diff --git a/.gitattributes b/.gitattributes index 2b65f6fe3cc80..e2211a2af515e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,7 @@ *.bat text eol=crlf *.cmd text eol=crlf +*.java text eol=lf +*.scala text eol=lf +*.xml text eol=lf +*.py text eol=lf +*.R text eol=lf diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index a9f757c3e2413..d53119ad75599 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -4,6 +4,9 @@ on: push: branches: - master + pull_request: + branches: + - master jobs: build: @@ -12,16 +15,105 @@ jobs: strategy: matrix: java: [ '1.8', '11' ] - name: Build Spark with JDK ${{ matrix.java }} + hadoop: [ 'hadoop-2.7', 'hadoop-3.2' ] + hive: [ 'hive-1.2', 'hive-2.3' ] + exclude: + - java: '11' + hive: 'hive-1.2' + - hadoop: 'hadoop-3.2' + hive: 'hive-1.2' + name: Build Spark - JDK${{ matrix.java }}/${{ matrix.hadoop }}/${{ matrix.hive }} steps: - uses: actions/checkout@master + # We split caches because GitHub Action Cache has a 400MB-size limit. + - uses: actions/cache@v1 + with: + path: build + key: build-${{ hashFiles('**/pom.xml') }} + restore-keys: | + build- + - uses: actions/cache@v1 + with: + path: ~/.m2/repository/com + key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-com-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ matrix.java }}-${{ matrix.hadoop }}-maven-com- + - uses: actions/cache@v1 + with: + path: ~/.m2/repository/org + key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-org-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ matrix.java }}-${{ matrix.hadoop }}-maven-org- + - uses: actions/cache@v1 + with: + path: ~/.m2/repository/net + key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-net-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ matrix.java }}-${{ matrix.hadoop }}-maven-net- + - uses: actions/cache@v1 + with: + path: ~/.m2/repository/io + key: ${{ matrix.java }}-${{ matrix.hadoop }}-maven-io-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ matrix.java }}-${{ matrix.hadoop }}-maven-io- - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v1 with: - version: ${{ matrix.java }} + java-version: ${{ matrix.java }} - name: Build with Maven run: | - export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN" + export MAVEN_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN" export MAVEN_CLI_OPTS="--no-transfer-progress" - ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-3.2 -Phadoop-cloud -Djava.version=${{ matrix.java }} package + mkdir -p ~/.m2 + ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -P${{ matrix.hive }} -Phive-thriftserver -P${{ matrix.hadoop }} -Phadoop-cloud -Djava.version=${{ matrix.java }} install + rm -rf ~/.m2/repository/org/apache/spark + + + lint: + runs-on: ubuntu-latest + name: Linters (Java/Scala/Python), licenses, dependencies + steps: + - uses: actions/checkout@master + - uses: actions/setup-java@v1 + with: + java-version: '11' + - uses: actions/setup-python@v1 + with: + python-version: '3.x' + architecture: 'x64' + - name: Scala + run: ./dev/lint-scala + - name: Java + run: ./dev/lint-java + - name: Python + run: | + pip install flake8 sphinx numpy + ./dev/lint-python + - name: License + run: ./dev/check-license + - name: Dependencies + run: ./dev/test-dependencies.sh + + lintr: + runs-on: ubuntu-latest + name: Linter (R) + steps: + - uses: actions/checkout@master + - uses: actions/setup-java@v1 + with: + java-version: '11' + - name: install R + run: | + echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/' | sudo tee -a /etc/apt/sources.list + curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xE298A3A825C0D65DFD57CBB651716619E084DAB9" | sudo apt-key add + sudo apt-get update + sudo apt-get install -y r-base r-base-dev libcurl4-openssl-dev + - name: install R packages + run: | + sudo Rscript -e "install.packages(c('curl', 'xml2', 'httr', 'devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2', 'e1071', 'survival'), repos='https://cloud.r-project.org/')" + sudo Rscript -e "devtools::install_github('jimhester/lintr@v2.0.0')" + - name: package and install SparkR + run: ./R/install-dev.sh + - name: lint-r + run: ./dev/lint-r diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml new file mode 100644 index 0000000000000..8e2f5bf3b0818 --- /dev/null +++ b/.github/workflows/stale.yml @@ -0,0 +1,24 @@ +name: Close stale PRs + +on: + schedule: + - cron: "0 0 * * *" + +jobs: + stale: + runs-on: ubuntu-latest + steps: + - uses: actions/stale@v1.1.0 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + stale-pr-message: > + We're closing this PR because it hasn't been updated in a while. + This isn't a judgement on the merit of the PR in any way. It's just + a way of keeping the PR queue manageable. + + If you'd like to revive this PR, please reopen it and ask a + committer to remove the Stale tag! + days-before-stale: 100 + # Setting this to 0 is the same as setting it to 1. + # See: https://github.com/actions/stale/issues/28 + days-before-close: 0 diff --git a/.gitignore b/.gitignore index ae20c85ebe351..798e8acc4d43b 100644 --- a/.gitignore +++ b/.gitignore @@ -45,7 +45,7 @@ dev/create-release/*final dev/create-release/*txt dev/pr-deps/ dist/ -docs/_site +docs/_site/ docs/api sql/docs sql/site @@ -63,6 +63,7 @@ project/plugins/target/ python/lib/pyspark.zip python/.eggs/ python/deps +python/docs/_site/ python/test_coverage/coverage_data python/test_coverage/htmlcov python/pyspark/python diff --git a/LICENSE b/LICENSE index 150ccc54ec6c2..6b169b1447f14 100644 --- a/LICENSE +++ b/LICENSE @@ -216,6 +216,7 @@ core/src/main/resources/org/apache/spark/ui/static/bootstrap* core/src/main/resources/org/apache/spark/ui/static/jsonFormatter* core/src/main/resources/org/apache/spark/ui/static/vis* docs/js/vendor/bootstrap.js +external/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaReporter.java Python Software Foundation License @@ -243,7 +244,7 @@ MIT License core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js core/src/main/resources/org/apache/spark/ui/static/*dataTables* core/src/main/resources/org/apache/spark/ui/static/graphlib-dot.min.js -ore/src/main/resources/org/apache/spark/ui/static/jquery* +core/src/main/resources/org/apache/spark/ui/static/jquery* core/src/main/resources/org/apache/spark/ui/static/sorttable.js docs/js/vendor/anchor.min.js docs/js/vendor/jquery* diff --git a/LICENSE-binary b/LICENSE-binary index ba20eea118687..b50da6be4e697 100644 --- a/LICENSE-binary +++ b/LICENSE-binary @@ -218,13 +218,14 @@ javax.jdo:jdo-api joda-time:joda-time net.sf.opencsv:opencsv org.apache.derby:derby +org.ehcache:ehcache org.objenesis:objenesis org.roaringbitmap:RoaringBitmap org.scalanlp:breeze-macros_2.12 org.scalanlp:breeze_2.12 org.typelevel:macro-compat_2.12 org.yaml:snakeyaml -org.apache.xbean:xbean-asm5-shaded +org.apache.xbean:xbean-asm7-shaded com.squareup.okhttp3:logging-interceptor com.squareup.okhttp3:okhttp com.squareup.okio:okio @@ -242,10 +243,10 @@ com.vlkan:flatbuffers com.ning:compress-lzf io.airlift:aircompressor io.dropwizard.metrics:metrics-core -io.dropwizard.metrics:metrics-ganglia io.dropwizard.metrics:metrics-graphite io.dropwizard.metrics:metrics-json io.dropwizard.metrics:metrics-jvm +io.dropwizard.metrics:metrics-jmx org.iq80.snappy:snappy com.clearspring.analytics:stream com.jamesmurty.utils:java-xmlbuilder @@ -253,12 +254,14 @@ commons-codec:commons-codec commons-collections:commons-collections io.fabric8:kubernetes-client io.fabric8:kubernetes-model +io.fabric8:kubernetes-model-common io.netty:netty-all net.hydromatic:eigenbase-properties net.sf.supercsv:super-csv org.apache.arrow:arrow-format org.apache.arrow:arrow-memory org.apache.arrow:arrow-vector +org.apache.commons:commons-configuration2 org.apache.commons:commons-crypto org.apache.commons:commons-lang3 org.apache.hadoop:hadoop-annotations @@ -266,6 +269,7 @@ org.apache.hadoop:hadoop-auth org.apache.hadoop:hadoop-client org.apache.hadoop:hadoop-common org.apache.hadoop:hadoop-hdfs +org.apache.hadoop:hadoop-hdfs-client org.apache.hadoop:hadoop-mapreduce-client-app org.apache.hadoop:hadoop-mapreduce-client-common org.apache.hadoop:hadoop-mapreduce-client-core @@ -278,6 +282,21 @@ org.apache.hadoop:hadoop-yarn-server-common org.apache.hadoop:hadoop-yarn-server-web-proxy org.apache.httpcomponents:httpclient org.apache.httpcomponents:httpcore +org.apache.kerby:kerb-admin +org.apache.kerby:kerb-client +org.apache.kerby:kerb-common +org.apache.kerby:kerb-core +org.apache.kerby:kerb-crypto +org.apache.kerby:kerb-identity +org.apache.kerby:kerb-server +org.apache.kerby:kerb-simplekdc +org.apache.kerby:kerb-util +org.apache.kerby:kerby-asn1 +org.apache.kerby:kerby-config +org.apache.kerby:kerby-pkix +org.apache.kerby:kerby-util +org.apache.kerby:kerby-xdr +org.apache.kerby:token-provider org.apache.orc:orc-core org.apache.orc:orc-mapreduce org.mortbay.jetty:jetty @@ -292,16 +311,24 @@ com.fasterxml.jackson.core:jackson-annotations com.fasterxml.jackson.core:jackson-core com.fasterxml.jackson.core:jackson-databind com.fasterxml.jackson.dataformat:jackson-dataformat-yaml +com.fasterxml.jackson.jaxrs:jackson-jaxrs-base +com.fasterxml.jackson.jaxrs:jackson-jaxrs-json-provider com.fasterxml.jackson.module:jackson-module-jaxb-annotations com.fasterxml.jackson.module:jackson-module-paranamer com.fasterxml.jackson.module:jackson-module-scala_2.12 +com.fasterxml.woodstox:woodstox-core com.github.mifmif:generex +com.github.stephenc.jcip:jcip-annotations com.google.code.findbugs:jsr305 com.google.code.gson:gson +com.google.flatbuffers:flatbuffers-java +com.google.guava:guava com.google.inject:guice com.google.inject.extensions:guice-servlet +com.nimbusds:nimbus-jose-jwt com.twitter:parquet-hadoop-bundle commons-cli:commons-cli +commons-daemon:commons-daemon commons-dbcp:commons-dbcp commons-io:commons-io commons-lang:commons-lang @@ -313,6 +340,8 @@ javax.inject:javax.inject javax.validation:validation-api log4j:apache-log4j-extras log4j:log4j +net.minidev:accessors-smart +net.minidev:json-smart net.sf.jpam:jpam org.apache.avro:avro org.apache.avro:avro-ipc @@ -328,6 +357,7 @@ org.apache.directory.server:apacheds-i18n org.apache.directory.server:apacheds-kerberos-codec org.apache.htrace:htrace-core org.apache.ivy:ivy +org.apache.geronimo.specs:geronimo-jcache_1.0_spec org.apache.mesos:mesos org.apache.parquet:parquet-column org.apache.parquet:parquet-common @@ -343,11 +373,6 @@ org.datanucleus:datanucleus-api-jdo org.datanucleus:datanucleus-core org.datanucleus:datanucleus-rdbms org.lz4:lz4-java -org.spark-project.hive:hive-beeline -org.spark-project.hive:hive-cli -org.spark-project.hive:hive-exec -org.spark-project.hive:hive-jdbc -org.spark-project.hive:hive-metastore org.xerial.snappy:snappy-java stax:stax-api xerces:xercesImpl @@ -368,7 +393,27 @@ org.eclipse.jetty:jetty-util org.eclipse.jetty:jetty-webapp org.eclipse.jetty:jetty-xml org.scala-lang.modules:scala-xml_2.12 -org.opencypher:okapi-shade +com.github.joshelser:dropwizard-metrics-hadoop-metrics2-reporter +com.zaxxer.HikariCP +org.apache.hive:hive-beeline +org.apache.hive:hive-cli +org.apache.hive:hive-common +org.apache.hive:hive-exec +org.apache.hive:hive-jdbc +org.apache.hive:hive-llap-common +org.apache.hive:hive-metastore +org.apache.hive:hive-serde +org.apache.hive:hive-service-rpc +org.apache.hive:hive-shims-0.23 +org.apache.hive:hive-shims +org.apache.hive:hive-common +org.apache.hive:hive-shims-scheduler +org.apache.hive:hive-storage-api +org.apache.hive:hive-vector-code-gen +org.datanucleus:javax.jdo +com.tdunning:json +org.apache.velocity:velocity +org.apache.yetus:audience-annotations core/src/main/java/org/apache/spark/util/collection/TimSort.java core/src/main/resources/org/apache/spark/ui/static/bootstrap* @@ -387,6 +432,7 @@ BSD 2-Clause ------------ com.github.luben:zstd-jni +dnsjava:dnsjava javolution:javolution com.esotericsoftware:kryo-shaded com.esotericsoftware:minlog @@ -394,8 +440,11 @@ com.esotericsoftware:reflectasm com.google.protobuf:protobuf-java org.codehaus.janino:commons-compiler org.codehaus.janino:janino +org.codehaus.woodstox:stax2-api jline:jline org.jodd:jodd-core +com.github.wendykierp:JTransforms +pl.edu.icm:JLargeArrays BSD 3-Clause @@ -408,6 +457,7 @@ org.antlr:stringtemplate org.antlr:antlr4-runtime antlr:antlr com.github.fommil.netlib:core +com.google.re2j:re2j com.thoughtworks.paranamer:paranamer org.scala-lang:scala-compiler org.scala-lang:scala-library @@ -419,6 +469,7 @@ xmlenc:xmlenc net.sf.py4j:py4j org.jpmml:pmml-model org.jpmml:pmml-schema +org.threeten:threeten-extra python/lib/py4j-*-src.zip python/pyspark/cloudpickle.py @@ -433,8 +484,13 @@ is distributed under the 3-Clause BSD license. MIT License ----------- -org.spire-math:spire-macros_2.12 -org.spire-math:spire_2.12 +com.microsoft.sqlserver:mssql-jdbc +org.typelevel:spire_2.12 +org.typelevel:spire-macros_2.12 +org.typelevel:spire-platform_2.12 +org.typelevel:spire-util_2.12 +org.typelevel:algebra_2.12:jar +org.typelevel:cats-kernel_2.12 org.typelevel:machinist_2.12 net.razorvine:pyrolite org.slf4j:jcl-over-slf4j @@ -446,7 +502,7 @@ com.github.scopt:scopt_2.12 core/src/main/resources/org/apache/spark/ui/static/dagre-d3.min.js core/src/main/resources/org/apache/spark/ui/static/*dataTables* core/src/main/resources/org/apache/spark/ui/static/graphlib-dot.min.js -ore/src/main/resources/org/apache/spark/ui/static/jquery* +core/src/main/resources/org/apache/spark/ui/static/jquery* core/src/main/resources/org/apache/spark/ui/static/sorttable.js docs/js/vendor/anchor.min.js docs/js/vendor/jquery* @@ -458,6 +514,7 @@ Common Development and Distribution License (CDDL) 1.0 javax.activation:activation http://www.oracle.com/technetwork/java/javase/tech/index-jsp-138795.html javax.xml.stream:stax-api https://jcp.org/en/jsr/detail?id=173 +javax.transaction:javax.transaction-api Common Development and Distribution License (CDDL) 1.1 @@ -465,6 +522,7 @@ Common Development and Distribution License (CDDL) 1.1 javax.el:javax.el-api https://javaee.github.io/uel-ri/ javax.servlet:javax.servlet-api https://javaee.github.io/servlet-spec/ +javax.servlet.jsp:jsp-api javax.transaction:jta http://www.oracle.com/technetwork/java/index.html javax.xml.bind:jaxb-api https://github.com/javaee/jaxb-v2 org.glassfish.hk2:hk2-api https://github.com/javaee/glassfish @@ -486,6 +544,7 @@ Eclipse Distribution License (EDL) 1.0 -------------------------------------- org.glassfish.jaxb:jaxb-runtime +jakarta.activation:jakarta.activation-api jakarta.xml.bind:jakarta.xml.bind-api com.sun.istack:istack-commons-runtime @@ -495,11 +554,7 @@ Eclipse Public License (EPL) 2.0 jakarta.annotation:jakarta-annotation-api https://projects.eclipse.org/projects/ee4j.ca jakarta.ws.rs:jakarta.ws.rs-api https://github.com/eclipse-ee4j/jaxrs-api - -Mozilla Public License (MPL) 1.1 --------------------------------- - -com.github.rwl:jtransforms https://sourceforge.net/projects/jtransforms/ +org.glassfish.hk2.external:jakarta.inject Python Software Foundation License diff --git a/NOTICE b/NOTICE index fefe08b38afc5..d5ea8dedb311b 100644 --- a/NOTICE +++ b/NOTICE @@ -26,3 +26,16 @@ The following provides more details on the included cryptographic software: This software uses Apache Commons Crypto (https://commons.apache.org/proper/commons-crypto/) to support authentication, and encryption and decryption of data sent across the network between services. + + +Metrics +Copyright 2010-2013 Coda Hale and Yammer, Inc. + +This product includes software developed by Coda Hale and Yammer, Inc. + +This product includes code derived from the JSR-166 project (ThreadLocalRandom, Striped64, +LongAdder), which was released with the following comments: + + Written by Doug Lea with assistance from members of JCP JSR-166 + Expert Group and released to the public domain, as explained at + http://creativecommons.org/publicdomain/zero/1.0/ \ No newline at end of file diff --git a/NOTICE-binary b/NOTICE-binary index f93e088a9a731..4ce8bf2f86b2a 100644 --- a/NOTICE-binary +++ b/NOTICE-binary @@ -65,8 +65,8 @@ Copyright 2009-2014 The Apache Software Foundation Objenesis Copyright 2006-2013 Joe Walnes, Henri Tremblay, Leonardo Mesquita -Apache XBean :: ASM 5 shaded (repackaged) -Copyright 2005-2015 The Apache Software Foundation +Apache XBean :: ASM shaded (repackaged) +Copyright 2005-2019 The Apache Software Foundation -------------------------------------- @@ -661,6 +661,9 @@ Copyright 2017 The Apache Software Foundation Apache Commons CLI Copyright 2001-2009 The Apache Software Foundation +Apache Commons Daemon +Copyright 1999-2019 The Apache Software Foundation + Google Guice - Extensions - Servlet Copyright 2006-2011 Google, Inc. @@ -1135,4 +1138,393 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and -limitations under the License. \ No newline at end of file +limitations under the License. + +dropwizard-metrics-hadoop-metrics2-reporter +Copyright 2016 Josh Elser + +Hive Beeline +Copyright 2019 The Apache Software Foundation + +Hive CLI +Copyright 2019 The Apache Software Foundation + +Hive Common +Copyright 2019 The Apache Software Foundation + +Hive JDBC +Copyright 2019 The Apache Software Foundation + +Hive Query Language +Copyright 2019 The Apache Software Foundation + +Hive Llap Common +Copyright 2019 The Apache Software Foundation + +Hive Metastore +Copyright 2019 The Apache Software Foundation + +Hive Serde +Copyright 2019 The Apache Software Foundation + +Hive Service RPC +Copyright 2019 The Apache Software Foundation + +Hive Shims +Copyright 2019 The Apache Software Foundation + +Hive Shims 0.23 +Copyright 2019 The Apache Software Foundation + +Hive Shims Common +Copyright 2019 The Apache Software Foundation + +Hive Shims Scheduler +Copyright 2019 The Apache Software Foundation + +Hive Storage API +Copyright 2018 The Apache Software Foundation + +Hive Vector-Code-Gen Utilities +Copyright 2019 The Apache Software Foundation + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2015-2015 DataNucleus + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +Android JSON library +Copyright (C) 2010 The Android Open Source Project + +This product includes software developed by +The Android Open Source Project + +Apache Velocity + +Copyright (C) 2000-2007 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Apache Yetus - Audience Annotations +Copyright 2015-2017 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Ehcache V3 +Copyright 2014-2016 Terracotta, Inc. + +The product includes software from the Apache Commons Lang project, +under the Apache License 2.0 (see: org.ehcache.impl.internal.classes.commonslang) + +Apache Geronimo JCache Spec 1.0 +Copyright 2003-2014 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + + +Kerby-kerb Admin +Copyright 2014-2017 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + + +Kerby-kerb Client +Copyright 2014-2017 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + + +Kerby-kerb Common +Copyright 2014-2017 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + + +Kerby-kerb core +Copyright 2014-2017 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + + +Kerby-kerb Crypto +Copyright 2014-2017 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + + +Kerby-kerb Identity +Copyright 2014-2017 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + + +Kerby-kerb Server +Copyright 2014-2017 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + + +Kerb Simple Kdc +Copyright 2014-2017 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + + +Kerby-kerb Util +Copyright 2014-2017 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + + +Kerby ASN1 Project +Copyright 2014-2017 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + + +Kerby Config +Copyright 2014-2017 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + + +Kerby PKIX Project +Copyright 2014-2017 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + + +Kerby Util +Copyright 2014-2017 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + + +Kerby XDR Project +Copyright 2014-2017 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + + +Token provider +Copyright 2014-2017 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + + +Metrics +Copyright 2010-2013 Coda Hale and Yammer, Inc. + +This product includes software developed by Coda Hale and Yammer, Inc. + +This product includes code derived from the JSR-166 project (ThreadLocalRandom, Striped64, +LongAdder), which was released with the following comments: + + Written by Doug Lea with assistance from members of JCP JSR-166 + Expert Group and released to the public domain, as explained at + http://creativecommons.org/publicdomain/zero/1.0/ \ No newline at end of file diff --git a/R/check-cran.sh b/R/check-cran.sh index 22cc9c6b601fc..22c8f423cfd12 100755 --- a/R/check-cran.sh +++ b/R/check-cran.sh @@ -65,6 +65,10 @@ fi echo "Running CRAN check with $CRAN_CHECK_OPTIONS options" +# Remove this environment variable to allow to check suggested packages once +# Jenkins installs arrow. See SPARK-29339. +export _R_CHECK_FORCE_SUGGESTS_=FALSE + if [ -n "$NO_TESTS" ] && [ -n "$NO_MANUAL" ] then "$R_SCRIPT_PATH/R" CMD check $CRAN_CHECK_OPTIONS "SparkR_$VERSION.tar.gz" diff --git a/R/pkg/.lintr b/R/pkg/.lintr index c83ad2adfe0ef..67dc1218ea551 100644 --- a/R/pkg/.lintr +++ b/R/pkg/.lintr @@ -1,2 +1,2 @@ -linters: with_defaults(line_length_linter(100), multiple_dots_linter = NULL, object_name_linter = NULL, camel_case_linter = NULL, open_curly_linter(allow_single_line = TRUE), closed_curly_linter(allow_single_line = TRUE)) +linters: with_defaults(line_length_linter(100), multiple_dots_linter = NULL, object_name_linter = NULL, camel_case_linter = NULL, open_curly_linter(allow_single_line = TRUE), closed_curly_linter(allow_single_line = TRUE), object_usage_linter = NULL, cyclocomp_linter = NULL) exclusions: list("inst/profile/general.R" = 1, "inst/profile/shell.R") diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index f4780862099d3..c8cb1c3a992ad 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -22,7 +22,8 @@ Suggests: rmarkdown, testthat, e1071, - survival + survival, + arrow Collate: 'schema.R' 'generics.R' @@ -61,3 +62,4 @@ Collate: RoxygenNote: 5.0.1 VignetteBuilder: knitr NeedsCompilation: no +Encoding: UTF-8 diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index f9d9494ca6fa1..7ed2e36d59531 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -335,6 +335,7 @@ exportMethods("%<=>%", "ntile", "otherwise", "over", + "overlay", "percent_rank", "pmod", "posexplode", diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 6f3c7c120ba3c..593d3ca16220d 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2252,7 +2252,7 @@ setMethod("mutate", # The last column of the same name in the specific columns takes effect deDupCols <- list() - for (i in 1:length(cols)) { + for (i in seq_len(length(cols))) { deDupCols[[ns[[i]]]] <- alias(cols[[i]], ns[[i]]) } @@ -2416,7 +2416,7 @@ setMethod("arrange", # builds a list of columns of type Column # example: [[1]] Column Species ASC # [[2]] Column Petal_Length DESC - jcols <- lapply(seq_len(length(decreasing)), function(i){ + jcols <- lapply(seq_len(length(decreasing)), function(i) { if (decreasing[[i]]) { desc(getColumn(x, by[[i]])) } else { @@ -2749,7 +2749,7 @@ genAliasesForIntersectedCols <- function(x, intersectedColNames, suffix) { col <- getColumn(x, colName) if (colName %in% intersectedColNames) { newJoin <- paste(colName, suffix, sep = "") - if (newJoin %in% allColNames){ + if (newJoin %in% allColNames) { stop("The following column name: ", newJoin, " occurs more than once in the 'DataFrame'.", "Please use different suffixes for the intersected columns.") } @@ -3475,7 +3475,7 @@ setMethod("str", cat(paste0("'", class(object), "': ", length(names), " variables:\n")) if (nrow(localDF) > 0) { - for (i in 1 : ncol(localDF)) { + for (i in seq_len(ncol(localDF))) { # Get the first elements for each column firstElements <- if (types[i] == "character") { diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index 43ea27b359a9c..c6842912706af 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -148,19 +148,7 @@ getDefaultSqlSource <- function() { } writeToFileInArrow <- function(fileName, rdf, numPartitions) { - requireNamespace1 <- requireNamespace - - # R API in Arrow is not yet released in CRAN. CRAN requires to add the - # package in requireNamespace at DESCRIPTION. Later, CRAN checks if the package is available - # or not. Therefore, it works around by avoiding direct requireNamespace. - # Currently, as of Arrow 0.12.0, it can be installed by install_github. See ARROW-3204. - if (requireNamespace1("arrow", quietly = TRUE)) { - record_batch <- get("record_batch", envir = asNamespace("arrow"), inherits = FALSE) - RecordBatchStreamWriter <- get( - "RecordBatchStreamWriter", envir = asNamespace("arrow"), inherits = FALSE) - FileOutputStream <- get( - "FileOutputStream", envir = asNamespace("arrow"), inherits = FALSE) - + if (requireNamespace("arrow", quietly = TRUE)) { numPartitions <- if (!is.null(numPartitions)) { numToInt(numPartitions) } else { @@ -176,11 +164,11 @@ writeToFileInArrow <- function(fileName, rdf, numPartitions) { stream_writer <- NULL tryCatch({ for (rdf_slice in rdf_slices) { - batch <- record_batch(rdf_slice) + batch <- arrow::record_batch(rdf_slice) if (is.null(stream_writer)) { - stream <- FileOutputStream(fileName) + stream <- arrow::FileOutputStream$create(fileName) schema <- batch$schema - stream_writer <- RecordBatchStreamWriter(stream, schema) + stream_writer <- arrow::RecordBatchStreamWriter$create(stream, schema) } stream_writer$write_batch(batch) @@ -209,7 +197,7 @@ getSchema <- function(schema, firstRow = NULL, rdd = NULL) { as.list(schema) } if (is.null(names)) { - names <- lapply(1:length(firstRow), function(x) { + names <- lapply(seq_len(length(firstRow)), function(x) { paste0("_", as.character(x)) }) } @@ -225,7 +213,7 @@ getSchema <- function(schema, firstRow = NULL, rdd = NULL) { }) types <- lapply(firstRow, infer_type) - fields <- lapply(1:length(firstRow), function(i) { + fields <- lapply(seq_len(length(firstRow)), function(i) { structField(names[[i]], types[[i]], TRUE) }) schema <- do.call(structType, fields) @@ -568,7 +556,6 @@ tableToDF <- function(tableName) { #' stringSchema <- "name STRING, info MAP" #' df4 <- read.df(mapTypeJsonPath, "json", stringSchema, multiLine = TRUE) #' } -#' @name read.df #' @note read.df since 1.4.0 read.df <- function(path = NULL, source = NULL, schema = NULL, na.strings = "NA", ...) { if (!is.null(path) && !is.character(path)) { @@ -699,7 +686,6 @@ read.jdbc <- function(url, tableName, #' stringSchema <- "name STRING, info MAP" #' df1 <- read.stream("json", path = jsonDir, schema = stringSchema, maxFilesPerTrigger = 1) #' } -#' @name read.stream #' @note read.stream since 2.2.0 #' @note experimental read.stream <- function(source = NULL, schema = NULL, ...) { diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index 51ae2d2954a9a..d96a287f818a2 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -301,7 +301,7 @@ broadcastRDD <- function(sc, object) { #' Set the checkpoint directory #' #' Set the directory under which RDDs are going to be checkpointed. The -#' directory must be a HDFS path if running on a cluster. +#' directory must be an HDFS path if running on a cluster. #' #' @param sc Spark Context to use #' @param dirName Directory path @@ -416,7 +416,7 @@ spark.getSparkFiles <- function(fileName) { #' @examples #'\dontrun{ #' sparkR.session() -#' doubled <- spark.lapply(1:10, function(x){2 * x}) +#' doubled <- spark.lapply(1:10, function(x) {2 * x}) #'} #' @note spark.lapply since 2.0.0 spark.lapply <- function(list, func) { @@ -446,7 +446,7 @@ setLogLevel <- function(level) { #' Set checkpoint directory #' #' Set the directory under which SparkDataFrame are going to be checkpointed. The directory must be -#' a HDFS path if running on a cluster. +#' an HDFS path if running on a cluster. #' #' @rdname setCheckpointDir #' @param directory Directory path to checkpoint to diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R index b38d245a0cca7..ca4a6e342d772 100644 --- a/R/pkg/R/deserialize.R +++ b/R/pkg/R/deserialize.R @@ -232,11 +232,7 @@ readMultipleObjectsWithKeys <- function(inputCon) { } readDeserializeInArrow <- function(inputCon) { - # This is a hack to avoid CRAN check. Arrow is not uploaded into CRAN now. See ARROW-3204. - requireNamespace1 <- requireNamespace - if (requireNamespace1("arrow", quietly = TRUE)) { - RecordBatchStreamReader <- get( - "RecordBatchStreamReader", envir = asNamespace("arrow"), inherits = FALSE) + if (requireNamespace("arrow", quietly = TRUE)) { # Arrow drops `as_tibble` since 0.14.0, see ARROW-5190. useAsTibble <- exists("as_tibble", envir = asNamespace("arrow")) @@ -246,7 +242,7 @@ readDeserializeInArrow <- function(inputCon) { # for now. dataLen <- readInt(inputCon) arrowData <- readBin(inputCon, raw(), as.integer(dataLen), endian = "big") - batches <- RecordBatchStreamReader(arrowData)$batches() + batches <- arrow::RecordBatchStreamReader$create(arrowData)$batches() if (useAsTibble) { as_tibble <- get("as_tibble", envir = asNamespace("arrow")) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index eecb84572a30b..48f69d5769620 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -136,6 +136,14 @@ NULL #' format to. See 'Details'. #' } #' @param y Column to compute on. +#' @param pos In \itemize{ +#' \item \code{locate}: a start position of search. +#' \item \code{overlay}: a start postiton for replacement. +#' } +#' @param len In \itemize{ +#' \item \code{lpad} the maximum length of each output result. +#' \item \code{overlay} a number of bytes to replace. +#' } #' @param ... additional Columns. #' @name column_string_functions #' @rdname column_string_functions @@ -879,8 +887,8 @@ setMethod("factorial", #' #' The function by default returns the first values it sees. It will return the first non-missing #' value it sees when na.rm is set to true. If all values are missing, then NA is returned. -#' Note: the function is non-deterministic because its results depends on order of rows which -#' may be non-deterministic after a shuffle. +#' Note: the function is non-deterministic because its results depends on the order of the rows +#' which may be non-deterministic after a shuffle. #' #' @param na.rm a logical value indicating whether NA values should be stripped #' before the computation proceeds. @@ -1024,8 +1032,8 @@ setMethod("kurtosis", #' #' The function by default returns the last values it sees. It will return the last non-missing #' value it sees when na.rm is set to true. If all values are missing, then NA is returned. -#' Note: the function is non-deterministic because its results depends on order of rows which -#' may be non-deterministic after a shuffle. +#' Note: the function is non-deterministic because its results depends on the order of the rows +#' which may be non-deterministic after a shuffle. #' #' @param x column to compute on. #' @param na.rm a logical value indicating whether NA values should be stripped @@ -1319,6 +1327,35 @@ setMethod("negate", column(jc) }) +#' @details +#' \code{overlay}: Overlay the specified portion of \code{x} with \code{replace}, +#' starting from byte position \code{pos} of \code{src} and proceeding for +#' \code{len} bytes. +#' +#' @param replace a Column with replacement. +#' +#' @rdname column_string_functions +#' @aliases overlay overlay,Column-method,numericOrColumn-method +#' @note overlay since 3.0.0 +setMethod("overlay", + signature(x = "Column", replace = "Column", pos = "numericOrColumn"), + function(x, replace, pos, len = -1) { + if (is.numeric(pos)) { + pos <- lit(as.integer(pos)) + } + + if (is.numeric(len)) { + len <- lit(as.integer(len)) + } + + jc <- callJStatic( + "org.apache.spark.sql.functions", "overlay", + x@jc, replace@jc, pos@jc, len@jc + ) + + column(jc) + }) + #' @details #' \code{quarter}: Extracts the quarter as an integer from a given date/timestamp/string. #' @@ -2459,7 +2496,6 @@ setMethod("schema_of_csv", signature(x = "characterOrColumn"), #' @note from_utc_timestamp since 1.5.0 setMethod("from_utc_timestamp", signature(y = "Column", x = "character"), function(y, x) { - .Deprecated(msg = "from_utc_timestamp is deprecated. See SPARK-25496.") jc <- callJStatic("org.apache.spark.sql.functions", "from_utc_timestamp", y@jc, x) column(jc) }) @@ -2518,7 +2554,6 @@ setMethod("next_day", signature(y = "Column", x = "character"), #' @note to_utc_timestamp since 1.5.0 setMethod("to_utc_timestamp", signature(y = "Column", x = "character"), function(y, x) { - .Deprecated(msg = "to_utc_timestamp is deprecated. See SPARK-25496.") jc <- callJStatic("org.apache.spark.sql.functions", "to_utc_timestamp", y@jc, x) column(jc) }) @@ -2819,7 +2854,6 @@ setMethod("window", signature(x = "Column"), #' #' @param substr a character string to be matched. #' @param str a Column where matches are sought for each entry. -#' @param pos start position of search. #' @rdname column_string_functions #' @aliases locate locate,character,Column-method #' @note locate since 1.5.0 @@ -2834,7 +2868,6 @@ setMethod("locate", signature(substr = "character", str = "Column"), #' @details #' \code{lpad}: Left-padded with pad to a length of len. #' -#' @param len maximum length of each output result. #' @param pad a character string to be padded with. #' @rdname column_string_functions #' @aliases lpad lpad,Column,numeric,character-method @@ -3617,11 +3650,11 @@ setMethod("size", #' @details #' \code{slice}: Returns an array containing all the elements in x from the index start -#' (or starting from the end if start is negative) with the specified length. +#' (array indices start at 1, or from the end if start is negative) with the specified length. #' #' @rdname column_collection_functions -#' @param start an index indicating the first element occurring in the result. -#' @param length a number of consecutive elements chosen to the result. +#' @param start the starting index +#' @param length the length of the slice #' @aliases slice slice,Column-method #' @note slice since 2.4.0 setMethod("slice", @@ -3706,7 +3739,7 @@ setMethod("create_map", #' @details #' \code{collect_list}: Creates a list of objects with duplicates. #' Note: the function is non-deterministic because the order of collected results depends -#' on order of rows which may be non-deterministic after a shuffle. +#' on the order of the rows which may be non-deterministic after a shuffle. #' #' @rdname column_aggregate_functions #' @aliases collect_list collect_list,Column-method @@ -3727,7 +3760,7 @@ setMethod("collect_list", #' @details #' \code{collect_set}: Creates a list of objects with duplicate elements eliminated. #' Note: the function is non-deterministic because the order of collected results depends -#' on order of rows which may be non-deterministic after a shuffle. +#' on the order of the rows which may be non-deterministic after a shuffle. #' #' @rdname column_aggregate_functions #' @aliases collect_set collect_set,Column-method diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index f849dd172247c..4134d5cecc888 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1149,6 +1149,10 @@ setGeneric("ntile", function(x) { standardGeneric("ntile") }) #' @name NULL setGeneric("n_distinct", function(x, ...) { standardGeneric("n_distinct") }) +#' @rdname column_string_functions +#' @name NULL +setGeneric("overlay", function(x, replace, pos, ...) { standardGeneric("overlay") }) + #' @rdname column_window_functions #' @name NULL setGeneric("percent_rank", function(x = "missing") { standardGeneric("percent_rank") }) diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R index 6e8f4dc3a7907..2b7995e1e37f6 100644 --- a/R/pkg/R/group.R +++ b/R/pkg/R/group.R @@ -162,7 +162,7 @@ methods <- c("avg", "max", "mean", "min", "sum") #' @note pivot since 2.0.0 setMethod("pivot", signature(x = "GroupedData", colname = "character"), - function(x, colname, values = list()){ + function(x, colname, values = list()) { stopifnot(length(colname) == 1) if (length(values) == 0) { result <- callJMethod(x@sgd, "pivot", colname) diff --git a/R/pkg/R/mllib_recommendation.R b/R/pkg/R/mllib_recommendation.R index 9a77b07462585..d238ff93ed245 100644 --- a/R/pkg/R/mllib_recommendation.R +++ b/R/pkg/R/mllib_recommendation.R @@ -82,6 +82,12 @@ setClass("ALSModel", representation(jobj = "jobj")) #' statsS <- summary(modelS) #' } #' @note spark.als since 2.1.0 +#' @note the input rating dataframe to the ALS implementation should be deterministic. +#' Nondeterministic data can cause failure during fitting ALS model. For example, +#' an order-sensitive operation like sampling after a repartition makes dataframe output +#' nondeterministic, like \code{sample(repartition(df, 2L), FALSE, 0.5, 1618L)}. +#' Checkpointing sampled dataframe or adding a sort before sampling can help make the +#' dataframe deterministic. setMethod("spark.als", signature(data = "SparkDataFrame"), function(data, ratingCol = "rating", userCol = "user", itemCol = "item", rank = 10, regParam = 0.1, maxIter = 10, nonnegative = FALSE, diff --git a/R/pkg/R/mllib_tree.R b/R/pkg/R/mllib_tree.R index ff16b436217dc..f6aa48f5fa04a 100644 --- a/R/pkg/R/mllib_tree.R +++ b/R/pkg/R/mllib_tree.R @@ -393,6 +393,7 @@ setMethod("write.ml", signature(object = "GBTClassificationModel", path = "chara #' "error" (throw an error), "keep" (put invalid data in #' a special additional bucket, at index numLabels). Default #' is "error". +#' @param bootstrap Whether bootstrap samples are used when building trees. #' @param ... additional arguments passed to the method. #' @aliases spark.randomForest,SparkDataFrame,formula-method #' @return \code{spark.randomForest} returns a fitted Random Forest model. @@ -428,7 +429,8 @@ setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "fo featureSubsetStrategy = "auto", seed = NULL, subsamplingRate = 1.0, minInstancesPerNode = 1, minInfoGain = 0.0, checkpointInterval = 10, maxMemoryInMB = 256, cacheNodeIds = FALSE, - handleInvalid = c("error", "keep", "skip")) { + handleInvalid = c("error", "keep", "skip"), + bootstrap = TRUE) { type <- match.arg(type) formula <- paste(deparse(formula), collapse = "") if (!is.null(seed)) { @@ -445,7 +447,8 @@ setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "fo as.numeric(minInfoGain), as.integer(checkpointInterval), as.character(featureSubsetStrategy), seed, as.numeric(subsamplingRate), - as.integer(maxMemoryInMB), as.logical(cacheNodeIds)) + as.integer(maxMemoryInMB), as.logical(cacheNodeIds), + as.logical(bootstrap)) new("RandomForestRegressionModel", jobj = jobj) }, classification = { @@ -460,7 +463,7 @@ setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "fo as.character(featureSubsetStrategy), seed, as.numeric(subsamplingRate), as.integer(maxMemoryInMB), as.logical(cacheNodeIds), - handleInvalid) + handleInvalid, as.logical(bootstrap)) new("RandomForestClassificationModel", jobj = jobj) } ) diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R index 0d6f32c8f7e1f..cb3c1c59d12ed 100644 --- a/R/pkg/R/serialize.R +++ b/R/pkg/R/serialize.R @@ -222,15 +222,11 @@ writeArgs <- function(con, args) { } writeSerializeInArrow <- function(conn, df) { - # This is a hack to avoid CRAN check. Arrow is not uploaded into CRAN now. See ARROW-3204. - requireNamespace1 <- requireNamespace - if (requireNamespace1("arrow", quietly = TRUE)) { - write_arrow <- get("write_arrow", envir = asNamespace("arrow"), inherits = FALSE) - + if (requireNamespace("arrow", quietly = TRUE)) { # There looks no way to send each batch in streaming format via socket # connection. See ARROW-4512. # So, it writes the whole Arrow streaming-formatted binary at once for now. - writeRaw(conn, write_arrow(df, raw())) + writeRaw(conn, arrow::write_arrow(df, raw())) } else { stop("'arrow' package should be installed.") } diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index 31b986c326d0c..cdb59093781fb 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -266,11 +266,12 @@ sparkR.sparkContext <- function( #' df <- read.json(path) #' #' sparkR.session("local[2]", "SparkR", "/home/spark") -#' sparkR.session("yarn-client", "SparkR", "/home/spark", -#' list(spark.executor.memory="4g"), +#' sparkR.session("yarn", "SparkR", "/home/spark", +#' list(spark.executor.memory="4g", spark.submit.deployMode="client"), #' c("one.jar", "two.jar", "three.jar"), #' c("com.databricks:spark-avro_2.12:2.0.1")) -#' sparkR.session(spark.master = "yarn-client", spark.executor.memory = "4g") +#' sparkR.session(spark.master = "yarn", spark.submit.deployMode = "client", +# spark.executor.memory = "4g") #'} #' @note sparkR.session since 2.0.0 sparkR.session <- function( diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R index c3501977e64bc..a8c1ddb3dd20b 100644 --- a/R/pkg/R/utils.R +++ b/R/pkg/R/utils.R @@ -131,7 +131,7 @@ hashCode <- function(key) { } else { asciiVals <- sapply(charToRaw(key), function(x) { strtoi(x, 16L) }) hashC <- 0 - for (k in 1:length(asciiVals)) { + for (k in seq_len(length(asciiVals))) { hashC <- mult31AndAdd(hashC, asciiVals[k]) } as.integer(hashC) @@ -543,10 +543,14 @@ processClosure <- function(node, oldEnv, defVars, checkedFuncs, newEnv) { funcList <- mget(nodeChar, envir = checkedFuncs, inherits = F, ifnotfound = list(list(NULL)))[[1]] found <- sapply(funcList, function(func) { - ifelse(identical(func, obj), TRUE, FALSE) + ifelse( + identical(func, obj) && + # Also check if the parent environment is identical to current parent + identical(parent.env(environment(func)), func.env), + TRUE, FALSE) }) if (sum(found) > 0) { - # If function has been examined, ignore. + # If function has been examined ignore break } # Function has not been examined, record it and recursively clean its closure. @@ -724,7 +728,7 @@ assignNewEnv <- function(data) { stopifnot(length(cols) > 0) env <- new.env() - for (i in 1:length(cols)) { + for (i in seq_len(length(cols))) { assign(x = cols[i], value = data[, cols[i], drop = F], envir = env) } env @@ -750,7 +754,7 @@ launchScript <- function(script, combinedArgs, wait = FALSE, stdout = "", stderr if (.Platform$OS.type == "windows") { scriptWithArgs <- paste(script, combinedArgs, sep = " ") # on Windows, intern = F seems to mean output to the console. (documentation on this is missing) - shell(scriptWithArgs, translate = TRUE, wait = wait, intern = wait) # nolint + shell(scriptWithArgs, translate = TRUE, wait = wait, intern = wait) } else { # http://stat.ethz.ch/R-manual/R-devel/library/base/html/system2.html # stdout = F means discard output diff --git a/R/pkg/inst/worker/worker.R b/R/pkg/inst/worker/worker.R index 80dc4ee634512..1ef05ea621e83 100644 --- a/R/pkg/inst/worker/worker.R +++ b/R/pkg/inst/worker/worker.R @@ -50,7 +50,7 @@ compute <- function(mode, partition, serializer, deserializer, key, } else { # Check to see if inputData is a valid data.frame stopifnot(deserializer == "byte" || deserializer == "arrow") - stopifnot(class(inputData) == "data.frame") + stopifnot(is.data.frame(inputData)) } if (mode == 2) { @@ -194,7 +194,7 @@ if (isEmpty != 0) { } else { # gapply mode outputs <- list() - for (i in 1:length(data)) { + for (i in seq_len(length(data))) { # Timing reading input data for execution inputElap <- elapsedSecs() output <- compute(mode, partition, serializer, deserializer, keys[[i]], diff --git a/R/pkg/tests/fulltests/data/test_utils_utf.json b/R/pkg/tests/fulltests/data/test_utils_utf.json new file mode 100644 index 0000000000000..b78352ee52ef1 --- /dev/null +++ b/R/pkg/tests/fulltests/data/test_utils_utf.json @@ -0,0 +1,4 @@ +{"name": "안녕하세요"} +{"name": "您好", "age": 30} +{"name": "こんにちは", "age": 19} +{"name": "Xin chào"} diff --git a/R/pkg/tests/fulltests/test_context.R b/R/pkg/tests/fulltests/test_context.R index eb8d2a700e1ea..6be04b321e985 100644 --- a/R/pkg/tests/fulltests/test_context.R +++ b/R/pkg/tests/fulltests/test_context.R @@ -25,7 +25,8 @@ test_that("Check masked functions", { namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var", "colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset", "summary", "transform", "drop", "window", "as.data.frame", "union", "not") - if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) { + version <- packageVersion("base") + if (as.numeric(version$major) >= 3 && as.numeric(version$minor) >= 3) { namesOfMasked <- c("endsWith", "startsWith", namesOfMasked) } masked <- conflicts(detail = TRUE)$`package:SparkR` @@ -84,6 +85,7 @@ test_that("rdd GC across sparkR.stop", { countRDD(rdd3) countRDD(rdd4) sparkR.session.stop() + expect_true(TRUE) }) test_that("job group functions can be called", { @@ -93,6 +95,7 @@ test_that("job group functions can be called", { clearJobGroup() sparkR.session.stop() + expect_true(TRUE) }) test_that("job description and local properties can be set and got", { @@ -131,6 +134,7 @@ test_that("utility function can be called", { sparkR.sparkContext(master = sparkRTestMaster) setLogLevel("ERROR") sparkR.session.stop() + expect_true(TRUE) }) test_that("getClientModeSparkSubmitOpts() returns spark-submit args from whitelist", { @@ -234,4 +238,5 @@ test_that("SPARK-25234: parallelize should not have integer overflow", { # 47000 * 47000 exceeds integer range parallelize(sc, 1:47000, 47000) sparkR.session.stop() + expect_true(TRUE) }) diff --git a/R/pkg/tests/fulltests/test_includePackage.R b/R/pkg/tests/fulltests/test_includePackage.R index f4ea0d1b5cb27..1d16b260c4c52 100644 --- a/R/pkg/tests/fulltests/test_includePackage.R +++ b/R/pkg/tests/fulltests/test_includePackage.R @@ -27,8 +27,8 @@ rdd <- parallelize(sc, nums, 2L) test_that("include inside function", { # Only run the test if plyr is installed. - if ("plyr" %in% rownames(installed.packages())) { - suppressPackageStartupMessages(library(plyr)) + if ("plyr" %in% rownames(installed.packages()) && + suppressPackageStartupMessages(suppressWarnings(library(plyr, logical.return = TRUE)))) { generateData <- function(x) { suppressPackageStartupMessages(library(plyr)) attach(airquality) @@ -39,12 +39,13 @@ test_that("include inside function", { data <- lapplyPartition(rdd, generateData) actual <- collectRDD(data) } + expect_true(TRUE) }) test_that("use include package", { # Only run the test if plyr is installed. - if ("plyr" %in% rownames(installed.packages())) { - suppressPackageStartupMessages(library(plyr)) + if ("plyr" %in% rownames(installed.packages()) && + suppressPackageStartupMessages(suppressWarnings(library(plyr, logical.return = TRUE)))) { generateData <- function(x) { attach(airquality) result <- transform(Ozone, logOzone = log(Ozone)) @@ -55,6 +56,7 @@ test_that("use include package", { data <- lapplyPartition(rdd, generateData) actual <- collectRDD(data) } + expect_true(TRUE) }) sparkR.session.stop() diff --git a/R/pkg/tests/fulltests/test_mllib_recommendation.R b/R/pkg/tests/fulltests/test_mllib_recommendation.R index d50de4123aeb0..73f6cfd67cee9 100644 --- a/R/pkg/tests/fulltests/test_mllib_recommendation.R +++ b/R/pkg/tests/fulltests/test_mllib_recommendation.R @@ -31,7 +31,8 @@ test_that("spark.als", { stats <- summary(model) expect_equal(stats$rank, 10) test <- createDataFrame(list(list(0, 2), list(1, 0), list(2, 0)), c("user", "item")) - predictions <- collect(predict(model, test)) + result <- predict(model, test) + predictions <- collect(arrange(result, desc(result$item), result$user)) expect_equal(predictions$prediction, c(0.6324540, 3.6218479, -0.4568263), tolerance = 1e-4) diff --git a/R/pkg/tests/fulltests/test_mllib_tree.R b/R/pkg/tests/fulltests/test_mllib_tree.R index ad68700c7ff4e..ee5043a744bba 100644 --- a/R/pkg/tests/fulltests/test_mllib_tree.R +++ b/R/pkg/tests/fulltests/test_mllib_tree.R @@ -130,7 +130,7 @@ test_that("spark.randomForest", { # regression data <- suppressWarnings(createDataFrame(longley)) model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16, - numTrees = 1, seed = 1) + numTrees = 1, seed = 1, bootstrap = FALSE) predictions <- collect(predict(model, data)) expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187, diff --git a/R/pkg/tests/fulltests/test_sparkR.R b/R/pkg/tests/fulltests/test_sparkR.R index f73fc6baeccef..4232f5ec430f6 100644 --- a/R/pkg/tests/fulltests/test_sparkR.R +++ b/R/pkg/tests/fulltests/test_sparkR.R @@ -36,8 +36,8 @@ test_that("sparkCheckInstall", { # "yarn-client, mesos-client" mode, SPARK_HOME was not set sparkHome <- "" - master <- "yarn-client" - deployMode <- "" + master <- "yarn" + deployMode <- "client" expect_error(sparkCheckInstall(sparkHome, master, deployMode)) sparkHome <- "" master <- "" diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index 035525a7a849b..c1d277ac84be1 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -172,7 +172,7 @@ test_that("structField type strings", { typeList <- c(primitiveTypes, complexTypes) typeStrings <- names(typeList) - for (i in seq_along(typeStrings)){ + for (i in seq_along(typeStrings)) { typeString <- typeStrings[i] expected <- typeList[[i]] testField <- structField("_col", typeString) @@ -203,7 +203,7 @@ test_that("structField type strings", { errorList <- c(primitiveErrors, complexErrors) typeStrings <- names(errorList) - for (i in seq_along(typeStrings)){ + for (i in seq_along(typeStrings)) { typeString <- typeStrings[i] expected <- paste0("Unsupported type for SparkDataframe: ", errorList[[i]]) expect_error(structField("_col", typeString), expected) @@ -848,24 +848,31 @@ test_that("collect() and take() on a DataFrame return the same number of rows an }) test_that("collect() support Unicode characters", { - lines <- c("{\"name\":\"안녕하세요\"}", - "{\"name\":\"您好\", \"age\":30}", - "{\"name\":\"こんにちは\", \"age\":19}", - "{\"name\":\"Xin chào\"}") + jsonPath <- file.path( + Sys.getenv("SPARK_HOME"), + "R", "pkg", "tests", "fulltests", "data", + "test_utils_utf.json" + ) + + lines <- readLines(jsonPath, encoding = "UTF-8") - jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") - writeLines(lines, jsonPath) + expected <- regmatches(lines, gregexpr('(?<="name": ").*?(?=")', lines, perl = TRUE)) df <- read.df(jsonPath, "json") rdf <- collect(df) expect_true(is.data.frame(rdf)) - expect_equal(rdf$name[1], markUtf8("안녕하세요")) - expect_equal(rdf$name[2], markUtf8("您好")) - expect_equal(rdf$name[3], markUtf8("こんにちは")) - expect_equal(rdf$name[4], markUtf8("Xin chào")) + expect_equal(rdf$name[1], expected[[1]]) + expect_equal(rdf$name[2], expected[[2]]) + expect_equal(rdf$name[3], expected[[3]]) + expect_equal(rdf$name[4], expected[[4]]) df1 <- createDataFrame(rdf) - expect_equal(collect(where(df1, df1$name == markUtf8("您好")))$name, markUtf8("您好")) + expect_equal( + collect( + where(df1, df1$name == expected[[2]]) + )$name, + expected[[2]] + ) }) test_that("multiple pipeline transformations result in an RDD with the correct values", { @@ -1375,6 +1382,7 @@ test_that("column operators", { c5 <- c2 ^ c3 ^ c4 c6 <- c2 %<=>% c3 c7 <- !c6 + expect_true(TRUE) }) test_that("column functions", { @@ -1405,6 +1413,8 @@ test_that("column functions", { trunc(c, "month") + trunc(c, "mon") + trunc(c, "mm") c24 <- date_trunc("hour", c) + date_trunc("minute", c) + date_trunc("week", c) + date_trunc("quarter", c) + current_date() + current_timestamp() + c25 <- overlay(c1, c2, c3, c3) + overlay(c1, c2, c3) + overlay(c1, c2, 1) + + overlay(c1, c2, 3, 4) # Test if base::is.nan() is exposed expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE)) @@ -1800,7 +1810,8 @@ test_that("string operators", { expect_true(first(select(df, endsWith(df$name, "el")))[[1]]) expect_equal(first(select(df, substr(df$name, 1, 2)))[[1]], "Mi") expect_equal(first(select(df, substr(df$name, 4, 6)))[[1]], "hae") - if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) { + version <- packageVersion("base") + if (as.numeric(version$major) >= 3 && as.numeric(version$minor) >= 3) { expect_true(startsWith("Hello World", "Hello")) expect_false(endsWith("Hello World", "a")) } @@ -1905,20 +1916,10 @@ test_that("date functions on a DataFrame", { df2 <- createDataFrame(l2) expect_equal(collect(select(df2, minute(df2$b)))[, 1], c(34, 24)) expect_equal(collect(select(df2, second(df2$b)))[, 1], c(0, 34)) - conf <- callJMethod(sparkSession, "conf") - isUtcTimestampFuncEnabled <- callJMethod(conf, "get", "spark.sql.legacy.utcTimestampFunc.enabled") - callJMethod(conf, "set", "spark.sql.legacy.utcTimestampFunc.enabled", "true") - tryCatch({ - # Both from_utc_timestamp and to_utc_timestamp are deprecated as of SPARK-25496 - expect_equal(suppressWarnings(collect(select(df2, from_utc_timestamp(df2$b, "JST"))))[, 1], - c(as.POSIXct("2012-12-13 21:34:00 UTC"), as.POSIXct("2014-12-15 10:24:34 UTC"))) - expect_equal(suppressWarnings(collect(select(df2, to_utc_timestamp(df2$b, "JST"))))[, 1], - c(as.POSIXct("2012-12-13 03:34:00 UTC"), as.POSIXct("2014-12-14 16:24:34 UTC"))) - }, - finally = { - # Reverting the conf back - callJMethod(conf, "set", "spark.sql.legacy.utcTimestampFunc.enabled", isUtcTimestampFuncEnabled) - }) + expect_equal(collect(select(df2, from_utc_timestamp(df2$b, "JST")))[, 1], + c(as.POSIXct("2012-12-13 21:34:00 UTC"), as.POSIXct("2014-12-15 10:24:34 UTC"))) + expect_equal(collect(select(df2, to_utc_timestamp(df2$b, "JST")))[, 1], + c(as.POSIXct("2012-12-13 03:34:00 UTC"), as.POSIXct("2014-12-14 16:24:34 UTC"))) expect_gt(collect(select(df2, unix_timestamp()))[1, 1], 0) expect_gt(collect(select(df2, unix_timestamp(df2$b)))[1, 1], 0) expect_gt(collect(select(df2, unix_timestamp(lit("2015-01-01"), "yyyy-MM-dd")))[1, 1], 0) @@ -3238,6 +3239,13 @@ test_that("Histogram", { expect_equal(histogram(df, "x")$counts, c(4, 0, 0, 0, 0, 0, 0, 0, 0, 1)) }) +test_that("dapply() should show error message from R worker", { + df <- createDataFrame(list(list(n = 1))) + expect_error({ + collect(dapply(df, function(x) stop("custom error message"), structType("a double"))) + }, "custom error message") +}) + test_that("dapply() and dapplyCollect() on a DataFrame", { df <- createDataFrame( list(list(1L, 1, "1"), list(2L, 2, "2"), list(3L, 3, "3")), diff --git a/R/pkg/tests/fulltests/test_sparkSQL_arrow.R b/R/pkg/tests/fulltests/test_sparkSQL_arrow.R index 825c7423e1579..97972753a78fa 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL_arrow.R +++ b/R/pkg/tests/fulltests/test_sparkSQL_arrow.R @@ -101,7 +101,7 @@ test_that("dapply() Arrow optimization", { tryCatch({ ret <- dapply(df, function(rdf) { - stopifnot(class(rdf) == "data.frame") + stopifnot(is.data.frame(rdf)) rdf }, schema(df)) @@ -115,7 +115,7 @@ test_that("dapply() Arrow optimization", { tryCatch({ ret <- dapply(df, function(rdf) { - stopifnot(class(rdf) == "data.frame") + stopifnot(is.data.frame(rdf)) # mtcars' hp is more then 50. stopifnot(all(rdf$hp > 50)) rdf @@ -199,7 +199,7 @@ test_that("gapply() Arrow optimization", { if (length(key) > 0) { stopifnot(is.numeric(key[[1]])) } - stopifnot(class(grouped) == "data.frame") + stopifnot(is.data.frame(grouped)) grouped }, schema(df)) @@ -217,7 +217,7 @@ test_that("gapply() Arrow optimization", { if (length(key) > 0) { stopifnot(is.numeric(key[[1]])) } - stopifnot(class(grouped) == "data.frame") + stopifnot(is.data.frame(grouped)) stopifnot(length(colnames(grouped)) == 11) # mtcars' hp is more then 50. stopifnot(all(grouped$hp > 50)) diff --git a/R/pkg/tests/fulltests/test_textFile.R b/R/pkg/tests/fulltests/test_textFile.R index be2d2711ff88e..046018c7c2a2d 100644 --- a/R/pkg/tests/fulltests/test_textFile.R +++ b/R/pkg/tests/fulltests/test_textFile.R @@ -75,6 +75,7 @@ test_that("several transformations on RDD created by textFile()", { collectRDD(rdd) unlink(fileName) + expect_true(TRUE) }) test_that("textFile() followed by a saveAsTextFile() returns the same content", { diff --git a/R/pkg/tests/fulltests/test_utils.R b/R/pkg/tests/fulltests/test_utils.R index b2b6f34aaa085..c3fb9046fcda4 100644 --- a/R/pkg/tests/fulltests/test_utils.R +++ b/R/pkg/tests/fulltests/test_utils.R @@ -89,7 +89,10 @@ test_that("cleanClosure on R functions", { lapply(x, g) + 1 # Test for capturing function call "g"'s closure as a argument of lapply. l$field[1, 1] <- 3 # Test for access operators `$`. res <- defUse + l$field[1, ] # Test for def-use chain of "defUse", and "" symbol. - f(res) # Test for recursive calls. + # Enable once SPARK-30629 is fixed + # nolint start + # f(res) # Test for recursive calls. + # nolint end } newF <- cleanClosure(f) env <- environment(newF) @@ -101,7 +104,10 @@ test_that("cleanClosure on R functions", { # nolint end expect_true("g" %in% ls(env)) expect_true("l" %in% ls(env)) - expect_true("f" %in% ls(env)) + # Enable once SPARK-30629 is fixed + # nolint start + # expect_true("f" %in% ls(env)) + # nolint end expect_equal(get("l", envir = env, inherits = FALSE), l) # "y" should be in the environment of g. newG <- get("g", envir = env, inherits = FALSE) @@ -110,6 +116,15 @@ test_that("cleanClosure on R functions", { actual <- get("y", envir = env, inherits = FALSE) expect_equal(actual, y) + # Test for combination for nested and sequenctial functions in a closure + f1 <- function(x) x + 1 + f2 <- function(x) f1(x) + 2 + userFunc <- function(x) { f1(x); f2(x) } + cUserFuncEnv <- environment(cleanClosure(userFunc)) + expect_equal(length(cUserFuncEnv), 2) + innerCUserFuncEnv <- environment(cUserFuncEnv$f2) + expect_equal(length(innerCUserFuncEnv), 1) + # Test for function (and variable) definitions. f <- function(x) { g <- function(y) { y * 2 } diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R index 1e96418558883..bf02ecdad66ff 100644 --- a/R/pkg/tests/run-all.R +++ b/R/pkg/tests/run-all.R @@ -20,7 +20,6 @@ library(SparkR) # SPARK-25572 if (identical(Sys.getenv("NOT_CRAN"), "true")) { - # Turn all warnings into errors options("warn" = 2) @@ -60,11 +59,23 @@ if (identical(Sys.getenv("NOT_CRAN"), "true")) { if (identical(Sys.getenv("NOT_CRAN"), "true")) { # set random seed for predictable results. mostly for base's sample() in tree and classification set.seed(42) - # for testthat 1.0.2 later, change reporter from "summary" to default_reporter() - testthat:::run_tests("SparkR", - file.path(sparkRDir, "pkg", "tests", "fulltests"), - NULL, - "summary") + + # TODO (SPARK-30663) To be removed once testthat 1.x is removed from all builds + if (grepl("^1\\..*", packageVersion("testthat"))) { + # testthat 1.x + test_runner <- testthat:::run_tests + reporter <- "summary" + + } else { + # testthat >= 2.0.0 + test_runner <- testthat:::test_package_dir + reporter <- testthat::default_reporter() + } + + test_runner("SparkR", + file.path(sparkRDir, "pkg", "tests", "fulltests"), + NULL, + reporter) } SparkR:::uninstallDownloadedSpark() diff --git a/R/run-tests.sh b/R/run-tests.sh index 86bd8aad5f113..51ca7d600caf0 100755 --- a/R/run-tests.sh +++ b/R/run-tests.sh @@ -23,7 +23,7 @@ FAILED=0 LOGFILE=$FWDIR/unit-tests.out rm -f $LOGFILE -SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --driver-java-options "-Dlog4j.configuration=file:$FWDIR/log4j.properties" --conf spark.hadoop.fs.defaultFS="file:///" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE +SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --driver-java-options "-Dlog4j.configuration=file:$FWDIR/log4j.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE FAILED=$((PIPESTATUS[0]||$FAILED)) NUM_TEST_WARNING="$(grep -c -e 'Warnings ----------------' $LOGFILE)" diff --git a/README.md b/README.md index 29777a5962bc2..d7931263b0fc7 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ and Structured Streaming for stream processing. -[![Jenkins Build](https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7/badge/icon)](https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7) +[![Jenkins Build](https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7-hive-2.3/badge/icon)](https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7-hive-2.3) [![AppVeyor Build](https://img.shields.io/appveyor/ci/ApacheSoftwareFoundation/spark/master.svg?style=plastic&logo=appveyor)](https://ci.appveyor.com/project/ApacheSoftwareFoundation/spark) [![PySpark Coverage](https://img.shields.io/badge/dynamic/xml.svg?label=pyspark%20coverage&url=https%3A%2F%2Fspark-test.github.io%2Fpyspark-coverage-site&query=%2Fhtml%2Fbody%2Fdiv%5B1%5D%2Fdiv%2Fh1%2Fspan&colorB=brightgreen&style=plastic)](https://spark-test.github.io/pyspark-coverage-site) @@ -29,7 +29,6 @@ To build Spark and its example programs, run: (You do not need to do this if you downloaded a pre-built package.) -You can build Spark using more than one thread by using the -T option with Maven, see ["Parallel builds in Maven 3"](https://cwiki.apache.org/confluence/display/MAVEN/Parallel+builds+in+Maven+3). More detailed documentation is available from the project site, at ["Building Spark"](https://spark.apache.org/docs/latest/building-spark.html). diff --git a/appveyor.yml b/appveyor.yml index a61436c5d2e68..5d98260265b1a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -42,13 +42,9 @@ install: # Install maven and dependencies - ps: .\dev\appveyor-install-dependencies.ps1 # Required package for R unit tests - - cmd: R -e "install.packages(c('knitr', 'rmarkdown', 'e1071', 'survival'), repos='https://cloud.r-project.org/')" - # Here, we use the fixed version of testthat. For more details, please see SPARK-22817. - # As of devtools 2.1.0, it requires testthat higher then 2.1.1 as a dependency. SparkR test requires testthat 1.0.2. - # Therefore, we don't use devtools but installs it directly from the archive including its dependencies. - - cmd: R -e "install.packages(c('crayon', 'praise', 'R6'), repos='https://cloud.r-project.org/')" - - cmd: R -e "install.packages('https://cloud.r-project.org/src/contrib/Archive/testthat/testthat_1.0.2.tar.gz', repos=NULL, type='source')" - - cmd: R -e "packageVersion('knitr'); packageVersion('rmarkdown'); packageVersion('testthat'); packageVersion('e1071'); packageVersion('survival')" + - cmd: R -e "install.packages(c('knitr', 'rmarkdown', 'e1071', 'survival', 'arrow'), repos='https://cloud.r-project.org/')" + - cmd: R -e "install.packages(c('crayon', 'praise', 'R6', 'testthat'), repos='https://cloud.r-project.org/')" + - cmd: R -e "packageVersion('knitr'); packageVersion('rmarkdown'); packageVersion('testthat'); packageVersion('e1071'); packageVersion('survival'); packageVersion('arrow')" build_script: # '-Djna.nosys=true' is required to avoid kernel32.dll load failure. diff --git a/assembly/pom.xml b/assembly/pom.xml index ef916fb99a04c..193ad3d671bcf 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -64,11 +64,6 @@ spark-graphx_${scala.binary.version} ${project.version} - - org.apache.spark - spark-graph_${scala.binary.version} - ${project.version} - org.apache.spark spark-sql_${scala.binary.version} @@ -122,7 +117,7 @@ - + diff --git a/build/mvn b/build/mvn index f68377b3ddc71..3628be9880253 100755 --- a/build/mvn +++ b/build/mvn @@ -22,7 +22,7 @@ _DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" # Preserve the calling directory _CALLING_DIR="$(pwd)" # Options used during compilation -_COMPILE_JVM_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=512m" +_COMPILE_JVM_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g" # Installs any application tarball given a URL, the expected tarball name, # and, optionally, a checkable binary path to determine if the binary has diff --git a/build/sbt b/build/sbt index 7d8d0993e57d8..475dfd3b20b43 100755 --- a/build/sbt +++ b/build/sbt @@ -66,7 +66,7 @@ Usage: $script_name [options] -sbt-dir path to global settings/plugins directory (default: ~/.sbt) -sbt-boot path to shared boot directory (default: ~/.sbt/boot in 0.11 series) -ivy path to local Ivy repository (default: ~/.ivy2) - -mem set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem)) + -mem set memory options (default: $sbt_default_mem, which is $(get_mem_opts $sbt_default_mem)) -no-share use all local caches; no sharing -no-global uses global caches, but does not use global ~/.sbt directory. -jvm-debug Turn on JVM debugging, open at the given port. diff --git a/build/sbt-launch-lib.bash b/build/sbt-launch-lib.bash index 0ed6f8b6d737b..162bfbf2257c7 100755 --- a/build/sbt-launch-lib.bash +++ b/build/sbt-launch-lib.bash @@ -17,6 +17,7 @@ declare -a java_args declare -a scalac_args declare -a sbt_commands declare -a maven_profiles +declare sbt_default_mem=2048 if test -x "$JAVA_HOME/bin/java"; then echo -e "Using $JAVA_HOME as default JAVA_HOME." @@ -111,11 +112,10 @@ addDebugger () { # a ham-fisted attempt to move some memory settings in concert # so they need not be dicked around with individually. get_mem_opts () { - local mem=${1:-2048} - local perm=$(( $mem / 4 )) - (( $perm > 256 )) || perm=256 - (( $perm < 4096 )) || perm=4096 - local codecache=$(( $perm / 2 )) + local mem=${1:-$sbt_default_mem} + local codecache=$(( $mem / 8 )) + (( $codecache > 128 )) || codecache=128 + (( $codecache < 2048 )) || codecache=2048 echo "-Xms${mem}m -Xmx${mem}m -XX:ReservedCodeCacheSize=${codecache}m" } diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml index f042a12fda3d2..a1c8a8e6582eb 100644 --- a/common/kvstore/pom.xml +++ b/common/kvstore/pom.xml @@ -45,7 +45,7 @@ guava - org.fusesource.leveldbjni + ${leveldbjni.group} leveldbjni-all diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java index 6af45aec3c7b2..b33c53871c32f 100644 --- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java +++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java @@ -252,7 +252,7 @@ private static Predicate getPredicate( return (value) -> set.contains(indexValueForEntity(getter, value)); } else { - HashSet set = new HashSet<>(values.size()); + HashSet> set = new HashSet<>(values.size()); for (Object key : values) { set.add(asKey(key)); } diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVTypeInfo.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVTypeInfo.java index b8c5fab8709ed..d2a26982d8703 100644 --- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVTypeInfo.java +++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVTypeInfo.java @@ -124,7 +124,7 @@ interface Accessor { Object get(Object instance) throws ReflectiveOperationException; - Class getType(); + Class getType(); } private class FieldAccessor implements Accessor { @@ -141,7 +141,7 @@ public Object get(Object instance) throws ReflectiveOperationException { } @Override - public Class getType() { + public Class getType() { return field.getType(); } } @@ -160,7 +160,7 @@ public Object get(Object instance) throws ReflectiveOperationException { } @Override - public Class getType() { + public Class getType() { return method.getReturnType(); } } diff --git a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/ArrayKeyIndexType.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/ArrayKeyIndexType.java index 32030fb4115c3..dd53fdf0b1b4c 100644 --- a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/ArrayKeyIndexType.java +++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/ArrayKeyIndexType.java @@ -38,7 +38,7 @@ public boolean equals(Object o) { @Override public int hashCode() { - return key.hashCode(); + return Arrays.hashCode(key) ^ Arrays.hashCode(id); } } diff --git a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/CustomType1.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/CustomType1.java index 92b643b0cb928..ebb5c2c5ed55c 100644 --- a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/CustomType1.java +++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/CustomType1.java @@ -17,8 +17,6 @@ package org.apache.spark.util.kvstore; -import com.google.common.base.Objects; - public class CustomType1 { @KVIndex @@ -52,12 +50,7 @@ public int hashCode() { @Override public String toString() { - return Objects.toStringHelper(this) - .add("key", key) - .add("id", id) - .add("name", name) - .add("num", num) - .toString(); + return "CustomType1[key=" + key + ",id=" + id + ",name=" + name + ",num=" + num; } } diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index c107af9ceb415..163c250054e4d 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -35,6 +35,12 @@ + + + org.scala-lang + scala-library + + io.netty @@ -46,7 +52,7 @@ - org.fusesource.leveldbjni + ${leveldbjni.group} leveldbjni-all 1.8 @@ -87,13 +93,6 @@ - - - org.scala-lang - scala-library - ${scala.version} - test - log4j log4j diff --git a/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java b/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java index 45fee541a4f5d..66566b67870f3 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java +++ b/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java @@ -26,9 +26,10 @@ import java.nio.channels.FileChannel; import java.nio.file.StandardOpenOption; -import com.google.common.base.Objects; import com.google.common.io.ByteStreams; import io.netty.channel.DefaultFileRegion; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; import org.apache.spark.network.util.JavaUtils; import org.apache.spark.network.util.LimitedInputStream; @@ -144,10 +145,10 @@ public Object convertToNetty() throws IOException { @Override public String toString() { - return Objects.toStringHelper(this) - .add("file", file) - .add("offset", offset) - .add("length", length) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("file", file) + .append("offset", offset) + .append("length", length) .toString(); } } diff --git a/common/network-common/src/main/java/org/apache/spark/network/buffer/NettyManagedBuffer.java b/common/network-common/src/main/java/org/apache/spark/network/buffer/NettyManagedBuffer.java index acc49d968c186..b42977c7cb7f6 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/buffer/NettyManagedBuffer.java +++ b/common/network-common/src/main/java/org/apache/spark/network/buffer/NettyManagedBuffer.java @@ -21,9 +21,10 @@ import java.io.InputStream; import java.nio.ByteBuffer; -import com.google.common.base.Objects; import io.netty.buffer.ByteBuf; import io.netty.buffer.ByteBufInputStream; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; /** * A {@link ManagedBuffer} backed by a Netty {@link ByteBuf}. @@ -69,8 +70,8 @@ public Object convertToNetty() throws IOException { @Override public String toString() { - return Objects.toStringHelper(this) - .add("buf", buf) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("buf", buf) .toString(); } } diff --git a/common/network-common/src/main/java/org/apache/spark/network/buffer/NioManagedBuffer.java b/common/network-common/src/main/java/org/apache/spark/network/buffer/NioManagedBuffer.java index 631d767715256..084f89d2611cf 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/buffer/NioManagedBuffer.java +++ b/common/network-common/src/main/java/org/apache/spark/network/buffer/NioManagedBuffer.java @@ -21,9 +21,10 @@ import java.io.InputStream; import java.nio.ByteBuffer; -import com.google.common.base.Objects; import io.netty.buffer.ByteBufInputStream; import io.netty.buffer.Unpooled; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; /** * A {@link ManagedBuffer} backed by {@link ByteBuffer}. @@ -67,8 +68,8 @@ public Object convertToNetty() throws IOException { @Override public String toString() { - return Objects.toStringHelper(this) - .add("buf", buf) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("buf", buf) .toString(); } } diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java index b018197deaf2e..6dcc703e92669 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java +++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClient.java @@ -27,13 +27,14 @@ import javax.annotation.Nullable; import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Objects; import com.google.common.base.Preconditions; import com.google.common.base.Throwables; import com.google.common.util.concurrent.SettableFuture; import io.netty.channel.Channel; import io.netty.util.concurrent.Future; import io.netty.util.concurrent.GenericFutureListener; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -301,10 +302,10 @@ public void close() { @Override public String toString() { - return Objects.toStringHelper(this) - .add("remoteAdress", channel.remoteAddress()) - .add("clientId", clientId) - .add("isActive", isActive()) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("remoteAdress", channel.remoteAddress()) + .append("clientId", clientId) + .append("isActive", isActive()) .toString(); } diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java index 53835d8304866..c9ef9f918ffd1 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java +++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportClientFactory.java @@ -293,9 +293,8 @@ public void close() { } connectionPool.clear(); - if (workerGroup != null) { + if (workerGroup != null && !workerGroup.isShuttingDown()) { workerGroup.shutdownGracefully(); - workerGroup = null; } } } diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthClientBootstrap.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthClientBootstrap.java index 77b167d15e911..4428f0f295d6e 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthClientBootstrap.java +++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthClientBootstrap.java @@ -78,6 +78,7 @@ public void doBootstrap(TransportClient client, Channel channel) { try { doSparkAuth(client, channel); + client.setClientId(appId); } catch (GeneralSecurityException | IOException e) { throw Throwables.propagate(e); } catch (RuntimeException e) { diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java index fb44dbbb0953b..821cc7a849504 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java +++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/AuthRpcHandler.java @@ -125,6 +125,7 @@ public void receive(TransportClient client, ByteBuffer message, RpcResponseCallb response.encode(responseData); callback.onSuccess(responseData.nioBuffer()); engine.sessionCipher().addToChannel(channel); + client.setClientId(challenge.appId); } catch (Exception e) { // This is a fatal error: authentication has failed. Close the channel explicitly. LOG.debug("Authentication failed for client {}, closing channel.", channel.remoteAddress()); diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipher.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipher.java index 8995bbc940f63..36ca73f6ac0f0 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipher.java +++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipher.java @@ -90,7 +90,8 @@ CryptoOutputStream createOutputStream(WritableByteChannel ch) throws IOException return new CryptoOutputStream(cipher, conf, ch, key, new IvParameterSpec(outIv)); } - private CryptoInputStream createInputStream(ReadableByteChannel ch) throws IOException { + @VisibleForTesting + CryptoInputStream createInputStream(ReadableByteChannel ch) throws IOException { return new CryptoInputStream(cipher, conf, ch, key, new IvParameterSpec(inIv)); } @@ -166,34 +167,45 @@ private static class DecryptionHandler extends ChannelInboundHandlerAdapter { @Override public void channelRead(ChannelHandlerContext ctx, Object data) throws Exception { - if (!isCipherValid) { - throw new IOException("Cipher is in invalid state."); - } - byteChannel.feedData((ByteBuf) data); - - byte[] decryptedData = new byte[byteChannel.readableBytes()]; - int offset = 0; - while (offset < decryptedData.length) { - // SPARK-25535: workaround for CRYPTO-141. - try { - offset += cis.read(decryptedData, offset, decryptedData.length - offset); - } catch (InternalError ie) { - isCipherValid = false; - throw ie; + ByteBuf buffer = (ByteBuf) data; + + try { + if (!isCipherValid) { + throw new IOException("Cipher is in invalid state."); + } + byte[] decryptedData = new byte[buffer.readableBytes()]; + byteChannel.feedData(buffer); + + int offset = 0; + while (offset < decryptedData.length) { + // SPARK-25535: workaround for CRYPTO-141. + try { + offset += cis.read(decryptedData, offset, decryptedData.length - offset); + } catch (InternalError ie) { + isCipherValid = false; + throw ie; + } } - } - ctx.fireChannelRead(Unpooled.wrappedBuffer(decryptedData, 0, decryptedData.length)); + ctx.fireChannelRead(Unpooled.wrappedBuffer(decryptedData, 0, decryptedData.length)); + } finally { + buffer.release(); + } } @Override - public void channelInactive(ChannelHandlerContext ctx) throws Exception { + public void handlerRemoved(ChannelHandlerContext ctx) throws Exception { + // We do the closing of the stream / channel in handlerRemoved(...) as + // this method will be called in all cases: + // + // - when the Channel becomes inactive + // - when the handler is removed from the ChannelPipeline try { if (isCipherValid) { cis.close(); } } finally { - super.channelInactive(ctx); + super.handlerRemoved(ctx); } } } diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/ChunkFetchFailure.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/ChunkFetchFailure.java index a7afbfa8621c8..0f1781cbf1f2c 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/protocol/ChunkFetchFailure.java +++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/ChunkFetchFailure.java @@ -17,8 +17,11 @@ package org.apache.spark.network.protocol; -import com.google.common.base.Objects; +import java.util.Objects; + import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; /** * Response to {@link ChunkFetchRequest} when there is an error fetching the chunk. @@ -54,7 +57,7 @@ public static ChunkFetchFailure decode(ByteBuf buf) { @Override public int hashCode() { - return Objects.hashCode(streamChunkId, errorString); + return Objects.hash(streamChunkId, errorString); } @Override @@ -68,9 +71,9 @@ public boolean equals(Object other) { @Override public String toString() { - return Objects.toStringHelper(this) - .add("streamChunkId", streamChunkId) - .add("errorString", errorString) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("streamChunkId", streamChunkId) + .append("errorString", errorString) .toString(); } } diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/ChunkFetchRequest.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/ChunkFetchRequest.java index fe54fcc50dc86..7b034d5c2f595 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/protocol/ChunkFetchRequest.java +++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/ChunkFetchRequest.java @@ -17,8 +17,9 @@ package org.apache.spark.network.protocol; -import com.google.common.base.Objects; import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; /** * Request to fetch a sequence of a single chunk of a stream. This will correspond to a single @@ -64,8 +65,8 @@ public boolean equals(Object other) { @Override public String toString() { - return Objects.toStringHelper(this) - .add("streamChunkId", streamChunkId) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("streamChunkId", streamChunkId) .toString(); } } diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/ChunkFetchSuccess.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/ChunkFetchSuccess.java index d5c9a9b3202fb..eaad143fc3f5f 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/protocol/ChunkFetchSuccess.java +++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/ChunkFetchSuccess.java @@ -17,8 +17,11 @@ package org.apache.spark.network.protocol; -import com.google.common.base.Objects; +import java.util.Objects; + import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; import org.apache.spark.network.buffer.ManagedBuffer; import org.apache.spark.network.buffer.NettyManagedBuffer; @@ -67,7 +70,7 @@ public static ChunkFetchSuccess decode(ByteBuf buf) { @Override public int hashCode() { - return Objects.hashCode(streamChunkId, body()); + return Objects.hash(streamChunkId, body()); } @Override @@ -81,9 +84,9 @@ public boolean equals(Object other) { @Override public String toString() { - return Objects.toStringHelper(this) - .add("streamChunkId", streamChunkId) - .add("buffer", body()) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("streamChunkId", streamChunkId) + .append("buffer", body()) .toString(); } } diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/Encoders.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/Encoders.java index 736059fdd1f57..490915f6de4b3 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/protocol/Encoders.java +++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/Encoders.java @@ -112,4 +112,27 @@ public static int[] decode(ByteBuf buf) { return ints; } } + + /** Long integer arrays are encoded with their length followed by long integers. */ + public static class LongArrays { + public static int encodedLength(long[] longs) { + return 4 + 8 * longs.length; + } + + public static void encode(ByteBuf buf, long[] longs) { + buf.writeInt(longs.length); + for (long i : longs) { + buf.writeLong(i); + } + } + + public static long[] decode(ByteBuf buf) { + int numLongs = buf.readInt(); + long[] longs = new long[numLongs]; + for (int i = 0; i < longs.length; i ++) { + longs[i] = buf.readLong(); + } + return longs; + } + } } diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/OneWayMessage.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/OneWayMessage.java index 1632fb9e03687..719f6c64c5dee 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/protocol/OneWayMessage.java +++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/OneWayMessage.java @@ -17,8 +17,11 @@ package org.apache.spark.network.protocol; -import com.google.common.base.Objects; +import java.util.Objects; + import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; import org.apache.spark.network.buffer.ManagedBuffer; import org.apache.spark.network.buffer.NettyManagedBuffer; @@ -72,8 +75,8 @@ public boolean equals(Object other) { @Override public String toString() { - return Objects.toStringHelper(this) - .add("body", body()) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("body", body()) .toString(); } } diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/RpcFailure.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/RpcFailure.java index 61061903de23f..6e4f5687d16cd 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/protocol/RpcFailure.java +++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/RpcFailure.java @@ -17,8 +17,11 @@ package org.apache.spark.network.protocol; -import com.google.common.base.Objects; +import java.util.Objects; + import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; /** Response to {@link RpcRequest} for a failed RPC. */ public final class RpcFailure extends AbstractMessage implements ResponseMessage { @@ -52,7 +55,7 @@ public static RpcFailure decode(ByteBuf buf) { @Override public int hashCode() { - return Objects.hashCode(requestId, errorString); + return Objects.hash(requestId, errorString); } @Override @@ -66,9 +69,9 @@ public boolean equals(Object other) { @Override public String toString() { - return Objects.toStringHelper(this) - .add("requestId", requestId) - .add("errorString", errorString) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("requestId", requestId) + .append("errorString", errorString) .toString(); } } diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/RpcRequest.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/RpcRequest.java index cc1bb95d2d566..f2609ce2dbdb3 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/protocol/RpcRequest.java +++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/RpcRequest.java @@ -17,8 +17,11 @@ package org.apache.spark.network.protocol; -import com.google.common.base.Objects; +import java.util.Objects; + import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; import org.apache.spark.network.buffer.ManagedBuffer; import org.apache.spark.network.buffer.NettyManagedBuffer; @@ -64,7 +67,7 @@ public static RpcRequest decode(ByteBuf buf) { @Override public int hashCode() { - return Objects.hashCode(requestId, body()); + return Objects.hash(requestId, body()); } @Override @@ -78,9 +81,9 @@ public boolean equals(Object other) { @Override public String toString() { - return Objects.toStringHelper(this) - .add("requestId", requestId) - .add("body", body()) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("requestId", requestId) + .append("body", body()) .toString(); } } diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/RpcResponse.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/RpcResponse.java index c03291e9c0b23..51b36ea183362 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/protocol/RpcResponse.java +++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/RpcResponse.java @@ -17,8 +17,11 @@ package org.apache.spark.network.protocol; -import com.google.common.base.Objects; +import java.util.Objects; + import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; import org.apache.spark.network.buffer.ManagedBuffer; import org.apache.spark.network.buffer.NettyManagedBuffer; @@ -64,7 +67,7 @@ public static RpcResponse decode(ByteBuf buf) { @Override public int hashCode() { - return Objects.hashCode(requestId, body()); + return Objects.hash(requestId, body()); } @Override @@ -78,9 +81,9 @@ public boolean equals(Object other) { @Override public String toString() { - return Objects.toStringHelper(this) - .add("requestId", requestId) - .add("body", body()) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("requestId", requestId) + .append("body", body()) .toString(); } } diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/StreamChunkId.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/StreamChunkId.java index d46a263884807..75c6d630b9c33 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/protocol/StreamChunkId.java +++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/StreamChunkId.java @@ -17,8 +17,11 @@ package org.apache.spark.network.protocol; -import com.google.common.base.Objects; +import java.util.Objects; + import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; /** * Encapsulates a request for a particular chunk of a stream. @@ -51,7 +54,7 @@ public static StreamChunkId decode(ByteBuf buffer) { @Override public int hashCode() { - return Objects.hashCode(streamId, chunkIndex); + return Objects.hash(streamId, chunkIndex); } @Override @@ -65,9 +68,9 @@ public boolean equals(Object other) { @Override public String toString() { - return Objects.toStringHelper(this) - .add("streamId", streamId) - .add("chunkIndex", chunkIndex) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("streamId", streamId) + .append("chunkIndex", chunkIndex) .toString(); } } diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/StreamFailure.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/StreamFailure.java index 68fcfa7748611..06836f5eea390 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/protocol/StreamFailure.java +++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/StreamFailure.java @@ -17,8 +17,11 @@ package org.apache.spark.network.protocol; -import com.google.common.base.Objects; +import java.util.Objects; + import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; /** * Message indicating an error when transferring a stream. @@ -54,7 +57,7 @@ public static StreamFailure decode(ByteBuf buf) { @Override public int hashCode() { - return Objects.hashCode(streamId, error); + return Objects.hash(streamId, error); } @Override @@ -68,9 +71,9 @@ public boolean equals(Object other) { @Override public String toString() { - return Objects.toStringHelper(this) - .add("streamId", streamId) - .add("error", error) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("streamId", streamId) + .append("error", error) .toString(); } diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/StreamRequest.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/StreamRequest.java index 1b135af752bd8..3d035e5c94f23 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/protocol/StreamRequest.java +++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/StreamRequest.java @@ -17,8 +17,11 @@ package org.apache.spark.network.protocol; -import com.google.common.base.Objects; +import java.util.Objects; + import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; /** * Request to stream data from the remote end. @@ -67,8 +70,8 @@ public boolean equals(Object other) { @Override public String toString() { - return Objects.toStringHelper(this) - .add("streamId", streamId) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("streamId", streamId) .toString(); } diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/StreamResponse.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/StreamResponse.java index 568108c4fe5e8..f30605ce836fc 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/protocol/StreamResponse.java +++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/StreamResponse.java @@ -17,8 +17,11 @@ package org.apache.spark.network.protocol; -import com.google.common.base.Objects; +import java.util.Objects; + import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; import org.apache.spark.network.buffer.ManagedBuffer; @@ -67,7 +70,7 @@ public static StreamResponse decode(ByteBuf buf) { @Override public int hashCode() { - return Objects.hashCode(byteCount, streamId); + return Objects.hash(byteCount, streamId); } @Override @@ -81,10 +84,10 @@ public boolean equals(Object other) { @Override public String toString() { - return Objects.toStringHelper(this) - .add("streamId", streamId) - .add("byteCount", byteCount) - .add("body", body()) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("streamId", streamId) + .append("byteCount", byteCount) + .append("body", body()) .toString(); } diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/UploadStream.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/UploadStream.java index 7d21151e01074..fb50801a51ba3 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/protocol/UploadStream.java +++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/UploadStream.java @@ -20,8 +20,9 @@ import java.io.IOException; import java.nio.ByteBuffer; -import com.google.common.base.Objects; import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; import org.apache.spark.network.buffer.ManagedBuffer; import org.apache.spark.network.buffer.NettyManagedBuffer; @@ -99,9 +100,9 @@ public boolean equals(Object other) { @Override public String toString() { - return Objects.toStringHelper(this) - .add("requestId", requestId) - .add("body", body()) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("requestId", requestId) + .append("body", body()) .toString(); } } diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java b/common/network-common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java index 67f64d7962035..1a902a937a176 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java +++ b/common/network-common/src/main/java/org/apache/spark/network/server/OneForOneStreamManager.java @@ -57,7 +57,7 @@ private static class StreamState { int curChunk = 0; // Used to keep track of the number of chunks being transferred and not finished yet. - volatile long chunksBeingTransferred = 0L; + final AtomicLong chunksBeingTransferred = new AtomicLong(0L); StreamState(String appId, Iterator buffers, Channel channel) { this.appId = appId; @@ -117,21 +117,35 @@ public static Pair parseStreamChunkId(String streamChunkId) { @Override public void connectionTerminated(Channel channel) { + RuntimeException failedToReleaseBufferException = null; + // Close all streams which have been associated with the channel. for (Map.Entry entry: streams.entrySet()) { StreamState state = entry.getValue(); if (state.associatedChannel == channel) { streams.remove(entry.getKey()); - // Release all remaining buffers. - while (state.buffers.hasNext()) { - ManagedBuffer buffer = state.buffers.next(); - if (buffer != null) { - buffer.release(); + try { + // Release all remaining buffers. + while (state.buffers.hasNext()) { + ManagedBuffer buffer = state.buffers.next(); + if (buffer != null) { + buffer.release(); + } + } + } catch (RuntimeException e) { + if (failedToReleaseBufferException == null) { + failedToReleaseBufferException = e; + } else { + logger.error("Exception trying to release remaining StreamState buffers", e); } } } } + + if (failedToReleaseBufferException != null) { + throw failedToReleaseBufferException; + } } @Override @@ -153,7 +167,7 @@ public void checkAuthorization(TransportClient client, long streamId) { public void chunkBeingSent(long streamId) { StreamState streamState = streams.get(streamId); if (streamState != null) { - streamState.chunksBeingTransferred++; + streamState.chunksBeingTransferred.incrementAndGet(); } } @@ -167,7 +181,7 @@ public void streamBeingSent(String streamId) { public void chunkSent(long streamId) { StreamState streamState = streams.get(streamId); if (streamState != null) { - streamState.chunksBeingTransferred--; + streamState.chunksBeingTransferred.decrementAndGet(); } } @@ -180,7 +194,7 @@ public void streamSent(String streamId) { public long chunksBeingTransferred() { long sum = 0L; for (StreamState streamState: streams.values()) { - sum += streamState.chunksBeingTransferred; + sum += streamState.chunksBeingTransferred.get(); } return sum; } diff --git a/common/network-common/src/main/java/org/apache/spark/network/server/TransportServer.java b/common/network-common/src/main/java/org/apache/spark/network/server/TransportServer.java index 8396e691e9db1..f0ff9f57e7be5 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/server/TransportServer.java +++ b/common/network-common/src/main/java/org/apache/spark/network/server/TransportServer.java @@ -100,9 +100,10 @@ public int getPort() { private void init(String hostToBind, int portToBind) { IOMode ioMode = IOMode.valueOf(conf.ioMode()); - EventLoopGroup bossGroup = - NettyUtils.createEventLoop(ioMode, conf.serverThreads(), conf.getModuleName() + "-server"); - EventLoopGroup workerGroup = bossGroup; + EventLoopGroup bossGroup = NettyUtils.createEventLoop(ioMode, 1, + conf.getModuleName() + "-boss"); + EventLoopGroup workerGroup = NettyUtils.createEventLoop(ioMode, conf.serverThreads(), + conf.getModuleName() + "-server"); bootstrap = new ServerBootstrap() .group(bossGroup, workerGroup) diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/ByteArrayReadableChannel.java b/common/network-common/src/main/java/org/apache/spark/network/util/ByteArrayReadableChannel.java index 25d103d0e316f..fe461d0b39862 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/util/ByteArrayReadableChannel.java +++ b/common/network-common/src/main/java/org/apache/spark/network/util/ByteArrayReadableChannel.java @@ -19,23 +19,27 @@ import java.io.IOException; import java.nio.ByteBuffer; +import java.nio.channels.ClosedChannelException; import java.nio.channels.ReadableByteChannel; import io.netty.buffer.ByteBuf; public class ByteArrayReadableChannel implements ReadableByteChannel { private ByteBuf data; + private boolean closed; - public int readableBytes() { - return data.readableBytes(); - } - - public void feedData(ByteBuf buf) { + public void feedData(ByteBuf buf) throws ClosedChannelException { + if (closed) { + throw new ClosedChannelException(); + } data = buf; } @Override public int read(ByteBuffer dst) throws IOException { + if (closed) { + throw new ClosedChannelException(); + } int totalRead = 0; while (data.readableBytes() > 0 && dst.remaining() > 0) { int bytesToRead = Math.min(data.readableBytes(), dst.remaining()); @@ -43,20 +47,16 @@ public int read(ByteBuffer dst) throws IOException { totalRead += bytesToRead; } - if (data.readableBytes() == 0) { - data.release(); - } - return totalRead; } @Override - public void close() throws IOException { + public void close() { + closed = true; } @Override public boolean isOpen() { - return true; + return !closed; } - } diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java b/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java index 589dfcbefb6ea..cc0f2919568ac 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java +++ b/common/network-common/src/main/java/org/apache/spark/network/util/TransportConf.java @@ -108,8 +108,12 @@ public int numConnectionsPerPeer() { return conf.getInt(SPARK_NETWORK_IO_NUMCONNECTIONSPERPEER_KEY, 1); } - /** Requested maximum length of the queue of incoming connections. Default is 64. */ - public int backLog() { return conf.getInt(SPARK_NETWORK_IO_BACKLOG_KEY, 64); } + /** + * Requested maximum length of the queue of incoming connections. If < 1, + * the default Netty value of {@link io.netty.util.NetUtil#SOMAXCONN} will be used. + * Default to -1. + */ + public int backLog() { return conf.getInt(SPARK_NETWORK_IO_BACKLOG_KEY, -1); } /** Number of threads used in the server thread pool. Default to 0, which is 2x#cores. */ public int serverThreads() { return conf.getInt(SPARK_NETWORK_IO_SERVERTHREADS_KEY, 0); } diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/TransportFrameDecoder.java b/common/network-common/src/main/java/org/apache/spark/network/util/TransportFrameDecoder.java index 1980361a15523..cef0e415aa40a 100644 --- a/common/network-common/src/main/java/org/apache/spark/network/util/TransportFrameDecoder.java +++ b/common/network-common/src/main/java/org/apache/spark/network/util/TransportFrameDecoder.java @@ -184,8 +184,12 @@ private ByteBuf decodeNext() { return null; } - // Reset buf and size for next frame. + return consumeCurrentFrameBuf(); + } + + private ByteBuf consumeCurrentFrameBuf() { ByteBuf frame = frameBuf; + // Reset buf and size for next frame. frameBuf = null; consolidatedFrameBufSize = 0; consolidatedNumComponents = 0; @@ -215,13 +219,9 @@ private ByteBuf nextBufferForFrame(int bytesToRead) { @Override public void channelInactive(ChannelHandlerContext ctx) throws Exception { - for (ByteBuf b : buffers) { - b.release(); - } if (interceptor != null) { interceptor.channelInactive(); } - frameLenBuf.release(); super.channelInactive(ctx); } @@ -233,6 +233,24 @@ public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause) throws E super.exceptionCaught(ctx, cause); } + @Override + public void handlerRemoved(ChannelHandlerContext ctx) throws Exception { + // Release all buffers that are still in our ownership. + // Doing this in handlerRemoved(...) guarantees that this will happen in all cases: + // - When the Channel becomes inactive + // - When the decoder is removed from the ChannelPipeline + for (ByteBuf b : buffers) { + b.release(); + } + buffers.clear(); + frameLenBuf.release(); + ByteBuf frame = consumeCurrentFrameBuf(); + if (frame != null) { + frame.release(); + } + super.handlerRemoved(ctx); + } + public void setInterceptor(Interceptor interceptor) { Preconditions.checkState(this.interceptor == null, "Already have an interceptor."); this.interceptor = interceptor; diff --git a/common/network-common/src/test/java/org/apache/spark/network/RpcIntegrationSuite.java b/common/network-common/src/test/java/org/apache/spark/network/RpcIntegrationSuite.java index 498dc51cdc81a..916c140621671 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/RpcIntegrationSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/RpcIntegrationSuite.java @@ -260,14 +260,14 @@ public void onFailure(Throwable e) { @Test public void singleRPC() throws Exception { RpcResult res = sendRPC("hello/Aaron"); - assertEquals(res.successMessages, Sets.newHashSet("Hello, Aaron!")); + assertEquals(Sets.newHashSet("Hello, Aaron!"), res.successMessages); assertTrue(res.errorMessages.isEmpty()); } @Test public void doubleRPC() throws Exception { RpcResult res = sendRPC("hello/Aaron", "hello/Reynold"); - assertEquals(res.successMessages, Sets.newHashSet("Hello, Aaron!", "Hello, Reynold!")); + assertEquals(Sets.newHashSet("Hello, Aaron!", "Hello, Reynold!"), res.successMessages); assertTrue(res.errorMessages.isEmpty()); } @@ -295,7 +295,7 @@ public void doubleTrouble() throws Exception { @Test public void sendSuccessAndFailure() throws Exception { RpcResult res = sendRPC("hello/Bob", "throw error/the", "hello/Builder", "return error/!"); - assertEquals(res.successMessages, Sets.newHashSet("Hello, Bob!", "Hello, Builder!")); + assertEquals(Sets.newHashSet("Hello, Bob!", "Hello, Builder!"), res.successMessages); assertErrorsContain(res.errorMessages, Sets.newHashSet("Thrown: the", "Returned: !")); } diff --git a/common/network-common/src/test/java/org/apache/spark/network/TransportClientFactorySuite.java b/common/network-common/src/test/java/org/apache/spark/network/TransportClientFactorySuite.java index 2aec4a33bbe43..9b76981c31c57 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/TransportClientFactorySuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/TransportClientFactorySuite.java @@ -217,4 +217,11 @@ public Iterable> getAll() { assertFalse(c1.isActive()); } } + + @Test(expected = IOException.class) + public void closeFactoryBeforeCreateClient() throws IOException, InterruptedException { + TransportClientFactory factory = context.createClientFactory(); + factory.close(); + factory.createClient(TestUtils.getLocalHost(), server1.getPort()); + } } diff --git a/common/network-common/src/test/java/org/apache/spark/network/crypto/TransportCipherSuite.java b/common/network-common/src/test/java/org/apache/spark/network/crypto/TransportCipherSuite.java new file mode 100644 index 0000000000000..6b2186f73cd0c --- /dev/null +++ b/common/network-common/src/test/java/org/apache/spark/network/crypto/TransportCipherSuite.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.network.crypto; + +import javax.crypto.spec.SecretKeySpec; +import java.io.IOException; +import java.nio.channels.ReadableByteChannel; +import java.nio.channels.WritableByteChannel; + +import io.netty.buffer.ByteBuf; +import io.netty.buffer.Unpooled; +import io.netty.channel.embedded.EmbeddedChannel; +import org.apache.commons.crypto.stream.CryptoInputStream; +import org.apache.commons.crypto.stream.CryptoOutputStream; +import org.apache.spark.network.util.MapConfigProvider; +import org.apache.spark.network.util.TransportConf; +import org.hamcrest.CoreMatchers; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertThat; +import static org.junit.Assert.fail; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyInt; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class TransportCipherSuite { + + @Test + public void testBufferNotLeaksOnInternalError() throws IOException { + String algorithm = "TestAlgorithm"; + TransportConf conf = new TransportConf("Test", MapConfigProvider.EMPTY); + TransportCipher cipher = new TransportCipher(conf.cryptoConf(), conf.cipherTransformation(), + new SecretKeySpec(new byte[256], algorithm), new byte[0], new byte[0]) { + + @Override + CryptoOutputStream createOutputStream(WritableByteChannel ch) { + return null; + } + + @Override + CryptoInputStream createInputStream(ReadableByteChannel ch) throws IOException { + CryptoInputStream mockInputStream = mock(CryptoInputStream.class); + when(mockInputStream.read(any(byte[].class), anyInt(), anyInt())) + .thenThrow(new InternalError()); + return mockInputStream; + } + }; + + EmbeddedChannel channel = new EmbeddedChannel(); + cipher.addToChannel(channel); + + ByteBuf buffer = Unpooled.wrappedBuffer(new byte[] { 1, 2 }); + ByteBuf buffer2 = Unpooled.wrappedBuffer(new byte[] { 1, 2 }); + + try { + channel.writeInbound(buffer); + fail("Should have raised InternalError"); + } catch (InternalError expected) { + // expected + assertEquals(0, buffer.refCnt()); + } + + try { + channel.writeInbound(buffer2); + fail("Should have raised an exception"); + } catch (Throwable expected) { + assertThat(expected, CoreMatchers.instanceOf(IOException.class)); + assertEquals(0, buffer2.refCnt()); + } + + // Simulate closing the connection + assertFalse(channel.finish()); + } +} diff --git a/common/network-common/src/test/java/org/apache/spark/network/server/OneForOneStreamManagerSuite.java b/common/network-common/src/test/java/org/apache/spark/network/server/OneForOneStreamManagerSuite.java index fb3503b783e54..45e1836da641f 100644 --- a/common/network-common/src/test/java/org/apache/spark/network/server/OneForOneStreamManagerSuite.java +++ b/common/network-common/src/test/java/org/apache/spark/network/server/OneForOneStreamManagerSuite.java @@ -18,6 +18,7 @@ package org.apache.spark.network.server; import java.util.ArrayList; +import java.util.Iterator; import java.util.List; import io.netty.channel.Channel; @@ -96,4 +97,42 @@ public void managedBuffersAreFreedWhenConnectionIsClosed() { Mockito.verify(buffer2, Mockito.times(1)).release(); Assert.assertEquals(0, manager.numStreamStates()); } + + @Test + public void streamStatesAreFreedWhenConnectionIsClosedEvenIfBufferIteratorThrowsException() { + OneForOneStreamManager manager = new OneForOneStreamManager(); + + Iterator buffers = Mockito.mock(Iterator.class); + Mockito.when(buffers.hasNext()).thenReturn(true); + Mockito.when(buffers.next()).thenThrow(RuntimeException.class); + + ManagedBuffer mockManagedBuffer = Mockito.mock(ManagedBuffer.class); + + Iterator buffers2 = Mockito.mock(Iterator.class); + Mockito.when(buffers2.hasNext()).thenReturn(true).thenReturn(true); + Mockito.when(buffers2.next()).thenReturn(mockManagedBuffer).thenThrow(RuntimeException.class); + + Channel dummyChannel = Mockito.mock(Channel.class, Mockito.RETURNS_SMART_NULLS); + manager.registerStream("appId", buffers, dummyChannel); + manager.registerStream("appId", buffers2, dummyChannel); + + Assert.assertEquals(2, manager.numStreamStates()); + + try { + manager.connectionTerminated(dummyChannel); + Assert.fail("connectionTerminated should throw exception when fails to release all buffers"); + + } catch (RuntimeException e) { + + Mockito.verify(buffers, Mockito.times(1)).hasNext(); + Mockito.verify(buffers, Mockito.times(1)).next(); + + Mockito.verify(buffers2, Mockito.times(2)).hasNext(); + Mockito.verify(buffers2, Mockito.times(2)).next(); + + Mockito.verify(mockManagedBuffer, Mockito.times(1)).release(); + + Assert.assertEquals(0, manager.numStreamStates()); + } + } } diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockHandler.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockHandler.java index 037e5cf7e5222..8c05288fb4111 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockHandler.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockHandler.java @@ -102,11 +102,15 @@ protected void handleMessage( FetchShuffleBlocks msg = (FetchShuffleBlocks) msgObj; checkAuth(client, msg.appId); numBlockIds = 0; - for (int[] ids: msg.reduceIds) { - numBlockIds += ids.length; + if (msg.batchFetchEnabled) { + numBlockIds = msg.mapIds.length; + } else { + for (int[] ids: msg.reduceIds) { + numBlockIds += ids.length; + } } streamId = streamManager.registerStream(client.getClientId(), - new ManagedBufferIterator(msg, numBlockIds), client.getChannel()); + new ShuffleManagedBufferIterator(msg), client.getChannel()); } else { // For the compatibility with the old version, still keep the support for OpenBlocks. OpenBlocks msg = (OpenBlocks) msgObj; @@ -146,6 +150,12 @@ protected void handleMessage( int numRemovedBlocks = blockManager.removeBlocks(msg.appId, msg.execId, msg.blockIds); callback.onSuccess(new BlocksRemoved(numRemovedBlocks).toByteBuffer()); + } else if (msgObj instanceof GetLocalDirsForExecutors) { + GetLocalDirsForExecutors msg = (GetLocalDirsForExecutors) msgObj; + checkAuth(client, msg.appId); + Map localDirs = blockManager.getLocalDirs(msg.appId, msg.execIds); + callback.onSuccess(new LocalDirsForExecutors(localDirs).toByteBuffer()); + } else { throw new UnsupportedOperationException("Unexpected message: " + msgObj); } @@ -299,21 +309,6 @@ private int[] shuffleMapIdAndReduceIds(String[] blockIds, int shuffleId) { return mapIdAndReduceIds; } - ManagedBufferIterator(FetchShuffleBlocks msg, int numBlockIds) { - final int[] mapIdAndReduceIds = new int[2 * numBlockIds]; - int idx = 0; - for (int i = 0; i < msg.mapIds.length; i++) { - for (int reduceId : msg.reduceIds[i]) { - mapIdAndReduceIds[idx++] = msg.mapIds[i]; - mapIdAndReduceIds[idx++] = reduceId; - } - } - assert(idx == 2 * numBlockIds); - size = mapIdAndReduceIds.length; - blockDataForIndexFn = index -> blockManager.getBlockData(msg.appId, msg.execId, - msg.shuffleId, mapIdAndReduceIds[index], mapIdAndReduceIds[index + 1]); - } - @Override public boolean hasNext() { return index < size; @@ -328,6 +323,59 @@ public ManagedBuffer next() { } } + private class ShuffleManagedBufferIterator implements Iterator { + + private int mapIdx = 0; + private int reduceIdx = 0; + + private final String appId; + private final String execId; + private final int shuffleId; + private final long[] mapIds; + private final int[][] reduceIds; + private final boolean batchFetchEnabled; + + ShuffleManagedBufferIterator(FetchShuffleBlocks msg) { + appId = msg.appId; + execId = msg.execId; + shuffleId = msg.shuffleId; + mapIds = msg.mapIds; + reduceIds = msg.reduceIds; + batchFetchEnabled = msg.batchFetchEnabled; + } + + @Override + public boolean hasNext() { + // mapIds.length must equal to reduceIds.length, and the passed in FetchShuffleBlocks + // must have non-empty mapIds and reduceIds, see the checking logic in + // OneForOneBlockFetcher. + assert(mapIds.length != 0 && mapIds.length == reduceIds.length); + return mapIdx < mapIds.length && reduceIdx < reduceIds[mapIdx].length; + } + + @Override + public ManagedBuffer next() { + ManagedBuffer block; + if (!batchFetchEnabled) { + block = blockManager.getBlockData( + appId, execId, shuffleId, mapIds[mapIdx], reduceIds[mapIdx][reduceIdx]); + if (reduceIdx < reduceIds[mapIdx].length - 1) { + reduceIdx += 1; + } else { + reduceIdx = 0; + mapIdx += 1; + } + } else { + assert(reduceIds[mapIdx].length == 2); + block = blockManager.getContinuousBlocksData(appId, execId, shuffleId, mapIds[mapIdx], + reduceIds[mapIdx][0], reduceIds[mapIdx][1]); + mapIdx += 1; + } + metrics.blockTransferRateBytes.mark(block != null ? block.size() : 0); + return block; + } + } + @Override public void channelActive(TransportClient client) { metrics.activeConnections.inc(); diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockStoreClient.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockStoreClient.java index b8e52c8621fb6..d6185f089d3c0 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockStoreClient.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalBlockStoreClient.java @@ -21,20 +21,21 @@ import java.nio.ByteBuffer; import java.util.Arrays; import java.util.List; +import java.util.Map; import java.util.concurrent.CompletableFuture; import java.util.concurrent.Future; import com.codahale.metrics.MetricSet; import com.google.common.collect.Lists; import org.apache.spark.network.client.RpcResponseCallback; +import org.apache.spark.network.client.TransportClient; +import org.apache.spark.network.client.TransportClientBootstrap; +import org.apache.spark.network.client.TransportClientFactory; import org.apache.spark.network.shuffle.protocol.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.spark.network.TransportContext; -import org.apache.spark.network.client.TransportClient; -import org.apache.spark.network.client.TransportClientBootstrap; -import org.apache.spark.network.client.TransportClientFactory; import org.apache.spark.network.crypto.AuthClientBootstrap; import org.apache.spark.network.sasl.SecretKeyHolder; import org.apache.spark.network.server.NoOpRpcHandler; @@ -53,7 +54,7 @@ public class ExternalBlockStoreClient extends BlockStoreClient { private final SecretKeyHolder secretKeyHolder; private final long registrationTimeoutMs; - protected TransportClientFactory clientFactory; + protected volatile TransportClientFactory clientFactory; protected String appId; /** @@ -102,9 +103,14 @@ public void fetchBlocks( try { RetryingBlockFetcher.BlockFetchStarter blockFetchStarter = (blockIds1, listener1) -> { - TransportClient client = clientFactory.createClient(host, port); - new OneForOneBlockFetcher(client, appId, execId, - blockIds1, listener1, conf, downloadFileManager).start(); + // Unless this client is closed. + if (clientFactory != null) { + TransportClient client = clientFactory.createClient(host, port); + new OneForOneBlockFetcher(client, appId, execId, + blockIds1, listener1, conf, downloadFileManager).start(); + } else { + logger.info("This clientFactory was closed. Skipping further block fetch retries."); + } }; int maxRetries = conf.maxIORetries(); @@ -177,7 +183,7 @@ public void onSuccess(ByteBuffer response) { @Override public void onFailure(Throwable e) { logger.warn("Error trying to remove RDD blocks " + Arrays.toString(blockIds) + - " via external shuffle service from executor: " + execId, e); + " via external shuffle service from executor: " + execId, e); numRemovedBlocksFuture.complete(0); client.close(); } @@ -185,6 +191,46 @@ public void onFailure(Throwable e) { return numRemovedBlocksFuture; } + public void getHostLocalDirs( + String host, + int port, + String[] execIds, + CompletableFuture> hostLocalDirsCompletable) { + checkInit(); + GetLocalDirsForExecutors getLocalDirsMessage = new GetLocalDirsForExecutors(appId, execIds); + try { + TransportClient client = clientFactory.createClient(host, port); + client.sendRpc(getLocalDirsMessage.toByteBuffer(), new RpcResponseCallback() { + @Override + public void onSuccess(ByteBuffer response) { + try { + BlockTransferMessage msgObj = BlockTransferMessage.Decoder.fromByteBuffer(response); + hostLocalDirsCompletable.complete( + ((LocalDirsForExecutors) msgObj).getLocalDirsByExec()); + } catch (Throwable t) { + logger.warn("Error trying to get the host local dirs for " + + Arrays.toString(getLocalDirsMessage.execIds) + " via external shuffle service", + t.getCause()); + hostLocalDirsCompletable.completeExceptionally(t); + } finally { + client.close(); + } + } + + @Override + public void onFailure(Throwable t) { + logger.warn("Error trying to get the host local dirs for " + + Arrays.toString(getLocalDirsMessage.execIds) + " via external shuffle service", + t.getCause()); + hostLocalDirsCompletable.completeExceptionally(t); + client.close(); + } + }); + } catch (IOException | InterruptedException e) { + hostLocalDirsCompletable.completeExceptionally(e); + } + } + @Override public void close() { checkInit(); diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java index 50f16fc700f12..ba1a17bf7e5ea 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolver.java @@ -25,12 +25,15 @@ import java.util.concurrent.Executor; import java.util.concurrent.Executors; import java.util.regex.Pattern; +import java.util.stream.Collectors; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; +import org.apache.commons.lang3.tuple.Pair; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Objects; import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheLoader; import com.google.common.cache.LoadingCache; @@ -165,21 +168,34 @@ public void registerExecutor( } /** - * Obtains a FileSegmentManagedBuffer from (shuffleId, mapId, reduceId). We make assumptions - * about how the hash and sort based shuffles store their data. + * Obtains a FileSegmentManagedBuffer from a single block (shuffleId, mapId, reduceId). */ public ManagedBuffer getBlockData( String appId, String execId, int shuffleId, - int mapId, + long mapId, int reduceId) { + return getContinuousBlocksData(appId, execId, shuffleId, mapId, reduceId, reduceId + 1); + } + + /** + * Obtains a FileSegmentManagedBuffer from (shuffleId, mapId, [startReduceId, endReduceId)). + * We make assumptions about how the hash and sort based shuffles store their data. + */ + public ManagedBuffer getContinuousBlocksData( + String appId, + String execId, + int shuffleId, + long mapId, + int startReduceId, + int endReduceId) { ExecutorShuffleInfo executor = executors.get(new AppExecId(appId, execId)); if (executor == null) { throw new RuntimeException( String.format("Executor is not registered (appId=%s, execId=%s)", appId, execId)); } - return getSortBasedShuffleBlockData(executor, shuffleId, mapId, reduceId); + return getSortBasedShuffleBlockData(executor, shuffleId, mapId, startReduceId, endReduceId); } public ManagedBuffer getRddBlockData( @@ -296,13 +312,14 @@ private void deleteNonShuffleServiceServedFiles(String[] dirs) { * and the block id format is from ShuffleDataBlockId and ShuffleIndexBlockId. */ private ManagedBuffer getSortBasedShuffleBlockData( - ExecutorShuffleInfo executor, int shuffleId, int mapId, int reduceId) { + ExecutorShuffleInfo executor, int shuffleId, long mapId, int startReduceId, int endReduceId) { File indexFile = ExecutorDiskUtils.getFile(executor.localDirs, executor.subDirsPerLocalDir, "shuffle_" + shuffleId + "_" + mapId + "_0.index"); try { ShuffleIndexInformation shuffleIndexInformation = shuffleIndexCache.get(indexFile); - ShuffleIndexRecord shuffleIndexRecord = shuffleIndexInformation.getIndex(reduceId); + ShuffleIndexRecord shuffleIndexRecord = shuffleIndexInformation.getIndex( + startReduceId, endReduceId); return new FileSegmentManagedBuffer( conf, ExecutorDiskUtils.getFile(executor.localDirs, executor.subDirsPerLocalDir, @@ -355,6 +372,19 @@ public int removeBlocks(String appId, String execId, String[] blockIds) { return numRemovedBlocks; } + public Map getLocalDirs(String appId, String[] execIds) { + return Arrays.stream(execIds) + .map(exec -> { + ExecutorShuffleInfo info = executors.get(new AppExecId(appId, exec)); + if (info == null) { + throw new RuntimeException( + String.format("Executor is not registered (appId=%s, execId=%s)", appId, exec)); + } + return Pair.of(exec, info.localDirs); + }) + .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); + } + /** Simply encodes an executor's full ID, which is appId + execId. */ public static class AppExecId { public final String appId; @@ -372,19 +402,19 @@ public boolean equals(Object o) { if (o == null || getClass() != o.getClass()) return false; AppExecId appExecId = (AppExecId) o; - return Objects.equal(appId, appExecId.appId) && Objects.equal(execId, appExecId.execId); + return Objects.equals(appId, appExecId.appId) && Objects.equals(execId, appExecId.execId); } @Override public int hashCode() { - return Objects.hashCode(appId, execId); + return Objects.hash(appId, execId); } @Override public String toString() { - return Objects.toStringHelper(this) - .add("appId", appId) - .add("execId", execId) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("appId", appId) + .append("execId", execId) .toString(); } } diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java index cc11e92067375..ec2e3dce661d9 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java @@ -24,6 +24,7 @@ import java.util.HashMap; import com.google.common.primitives.Ints; +import com.google.common.primitives.Longs; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -111,40 +112,49 @@ private boolean isShuffleBlocks(String[] blockIds) { */ private FetchShuffleBlocks createFetchShuffleBlocksMsg( String appId, String execId, String[] blockIds) { - int shuffleId = splitBlockId(blockIds[0])[0]; - HashMap> mapIdToReduceIds = new HashMap<>(); + String[] firstBlock = splitBlockId(blockIds[0]); + int shuffleId = Integer.parseInt(firstBlock[1]); + boolean batchFetchEnabled = firstBlock.length == 5; + + HashMap> mapIdToReduceIds = new HashMap<>(); for (String blockId : blockIds) { - int[] blockIdParts = splitBlockId(blockId); - if (blockIdParts[0] != shuffleId) { + String[] blockIdParts = splitBlockId(blockId); + if (Integer.parseInt(blockIdParts[1]) != shuffleId) { throw new IllegalArgumentException("Expected shuffleId=" + shuffleId + ", got:" + blockId); } - int mapId = blockIdParts[1]; + long mapId = Long.parseLong(blockIdParts[2]); if (!mapIdToReduceIds.containsKey(mapId)) { mapIdToReduceIds.put(mapId, new ArrayList<>()); } - mapIdToReduceIds.get(mapId).add(blockIdParts[2]); + mapIdToReduceIds.get(mapId).add(Integer.parseInt(blockIdParts[3])); + if (batchFetchEnabled) { + // When we read continuous shuffle blocks in batch, we will reuse reduceIds in + // FetchShuffleBlocks to store the start and end reduce id for range + // [startReduceId, endReduceId). + assert(blockIdParts.length == 5); + mapIdToReduceIds.get(mapId).add(Integer.parseInt(blockIdParts[4])); + } } - int[] mapIds = Ints.toArray(mapIdToReduceIds.keySet()); + long[] mapIds = Longs.toArray(mapIdToReduceIds.keySet()); int[][] reduceIdArr = new int[mapIds.length][]; for (int i = 0; i < mapIds.length; i++) { reduceIdArr[i] = Ints.toArray(mapIdToReduceIds.get(mapIds[i])); } - return new FetchShuffleBlocks(appId, execId, shuffleId, mapIds, reduceIdArr); + return new FetchShuffleBlocks( + appId, execId, shuffleId, mapIds, reduceIdArr, batchFetchEnabled); } - /** Split the shuffleBlockId and return shuffleId, mapId and reduceId. */ - private int[] splitBlockId(String blockId) { + /** Split the shuffleBlockId and return shuffleId, mapId and reduceIds. */ + private String[] splitBlockId(String blockId) { String[] blockIdParts = blockId.split("_"); - if (blockIdParts.length != 4 || !blockIdParts[0].equals("shuffle")) { + // For batch block id, the format contains shuffleId, mapId, begin reduceId, end reduceId. + // For single block id, the format contains shuffleId, mapId, educeId. + if (blockIdParts.length < 4 || blockIdParts.length > 5 || !blockIdParts[0].equals("shuffle")) { throw new IllegalArgumentException( "Unexpected shuffle block id format: " + blockId); } - return new int[] { - Integer.parseInt(blockIdParts[1]), - Integer.parseInt(blockIdParts[2]), - Integer.parseInt(blockIdParts[3]) - }; + return blockIdParts; } /** Callback invoked on receipt of each chunk. We equate a single chunk to a single block. */ diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java index 371149bef3974..b65aacfcc4b9e 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java @@ -54,8 +54,15 @@ public int getSize() { * Get index offset for a particular reducer. */ public ShuffleIndexRecord getIndex(int reduceId) { - long offset = offsets.get(reduceId); - long nextOffset = offsets.get(reduceId + 1); + return getIndex(reduceId, reduceId + 1); + } + + /** + * Get index offset for the reducer range of [startReduceId, endReduceId). + */ + public ShuffleIndexRecord getIndex(int startReduceId, int endReduceId) { + long offset = offsets.get(startReduceId); + long nextOffset = offsets.get(endReduceId); return new ShuffleIndexRecord(offset, nextOffset - offset); } } diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/BlockTransferMessage.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/BlockTransferMessage.java index 41dd55847ebdb..89d8dfe8716b8 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/BlockTransferMessage.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/BlockTransferMessage.java @@ -47,7 +47,7 @@ public abstract class BlockTransferMessage implements Encodable { public enum Type { OPEN_BLOCKS(0), UPLOAD_BLOCK(1), REGISTER_EXECUTOR(2), STREAM_HANDLE(3), REGISTER_DRIVER(4), HEARTBEAT(5), UPLOAD_BLOCK_STREAM(6), REMOVE_BLOCKS(7), BLOCKS_REMOVED(8), - FETCH_SHUFFLE_BLOCKS(9); + FETCH_SHUFFLE_BLOCKS(9), GET_LOCAL_DIRS_FOR_EXECUTORS(10), LOCAL_DIRS_FOR_EXECUTORS(11); private final byte id; @@ -76,6 +76,8 @@ public static BlockTransferMessage fromByteBuffer(ByteBuffer msg) { case 7: return RemoveBlocks.decode(buf); case 8: return BlocksRemoved.decode(buf); case 9: return FetchShuffleBlocks.decode(buf); + case 10: return GetLocalDirsForExecutors.decode(buf); + case 11: return LocalDirsForExecutors.decode(buf); default: throw new IllegalArgumentException("Unknown message type: " + type); } } diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/BlocksRemoved.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/BlocksRemoved.java index 3f04443871b68..a4d6035df807c 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/BlocksRemoved.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/BlocksRemoved.java @@ -17,8 +17,11 @@ package org.apache.spark.network.shuffle.protocol; -import com.google.common.base.Objects; +import java.util.Objects; + import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; // Needed by ScalaDoc. See SPARK-7726 import static org.apache.spark.network.shuffle.protocol.BlockTransferMessage.Type; @@ -41,8 +44,8 @@ public int hashCode() { @Override public String toString() { - return Objects.toStringHelper(this) - .add("numRemovedBlocks", numRemovedBlocks) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("numRemovedBlocks", numRemovedBlocks) .toString(); } @@ -50,7 +53,7 @@ public String toString() { public boolean equals(Object other) { if (other != null && other instanceof BlocksRemoved) { BlocksRemoved o = (BlocksRemoved) other; - return Objects.equal(numRemovedBlocks, o.numRemovedBlocks); + return numRemovedBlocks == o.numRemovedBlocks; } return false; } diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/ExecutorShuffleInfo.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/ExecutorShuffleInfo.java index 93758bdc58fb0..b4e7bc409d3b8 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/ExecutorShuffleInfo.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/ExecutorShuffleInfo.java @@ -18,11 +18,13 @@ package org.apache.spark.network.shuffle.protocol; import java.util.Arrays; +import java.util.Objects; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; -import com.google.common.base.Objects; import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; import org.apache.spark.network.protocol.Encodable; import org.apache.spark.network.protocol.Encoders; @@ -48,15 +50,15 @@ public ExecutorShuffleInfo( @Override public int hashCode() { - return Objects.hashCode(subDirsPerLocalDir, shuffleManager) * 41 + Arrays.hashCode(localDirs); + return Objects.hash(subDirsPerLocalDir, shuffleManager) * 41 + Arrays.hashCode(localDirs); } @Override public String toString() { - return Objects.toStringHelper(this) - .add("localDirs", Arrays.toString(localDirs)) - .add("subDirsPerLocalDir", subDirsPerLocalDir) - .add("shuffleManager", shuffleManager) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("localDirs", Arrays.toString(localDirs)) + .append("subDirsPerLocalDir", subDirsPerLocalDir) + .append("shuffleManager", shuffleManager) .toString(); } @@ -65,8 +67,8 @@ public boolean equals(Object other) { if (other != null && other instanceof ExecutorShuffleInfo) { ExecutorShuffleInfo o = (ExecutorShuffleInfo) other; return Arrays.equals(localDirs, o.localDirs) - && Objects.equal(subDirsPerLocalDir, o.subDirsPerLocalDir) - && Objects.equal(shuffleManager, o.shuffleManager); + && subDirsPerLocalDir == o.subDirsPerLocalDir + && Objects.equals(shuffleManager, o.shuffleManager); } return false; } diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/FetchShuffleBlocks.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/FetchShuffleBlocks.java index 466eeb3e048a8..98057d58f7ab5 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/FetchShuffleBlocks.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/FetchShuffleBlocks.java @@ -19,8 +19,9 @@ import java.util.Arrays; -import com.google.common.base.Objects; import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; import org.apache.spark.network.protocol.Encoders; @@ -34,21 +35,33 @@ public class FetchShuffleBlocks extends BlockTransferMessage { public final int shuffleId; // The length of mapIds must equal to reduceIds.size(), for the i-th mapId in mapIds, // it corresponds to the i-th int[] in reduceIds, which contains all reduce id for this map id. - public final int[] mapIds; + public final long[] mapIds; + // When batchFetchEnabled=true, reduceIds[i] contains 2 elements: startReduceId (inclusive) and + // endReduceId (exclusive) for the mapper mapIds[i]. + // When batchFetchEnabled=false, reduceIds[i] contains all the reduce IDs that mapper mapIds[i] + // needs to fetch. public final int[][] reduceIds; + public final boolean batchFetchEnabled; public FetchShuffleBlocks( String appId, String execId, int shuffleId, - int[] mapIds, - int[][] reduceIds) { + long[] mapIds, + int[][] reduceIds, + boolean batchFetchEnabled) { this.appId = appId; this.execId = execId; this.shuffleId = shuffleId; this.mapIds = mapIds; this.reduceIds = reduceIds; assert(mapIds.length == reduceIds.length); + this.batchFetchEnabled = batchFetchEnabled; + if (batchFetchEnabled) { + for (int[] ids: reduceIds) { + assert(ids.length == 2); + } + } } @Override @@ -56,12 +69,13 @@ public FetchShuffleBlocks( @Override public String toString() { - return Objects.toStringHelper(this) - .add("appId", appId) - .add("execId", execId) - .add("shuffleId", shuffleId) - .add("mapIds", Arrays.toString(mapIds)) - .add("reduceIds", Arrays.deepToString(reduceIds)) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("appId", appId) + .append("execId", execId) + .append("shuffleId", shuffleId) + .append("mapIds", Arrays.toString(mapIds)) + .append("reduceIds", Arrays.deepToString(reduceIds)) + .append("batchFetchEnabled", batchFetchEnabled) .toString(); } @@ -73,6 +87,7 @@ public boolean equals(Object o) { FetchShuffleBlocks that = (FetchShuffleBlocks) o; if (shuffleId != that.shuffleId) return false; + if (batchFetchEnabled != that.batchFetchEnabled) return false; if (!appId.equals(that.appId)) return false; if (!execId.equals(that.execId)) return false; if (!Arrays.equals(mapIds, that.mapIds)) return false; @@ -86,6 +101,7 @@ public int hashCode() { result = 31 * result + shuffleId; result = 31 * result + Arrays.hashCode(mapIds); result = 31 * result + Arrays.deepHashCode(reduceIds); + result = 31 * result + (batchFetchEnabled ? 1 : 0); return result; } @@ -98,9 +114,10 @@ public int encodedLength() { return Encoders.Strings.encodedLength(appId) + Encoders.Strings.encodedLength(execId) + 4 /* encoded length of shuffleId */ - + Encoders.IntArrays.encodedLength(mapIds) + + Encoders.LongArrays.encodedLength(mapIds) + 4 /* encoded length of reduceIds.size() */ - + encodedLengthOfReduceIds; + + encodedLengthOfReduceIds + + 1; /* encoded length of batchFetchEnabled */ } @Override @@ -108,23 +125,25 @@ public void encode(ByteBuf buf) { Encoders.Strings.encode(buf, appId); Encoders.Strings.encode(buf, execId); buf.writeInt(shuffleId); - Encoders.IntArrays.encode(buf, mapIds); + Encoders.LongArrays.encode(buf, mapIds); buf.writeInt(reduceIds.length); for (int[] ids: reduceIds) { Encoders.IntArrays.encode(buf, ids); } + buf.writeBoolean(batchFetchEnabled); } public static FetchShuffleBlocks decode(ByteBuf buf) { String appId = Encoders.Strings.decode(buf); String execId = Encoders.Strings.decode(buf); int shuffleId = buf.readInt(); - int[] mapIds = Encoders.IntArrays.decode(buf); + long[] mapIds = Encoders.LongArrays.decode(buf); int reduceIdsSize = buf.readInt(); int[][] reduceIds = new int[reduceIdsSize][]; for (int i = 0; i < reduceIdsSize; i++) { reduceIds[i] = Encoders.IntArrays.decode(buf); } - return new FetchShuffleBlocks(appId, execId, shuffleId, mapIds, reduceIds); + boolean batchFetchEnabled = buf.readBoolean(); + return new FetchShuffleBlocks(appId, execId, shuffleId, mapIds, reduceIds, batchFetchEnabled); } } diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/GetLocalDirsForExecutors.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/GetLocalDirsForExecutors.java new file mode 100644 index 0000000000000..47f617c5e0a0a --- /dev/null +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/GetLocalDirsForExecutors.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.network.shuffle.protocol; + +import java.util.Arrays; +import java.util.Objects; + +import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; + +import org.apache.spark.network.protocol.Encoders; + +// Needed by ScalaDoc. See SPARK-7726 +import static org.apache.spark.network.shuffle.protocol.BlockTransferMessage.Type; + +/** Request to get the local dirs for the given executors. */ +public class GetLocalDirsForExecutors extends BlockTransferMessage { + public final String appId; + public final String[] execIds; + + public GetLocalDirsForExecutors(String appId, String[] execIds) { + this.appId = appId; + this.execIds = execIds; + } + + @Override + protected Type type() { return Type.GET_LOCAL_DIRS_FOR_EXECUTORS; } + + @Override + public int hashCode() { + return Objects.hashCode(appId) * 41 + Arrays.hashCode(execIds); + } + + @Override + public String toString() { + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("appId", appId) + .append("execIds", Arrays.toString(execIds)) + .toString(); + } + + @Override + public boolean equals(Object other) { + if (other instanceof GetLocalDirsForExecutors) { + GetLocalDirsForExecutors o = (GetLocalDirsForExecutors) other; + return appId.equals(o.appId) && Arrays.equals(execIds, o.execIds); + } + return false; + } + + @Override + public int encodedLength() { + return Encoders.Strings.encodedLength(appId) + Encoders.StringArrays.encodedLength(execIds); + } + + @Override + public void encode(ByteBuf buf) { + Encoders.Strings.encode(buf, appId); + Encoders.StringArrays.encode(buf, execIds); + } + + public static GetLocalDirsForExecutors decode(ByteBuf buf) { + String appId = Encoders.Strings.decode(buf); + String[] execIds = Encoders.StringArrays.decode(buf); + return new GetLocalDirsForExecutors(appId, execIds); + } +} diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/LocalDirsForExecutors.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/LocalDirsForExecutors.java new file mode 100644 index 0000000000000..9e2f0668cbd24 --- /dev/null +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/LocalDirsForExecutors.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.network.shuffle.protocol; + +import java.util.*; + +import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; + +import org.apache.spark.network.protocol.Encoders; + +// Needed by ScalaDoc. See SPARK-7726 +import static org.apache.spark.network.shuffle.protocol.BlockTransferMessage.Type; + +/** The reply to get local dirs giving back the dirs for each of the requested executors. */ +public class LocalDirsForExecutors extends BlockTransferMessage { + private final String[] execIds; + private final int[] numLocalDirsByExec; + private final String[] allLocalDirs; + + public LocalDirsForExecutors(Map localDirsByExec) { + this.execIds = new String[localDirsByExec.size()]; + this.numLocalDirsByExec = new int[localDirsByExec.size()]; + ArrayList localDirs = new ArrayList<>(); + int index = 0; + for (Map.Entry e: localDirsByExec.entrySet()) { + execIds[index] = e.getKey(); + numLocalDirsByExec[index] = e.getValue().length; + Collections.addAll(localDirs, e.getValue()); + index++; + } + this.allLocalDirs = localDirs.toArray(new String[0]); + } + + private LocalDirsForExecutors(String[] execIds, int[] numLocalDirsByExec, String[] allLocalDirs) { + this.execIds = execIds; + this.numLocalDirsByExec = numLocalDirsByExec; + this.allLocalDirs = allLocalDirs; + } + + @Override + protected Type type() { return Type.LOCAL_DIRS_FOR_EXECUTORS; } + + @Override + public int hashCode() { + return Arrays.hashCode(execIds); + } + + @Override + public String toString() { + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("execIds", Arrays.toString(execIds)) + .append("numLocalDirsByExec", Arrays.toString(numLocalDirsByExec)) + .append("allLocalDirs", Arrays.toString(allLocalDirs)) + .toString(); + } + + @Override + public boolean equals(Object other) { + if (other instanceof LocalDirsForExecutors) { + LocalDirsForExecutors o = (LocalDirsForExecutors) other; + return Arrays.equals(execIds, o.execIds) + && Arrays.equals(numLocalDirsByExec, o.numLocalDirsByExec) + && Arrays.equals(allLocalDirs, o.allLocalDirs); + } + return false; + } + + @Override + public int encodedLength() { + return Encoders.StringArrays.encodedLength(execIds) + + Encoders.IntArrays.encodedLength(numLocalDirsByExec) + + Encoders.StringArrays.encodedLength(allLocalDirs); + } + + @Override + public void encode(ByteBuf buf) { + Encoders.StringArrays.encode(buf, execIds); + Encoders.IntArrays.encode(buf, numLocalDirsByExec); + Encoders.StringArrays.encode(buf, allLocalDirs); + } + + public static LocalDirsForExecutors decode(ByteBuf buf) { + String[] execIds = Encoders.StringArrays.decode(buf); + int[] numLocalDirsByExec = Encoders.IntArrays.decode(buf); + String[] allLocalDirs = Encoders.StringArrays.decode(buf); + return new LocalDirsForExecutors(execIds, numLocalDirsByExec, allLocalDirs); + } + + public Map getLocalDirsByExec() { + Map localDirsByExec = new HashMap<>(); + int index = 0; + int localDirsIndex = 0; + for (int length: numLocalDirsByExec) { + localDirsByExec.put(execIds[index], + Arrays.copyOfRange(allLocalDirs, localDirsIndex, localDirsIndex + length)); + localDirsIndex += length; + index++; + } + return localDirsByExec; + } +} diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/OpenBlocks.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/OpenBlocks.java index ce954b8a289e4..771e17b3233ec 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/OpenBlocks.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/OpenBlocks.java @@ -18,9 +18,11 @@ package org.apache.spark.network.shuffle.protocol; import java.util.Arrays; +import java.util.Objects; -import com.google.common.base.Objects; import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; import org.apache.spark.network.protocol.Encoders; @@ -44,15 +46,15 @@ public OpenBlocks(String appId, String execId, String[] blockIds) { @Override public int hashCode() { - return Objects.hashCode(appId, execId) * 41 + Arrays.hashCode(blockIds); + return Objects.hash(appId, execId) * 41 + Arrays.hashCode(blockIds); } @Override public String toString() { - return Objects.toStringHelper(this) - .add("appId", appId) - .add("execId", execId) - .add("blockIds", Arrays.toString(blockIds)) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("appId", appId) + .append("execId", execId) + .append("blockIds", Arrays.toString(blockIds)) .toString(); } @@ -60,8 +62,8 @@ public String toString() { public boolean equals(Object other) { if (other != null && other instanceof OpenBlocks) { OpenBlocks o = (OpenBlocks) other; - return Objects.equal(appId, o.appId) - && Objects.equal(execId, o.execId) + return Objects.equals(appId, o.appId) + && Objects.equals(execId, o.execId) && Arrays.equals(blockIds, o.blockIds); } return false; diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RegisterExecutor.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RegisterExecutor.java index 167ef33104227..f6af755cd9cd5 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RegisterExecutor.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RegisterExecutor.java @@ -17,8 +17,11 @@ package org.apache.spark.network.shuffle.protocol; -import com.google.common.base.Objects; +import java.util.Objects; + import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; import org.apache.spark.network.protocol.Encoders; @@ -48,15 +51,15 @@ public RegisterExecutor( @Override public int hashCode() { - return Objects.hashCode(appId, execId, executorInfo); + return Objects.hash(appId, execId, executorInfo); } @Override public String toString() { - return Objects.toStringHelper(this) - .add("appId", appId) - .add("execId", execId) - .add("executorInfo", executorInfo) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("appId", appId) + .append("execId", execId) + .append("executorInfo", executorInfo) .toString(); } @@ -64,9 +67,9 @@ public String toString() { public boolean equals(Object other) { if (other != null && other instanceof RegisterExecutor) { RegisterExecutor o = (RegisterExecutor) other; - return Objects.equal(appId, o.appId) - && Objects.equal(execId, o.execId) - && Objects.equal(executorInfo, o.executorInfo); + return Objects.equals(appId, o.appId) + && Objects.equals(execId, o.execId) + && Objects.equals(executorInfo, o.executorInfo); } return false; } diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RemoveBlocks.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RemoveBlocks.java index 1c718d307753f..ade838bd4286c 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RemoveBlocks.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/RemoveBlocks.java @@ -17,11 +17,14 @@ package org.apache.spark.network.shuffle.protocol; -import com.google.common.base.Objects; +import java.util.Arrays; +import java.util.Objects; + import io.netty.buffer.ByteBuf; -import org.apache.spark.network.protocol.Encoders; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; -import java.util.Arrays; +import org.apache.spark.network.protocol.Encoders; // Needed by ScalaDoc. See SPARK-7726 import static org.apache.spark.network.shuffle.protocol.BlockTransferMessage.Type; @@ -43,15 +46,15 @@ public RemoveBlocks(String appId, String execId, String[] blockIds) { @Override public int hashCode() { - return Objects.hashCode(appId, execId) * 41 + Arrays.hashCode(blockIds); + return Objects.hash(appId, execId) * 41 + Arrays.hashCode(blockIds); } @Override public String toString() { - return Objects.toStringHelper(this) - .add("appId", appId) - .add("execId", execId) - .add("blockIds", Arrays.toString(blockIds)) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("appId", appId) + .append("execId", execId) + .append("blockIds", Arrays.toString(blockIds)) .toString(); } @@ -59,8 +62,8 @@ public String toString() { public boolean equals(Object other) { if (other != null && other instanceof RemoveBlocks) { RemoveBlocks o = (RemoveBlocks) other; - return Objects.equal(appId, o.appId) - && Objects.equal(execId, o.execId) + return Objects.equals(appId, o.appId) + && Objects.equals(execId, o.execId) && Arrays.equals(blockIds, o.blockIds); } return false; diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/StreamHandle.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/StreamHandle.java index 1915295aa6cc2..dd7715a4e82d4 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/StreamHandle.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/StreamHandle.java @@ -17,8 +17,11 @@ package org.apache.spark.network.shuffle.protocol; -import com.google.common.base.Objects; +import java.util.Objects; + import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; // Needed by ScalaDoc. See SPARK-7726 import static org.apache.spark.network.shuffle.protocol.BlockTransferMessage.Type; @@ -41,14 +44,14 @@ public StreamHandle(long streamId, int numChunks) { @Override public int hashCode() { - return Objects.hashCode(streamId, numChunks); + return Objects.hash(streamId, numChunks); } @Override public String toString() { - return Objects.toStringHelper(this) - .add("streamId", streamId) - .add("numChunks", numChunks) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("streamId", streamId) + .append("numChunks", numChunks) .toString(); } @@ -56,8 +59,8 @@ public String toString() { public boolean equals(Object other) { if (other != null && other instanceof StreamHandle) { StreamHandle o = (StreamHandle) other; - return Objects.equal(streamId, o.streamId) - && Objects.equal(numChunks, o.numChunks); + return Objects.equals(streamId, o.streamId) + && Objects.equals(numChunks, o.numChunks); } return false; } diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/UploadBlock.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/UploadBlock.java index 3caed59d508fd..a5bc3f7009b46 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/UploadBlock.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/UploadBlock.java @@ -18,9 +18,11 @@ package org.apache.spark.network.shuffle.protocol; import java.util.Arrays; +import java.util.Objects; -import com.google.common.base.Objects; import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; import org.apache.spark.network.protocol.Encoders; @@ -60,18 +62,18 @@ public UploadBlock( @Override public int hashCode() { - int objectsHashCode = Objects.hashCode(appId, execId, blockId); + int objectsHashCode = Objects.hash(appId, execId, blockId); return (objectsHashCode * 41 + Arrays.hashCode(metadata)) * 41 + Arrays.hashCode(blockData); } @Override public String toString() { - return Objects.toStringHelper(this) - .add("appId", appId) - .add("execId", execId) - .add("blockId", blockId) - .add("metadata size", metadata.length) - .add("block size", blockData.length) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("appId", appId) + .append("execId", execId) + .append("blockId", blockId) + .append("metadata size", metadata.length) + .append("block size", blockData.length) .toString(); } @@ -79,9 +81,9 @@ public String toString() { public boolean equals(Object other) { if (other != null && other instanceof UploadBlock) { UploadBlock o = (UploadBlock) other; - return Objects.equal(appId, o.appId) - && Objects.equal(execId, o.execId) - && Objects.equal(blockId, o.blockId) + return Objects.equals(appId, o.appId) + && Objects.equals(execId, o.execId) + && Objects.equals(blockId, o.blockId) && Arrays.equals(metadata, o.metadata) && Arrays.equals(blockData, o.blockData); } diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/UploadBlockStream.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/UploadBlockStream.java index 9df30967d5bb2..958a84e516c81 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/UploadBlockStream.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/protocol/UploadBlockStream.java @@ -18,9 +18,11 @@ package org.apache.spark.network.shuffle.protocol; import java.util.Arrays; +import java.util.Objects; -import com.google.common.base.Objects; import io.netty.buffer.ByteBuf; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; import org.apache.spark.network.protocol.Encoders; @@ -53,9 +55,9 @@ public int hashCode() { @Override public String toString() { - return Objects.toStringHelper(this) - .add("blockId", blockId) - .add("metadata size", metadata.length) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("blockId", blockId) + .append("metadata size", metadata.length) .toString(); } @@ -63,7 +65,7 @@ public String toString() { public boolean equals(Object other) { if (other != null && other instanceof UploadBlockStream) { UploadBlockStream o = (UploadBlockStream) other; - return Objects.equal(blockId, o.blockId) + return Objects.equals(blockId, o.blockId) && Arrays.equals(metadata, o.metadata); } return false; diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java index e8e766d3fb3ab..96dfc3b7cae61 100644 --- a/common/network-shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java +++ b/common/network-shuffle/src/test/java/org/apache/spark/network/sasl/SaslIntegrationSuite.java @@ -21,8 +21,6 @@ import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.atomic.AtomicReference; import org.junit.After; import org.junit.AfterClass; @@ -34,8 +32,6 @@ import org.apache.spark.network.TestUtils; import org.apache.spark.network.TransportContext; -import org.apache.spark.network.buffer.ManagedBuffer; -import org.apache.spark.network.client.ChunkReceivedCallback; import org.apache.spark.network.client.RpcResponseCallback; import org.apache.spark.network.client.TransportClient; import org.apache.spark.network.client.TransportClientFactory; @@ -44,15 +40,6 @@ import org.apache.spark.network.server.StreamManager; import org.apache.spark.network.server.TransportServer; import org.apache.spark.network.server.TransportServerBootstrap; -import org.apache.spark.network.shuffle.BlockFetchingListener; -import org.apache.spark.network.shuffle.ExternalBlockHandler; -import org.apache.spark.network.shuffle.ExternalShuffleBlockResolver; -import org.apache.spark.network.shuffle.OneForOneBlockFetcher; -import org.apache.spark.network.shuffle.protocol.BlockTransferMessage; -import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo; -import org.apache.spark.network.shuffle.protocol.OpenBlocks; -import org.apache.spark.network.shuffle.protocol.RegisterExecutor; -import org.apache.spark.network.shuffle.protocol.StreamHandle; import org.apache.spark.network.util.JavaUtils; import org.apache.spark.network.util.MapConfigProvider; import org.apache.spark.network.util.TransportConf; @@ -165,93 +152,6 @@ public void testNoSaslServer() { } } - /** - * This test is not actually testing SASL behavior, but testing that the shuffle service - * performs correct authorization checks based on the SASL authentication data. - */ - @Test - public void testAppIsolation() throws Exception { - // Start a new server with the correct RPC handler to serve block data. - ExternalShuffleBlockResolver blockResolver = mock(ExternalShuffleBlockResolver.class); - ExternalBlockHandler blockHandler = new ExternalBlockHandler( - new OneForOneStreamManager(), blockResolver); - TransportServerBootstrap bootstrap = new SaslServerBootstrap(conf, secretKeyHolder); - - try ( - TransportContext blockServerContext = new TransportContext(conf, blockHandler); - TransportServer blockServer = blockServerContext.createServer(Arrays.asList(bootstrap)); - // Create a client, and make a request to fetch blocks from a different app. - TransportClientFactory clientFactory1 = blockServerContext.createClientFactory( - Arrays.asList(new SaslClientBootstrap(conf, "app-1", secretKeyHolder))); - TransportClient client1 = clientFactory1.createClient( - TestUtils.getLocalHost(), blockServer.getPort())) { - - AtomicReference exception = new AtomicReference<>(); - - CountDownLatch blockFetchLatch = new CountDownLatch(1); - BlockFetchingListener listener = new BlockFetchingListener() { - @Override - public void onBlockFetchSuccess(String blockId, ManagedBuffer data) { - blockFetchLatch.countDown(); - } - @Override - public void onBlockFetchFailure(String blockId, Throwable t) { - exception.set(t); - blockFetchLatch.countDown(); - } - }; - - String[] blockIds = { "shuffle_0_1_2", "shuffle_0_3_4" }; - OneForOneBlockFetcher fetcher = - new OneForOneBlockFetcher(client1, "app-2", "0", blockIds, listener, conf); - fetcher.start(); - blockFetchLatch.await(); - checkSecurityException(exception.get()); - - // Register an executor so that the next steps work. - ExecutorShuffleInfo executorInfo = new ExecutorShuffleInfo( - new String[] { System.getProperty("java.io.tmpdir") }, 1, - "org.apache.spark.shuffle.sort.SortShuffleManager"); - RegisterExecutor regmsg = new RegisterExecutor("app-1", "0", executorInfo); - client1.sendRpcSync(regmsg.toByteBuffer(), TIMEOUT_MS); - - // Make a successful request to fetch blocks, which creates a new stream. But do not actually - // fetch any blocks, to keep the stream open. - OpenBlocks openMessage = new OpenBlocks("app-1", "0", blockIds); - ByteBuffer response = client1.sendRpcSync(openMessage.toByteBuffer(), TIMEOUT_MS); - StreamHandle stream = (StreamHandle) BlockTransferMessage.Decoder.fromByteBuffer(response); - long streamId = stream.streamId; - - try ( - // Create a second client, authenticated with a different app ID, and try to read from - // the stream created for the previous app. - TransportClientFactory clientFactory2 = blockServerContext.createClientFactory( - Arrays.asList(new SaslClientBootstrap(conf, "app-2", secretKeyHolder))); - TransportClient client2 = clientFactory2.createClient( - TestUtils.getLocalHost(), blockServer.getPort()) - ) { - CountDownLatch chunkReceivedLatch = new CountDownLatch(1); - ChunkReceivedCallback callback = new ChunkReceivedCallback() { - @Override - public void onSuccess(int chunkIndex, ManagedBuffer buffer) { - chunkReceivedLatch.countDown(); - } - - @Override - public void onFailure(int chunkIndex, Throwable t) { - exception.set(t); - chunkReceivedLatch.countDown(); - } - }; - - exception.set(null); - client2.fetchChunk(streamId, 0, callback); - chunkReceivedLatch.await(); - checkSecurityException(exception.get()); - } - } - } - /** RPC handler which simply responds with the message it received. */ public static class TestRpcHandler extends RpcHandler { @Override @@ -264,10 +164,4 @@ public StreamManager getStreamManager() { return new OneForOneStreamManager(); } } - - private static void checkSecurityException(Throwable t) { - assertNotNull("No exception was caught.", t); - assertTrue("Expected SecurityException.", - t.getMessage().contains(SecurityException.class.getName())); - } } diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/AppIsolationSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/AppIsolationSuite.java new file mode 100644 index 0000000000000..92e75222d0391 --- /dev/null +++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/AppIsolationSuite.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.network.shuffle; + +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.Function; +import java.util.function.Supplier; + +import org.junit.BeforeClass; +import org.junit.Test; + +import static org.junit.Assert.*; +import static org.mockito.Mockito.*; + +import org.apache.spark.network.TestUtils; +import org.apache.spark.network.TransportContext; +import org.apache.spark.network.buffer.ManagedBuffer; +import org.apache.spark.network.client.ChunkReceivedCallback; +import org.apache.spark.network.client.TransportClient; +import org.apache.spark.network.client.TransportClientBootstrap; +import org.apache.spark.network.client.TransportClientFactory; +import org.apache.spark.network.crypto.AuthClientBootstrap; +import org.apache.spark.network.crypto.AuthServerBootstrap; +import org.apache.spark.network.sasl.SaslClientBootstrap; +import org.apache.spark.network.sasl.SaslServerBootstrap; +import org.apache.spark.network.sasl.SecretKeyHolder; +import org.apache.spark.network.server.OneForOneStreamManager; +import org.apache.spark.network.server.TransportServer; +import org.apache.spark.network.server.TransportServerBootstrap; +import org.apache.spark.network.shuffle.protocol.BlockTransferMessage; +import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo; +import org.apache.spark.network.shuffle.protocol.OpenBlocks; +import org.apache.spark.network.shuffle.protocol.RegisterExecutor; +import org.apache.spark.network.shuffle.protocol.StreamHandle; +import org.apache.spark.network.util.MapConfigProvider; +import org.apache.spark.network.util.TransportConf; + +public class AppIsolationSuite { + + // Use a long timeout to account for slow / overloaded build machines. In the normal case, + // tests should finish way before the timeout expires. + private static final long TIMEOUT_MS = 10_000; + + private static SecretKeyHolder secretKeyHolder; + private static TransportConf conf; + + @BeforeClass + public static void beforeAll() { + Map confMap = new HashMap<>(); + confMap.put("spark.network.crypto.enabled", "true"); + confMap.put("spark.network.crypto.saslFallback", "false"); + conf = new TransportConf("shuffle", new MapConfigProvider(confMap)); + + secretKeyHolder = mock(SecretKeyHolder.class); + when(secretKeyHolder.getSaslUser(eq("app-1"))).thenReturn("app-1"); + when(secretKeyHolder.getSecretKey(eq("app-1"))).thenReturn("app-1"); + when(secretKeyHolder.getSaslUser(eq("app-2"))).thenReturn("app-2"); + when(secretKeyHolder.getSecretKey(eq("app-2"))).thenReturn("app-2"); + } + + @Test + public void testSaslAppIsolation() throws Exception { + testAppIsolation( + () -> new SaslServerBootstrap(conf, secretKeyHolder), + appId -> new SaslClientBootstrap(conf, appId, secretKeyHolder)); + } + + @Test + public void testAuthEngineAppIsolation() throws Exception { + testAppIsolation( + () -> new AuthServerBootstrap(conf, secretKeyHolder), + appId -> new AuthClientBootstrap(conf, appId, secretKeyHolder)); + } + + private void testAppIsolation( + Supplier serverBootstrap, + Function clientBootstrapFactory) throws Exception { + // Start a new server with the correct RPC handler to serve block data. + ExternalShuffleBlockResolver blockResolver = mock(ExternalShuffleBlockResolver.class); + ExternalBlockHandler blockHandler = new ExternalBlockHandler( + new OneForOneStreamManager(), blockResolver); + TransportServerBootstrap bootstrap = serverBootstrap.get(); + + try ( + TransportContext blockServerContext = new TransportContext(conf, blockHandler); + TransportServer blockServer = blockServerContext.createServer(Arrays.asList(bootstrap)); + // Create a client, and make a request to fetch blocks from a different app. + TransportClientFactory clientFactory1 = blockServerContext.createClientFactory( + Arrays.asList(clientBootstrapFactory.apply("app-1"))); + TransportClient client1 = clientFactory1.createClient( + TestUtils.getLocalHost(), blockServer.getPort())) { + + AtomicReference exception = new AtomicReference<>(); + + CountDownLatch blockFetchLatch = new CountDownLatch(1); + BlockFetchingListener listener = new BlockFetchingListener() { + @Override + public void onBlockFetchSuccess(String blockId, ManagedBuffer data) { + blockFetchLatch.countDown(); + } + @Override + public void onBlockFetchFailure(String blockId, Throwable t) { + exception.set(t); + blockFetchLatch.countDown(); + } + }; + + String[] blockIds = { "shuffle_0_1_2", "shuffle_0_3_4" }; + OneForOneBlockFetcher fetcher = + new OneForOneBlockFetcher(client1, "app-2", "0", blockIds, listener, conf); + fetcher.start(); + blockFetchLatch.await(); + checkSecurityException(exception.get()); + + // Register an executor so that the next steps work. + ExecutorShuffleInfo executorInfo = new ExecutorShuffleInfo( + new String[] { System.getProperty("java.io.tmpdir") }, 1, + "org.apache.spark.shuffle.sort.SortShuffleManager"); + RegisterExecutor regmsg = new RegisterExecutor("app-1", "0", executorInfo); + client1.sendRpcSync(regmsg.toByteBuffer(), TIMEOUT_MS); + + // Make a successful request to fetch blocks, which creates a new stream. But do not actually + // fetch any blocks, to keep the stream open. + OpenBlocks openMessage = new OpenBlocks("app-1", "0", blockIds); + ByteBuffer response = client1.sendRpcSync(openMessage.toByteBuffer(), TIMEOUT_MS); + StreamHandle stream = (StreamHandle) BlockTransferMessage.Decoder.fromByteBuffer(response); + long streamId = stream.streamId; + + try ( + // Create a second client, authenticated with a different app ID, and try to read from + // the stream created for the previous app. + TransportClientFactory clientFactory2 = blockServerContext.createClientFactory( + Arrays.asList(clientBootstrapFactory.apply("app-2"))); + TransportClient client2 = clientFactory2.createClient( + TestUtils.getLocalHost(), blockServer.getPort()) + ) { + CountDownLatch chunkReceivedLatch = new CountDownLatch(1); + ChunkReceivedCallback callback = new ChunkReceivedCallback() { + @Override + public void onSuccess(int chunkIndex, ManagedBuffer buffer) { + chunkReceivedLatch.countDown(); + } + + @Override + public void onFailure(int chunkIndex, Throwable t) { + exception.set(t); + chunkReceivedLatch.countDown(); + } + }; + + exception.set(null); + client2.fetchChunk(streamId, 0, callback); + chunkReceivedLatch.await(); + checkSecurityException(exception.get()); + } + } + } + + private static void checkSecurityException(Throwable t) { + assertNotNull("No exception was caught.", t); + assertTrue("Expected SecurityException.", + t.getMessage().contains(SecurityException.class.getName())); + } +} diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/BlockTransferMessagesSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/BlockTransferMessagesSuite.java index 649c471dc1679..67229371c3a4a 100644 --- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/BlockTransferMessagesSuite.java +++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/BlockTransferMessagesSuite.java @@ -21,6 +21,9 @@ import static org.junit.Assert.*; +import java.util.HashMap; +import java.util.Map; + import org.apache.spark.network.shuffle.protocol.*; /** Verifies that all BlockTransferMessages can be serialized correctly. */ @@ -29,8 +32,11 @@ public class BlockTransferMessagesSuite { public void serializeOpenShuffleBlocks() { checkSerializeDeserialize(new OpenBlocks("app-1", "exec-2", new String[] { "b1", "b2" })); checkSerializeDeserialize(new FetchShuffleBlocks( - "app-1", "exec-2", 0, new int[] {0, 1}, - new int[][] {{ 0, 1 }, { 0, 1, 2 }})); + "app-1", "exec-2", 0, new long[] {0, 1}, + new int[][] {{ 0, 1 }, { 0, 1, 2 }}, false)); + checkSerializeDeserialize(new FetchShuffleBlocks( + "app-1", "exec-2", 0, new long[] {0, 1}, + new int[][] {{ 0, 1 }, { 0, 2 }}, true)); checkSerializeDeserialize(new RegisterExecutor("app-1", "exec-2", new ExecutorShuffleInfo( new String[] { "/local1", "/local2" }, 32, "MyShuffleManager"))); checkSerializeDeserialize(new UploadBlock("app-1", "exec-2", "block-3", new byte[] { 1, 2 }, @@ -38,10 +44,29 @@ public void serializeOpenShuffleBlocks() { checkSerializeDeserialize(new StreamHandle(12345, 16)); } - private void checkSerializeDeserialize(BlockTransferMessage msg) { + @Test + public void testLocalDirsMessages() { + checkSerializeDeserialize( + new GetLocalDirsForExecutors("app-1", new String[]{"exec-1", "exec-2"})); + + Map map = new HashMap<>(); + map.put("exec-1", new String[]{"loc1.1"}); + map.put("exec-22", new String[]{"loc2.1", "loc2.2"}); + LocalDirsForExecutors localDirsForExecs = new LocalDirsForExecutors(map); + Map resultMap = + ((LocalDirsForExecutors)checkSerializeDeserialize(localDirsForExecs)).getLocalDirsByExec(); + assertEquals(resultMap.size(), map.keySet().size()); + for (Map.Entry e: map.entrySet()) { + assertTrue(resultMap.containsKey(e.getKey())); + assertArrayEquals(e.getValue(), resultMap.get(e.getKey())); + } + } + + private BlockTransferMessage checkSerializeDeserialize(BlockTransferMessage msg) { BlockTransferMessage msg2 = BlockTransferMessage.Decoder.fromByteBuffer(msg.toByteBuffer()); assertEquals(msg, msg2); assertEquals(msg.hashCode(), msg2.hashCode()); assertEquals(msg.toString(), msg2.toString()); + return msg2; } } diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/CleanupNonShuffleServiceServedFilesSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/CleanupNonShuffleServiceServedFilesSuite.java index e38442327e22d..b37d8620a57f4 100644 --- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/CleanupNonShuffleServiceServedFilesSuite.java +++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/CleanupNonShuffleServiceServedFilesSuite.java @@ -30,7 +30,6 @@ import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; -import com.google.common.util.concurrent.MoreExecutors; import org.junit.Test; import static org.junit.Assert.assertEquals; @@ -42,7 +41,7 @@ public class CleanupNonShuffleServiceServedFilesSuite { // Same-thread Executor used to ensure cleanup happens synchronously in test thread. - private Executor sameThreadExecutor = MoreExecutors.sameThreadExecutor(); + private Executor sameThreadExecutor = Runnable::run; private static final String SORT_MANAGER = "org.apache.spark.shuffle.sort.SortShuffleManager"; diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalBlockHandlerSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalBlockHandlerSuite.java index 9c623a70424b6..455351fcf767c 100644 --- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalBlockHandlerSuite.java +++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalBlockHandlerSuite.java @@ -101,7 +101,7 @@ public void testFetchShuffleBlocks() { when(blockResolver.getBlockData("app0", "exec1", 0, 0, 1)).thenReturn(blockMarkers[1]); FetchShuffleBlocks fetchShuffleBlocks = new FetchShuffleBlocks( - "app0", "exec1", 0, new int[] { 0 }, new int[][] {{ 0, 1 }}); + "app0", "exec1", 0, new long[] { 0 }, new int[][] {{ 0, 1 }}, false); checkOpenBlocksReceive(fetchShuffleBlocks, blockMarkers); verify(blockResolver, times(1)).getBlockData("app0", "exec1", 0, 0, 0); @@ -109,6 +109,22 @@ public void testFetchShuffleBlocks() { verifyOpenBlockLatencyMetrics(); } + @Test + public void testFetchShuffleBlocksInBatch() { + ManagedBuffer[] batchBlockMarkers = { + new NioManagedBuffer(ByteBuffer.wrap(new byte[10])) + }; + when(blockResolver.getContinuousBlocksData( + "app0", "exec1", 0, 0, 0, 1)).thenReturn(batchBlockMarkers[0]); + + FetchShuffleBlocks fetchShuffleBlocks = new FetchShuffleBlocks( + "app0", "exec1", 0, new long[] { 0 }, new int[][] {{ 0, 1 }}, true); + checkOpenBlocksReceive(fetchShuffleBlocks, batchBlockMarkers); + + verify(blockResolver, times(1)).getContinuousBlocksData("app0", "exec1", 0, 0, 0, 1); + verifyOpenBlockLatencyMetrics(); + } + @Test public void testOpenDiskPersistedRDDBlocks() { when(blockResolver.getRddBlockData("app0", "exec1", 0, 0)).thenReturn(blockMarkers[0]); @@ -154,7 +170,7 @@ private void checkOpenBlocksReceive(BlockTransferMessage msg, ManagedBuffer[] bl StreamHandle handle = (StreamHandle) BlockTransferMessage.Decoder.fromByteBuffer(response.getValue()); - assertEquals(2, handle.numChunks); + assertEquals(blockMarkers.length, handle.numChunks); @SuppressWarnings("unchecked") ArgumentCaptor> stream = (ArgumentCaptor>) @@ -162,8 +178,9 @@ private void checkOpenBlocksReceive(BlockTransferMessage msg, ManagedBuffer[] bl verify(streamManager, times(1)).registerStream(anyString(), stream.capture(), any()); Iterator buffers = stream.getValue(); - assertEquals(blockMarkers[0], buffers.next()); - assertEquals(blockMarkers[1], buffers.next()); + for (ManagedBuffer blockMarker : blockMarkers) { + assertEquals(blockMarker, buffers.next()); + } assertFalse(buffers.hasNext()); } diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java index 09eb699be305a..09b31430b1eb9 100644 --- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java +++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java @@ -111,6 +111,13 @@ public void testSortShuffleBlocks() throws IOException { CharStreams.toString(new InputStreamReader(block1Stream, StandardCharsets.UTF_8)); assertEquals(sortBlock1, block1); } + + try (InputStream blocksStream = resolver.getContinuousBlocksData( + "app0", "exec0", 0, 0, 0, 2).createInputStream()) { + String blocks = + CharStreams.toString(new InputStreamReader(blocksStream, StandardCharsets.UTF_8)); + assertEquals(sortBlock0 + sortBlock1, blocks); + } } @Test diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleCleanupSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleCleanupSuite.java index 47c087088a8a2..48b73e32216ce 100644 --- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleCleanupSuite.java +++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleCleanupSuite.java @@ -24,7 +24,6 @@ import java.util.concurrent.Executor; import java.util.concurrent.atomic.AtomicBoolean; -import com.google.common.util.concurrent.MoreExecutors; import org.junit.Test; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -35,7 +34,7 @@ public class ExternalShuffleCleanupSuite { // Same-thread Executor used to ensure cleanup happens synchronously in test thread. - private Executor sameThreadExecutor = MoreExecutors.sameThreadExecutor(); + private Executor sameThreadExecutor = Runnable::run; private TransportConf conf = new TransportConf("shuffle", MapConfigProvider.EMPTY); private static final String SORT_MANAGER = "org.apache.spark.shuffle.sort.SortShuffleManager"; diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/OneForOneBlockFetcherSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/OneForOneBlockFetcherSuite.java index 66633cc7a3595..285eedb39c65c 100644 --- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/OneForOneBlockFetcherSuite.java +++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/OneForOneBlockFetcherSuite.java @@ -64,7 +64,7 @@ public void testFetchOne() { BlockFetchingListener listener = fetchBlocks( blocks, blockIds, - new FetchShuffleBlocks("app-id", "exec-id", 0, new int[] { 0 }, new int[][] {{ 0 }}), + new FetchShuffleBlocks("app-id", "exec-id", 0, new long[] { 0 }, new int[][] {{ 0 }}, false), conf); verify(listener).onBlockFetchSuccess("shuffle_0_0_0", blocks.get("shuffle_0_0_0")); @@ -100,7 +100,8 @@ public void testFetchThreeShuffleBlocks() { BlockFetchingListener listener = fetchBlocks( blocks, blockIds, - new FetchShuffleBlocks("app-id", "exec-id", 0, new int[] { 0 }, new int[][] {{ 0, 1, 2 }}), + new FetchShuffleBlocks( + "app-id", "exec-id", 0, new long[] { 0 }, new int[][] {{ 0, 1, 2 }}, false), conf); for (int i = 0; i < 3; i ++) { @@ -109,6 +110,23 @@ public void testFetchThreeShuffleBlocks() { } } + @Test + public void testBatchFetchThreeShuffleBlocks() { + LinkedHashMap blocks = Maps.newLinkedHashMap(); + blocks.put("shuffle_0_0_0_3", new NioManagedBuffer(ByteBuffer.wrap(new byte[58]))); + String[] blockIds = blocks.keySet().toArray(new String[blocks.size()]); + + BlockFetchingListener listener = fetchBlocks( + blocks, + blockIds, + new FetchShuffleBlocks( + "app-id", "exec-id", 0, new long[] { 0 }, new int[][] {{ 0, 3 }}, true), + conf); + + verify(listener, times(1)).onBlockFetchSuccess( + "shuffle_0_0_0_3", blocks.get("shuffle_0_0_0_3")); + } + @Test public void testFetchThree() { LinkedHashMap blocks = Maps.newLinkedHashMap(); diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/TestShuffleDataContext.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/TestShuffleDataContext.java index 457805feeac45..fb67d7220a0b4 100644 --- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/TestShuffleDataContext.java +++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/TestShuffleDataContext.java @@ -28,6 +28,7 @@ import org.apache.spark.network.shuffle.protocol.ExecutorShuffleInfo; import org.apache.spark.network.util.JavaUtils; +import org.junit.Assert; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -122,7 +123,7 @@ private void insertFile(String filename) throws IOException { private void insertFile(String filename, byte[] block) throws IOException { OutputStream dataStream = null; File file = ExecutorDiskUtils.getFile(localDirs, subDirsPerLocalDir, filename); - assert(!file.exists()) : "this test file has been already generated"; + Assert.assertFalse("this test file has been already generated", file.exists()); try { dataStream = new FileOutputStream( ExecutorDiskUtils.getFile(localDirs, subDirsPerLocalDir, filename)); diff --git a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java index c170f99b112c0..815a56d765b6a 100644 --- a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java +++ b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java @@ -23,14 +23,16 @@ import java.nio.ByteBuffer; import java.util.List; import java.util.Map; +import java.util.Objects; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Objects; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -417,7 +419,7 @@ public boolean equals(Object o) { if (o == null || getClass() != o.getClass()) return false; AppId appExecId = (AppId) o; - return Objects.equal(appId, appExecId.appId); + return Objects.equals(appId, appExecId.appId); } @Override @@ -427,8 +429,8 @@ public int hashCode() { @Override public String toString() { - return Objects.toStringHelper(this) - .add("appId", appId) + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("appId", appId) .toString(); } } diff --git a/common/tags/src/test/java/org/apache/spark/tags/ExtendedSQLTest.java b/common/tags/src/test/java/org/apache/spark/tags/ExtendedSQLTest.java new file mode 100644 index 0000000000000..1c0fff1b4045d --- /dev/null +++ b/common/tags/src/test/java/org/apache/spark/tags/ExtendedSQLTest.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.tags; + +import org.scalatest.TagAnnotation; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@TagAnnotation +@Retention(RetentionPolicy.RUNTIME) +@Target({ElementType.METHOD, ElementType.TYPE}) +public @interface ExtendedSQLTest { } diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/DateTimeConstants.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/DateTimeConstants.java new file mode 100644 index 0000000000000..84a0156ebfb66 --- /dev/null +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/DateTimeConstants.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.util; + +public class DateTimeConstants { + + public static final int YEARS_PER_DECADE = 10; + public static final int YEARS_PER_CENTURY = 100; + public static final int YEARS_PER_MILLENNIUM = 1000; + + public static final byte MONTHS_PER_QUARTER = 3; + public static final int MONTHS_PER_YEAR = 12; + + public static final byte DAYS_PER_WEEK = 7; + public static final long DAYS_PER_MONTH = 30L; + + public static final long HOURS_PER_DAY = 24L; + + public static final long MINUTES_PER_HOUR = 60L; + + public static final long SECONDS_PER_MINUTE = 60L; + public static final long SECONDS_PER_HOUR = MINUTES_PER_HOUR * SECONDS_PER_MINUTE; + public static final long SECONDS_PER_DAY = HOURS_PER_DAY * SECONDS_PER_HOUR; + + public static final long MILLIS_PER_SECOND = 1000L; + public static final long MILLIS_PER_MINUTE = SECONDS_PER_MINUTE * MILLIS_PER_SECOND; + public static final long MILLIS_PER_HOUR = MINUTES_PER_HOUR * MILLIS_PER_MINUTE; + public static final long MILLIS_PER_DAY = HOURS_PER_DAY * MILLIS_PER_HOUR; + + public static final long MICROS_PER_MILLIS = 1000L; + public static final long MICROS_PER_SECOND = MILLIS_PER_SECOND * MICROS_PER_MILLIS; + public static final long MICROS_PER_MINUTE = SECONDS_PER_MINUTE * MICROS_PER_SECOND; + public static final long MICROS_PER_HOUR = MINUTES_PER_HOUR * MICROS_PER_MINUTE; + public static final long MICROS_PER_DAY = HOURS_PER_DAY * MICROS_PER_HOUR; + public static final long MICROS_PER_MONTH = DAYS_PER_MONTH * MICROS_PER_DAY; + /* 365.25 days per year assumes leap year every four years */ + public static final long MICROS_PER_YEAR = (36525L * MICROS_PER_DAY) / 100; + + public static final long NANOS_PER_MICROS = 1000L; + public static final long NANOS_PER_MILLIS = MICROS_PER_MILLIS * NANOS_PER_MICROS; + public static final long NANOS_PER_SECOND = MILLIS_PER_SECOND * NANOS_PER_MILLIS; +} diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java index 908ff1983e6be..f2d06e793f9dd 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/CalendarInterval.java @@ -17,387 +17,114 @@ package org.apache.spark.unsafe.types; +import org.apache.spark.annotation.Unstable; + import java.io.Serializable; -import java.util.Locale; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import java.math.BigDecimal; +import java.time.Duration; +import java.time.Period; +import java.time.temporal.ChronoUnit; +import java.util.Objects; + +import static org.apache.spark.sql.catalyst.util.DateTimeConstants.*; /** - * The internal representation of interval type. + * The class representing calendar intervals. The calendar interval is stored internally in + * three components: + *
    + *
  • an integer value representing the number of `months` in this interval,
  • + *
  • an integer value representing the number of `days` in this interval,
  • + *
  • a long value representing the number of `microseconds` in this interval.
  • + *
+ * + * The `months` and `days` are not units of time with a constant length (unlike hours, seconds), so + * they are two separated fields from microseconds. One month may be equal to 28, 29, 30 or 31 days + * and one day may be equal to 23, 24 or 25 hours (daylight saving). + * + * @since 3.0.0 */ +@Unstable public final class CalendarInterval implements Serializable { - public static final long MICROS_PER_MILLI = 1000L; - public static final long MICROS_PER_SECOND = MICROS_PER_MILLI * 1000; - public static final long MICROS_PER_MINUTE = MICROS_PER_SECOND * 60; - public static final long MICROS_PER_HOUR = MICROS_PER_MINUTE * 60; - public static final long MICROS_PER_DAY = MICROS_PER_HOUR * 24; - public static final long MICROS_PER_WEEK = MICROS_PER_DAY * 7; - - /** - * A function to generate regex which matches interval string's unit part like "3 years". - * - * First, we can leave out some units in interval string, and we only care about the value of - * unit, so here we use non-capturing group to wrap the actual regex. - * At the beginning of the actual regex, we should match spaces before the unit part. - * Next is the number part, starts with an optional "-" to represent negative value. We use - * capturing group to wrap this part as we need the value later. - * Finally is the unit name, ends with an optional "s". - */ - private static String unitRegex(String unit) { - return "(?:\\s+(-?\\d+)\\s+" + unit + "s?)?"; - } - - private static Pattern p = Pattern.compile("interval" + unitRegex("year") + unitRegex("month") + - unitRegex("week") + unitRegex("day") + unitRegex("hour") + unitRegex("minute") + - unitRegex("second") + unitRegex("millisecond") + unitRegex("microsecond")); - - private static Pattern yearMonthPattern = - Pattern.compile("^(?:['|\"])?([+|-])?(\\d+)-(\\d+)(?:['|\"])?$"); - - private static Pattern dayTimePattern = Pattern.compile( - "^(?:['|\"])?([+|-])?((\\d+) )?((\\d+):)?(\\d+):(\\d+)(\\.(\\d+))?(?:['|\"])?$"); - - private static Pattern quoteTrimPattern = Pattern.compile("^(?:['|\"])?(.*?)(?:['|\"])?$"); - - private static long toLong(String s) { - if (s == null) { - return 0; - } else { - return Long.parseLong(s); - } - } - - /** - * Convert a string to CalendarInterval. Return null if the input string is not a valid interval. - * This method is case-sensitive and all characters in the input string should be in lower case. - */ - public static CalendarInterval fromString(String s) { - if (s == null) { - return null; - } - s = s.trim(); - Matcher m = p.matcher(s); - if (!m.matches() || s.equals("interval")) { - return null; - } else { - long months = toLong(m.group(1)) * 12 + toLong(m.group(2)); - long microseconds = toLong(m.group(3)) * MICROS_PER_WEEK; - microseconds += toLong(m.group(4)) * MICROS_PER_DAY; - microseconds += toLong(m.group(5)) * MICROS_PER_HOUR; - microseconds += toLong(m.group(6)) * MICROS_PER_MINUTE; - microseconds += toLong(m.group(7)) * MICROS_PER_SECOND; - microseconds += toLong(m.group(8)) * MICROS_PER_MILLI; - microseconds += toLong(m.group(9)); - return new CalendarInterval((int) months, microseconds); - } - } - - /** - * Convert a string to CalendarInterval. Unlike fromString, this method is case-insensitive and - * will throw IllegalArgumentException when the input string is not a valid interval. - * - * @throws IllegalArgumentException if the string is not a valid internal. - */ - public static CalendarInterval fromCaseInsensitiveString(String s) { - if (s == null || s.trim().isEmpty()) { - throw new IllegalArgumentException("Interval cannot be null or blank."); - } - String sInLowerCase = s.trim().toLowerCase(Locale.ROOT); - String interval = - sInLowerCase.startsWith("interval ") ? sInLowerCase : "interval " + sInLowerCase; - CalendarInterval cal = fromString(interval); - if (cal == null) { - throw new IllegalArgumentException("Invalid interval: " + s); - } - return cal; - } - - public static long toLongWithRange(String fieldName, - String s, long minValue, long maxValue) throws IllegalArgumentException { - long result = 0; - if (s != null) { - result = Long.parseLong(s); - if (result < minValue || result > maxValue) { - throw new IllegalArgumentException(String.format("%s %d outside range [%d, %d]", - fieldName, result, minValue, maxValue)); - } - } - return result; - } - - /** - * Parse YearMonth string in form: [-]YYYY-MM - * - * adapted from HiveIntervalYearMonth.valueOf - */ - public static CalendarInterval fromYearMonthString(String s) throws IllegalArgumentException { - CalendarInterval result = null; - if (s == null) { - throw new IllegalArgumentException("Interval year-month string was null"); - } - s = s.trim(); - Matcher m = yearMonthPattern.matcher(s); - if (!m.matches()) { - throw new IllegalArgumentException( - "Interval string does not match year-month format of 'y-m': " + s); - } else { - try { - int sign = m.group(1) != null && m.group(1).equals("-") ? -1 : 1; - int years = (int) toLongWithRange("year", m.group(2), 0, Integer.MAX_VALUE); - int months = (int) toLongWithRange("month", m.group(3), 0, 11); - result = new CalendarInterval(sign * (years * 12 + months), 0); - } catch (Exception e) { - throw new IllegalArgumentException( - "Error parsing interval year-month string: " + e.getMessage(), e); - } - } - return result; - } - - /** - * Parse dayTime string in form: [-]d HH:mm:ss.nnnnnnnnn and [-]HH:mm:ss.nnnnnnnnn - * - * adapted from HiveIntervalDayTime.valueOf - */ - public static CalendarInterval fromDayTimeString(String s) throws IllegalArgumentException { - return fromDayTimeString(s, "day", "second"); - } - - /** - * Parse dayTime string in form: [-]d HH:mm:ss.nnnnnnnnn and [-]HH:mm:ss.nnnnnnnnn - * - * adapted from HiveIntervalDayTime.valueOf. - * Below interval conversion patterns are supported: - * - DAY TO (HOUR|MINUTE|SECOND) - * - HOUR TO (MINUTE|SECOND) - * - MINUTE TO SECOND - */ - public static CalendarInterval fromDayTimeString(String s, String from, String to) - throws IllegalArgumentException { - CalendarInterval result = null; - if (s == null) { - throw new IllegalArgumentException("Interval day-time string was null"); - } - s = s.trim(); - Matcher m = dayTimePattern.matcher(s); - if (!m.matches()) { - throw new IllegalArgumentException( - "Interval string does not match day-time format of 'd h:m:s.n': " + s); - } else { - try { - int sign = m.group(1) != null && m.group(1).equals("-") ? -1 : 1; - long days = m.group(2) == null ? 0 : toLongWithRange("day", m.group(3), - 0, Integer.MAX_VALUE); - long hours = 0; - long minutes; - long seconds = 0; - if (m.group(5) != null || from.equals("minute")) { // 'HH:mm:ss' or 'mm:ss minute' - hours = toLongWithRange("hour", m.group(5), 0, 23); - minutes = toLongWithRange("minute", m.group(6), 0, 59); - seconds = toLongWithRange("second", m.group(7), 0, 59); - } else if (m.group(8) != null){ // 'mm:ss.nn' - minutes = toLongWithRange("minute", m.group(6), 0, 59); - seconds = toLongWithRange("second", m.group(7), 0, 59); - } else { // 'HH:mm' - hours = toLongWithRange("hour", m.group(6), 0, 23); - minutes = toLongWithRange("second", m.group(7), 0, 59); - } - // Hive allow nanosecond precision interval - String nanoStr = m.group(9) == null ? null : (m.group(9) + "000000000").substring(0, 9); - long nanos = toLongWithRange("nanosecond", nanoStr, 0L, 999999999L); - switch (to) { - case "hour": - minutes = 0; - seconds = 0; - nanos = 0; - break; - case "minute": - seconds = 0; - nanos = 0; - break; - case "second": - // No-op - break; - default: - throw new IllegalArgumentException( - String.format("Cannot support (interval '%s' %s to %s) expression", s, from, to)); - } - result = new CalendarInterval(0, sign * ( - days * MICROS_PER_DAY + hours * MICROS_PER_HOUR + minutes * MICROS_PER_MINUTE + - seconds * MICROS_PER_SECOND + nanos / 1000L)); - } catch (Exception e) { - throw new IllegalArgumentException( - "Error parsing interval day-time string: " + e.getMessage(), e); - } - } - return result; - } - - public static CalendarInterval fromSingleUnitString(String unit, String s) - throws IllegalArgumentException { - - CalendarInterval result = null; - if (s == null) { - throw new IllegalArgumentException(String.format("Interval %s string was null", unit)); - } - s = s.trim(); - Matcher m = quoteTrimPattern.matcher(s); - if (!m.matches()) { - throw new IllegalArgumentException( - "Interval string does not match day-time format of 'd h:m:s.n': " + s); - } else { - try { - switch (unit) { - case "year": - int year = (int) toLongWithRange("year", m.group(1), - Integer.MIN_VALUE / 12, Integer.MAX_VALUE / 12); - result = new CalendarInterval(year * 12, 0L); - break; - case "month": - int month = (int) toLongWithRange("month", m.group(1), - Integer.MIN_VALUE, Integer.MAX_VALUE); - result = new CalendarInterval(month, 0L); - break; - case "week": - long week = toLongWithRange("week", m.group(1), - Long.MIN_VALUE / MICROS_PER_WEEK, Long.MAX_VALUE / MICROS_PER_WEEK); - result = new CalendarInterval(0, week * MICROS_PER_WEEK); - break; - case "day": - long day = toLongWithRange("day", m.group(1), - Long.MIN_VALUE / MICROS_PER_DAY, Long.MAX_VALUE / MICROS_PER_DAY); - result = new CalendarInterval(0, day * MICROS_PER_DAY); - break; - case "hour": - long hour = toLongWithRange("hour", m.group(1), - Long.MIN_VALUE / MICROS_PER_HOUR, Long.MAX_VALUE / MICROS_PER_HOUR); - result = new CalendarInterval(0, hour * MICROS_PER_HOUR); - break; - case "minute": - long minute = toLongWithRange("minute", m.group(1), - Long.MIN_VALUE / MICROS_PER_MINUTE, Long.MAX_VALUE / MICROS_PER_MINUTE); - result = new CalendarInterval(0, minute * MICROS_PER_MINUTE); - break; - case "second": { - long micros = parseSecondNano(m.group(1)); - result = new CalendarInterval(0, micros); - break; - } - case "millisecond": - long millisecond = toLongWithRange("millisecond", m.group(1), - Long.MIN_VALUE / MICROS_PER_MILLI, Long.MAX_VALUE / MICROS_PER_MILLI); - result = new CalendarInterval(0, millisecond * MICROS_PER_MILLI); - break; - case "microsecond": { - long micros = Long.parseLong(m.group(1)); - result = new CalendarInterval(0, micros); - break; - } - } - } catch (Exception e) { - throw new IllegalArgumentException("Error parsing interval string: " + e.getMessage(), e); - } - } - return result; - } - - /** - * Parse second_nano string in ss.nnnnnnnnn format to microseconds - */ - public static long parseSecondNano(String secondNano) throws IllegalArgumentException { - String[] parts = secondNano.split("\\."); - if (parts.length == 1) { - return toLongWithRange("second", parts[0], Long.MIN_VALUE / MICROS_PER_SECOND, - Long.MAX_VALUE / MICROS_PER_SECOND) * MICROS_PER_SECOND; - - } else if (parts.length == 2) { - long seconds = parts[0].equals("") ? 0L : toLongWithRange("second", parts[0], - Long.MIN_VALUE / MICROS_PER_SECOND, Long.MAX_VALUE / MICROS_PER_SECOND); - long nanos = toLongWithRange("nanosecond", parts[1], 0L, 999999999L); - return seconds * MICROS_PER_SECOND + nanos / 1000L; - - } else { - throw new IllegalArgumentException( - "Interval string does not match second-nano format of ss.nnnnnnnnn"); - } - } - + // NOTE: If you're moving or renaming this file, you should also update Unidoc configuration + // specified in 'SparkBuild.scala'. public final int months; + public final int days; public final long microseconds; - public long milliseconds() { - return this.microseconds / MICROS_PER_MILLI; - } - - public CalendarInterval(int months, long microseconds) { + // CalendarInterval is represented by months, days and microseconds. Months and days are not + // units of time with a constant length (unlike hours, seconds), so they are two separated fields + // from microseconds. One month may be equal to 29, 30 or 31 days and one day may be equal to + // 23, 24 or 25 hours (daylight saving) + public CalendarInterval(int months, int days, long microseconds) { this.months = months; + this.days = days; this.microseconds = microseconds; } - public CalendarInterval add(CalendarInterval that) { - int months = this.months + that.months; - long microseconds = this.microseconds + that.microseconds; - return new CalendarInterval(months, microseconds); - } - - public CalendarInterval subtract(CalendarInterval that) { - int months = this.months - that.months; - long microseconds = this.microseconds - that.microseconds; - return new CalendarInterval(months, microseconds); - } - - public CalendarInterval negate() { - return new CalendarInterval(-this.months, -this.microseconds); - } - @Override - public boolean equals(Object other) { - if (this == other) return true; - if (other == null || !(other instanceof CalendarInterval)) return false; - - CalendarInterval o = (CalendarInterval) other; - return this.months == o.months && this.microseconds == o.microseconds; + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + CalendarInterval that = (CalendarInterval) o; + return months == that.months && + days == that.days && + microseconds == that.microseconds; } @Override public int hashCode() { - return 31 * months + (int) microseconds; + return Objects.hash(months, days, microseconds); } @Override public String toString() { - StringBuilder sb = new StringBuilder("interval"); + if (months == 0 && days == 0 && microseconds == 0) { + return "0 seconds"; + } + + StringBuilder sb = new StringBuilder(); if (months != 0) { - appendUnit(sb, months / 12, "year"); - appendUnit(sb, months % 12, "month"); + appendUnit(sb, months / 12, "years"); + appendUnit(sb, months % 12, "months"); } + appendUnit(sb, days, "days"); + if (microseconds != 0) { long rest = microseconds; - appendUnit(sb, rest / MICROS_PER_WEEK, "week"); - rest %= MICROS_PER_WEEK; - appendUnit(sb, rest / MICROS_PER_DAY, "day"); - rest %= MICROS_PER_DAY; - appendUnit(sb, rest / MICROS_PER_HOUR, "hour"); + appendUnit(sb, rest / MICROS_PER_HOUR, "hours"); rest %= MICROS_PER_HOUR; - appendUnit(sb, rest / MICROS_PER_MINUTE, "minute"); + appendUnit(sb, rest / MICROS_PER_MINUTE, "minutes"); rest %= MICROS_PER_MINUTE; - appendUnit(sb, rest / MICROS_PER_SECOND, "second"); - rest %= MICROS_PER_SECOND; - appendUnit(sb, rest / MICROS_PER_MILLI, "millisecond"); - rest %= MICROS_PER_MILLI; - appendUnit(sb, rest, "microsecond"); - } else if (months == 0) { - sb.append(" 0 microseconds"); + if (rest != 0) { + String s = BigDecimal.valueOf(rest, 6).stripTrailingZeros().toPlainString(); + sb.append(s).append(" seconds "); + } } + sb.setLength(sb.length() - 1); return sb.toString(); } private void appendUnit(StringBuilder sb, long value, String unit) { if (value != 0) { - sb.append(' ').append(value).append(' ').append(unit).append('s'); + sb.append(value).append(' ').append(unit).append(' '); } } + + /** + * Extracts the date part of the interval. + * @return an instance of {@code java.time.Period} based on the months and days fields + * of the given interval, not null. + */ + public Period extractAsPeriod() { return Period.of(0, months, days); } + + /** + * Extracts the time part of the interval. + * @return an instance of {@code java.time.Duration} based on the microseconds field + * of the given interval, not null. + * @throws ArithmeticException if a numeric overflow occurs + */ + public Duration extractAsDuration() { return Duration.of(microseconds, ChronoUnit.MICROS); } } diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 30b884c5fa9c6..c5384669eb922 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -370,7 +370,7 @@ private byte getByte(int i) { return Platform.getByte(base, offset + i); } - private boolean matchAt(final UTF8String s, int pos) { + public boolean matchAt(final UTF8String s, int pos) { if (s.numBytes + pos > numBytes || pos < 0) { return false; } @@ -538,14 +538,42 @@ private UTF8String copyUTF8String(int start, int end) { public UTF8String trim() { int s = 0; // skip all of the space (0x20) in the left side - while (s < this.numBytes && getByte(s) == 0x20) s++; + while (s < this.numBytes && getByte(s) == ' ') s++; if (s == this.numBytes) { // Everything trimmed return EMPTY_UTF8; } // skip all of the space (0x20) in the right side int e = this.numBytes - 1; - while (e > s && getByte(e) == 0x20) e--; + while (e > s && getByte(e) == ' ') e--; + if (s == 0 && e == numBytes - 1) { + // Nothing trimmed + return this; + } + return copyUTF8String(s, e); + } + + /** + * Trims whitespaces (<= ASCII 32) from both ends of this string. + * + * Note that, this method is the same as java's {@link String#trim}, and different from + * {@link UTF8String#trim()} which remove only spaces(= ASCII 32) from both ends. + * + * @return A UTF8String whose value is this UTF8String, with any leading and trailing white + * space removed, or this UTF8String if it has no leading or trailing whitespace. + * + */ + public UTF8String trimAll() { + int s = 0; + // skip all of the whitespaces (<=0x20) in the left side + while (s < this.numBytes && getByte(s) <= ' ') s++; + if (s == this.numBytes) { + // Everything trimmed + return EMPTY_UTF8; + } + // skip all of the whitespaces (<=0x20) in the right side + int e = this.numBytes - 1; + while (e > s && getByte(e) <= ' ') e--; if (s == 0 && e == numBytes - 1) { // Nothing trimmed return this; @@ -1063,7 +1091,7 @@ public static class IntWrapper implements Serializable { } /** - * Parses this UTF8String to long. + * Parses this UTF8String(trimmed if needed) to long. * * Note that, in this method we accumulate the result in negative format, and convert it to * positive format at the end, if this string is not started with '-'. This is because min value @@ -1077,18 +1105,20 @@ public static class IntWrapper implements Serializable { * @return true if the parsing was successful else false */ public boolean toLong(LongWrapper toLongResult) { - if (numBytes == 0) { - return false; - } + int offset = 0; + while (offset < this.numBytes && getByte(offset) <= ' ') offset++; + if (offset == this.numBytes) return false; - byte b = getByte(0); + int end = this.numBytes - 1; + while (end > offset && getByte(end) <= ' ') end--; + + byte b = getByte(offset); final boolean negative = b == '-'; - int offset = 0; if (negative || b == '+') { - offset++; - if (numBytes == 1) { + if (end - offset == 0) { return false; } + offset++; } final byte separator = '.'; @@ -1096,7 +1126,7 @@ public boolean toLong(LongWrapper toLongResult) { final long stopValue = Long.MIN_VALUE / radix; long result = 0; - while (offset < numBytes) { + while (offset <= end) { b = getByte(offset); offset++; if (b == separator) { @@ -1131,7 +1161,7 @@ public boolean toLong(LongWrapper toLongResult) { // This is the case when we've encountered a decimal separator. The fractional // part will not change the number, but we will verify that the fractional part // is well formed. - while (offset < numBytes) { + while (offset <= end) { byte currentByte = getByte(offset); if (currentByte < '0' || currentByte > '9') { return false; @@ -1151,7 +1181,7 @@ public boolean toLong(LongWrapper toLongResult) { } /** - * Parses this UTF8String to int. + * Parses this UTF8String(trimmed if needed) to int. * * Note that, in this method we accumulate the result in negative format, and convert it to * positive format at the end, if this string is not started with '-'. This is because min value @@ -1168,18 +1198,20 @@ public boolean toLong(LongWrapper toLongResult) { * @return true if the parsing was successful else false */ public boolean toInt(IntWrapper intWrapper) { - if (numBytes == 0) { - return false; - } + int offset = 0; + while (offset < this.numBytes && getByte(offset) <= ' ') offset++; + if (offset == this.numBytes) return false; - byte b = getByte(0); + int end = this.numBytes - 1; + while (end > offset && getByte(end) <= ' ') end--; + + byte b = getByte(offset); final boolean negative = b == '-'; - int offset = 0; if (negative || b == '+') { - offset++; - if (numBytes == 1) { + if (end - offset == 0) { return false; } + offset++; } final byte separator = '.'; @@ -1187,7 +1219,7 @@ public boolean toInt(IntWrapper intWrapper) { final int stopValue = Integer.MIN_VALUE / radix; int result = 0; - while (offset < numBytes) { + while (offset <= end) { b = getByte(offset); offset++; if (b == separator) { @@ -1222,7 +1254,7 @@ public boolean toInt(IntWrapper intWrapper) { // This is the case when we've encountered a decimal separator. The fractional // part will not change the number, but we will verify that the fractional part // is well formed. - while (offset < numBytes) { + while (offset <= end) { byte currentByte = getByte(offset); if (currentByte < '0' || currentByte > '9') { return false; @@ -1262,6 +1294,52 @@ public boolean toByte(IntWrapper intWrapper) { return false; } + /** + * Parses UTF8String(trimmed if needed) to long. This method is used when ANSI is enabled. + * + * @return If string contains valid numeric value then it returns the long value otherwise a + * NumberFormatException is thrown. + */ + public long toLongExact() { + LongWrapper result = new LongWrapper(); + if (toLong(result)) { + return result.value; + } + throw new NumberFormatException("invalid input syntax for type numeric: " + this); + } + + /** + * Parses UTF8String(trimmed if needed) to int. This method is used when ANSI is enabled. + * + * @return If string contains valid numeric value then it returns the int value otherwise a + * NumberFormatException is thrown. + */ + public int toIntExact() { + IntWrapper result = new IntWrapper(); + if (toInt(result)) { + return result.value; + } + throw new NumberFormatException("invalid input syntax for type numeric: " + this); + } + + public short toShortExact() { + int value = this.toIntExact(); + short result = (short) value; + if (result == value) { + return result; + } + throw new NumberFormatException("invalid input syntax for type numeric: " + this); + } + + public byte toByteExact() { + int value = this.toIntExact(); + byte result = (byte) value; + if (result == value) { + return result; + } + throw new NumberFormatException("invalid input syntax for type numeric: " + this); + } + @Override public String toString() { return new String(getBytes(), StandardCharsets.UTF_8); diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java index 3ad9ac7b4de9c..19e4182b38a4e 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java @@ -114,25 +114,25 @@ public void memoryDebugFillEnabledInTest() { Assert.assertTrue(MemoryAllocator.MEMORY_DEBUG_FILL_ENABLED); MemoryBlock onheap = MemoryAllocator.HEAP.allocate(1); Assert.assertEquals( - Platform.getByte(onheap.getBaseObject(), onheap.getBaseOffset()), - MemoryAllocator.MEMORY_DEBUG_FILL_CLEAN_VALUE); + MemoryAllocator.MEMORY_DEBUG_FILL_CLEAN_VALUE, + Platform.getByte(onheap.getBaseObject(), onheap.getBaseOffset())); MemoryBlock onheap1 = MemoryAllocator.HEAP.allocate(1024 * 1024); Object onheap1BaseObject = onheap1.getBaseObject(); long onheap1BaseOffset = onheap1.getBaseOffset(); MemoryAllocator.HEAP.free(onheap1); Assert.assertEquals( - Platform.getByte(onheap1BaseObject, onheap1BaseOffset), - MemoryAllocator.MEMORY_DEBUG_FILL_FREED_VALUE); + MemoryAllocator.MEMORY_DEBUG_FILL_FREED_VALUE, + Platform.getByte(onheap1BaseObject, onheap1BaseOffset)); MemoryBlock onheap2 = MemoryAllocator.HEAP.allocate(1024 * 1024); Assert.assertEquals( - Platform.getByte(onheap2.getBaseObject(), onheap2.getBaseOffset()), - MemoryAllocator.MEMORY_DEBUG_FILL_CLEAN_VALUE); + MemoryAllocator.MEMORY_DEBUG_FILL_CLEAN_VALUE, + Platform.getByte(onheap2.getBaseObject(), onheap2.getBaseOffset())); MemoryBlock offheap = MemoryAllocator.UNSAFE.allocate(1); Assert.assertEquals( - Platform.getByte(offheap.getBaseObject(), offheap.getBaseOffset()), - MemoryAllocator.MEMORY_DEBUG_FILL_CLEAN_VALUE); + MemoryAllocator.MEMORY_DEBUG_FILL_CLEAN_VALUE, + Platform.getByte(offheap.getBaseObject(), offheap.getBaseOffset())); MemoryAllocator.UNSAFE.free(offheap); } @@ -150,11 +150,11 @@ public void heapMemoryReuse() { // The size is greater than `HeapMemoryAllocator.POOLING_THRESHOLD_BYTES`, // reuse the previous memory which has released. MemoryBlock onheap3 = heapMem.allocate(1024 * 1024 + 1); - Assert.assertEquals(onheap3.size(), 1024 * 1024 + 1); + Assert.assertEquals(1024 * 1024 + 1, onheap3.size()); Object obj3 = onheap3.getBaseObject(); heapMem.free(onheap3); MemoryBlock onheap4 = heapMem.allocate(1024 * 1024 + 7); - Assert.assertEquals(onheap4.size(), 1024 * 1024 + 7); + Assert.assertEquals(1024 * 1024 + 7, onheap4.size()); Assert.assertEquals(obj3, onheap4.getBaseObject()); } } diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java index c307d74e0ba07..6397f26c02f3a 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java @@ -19,259 +19,67 @@ import org.junit.Test; +import java.time.Duration; +import java.time.Period; + import static org.junit.Assert.*; -import static org.apache.spark.unsafe.types.CalendarInterval.*; +import static org.apache.spark.sql.catalyst.util.DateTimeConstants.*; public class CalendarIntervalSuite { @Test public void equalsTest() { - CalendarInterval i1 = new CalendarInterval(3, 123); - CalendarInterval i2 = new CalendarInterval(3, 321); - CalendarInterval i3 = new CalendarInterval(1, 123); - CalendarInterval i4 = new CalendarInterval(3, 123); + CalendarInterval i1 = new CalendarInterval(3, 2, 123); + CalendarInterval i2 = new CalendarInterval(3, 2,321); + CalendarInterval i3 = new CalendarInterval(3, 4,123); + CalendarInterval i4 = new CalendarInterval(1, 2, 123); + CalendarInterval i5 = new CalendarInterval(1, 4, 321); + CalendarInterval i6 = new CalendarInterval(3, 2, 123); assertNotSame(i1, i2); assertNotSame(i1, i3); + assertNotSame(i1, i4); assertNotSame(i2, i3); - assertEquals(i1, i4); + assertNotSame(i2, i4); + assertNotSame(i3, i4); + assertNotSame(i1, i5); + assertEquals(i1, i6); } @Test public void toStringTest() { CalendarInterval i; - i = new CalendarInterval(0, 0); - assertEquals("interval 0 microseconds", i.toString()); - - i = new CalendarInterval(34, 0); - assertEquals("interval 2 years 10 months", i.toString()); - - i = new CalendarInterval(-34, 0); - assertEquals("interval -2 years -10 months", i.toString()); - - i = new CalendarInterval(0, 3 * MICROS_PER_WEEK + 13 * MICROS_PER_HOUR + 123); - assertEquals("interval 3 weeks 13 hours 123 microseconds", i.toString()); - - i = new CalendarInterval(0, -3 * MICROS_PER_WEEK - 13 * MICROS_PER_HOUR - 123); - assertEquals("interval -3 weeks -13 hours -123 microseconds", i.toString()); - - i = new CalendarInterval(34, 3 * MICROS_PER_WEEK + 13 * MICROS_PER_HOUR + 123); - assertEquals("interval 2 years 10 months 3 weeks 13 hours 123 microseconds", i.toString()); - } - - @Test - public void fromStringTest() { - testSingleUnit("year", 3, 36, 0); - testSingleUnit("month", 3, 3, 0); - testSingleUnit("week", 3, 0, 3 * MICROS_PER_WEEK); - testSingleUnit("day", 3, 0, 3 * MICROS_PER_DAY); - testSingleUnit("hour", 3, 0, 3 * MICROS_PER_HOUR); - testSingleUnit("minute", 3, 0, 3 * MICROS_PER_MINUTE); - testSingleUnit("second", 3, 0, 3 * MICROS_PER_SECOND); - testSingleUnit("millisecond", 3, 0, 3 * MICROS_PER_MILLI); - testSingleUnit("microsecond", 3, 0, 3); - - String input; - - input = "interval -5 years 23 month"; - CalendarInterval result = new CalendarInterval(-5 * 12 + 23, 0); - assertEquals(fromString(input), result); - - input = "interval -5 years 23 month "; - assertEquals(fromString(input), result); - - input = " interval -5 years 23 month "; - assertEquals(fromString(input), result); - - // Error cases - input = "interval 3month 1 hour"; - assertNull(fromString(input)); - - input = "interval 3 moth 1 hour"; - assertNull(fromString(input)); - - input = "interval"; - assertNull(fromString(input)); - - input = "int"; - assertNull(fromString(input)); - - input = ""; - assertNull(fromString(input)); - - input = null; - assertNull(fromString(input)); - } - - @Test - public void fromCaseInsensitiveStringTest() { - for (String input : new String[]{"5 MINUTES", "5 minutes", "5 Minutes"}) { - assertEquals(fromCaseInsensitiveString(input), new CalendarInterval(0, 5L * 60 * 1_000_000)); - } - - for (String input : new String[]{null, "", " "}) { - try { - fromCaseInsensitiveString(input); - fail("Expected to throw an exception for the invalid input"); - } catch (IllegalArgumentException e) { - assertTrue(e.getMessage().contains("cannot be null or blank")); - } - } - - for (String input : new String[]{"interval", "interval1 day", "foo", "foo 1 day"}) { - try { - fromCaseInsensitiveString(input); - fail("Expected to throw an exception for the invalid input"); - } catch (IllegalArgumentException e) { - assertTrue(e.getMessage().contains("Invalid interval")); - } - } - } - - @Test - public void fromYearMonthStringTest() { - String input; - CalendarInterval i; - - input = "99-10"; - i = new CalendarInterval(99 * 12 + 10, 0L); - assertEquals(fromYearMonthString(input), i); + i = new CalendarInterval(0, 0, 0); + assertEquals("0 seconds", i.toString()); - input = "-8-10"; - i = new CalendarInterval(-8 * 12 - 10, 0L); - assertEquals(fromYearMonthString(input), i); + i = new CalendarInterval(34, 0, 0); + assertEquals("2 years 10 months", i.toString()); - try { - input = "99-15"; - fromYearMonthString(input); - fail("Expected to throw an exception for the invalid input"); - } catch (IllegalArgumentException e) { - assertTrue(e.getMessage().contains("month 15 outside range")); - } - } - - @Test - public void fromDayTimeStringTest() { - String input; - CalendarInterval i; + i = new CalendarInterval(-34, 0, 0); + assertEquals("-2 years -10 months", i.toString()); - input = "5 12:40:30.999999999"; - i = new CalendarInterval(0, 5 * MICROS_PER_DAY + 12 * MICROS_PER_HOUR + - 40 * MICROS_PER_MINUTE + 30 * MICROS_PER_SECOND + 999999L); - assertEquals(fromDayTimeString(input), i); + i = new CalendarInterval(0, 31, 0); + assertEquals("31 days", i.toString()); - input = "10 0:12:0.888"; - i = new CalendarInterval(0, 10 * MICROS_PER_DAY + 12 * MICROS_PER_MINUTE + - 888 * MICROS_PER_MILLI); - assertEquals(fromDayTimeString(input), i); + i = new CalendarInterval(0, -31, 0); + assertEquals("-31 days", i.toString()); - input = "-3 0:0:0"; - i = new CalendarInterval(0, -3 * MICROS_PER_DAY); - assertEquals(fromDayTimeString(input), i); + i = new CalendarInterval(0, 0, 3 * MICROS_PER_HOUR + 13 * MICROS_PER_MINUTE + 123); + assertEquals("3 hours 13 minutes 0.000123 seconds", i.toString()); - try { - input = "5 30:12:20"; - fromDayTimeString(input); - fail("Expected to throw an exception for the invalid input"); - } catch (IllegalArgumentException e) { - assertTrue(e.getMessage().contains("hour 30 outside range")); - } + i = new CalendarInterval(0, 0, -3 * MICROS_PER_HOUR - 13 * MICROS_PER_MINUTE - 123); + assertEquals("-3 hours -13 minutes -0.000123 seconds", i.toString()); - try { - input = "5 30-12"; - fromDayTimeString(input); - fail("Expected to throw an exception for the invalid input"); - } catch (IllegalArgumentException e) { - assertTrue(e.getMessage().contains("not match day-time format")); - } - - try { - input = "5 1:12:20"; - fromDayTimeString(input, "hour", "microsecond"); - fail("Expected to throw an exception for the invalid convention type"); - } catch (IllegalArgumentException e) { - assertTrue(e.getMessage().contains("Cannot support (interval")); - } + i = new CalendarInterval(34, 31, 3 * MICROS_PER_HOUR + 13 * MICROS_PER_MINUTE + 123); + assertEquals("2 years 10 months 31 days 3 hours 13 minutes 0.000123 seconds", + i.toString()); } @Test - public void fromSingleUnitStringTest() { - String input; - CalendarInterval i; - - input = "12"; - i = new CalendarInterval(12 * 12, 0L); - assertEquals(fromSingleUnitString("year", input), i); - - input = "100"; - i = new CalendarInterval(0, 100 * MICROS_PER_DAY); - assertEquals(fromSingleUnitString("day", input), i); - - input = "1999.38888"; - i = new CalendarInterval(0, 1999 * MICROS_PER_SECOND + 38); - assertEquals(fromSingleUnitString("second", input), i); - - try { - input = String.valueOf(Integer.MAX_VALUE); - fromSingleUnitString("year", input); - fail("Expected to throw an exception for the invalid input"); - } catch (IllegalArgumentException e) { - assertTrue(e.getMessage().contains("outside range")); - } - - try { - input = String.valueOf(Long.MAX_VALUE / MICROS_PER_HOUR + 1); - fromSingleUnitString("hour", input); - fail("Expected to throw an exception for the invalid input"); - } catch (IllegalArgumentException e) { - assertTrue(e.getMessage().contains("outside range")); - } - } - - @Test - public void addTest() { - String input = "interval 3 month 1 hour"; - String input2 = "interval 2 month 100 hour"; - - CalendarInterval interval = fromString(input); - CalendarInterval interval2 = fromString(input2); - - assertEquals(interval.add(interval2), new CalendarInterval(5, 101 * MICROS_PER_HOUR)); - - input = "interval -10 month -81 hour"; - input2 = "interval 75 month 200 hour"; - - interval = fromString(input); - interval2 = fromString(input2); - - assertEquals(interval.add(interval2), new CalendarInterval(65, 119 * MICROS_PER_HOUR)); - } - - @Test - public void subtractTest() { - String input = "interval 3 month 1 hour"; - String input2 = "interval 2 month 100 hour"; - - CalendarInterval interval = fromString(input); - CalendarInterval interval2 = fromString(input2); - - assertEquals(interval.subtract(interval2), new CalendarInterval(1, -99 * MICROS_PER_HOUR)); - - input = "interval -10 month -81 hour"; - input2 = "interval 75 month 200 hour"; - - interval = fromString(input); - interval2 = fromString(input2); - - assertEquals(interval.subtract(interval2), new CalendarInterval(-85, -281 * MICROS_PER_HOUR)); - } - - private static void testSingleUnit(String unit, int number, int months, long microseconds) { - String input1 = "interval " + number + " " + unit; - String input2 = "interval " + number + " " + unit + "s"; - CalendarInterval result = new CalendarInterval(months, microseconds); - assertEquals(fromString(input1), result); - assertEquals(fromString(input2), result); + public void periodAndDurationTest() { + CalendarInterval interval = new CalendarInterval(120, -40, 123456); + assertEquals(Period.of(0, 120, -40), interval.extractAsPeriod()); + assertEquals(Duration.ofNanos(123456000), interval.extractAsDuration()); } } diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index cd253c0cbc904..8f933877f82e6 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -38,11 +38,11 @@ public class UTF8StringSuite { private static void checkBasic(String str, int len) { UTF8String s1 = fromString(str); UTF8String s2 = fromBytes(str.getBytes(StandardCharsets.UTF_8)); - assertEquals(s1.numChars(), len); - assertEquals(s2.numChars(), len); + assertEquals(len, s1.numChars()); + assertEquals(len, s2.numChars()); - assertEquals(s1.toString(), str); - assertEquals(s2.toString(), str); + assertEquals(str, s1.toString()); + assertEquals(str, s2.toString()); assertEquals(s1, s2); assertEquals(s1.hashCode(), s2.hashCode()); @@ -227,6 +227,7 @@ public void substring() { @Test public void trims() { assertEquals(fromString("1"), fromString("1").trim()); + assertEquals(fromString("1"), fromString("1\t").trimAll()); assertEquals(fromString("hello"), fromString(" hello ").trim()); assertEquals(fromString("hello "), fromString(" hello ").trimLeft()); @@ -375,20 +376,20 @@ public void pad() { @Test public void substringSQL() { UTF8String e = fromString("example"); - assertEquals(e.substringSQL(0, 2), fromString("ex")); - assertEquals(e.substringSQL(1, 2), fromString("ex")); - assertEquals(e.substringSQL(0, 7), fromString("example")); - assertEquals(e.substringSQL(1, 2), fromString("ex")); - assertEquals(e.substringSQL(0, 100), fromString("example")); - assertEquals(e.substringSQL(1, 100), fromString("example")); - assertEquals(e.substringSQL(2, 2), fromString("xa")); - assertEquals(e.substringSQL(1, 6), fromString("exampl")); - assertEquals(e.substringSQL(2, 100), fromString("xample")); - assertEquals(e.substringSQL(0, 0), fromString("")); - assertEquals(e.substringSQL(100, 4), EMPTY_UTF8); - assertEquals(e.substringSQL(0, Integer.MAX_VALUE), fromString("example")); - assertEquals(e.substringSQL(1, Integer.MAX_VALUE), fromString("example")); - assertEquals(e.substringSQL(2, Integer.MAX_VALUE), fromString("xample")); + assertEquals(fromString("ex"), e.substringSQL(0, 2)); + assertEquals(fromString("ex"), e.substringSQL(1, 2)); + assertEquals(fromString("example"), e.substringSQL(0, 7)); + assertEquals(fromString("ex"), e.substringSQL(1, 2)); + assertEquals(fromString("example"), e.substringSQL(0, 100)); + assertEquals(fromString("example"), e.substringSQL(1, 100)); + assertEquals(fromString("xa"), e.substringSQL(2, 2)); + assertEquals(fromString("exampl"), e.substringSQL(1, 6)); + assertEquals(fromString("xample"), e.substringSQL(2, 100)); + assertEquals(fromString(""), e.substringSQL(0, 0)); + assertEquals(EMPTY_UTF8, e.substringSQL(100, 4)); + assertEquals(fromString("example"), e.substringSQL(0, Integer.MAX_VALUE)); + assertEquals(fromString("example"), e.substringSQL(1, Integer.MAX_VALUE)); + assertEquals(fromString("xample"), e.substringSQL(2, Integer.MAX_VALUE)); } @Test @@ -506,50 +507,50 @@ public void findInSet() { @Test public void soundex() { - assertEquals(fromString("Robert").soundex(), fromString("R163")); - assertEquals(fromString("Rupert").soundex(), fromString("R163")); - assertEquals(fromString("Rubin").soundex(), fromString("R150")); - assertEquals(fromString("Ashcraft").soundex(), fromString("A261")); - assertEquals(fromString("Ashcroft").soundex(), fromString("A261")); - assertEquals(fromString("Burroughs").soundex(), fromString("B620")); - assertEquals(fromString("Burrows").soundex(), fromString("B620")); - assertEquals(fromString("Ekzampul").soundex(), fromString("E251")); - assertEquals(fromString("Example").soundex(), fromString("E251")); - assertEquals(fromString("Ellery").soundex(), fromString("E460")); - assertEquals(fromString("Euler").soundex(), fromString("E460")); - assertEquals(fromString("Ghosh").soundex(), fromString("G200")); - assertEquals(fromString("Gauss").soundex(), fromString("G200")); - assertEquals(fromString("Gutierrez").soundex(), fromString("G362")); - assertEquals(fromString("Heilbronn").soundex(), fromString("H416")); - assertEquals(fromString("Hilbert").soundex(), fromString("H416")); - assertEquals(fromString("Jackson").soundex(), fromString("J250")); - assertEquals(fromString("Kant").soundex(), fromString("K530")); - assertEquals(fromString("Knuth").soundex(), fromString("K530")); - assertEquals(fromString("Lee").soundex(), fromString("L000")); - assertEquals(fromString("Lukasiewicz").soundex(), fromString("L222")); - assertEquals(fromString("Lissajous").soundex(), fromString("L222")); - assertEquals(fromString("Ladd").soundex(), fromString("L300")); - assertEquals(fromString("Lloyd").soundex(), fromString("L300")); - assertEquals(fromString("Moses").soundex(), fromString("M220")); - assertEquals(fromString("O'Hara").soundex(), fromString("O600")); - assertEquals(fromString("Pfister").soundex(), fromString("P236")); - assertEquals(fromString("Rubin").soundex(), fromString("R150")); - assertEquals(fromString("Robert").soundex(), fromString("R163")); - assertEquals(fromString("Rupert").soundex(), fromString("R163")); - assertEquals(fromString("Soundex").soundex(), fromString("S532")); - assertEquals(fromString("Sownteks").soundex(), fromString("S532")); - assertEquals(fromString("Tymczak").soundex(), fromString("T522")); - assertEquals(fromString("VanDeusen").soundex(), fromString("V532")); - assertEquals(fromString("Washington").soundex(), fromString("W252")); - assertEquals(fromString("Wheaton").soundex(), fromString("W350")); - - assertEquals(fromString("a").soundex(), fromString("A000")); - assertEquals(fromString("ab").soundex(), fromString("A100")); - assertEquals(fromString("abc").soundex(), fromString("A120")); - assertEquals(fromString("abcd").soundex(), fromString("A123")); - assertEquals(fromString("").soundex(), fromString("")); - assertEquals(fromString("123").soundex(), fromString("123")); - assertEquals(fromString("世界千世").soundex(), fromString("世界千世")); + assertEquals(fromString("R163"), fromString("Robert").soundex()); + assertEquals(fromString("R163"), fromString("Rupert").soundex()); + assertEquals(fromString("R150"), fromString("Rubin").soundex()); + assertEquals(fromString("A261"), fromString("Ashcraft").soundex()); + assertEquals(fromString("A261"), fromString("Ashcroft").soundex()); + assertEquals(fromString("B620"), fromString("Burroughs").soundex()); + assertEquals(fromString("B620"), fromString("Burrows").soundex()); + assertEquals(fromString("E251"), fromString("Ekzampul").soundex()); + assertEquals(fromString("E251"), fromString("Example").soundex()); + assertEquals(fromString("E460"), fromString("Ellery").soundex()); + assertEquals(fromString("E460"), fromString("Euler").soundex()); + assertEquals(fromString("G200"), fromString("Ghosh").soundex()); + assertEquals(fromString("G200"), fromString("Gauss").soundex()); + assertEquals(fromString("G362"), fromString("Gutierrez").soundex()); + assertEquals(fromString("H416"), fromString("Heilbronn").soundex()); + assertEquals(fromString("H416"), fromString("Hilbert").soundex()); + assertEquals(fromString("J250"), fromString("Jackson").soundex()); + assertEquals(fromString("K530"), fromString("Kant").soundex()); + assertEquals(fromString("K530"), fromString("Knuth").soundex()); + assertEquals(fromString("L000"), fromString("Lee").soundex()); + assertEquals(fromString("L222"), fromString("Lukasiewicz").soundex()); + assertEquals(fromString("L222"), fromString("Lissajous").soundex()); + assertEquals(fromString("L300"), fromString("Ladd").soundex()); + assertEquals(fromString("L300"), fromString("Lloyd").soundex()); + assertEquals(fromString("M220"), fromString("Moses").soundex()); + assertEquals(fromString("O600"), fromString("O'Hara").soundex()); + assertEquals(fromString("P236"), fromString("Pfister").soundex()); + assertEquals(fromString("R150"), fromString("Rubin").soundex()); + assertEquals(fromString("R163"), fromString("Robert").soundex()); + assertEquals(fromString("R163"), fromString("Rupert").soundex()); + assertEquals(fromString("S532"), fromString("Soundex").soundex()); + assertEquals(fromString("S532"), fromString("Sownteks").soundex()); + assertEquals(fromString("T522"), fromString("Tymczak").soundex()); + assertEquals(fromString("V532"), fromString("VanDeusen").soundex()); + assertEquals(fromString("W252"), fromString("Washington").soundex()); + assertEquals(fromString("W350"), fromString("Wheaton").soundex()); + + assertEquals(fromString("A000"), fromString("a").soundex()); + assertEquals(fromString("A100"), fromString("ab").soundex()); + assertEquals(fromString("A120"), fromString("abc").soundex()); + assertEquals(fromString("A123"), fromString("abcd").soundex()); + assertEquals(fromString(""), fromString("").soundex()); + assertEquals(fromString("123"), fromString("123").soundex()); + assertEquals(fromString("世界千世"), fromString("世界千世").soundex()); } @Test @@ -849,7 +850,7 @@ public void skipWrongFirstByte() { for (int i = 0; i < wrongFirstBytes.length; ++i) { c[0] = (byte)wrongFirstBytes[i]; - assertEquals(fromBytes(c).numChars(), 1); + assertEquals(1, fromBytes(c).numChars()); } } } diff --git a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala index fdb81a06d41c9..72aa682bb95bc 100644 --- a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala +++ b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.unsafe.types import org.apache.commons.text.similarity.LevenshteinDistance import org.scalacheck.{Arbitrary, Gen} -import org.scalatest.prop.GeneratorDrivenPropertyChecks +import org.scalatestplus.scalacheck.ScalaCheckDrivenPropertyChecks // scalastyle:off import org.scalatest.{FunSuite, Matchers} @@ -28,7 +28,7 @@ import org.apache.spark.unsafe.types.UTF8String.{fromString => toUTF8} /** * This TestSuite utilize ScalaCheck to generate randomized inputs for UTF8String testing. */ -class UTF8StringPropertyCheckSuite extends FunSuite with GeneratorDrivenPropertyChecks with Matchers { +class UTF8StringPropertyCheckSuite extends FunSuite with ScalaCheckDrivenPropertyChecks with Matchers { // scalastyle:on test("toString") { diff --git a/conf/metrics.properties.template b/conf/metrics.properties.template index da0b06d295252..f52d33fd64223 100644 --- a/conf/metrics.properties.template +++ b/conf/metrics.properties.template @@ -113,6 +113,15 @@ # /metrics/applications/json # App information # /metrics/master/json # Master information +# org.apache.spark.metrics.sink.PrometheusServlet +# Name: Default: Description: +# path VARIES* Path prefix from the web server root +# +# * Default path is /metrics/prometheus for all instances except the master. The +# master has two paths: +# /metrics/applications/prometheus # App information +# /metrics/master/prometheus # Master information + # org.apache.spark.metrics.sink.GraphiteSink # Name: Default: Description: # host NONE Hostname of the Graphite server, must be set @@ -192,4 +201,10 @@ #driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource -#executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource \ No newline at end of file +#executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource + +# Example configuration for PrometheusServlet +#*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet +#*.sink.prometheusServlet.path=/metrics/prometheus +#master.sink.prometheusServlet.path=/metrics/master/prometheus +#applications.sink.prometheusServlet.path=/metrics/applications/prometheus diff --git a/core/benchmarks/CoalescedRDDBenchmark-jdk11-results.txt b/core/benchmarks/CoalescedRDDBenchmark-jdk11-results.txt new file mode 100644 index 0000000000000..e944111ff9e93 --- /dev/null +++ b/core/benchmarks/CoalescedRDDBenchmark-jdk11-results.txt @@ -0,0 +1,40 @@ +================================================================================================ +Coalesced RDD , large scale +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Coalesced RDD: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Coalesce Num Partitions: 100 Num Hosts: 1 344 360 14 0.3 3441.4 1.0X +Coalesce Num Partitions: 100 Num Hosts: 5 283 301 22 0.4 2825.1 1.2X +Coalesce Num Partitions: 100 Num Hosts: 10 270 271 2 0.4 2700.5 1.3X +Coalesce Num Partitions: 100 Num Hosts: 20 272 273 1 0.4 2721.1 1.3X +Coalesce Num Partitions: 100 Num Hosts: 40 271 272 1 0.4 2710.0 1.3X +Coalesce Num Partitions: 100 Num Hosts: 80 266 267 2 0.4 2656.3 1.3X +Coalesce Num Partitions: 500 Num Hosts: 1 609 619 15 0.2 6089.0 0.6X +Coalesce Num Partitions: 500 Num Hosts: 5 338 343 6 0.3 3383.0 1.0X +Coalesce Num Partitions: 500 Num Hosts: 10 303 306 3 0.3 3029.4 1.1X +Coalesce Num Partitions: 500 Num Hosts: 20 286 288 2 0.4 2855.9 1.2X +Coalesce Num Partitions: 500 Num Hosts: 40 279 282 4 0.4 2793.3 1.2X +Coalesce Num Partitions: 500 Num Hosts: 80 273 275 3 0.4 2725.9 1.3X +Coalesce Num Partitions: 1000 Num Hosts: 1 951 955 4 0.1 9514.1 0.4X +Coalesce Num Partitions: 1000 Num Hosts: 5 421 429 8 0.2 4211.3 0.8X +Coalesce Num Partitions: 1000 Num Hosts: 10 347 352 4 0.3 3473.5 1.0X +Coalesce Num Partitions: 1000 Num Hosts: 20 309 312 5 0.3 3087.5 1.1X +Coalesce Num Partitions: 1000 Num Hosts: 40 290 294 6 0.3 2896.4 1.2X +Coalesce Num Partitions: 1000 Num Hosts: 80 281 286 5 0.4 2811.3 1.2X +Coalesce Num Partitions: 5000 Num Hosts: 1 3928 3950 27 0.0 39278.0 0.1X +Coalesce Num Partitions: 5000 Num Hosts: 5 1373 1389 27 0.1 13725.2 0.3X +Coalesce Num Partitions: 5000 Num Hosts: 10 812 827 13 0.1 8123.3 0.4X +Coalesce Num Partitions: 5000 Num Hosts: 20 530 540 9 0.2 5299.1 0.6X +Coalesce Num Partitions: 5000 Num Hosts: 40 421 425 5 0.2 4210.5 0.8X +Coalesce Num Partitions: 5000 Num Hosts: 80 335 344 12 0.3 3353.7 1.0X +Coalesce Num Partitions: 10000 Num Hosts: 1 7116 7120 4 0.0 71159.0 0.0X +Coalesce Num Partitions: 10000 Num Hosts: 5 2539 2598 51 0.0 25390.1 0.1X +Coalesce Num Partitions: 10000 Num Hosts: 10 1393 1432 34 0.1 13928.1 0.2X +Coalesce Num Partitions: 10000 Num Hosts: 20 833 1009 303 0.1 8329.2 0.4X +Coalesce Num Partitions: 10000 Num Hosts: 40 562 563 3 0.2 5615.2 0.6X +Coalesce Num Partitions: 10000 Num Hosts: 80 420 426 7 0.2 4204.0 0.8X + + diff --git a/core/benchmarks/CoalescedRDDBenchmark-results.txt b/core/benchmarks/CoalescedRDDBenchmark-results.txt index dd63b0adea4f2..f1b867951a074 100644 --- a/core/benchmarks/CoalescedRDDBenchmark-results.txt +++ b/core/benchmarks/CoalescedRDDBenchmark-results.txt @@ -2,39 +2,39 @@ Coalesced RDD , large scale ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_201-b09 on Windows 10 10.0 -Intel64 Family 6 Model 63 Stepping 2, GenuineIntel +OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Coalesced RDD: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Coalesce Num Partitions: 100 Num Hosts: 1 346 364 24 0.3 3458.9 1.0X -Coalesce Num Partitions: 100 Num Hosts: 5 258 264 6 0.4 2579.0 1.3X -Coalesce Num Partitions: 100 Num Hosts: 10 242 249 7 0.4 2415.2 1.4X -Coalesce Num Partitions: 100 Num Hosts: 20 237 242 7 0.4 2371.7 1.5X -Coalesce Num Partitions: 100 Num Hosts: 40 230 231 1 0.4 2299.8 1.5X -Coalesce Num Partitions: 100 Num Hosts: 80 222 233 14 0.4 2223.0 1.6X -Coalesce Num Partitions: 500 Num Hosts: 1 659 665 5 0.2 6590.4 0.5X -Coalesce Num Partitions: 500 Num Hosts: 5 340 381 47 0.3 3395.2 1.0X -Coalesce Num Partitions: 500 Num Hosts: 10 279 307 47 0.4 2788.3 1.2X -Coalesce Num Partitions: 500 Num Hosts: 20 259 261 2 0.4 2591.9 1.3X -Coalesce Num Partitions: 500 Num Hosts: 40 241 250 15 0.4 2406.5 1.4X -Coalesce Num Partitions: 500 Num Hosts: 80 235 237 3 0.4 2349.9 1.5X -Coalesce Num Partitions: 1000 Num Hosts: 1 1050 1053 4 0.1 10503.2 0.3X -Coalesce Num Partitions: 1000 Num Hosts: 5 405 407 2 0.2 4049.5 0.9X -Coalesce Num Partitions: 1000 Num Hosts: 10 320 322 2 0.3 3202.7 1.1X -Coalesce Num Partitions: 1000 Num Hosts: 20 276 277 0 0.4 2762.3 1.3X -Coalesce Num Partitions: 1000 Num Hosts: 40 257 260 5 0.4 2571.2 1.3X -Coalesce Num Partitions: 1000 Num Hosts: 80 245 252 13 0.4 2448.9 1.4X -Coalesce Num Partitions: 5000 Num Hosts: 1 3099 3145 55 0.0 30988.6 0.1X -Coalesce Num Partitions: 5000 Num Hosts: 5 1037 1050 20 0.1 10374.4 0.3X -Coalesce Num Partitions: 5000 Num Hosts: 10 626 633 8 0.2 6261.8 0.6X -Coalesce Num Partitions: 5000 Num Hosts: 20 426 431 5 0.2 4258.6 0.8X -Coalesce Num Partitions: 5000 Num Hosts: 40 328 341 22 0.3 3275.4 1.1X -Coalesce Num Partitions: 5000 Num Hosts: 80 272 275 4 0.4 2721.4 1.3X -Coalesce Num Partitions: 10000 Num Hosts: 1 5516 5526 9 0.0 55156.8 0.1X -Coalesce Num Partitions: 10000 Num Hosts: 5 1956 1992 48 0.1 19560.9 0.2X -Coalesce Num Partitions: 10000 Num Hosts: 10 1045 1057 18 0.1 10447.4 0.3X -Coalesce Num Partitions: 10000 Num Hosts: 20 637 658 24 0.2 6373.2 0.5X -Coalesce Num Partitions: 10000 Num Hosts: 40 431 448 15 0.2 4312.9 0.8X -Coalesce Num Partitions: 10000 Num Hosts: 80 326 328 2 0.3 3263.4 1.1X +Coalesce Num Partitions: 100 Num Hosts: 1 395 401 9 0.3 3952.3 1.0X +Coalesce Num Partitions: 100 Num Hosts: 5 296 344 42 0.3 2963.2 1.3X +Coalesce Num Partitions: 100 Num Hosts: 10 294 308 15 0.3 2941.7 1.3X +Coalesce Num Partitions: 100 Num Hosts: 20 316 328 13 0.3 3155.2 1.3X +Coalesce Num Partitions: 100 Num Hosts: 40 294 316 36 0.3 2940.3 1.3X +Coalesce Num Partitions: 100 Num Hosts: 80 292 324 30 0.3 2922.2 1.4X +Coalesce Num Partitions: 500 Num Hosts: 1 629 687 61 0.2 6292.4 0.6X +Coalesce Num Partitions: 500 Num Hosts: 5 354 378 42 0.3 3541.7 1.1X +Coalesce Num Partitions: 500 Num Hosts: 10 318 338 29 0.3 3179.8 1.2X +Coalesce Num Partitions: 500 Num Hosts: 20 306 317 11 0.3 3059.2 1.3X +Coalesce Num Partitions: 500 Num Hosts: 40 294 311 28 0.3 2941.6 1.3X +Coalesce Num Partitions: 500 Num Hosts: 80 288 309 34 0.3 2883.9 1.4X +Coalesce Num Partitions: 1000 Num Hosts: 1 956 978 20 0.1 9562.2 0.4X +Coalesce Num Partitions: 1000 Num Hosts: 5 431 452 36 0.2 4306.2 0.9X +Coalesce Num Partitions: 1000 Num Hosts: 10 358 379 23 0.3 3581.1 1.1X +Coalesce Num Partitions: 1000 Num Hosts: 20 324 347 20 0.3 3236.7 1.2X +Coalesce Num Partitions: 1000 Num Hosts: 40 312 333 20 0.3 3116.8 1.3X +Coalesce Num Partitions: 1000 Num Hosts: 80 307 342 32 0.3 3068.4 1.3X +Coalesce Num Partitions: 5000 Num Hosts: 1 3895 3906 12 0.0 38946.8 0.1X +Coalesce Num Partitions: 5000 Num Hosts: 5 1388 1401 19 0.1 13881.7 0.3X +Coalesce Num Partitions: 5000 Num Hosts: 10 806 839 57 0.1 8063.7 0.5X +Coalesce Num Partitions: 5000 Num Hosts: 20 546 573 44 0.2 5462.6 0.7X +Coalesce Num Partitions: 5000 Num Hosts: 40 413 418 5 0.2 4134.7 1.0X +Coalesce Num Partitions: 5000 Num Hosts: 80 345 365 23 0.3 3448.1 1.1X +Coalesce Num Partitions: 10000 Num Hosts: 1 6933 6966 55 0.0 69328.8 0.1X +Coalesce Num Partitions: 10000 Num Hosts: 5 2455 2499 69 0.0 24551.7 0.2X +Coalesce Num Partitions: 10000 Num Hosts: 10 1352 1392 34 0.1 13520.2 0.3X +Coalesce Num Partitions: 10000 Num Hosts: 20 815 853 50 0.1 8147.5 0.5X +Coalesce Num Partitions: 10000 Num Hosts: 40 558 581 28 0.2 5578.0 0.7X +Coalesce Num Partitions: 10000 Num Hosts: 80 416 423 5 0.2 4163.3 0.9X diff --git a/core/benchmarks/KryoBenchmark-jdk11-results.txt b/core/benchmarks/KryoBenchmark-jdk11-results.txt new file mode 100644 index 0000000000000..27f0b8f59f47a --- /dev/null +++ b/core/benchmarks/KryoBenchmark-jdk11-results.txt @@ -0,0 +1,28 @@ +================================================================================================ +Benchmark Kryo Unsafe vs safe Serialization +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Benchmark Kryo Unsafe vs safe Serialization: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +basicTypes: Int with unsafe:true 275 288 14 3.6 275.2 1.0X +basicTypes: Long with unsafe:true 331 336 13 3.0 330.9 0.8X +basicTypes: Float with unsafe:true 304 305 1 3.3 304.4 0.9X +basicTypes: Double with unsafe:true 328 332 3 3.0 328.1 0.8X +Array: Int with unsafe:true 4 4 0 252.8 4.0 69.6X +Array: Long with unsafe:true 6 6 0 161.5 6.2 44.5X +Array: Float with unsafe:true 4 4 0 264.6 3.8 72.8X +Array: Double with unsafe:true 6 7 0 160.5 6.2 44.2X +Map of string->Double with unsafe:true 52 52 0 19.3 51.8 5.3X +basicTypes: Int with unsafe:false 344 345 1 2.9 344.3 0.8X +basicTypes: Long with unsafe:false 372 373 1 2.7 372.3 0.7X +basicTypes: Float with unsafe:false 333 334 1 3.0 333.4 0.8X +basicTypes: Double with unsafe:false 344 345 0 2.9 344.3 0.8X +Array: Int with unsafe:false 25 25 0 40.8 24.5 11.2X +Array: Long with unsafe:false 37 37 1 27.3 36.7 7.5X +Array: Float with unsafe:false 11 11 0 92.1 10.9 25.4X +Array: Double with unsafe:false 17 18 0 58.3 17.2 16.0X +Map of string->Double with unsafe:false 51 52 1 19.4 51.5 5.3X + + diff --git a/core/benchmarks/KryoBenchmark-results.txt b/core/benchmarks/KryoBenchmark-results.txt index 91e22f3afc14f..49791e6e87e3a 100644 --- a/core/benchmarks/KryoBenchmark-results.txt +++ b/core/benchmarks/KryoBenchmark-results.txt @@ -2,28 +2,27 @@ Benchmark Kryo Unsafe vs safe Serialization ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_131-b11 on Mac OS X 10.13.6 -Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz - -Benchmark Kryo Unsafe vs safe Serialization: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -basicTypes: Int with unsafe:true 138 / 149 7.2 138.0 1.0X -basicTypes: Long with unsafe:true 168 / 173 6.0 167.7 0.8X -basicTypes: Float with unsafe:true 153 / 174 6.5 153.1 0.9X -basicTypes: Double with unsafe:true 161 / 185 6.2 161.1 0.9X -Array: Int with unsafe:true 2 / 3 409.7 2.4 56.5X -Array: Long with unsafe:true 4 / 5 232.5 4.3 32.1X -Array: Float with unsafe:true 3 / 4 367.3 2.7 50.7X -Array: Double with unsafe:true 4 / 5 228.5 4.4 31.5X -Map of string->Double with unsafe:true 38 / 45 26.5 37.8 3.7X -basicTypes: Int with unsafe:false 176 / 187 5.7 175.9 0.8X -basicTypes: Long with unsafe:false 191 / 203 5.2 191.2 0.7X -basicTypes: Float with unsafe:false 166 / 176 6.0 166.2 0.8X -basicTypes: Double with unsafe:false 174 / 190 5.7 174.3 0.8X -Array: Int with unsafe:false 19 / 26 52.9 18.9 7.3X -Array: Long with unsafe:false 27 / 31 37.7 26.5 5.2X -Array: Float with unsafe:false 8 / 10 124.3 8.0 17.2X -Array: Double with unsafe:false 12 / 13 83.6 12.0 11.5X -Map of string->Double with unsafe:false 38 / 42 26.1 38.3 3.6X +OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Benchmark Kryo Unsafe vs safe Serialization: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +basicTypes: Int with unsafe:true 269 290 23 3.7 269.0 1.0X +basicTypes: Long with unsafe:true 294 295 1 3.4 293.8 0.9X +basicTypes: Float with unsafe:true 300 301 1 3.3 300.4 0.9X +basicTypes: Double with unsafe:true 304 305 1 3.3 304.0 0.9X +Array: Int with unsafe:true 5 6 1 193.5 5.2 52.0X +Array: Long with unsafe:true 8 9 1 131.2 7.6 35.3X +Array: Float with unsafe:true 6 6 0 163.5 6.1 44.0X +Array: Double with unsafe:true 9 10 0 108.8 9.2 29.3X +Map of string->Double with unsafe:true 54 54 1 18.7 53.6 5.0X +basicTypes: Int with unsafe:false 326 327 1 3.1 326.2 0.8X +basicTypes: Long with unsafe:false 353 354 1 2.8 353.3 0.8X +basicTypes: Float with unsafe:false 325 327 1 3.1 325.1 0.8X +basicTypes: Double with unsafe:false 335 336 1 3.0 335.0 0.8X +Array: Int with unsafe:false 27 28 1 36.7 27.2 9.9X +Array: Long with unsafe:false 40 41 1 25.0 40.0 6.7X +Array: Float with unsafe:false 12 13 1 80.8 12.4 21.7X +Array: Double with unsafe:false 21 21 1 48.6 20.6 13.1X +Map of string->Double with unsafe:false 56 57 1 17.8 56.1 4.8X diff --git a/core/benchmarks/KryoSerializerBenchmark-jdk11-results.txt b/core/benchmarks/KryoSerializerBenchmark-jdk11-results.txt new file mode 100644 index 0000000000000..6b148bde12d36 --- /dev/null +++ b/core/benchmarks/KryoSerializerBenchmark-jdk11-results.txt @@ -0,0 +1,12 @@ +================================================================================================ +Benchmark KryoPool vs old"pool of 1" implementation +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Benchmark KryoPool vs old"pool of 1" implementation: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +KryoPool:true 6208 8374 NaN 0.0 12416876.6 1.0X +KryoPool:false 9084 11577 724 0.0 18168947.4 0.7X + + diff --git a/core/benchmarks/KryoSerializerBenchmark-results.txt b/core/benchmarks/KryoSerializerBenchmark-results.txt index c3ce336d93241..609f3298cbc00 100644 --- a/core/benchmarks/KryoSerializerBenchmark-results.txt +++ b/core/benchmarks/KryoSerializerBenchmark-results.txt @@ -1,12 +1,12 @@ ================================================================================================ -Benchmark KryoPool vs "pool of 1" +Benchmark KryoPool vs old"pool of 1" implementation ================================================================================================ -Java HotSpot(TM) 64-Bit Server VM 1.8.0_131-b11 on Mac OS X 10.14 -Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz -Benchmark KryoPool vs "pool of 1": Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -KryoPool:true 2682 / 3425 0.0 5364627.9 1.0X -KryoPool:false 8176 / 9292 0.0 16351252.2 0.3X +OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Benchmark KryoPool vs old"pool of 1" implementation: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +KryoPool:true 6012 7586 NaN 0.0 12023020.2 1.0X +KryoPool:false 9289 11566 909 0.0 18578683.1 0.6X diff --git a/core/benchmarks/MapStatusesSerDeserBenchmark-jdk11-results.txt b/core/benchmarks/MapStatusesSerDeserBenchmark-jdk11-results.txt new file mode 100644 index 0000000000000..db23cf5c12ea7 --- /dev/null +++ b/core/benchmarks/MapStatusesSerDeserBenchmark-jdk11-results.txt @@ -0,0 +1,66 @@ +OpenJDK 64-Bit Server VM 11.0.4+11-post-Ubuntu-1ubuntu218.04.3 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +200000 MapOutputs, 10 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Serialization 170 178 9 1.2 849.7 1.0X +Deserialization 530 535 9 0.4 2651.1 0.3X + +Compressed Serialized MapStatus sizes: 411 bytes +Compressed Serialized Broadcast MapStatus sizes: 2 MB + + +OpenJDK 64-Bit Server VM 11.0.4+11-post-Ubuntu-1ubuntu218.04.3 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +200000 MapOutputs, 10 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Serialization 157 165 7 1.3 785.4 1.0X +Deserialization 495 588 79 0.4 2476.7 0.3X + +Compressed Serialized MapStatus sizes: 2 MB +Compressed Serialized Broadcast MapStatus sizes: 0 bytes + + +OpenJDK 64-Bit Server VM 11.0.4+11-post-Ubuntu-1ubuntu218.04.3 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +200000 MapOutputs, 100 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Serialization 344 351 4 0.6 1720.4 1.0X +Deserialization 527 579 99 0.4 2635.9 0.7X + +Compressed Serialized MapStatus sizes: 427 bytes +Compressed Serialized Broadcast MapStatus sizes: 13 MB + + +OpenJDK 64-Bit Server VM 11.0.4+11-post-Ubuntu-1ubuntu218.04.3 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +200000 MapOutputs, 100 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Serialization 317 321 4 0.6 1583.8 1.0X +Deserialization 530 540 15 0.4 2648.3 0.6X + +Compressed Serialized MapStatus sizes: 13 MB +Compressed Serialized Broadcast MapStatus sizes: 0 bytes + + +OpenJDK 64-Bit Server VM 11.0.4+11-post-Ubuntu-1ubuntu218.04.3 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +200000 MapOutputs, 1000 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Serialization 1738 1849 156 0.1 8692.0 1.0X +Deserialization 946 977 33 0.2 4730.2 1.8X + +Compressed Serialized MapStatus sizes: 556 bytes +Compressed Serialized Broadcast MapStatus sizes: 121 MB + + +OpenJDK 64-Bit Server VM 11.0.4+11-post-Ubuntu-1ubuntu218.04.3 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +200000 MapOutputs, 1000 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Serialization 1379 1432 76 0.1 6892.6 1.0X +Deserialization 929 941 19 0.2 4645.5 1.5X + +Compressed Serialized MapStatus sizes: 121 MB +Compressed Serialized Broadcast MapStatus sizes: 0 bytes + + diff --git a/core/benchmarks/MapStatusesSerDeserBenchmark-results.txt b/core/benchmarks/MapStatusesSerDeserBenchmark-results.txt new file mode 100644 index 0000000000000..053f4bf771923 --- /dev/null +++ b/core/benchmarks/MapStatusesSerDeserBenchmark-results.txt @@ -0,0 +1,66 @@ +OpenJDK 64-Bit Server VM 1.8.0_222-8u222-b10-1ubuntu1~18.04.1-b10 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +200000 MapOutputs, 10 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Serialization 178 187 15 1.1 887.5 1.0X +Deserialization 530 558 32 0.4 2647.5 0.3X + +Compressed Serialized MapStatus sizes: 411 bytes +Compressed Serialized Broadcast MapStatus sizes: 2 MB + + +OpenJDK 64-Bit Server VM 1.8.0_222-8u222-b10-1ubuntu1~18.04.1-b10 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +200000 MapOutputs, 10 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Serialization 167 175 7 1.2 835.7 1.0X +Deserialization 523 537 22 0.4 2616.2 0.3X + +Compressed Serialized MapStatus sizes: 2 MB +Compressed Serialized Broadcast MapStatus sizes: 0 bytes + + +OpenJDK 64-Bit Server VM 1.8.0_222-8u222-b10-1ubuntu1~18.04.1-b10 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +200000 MapOutputs, 100 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Serialization 351 416 147 0.6 1754.4 1.0X +Deserialization 546 551 8 0.4 2727.6 0.6X + +Compressed Serialized MapStatus sizes: 427 bytes +Compressed Serialized Broadcast MapStatus sizes: 13 MB + + +OpenJDK 64-Bit Server VM 1.8.0_222-8u222-b10-1ubuntu1~18.04.1-b10 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +200000 MapOutputs, 100 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Serialization 320 321 1 0.6 1598.0 1.0X +Deserialization 542 549 7 0.4 2709.0 0.6X + +Compressed Serialized MapStatus sizes: 13 MB +Compressed Serialized Broadcast MapStatus sizes: 0 bytes + + +OpenJDK 64-Bit Server VM 1.8.0_222-8u222-b10-1ubuntu1~18.04.1-b10 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +200000 MapOutputs, 1000 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Serialization 1671 1877 290 0.1 8357.3 1.0X +Deserialization 943 970 32 0.2 4715.8 1.8X + +Compressed Serialized MapStatus sizes: 556 bytes +Compressed Serialized Broadcast MapStatus sizes: 121 MB + + +OpenJDK 64-Bit Server VM 1.8.0_222-8u222-b10-1ubuntu1~18.04.1-b10 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +200000 MapOutputs, 1000 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Serialization 1373 1436 89 0.1 6865.0 1.0X +Deserialization 940 970 37 0.2 4699.1 1.5X + +Compressed Serialized MapStatus sizes: 121 MB +Compressed Serialized Broadcast MapStatus sizes: 0 bytes + + diff --git a/core/benchmarks/PropertiesCloneBenchmark-jdk11-results.txt b/core/benchmarks/PropertiesCloneBenchmark-jdk11-results.txt new file mode 100644 index 0000000000000..605b856d53382 --- /dev/null +++ b/core/benchmarks/PropertiesCloneBenchmark-jdk11-results.txt @@ -0,0 +1,40 @@ +================================================================================================ +Properties Cloning +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Empty Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +SerializationUtils.clone 0 0 0 0.1 11539.0 1.0X +Utils.cloneProperties 0 0 0 1.7 572.0 20.2X + +OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +System Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +SerializationUtils.clone 0 0 0 0.0 217514.0 1.0X +Utils.cloneProperties 0 0 0 0.2 5387.0 40.4X + +OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Small Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +SerializationUtils.clone 1 1 0 0.0 634574.0 1.0X +Utils.cloneProperties 0 0 0 0.3 3082.0 205.9X + +OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Medium Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +SerializationUtils.clone 3 3 0 0.0 2576565.0 1.0X +Utils.cloneProperties 0 0 0 0.1 16071.0 160.3X + +OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Large Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +SerializationUtils.clone 5 5 0 0.0 5027248.0 1.0X +Utils.cloneProperties 0 0 0 0.0 31842.0 157.9X + + diff --git a/core/benchmarks/PropertiesCloneBenchmark-results.txt b/core/benchmarks/PropertiesCloneBenchmark-results.txt new file mode 100644 index 0000000000000..5d332a147c698 --- /dev/null +++ b/core/benchmarks/PropertiesCloneBenchmark-results.txt @@ -0,0 +1,40 @@ +================================================================================================ +Properties Cloning +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Empty Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +SerializationUtils.clone 0 0 0 0.1 13640.0 1.0X +Utils.cloneProperties 0 0 0 1.6 608.0 22.4X + +OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +System Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +SerializationUtils.clone 0 0 0 0.0 238968.0 1.0X +Utils.cloneProperties 0 0 0 0.4 2318.0 103.1X + +OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Small Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +SerializationUtils.clone 1 1 0 0.0 725849.0 1.0X +Utils.cloneProperties 0 0 0 0.3 2900.0 250.3X + +OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Medium Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +SerializationUtils.clone 3 3 0 0.0 2999676.0 1.0X +Utils.cloneProperties 0 0 0 0.1 11734.0 255.6X + +OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Large Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +SerializationUtils.clone 6 6 1 0.0 5846410.0 1.0X +Utils.cloneProperties 0 0 0 0.0 22405.0 260.9X + + diff --git a/core/benchmarks/XORShiftRandomBenchmark-jdk11-results.txt b/core/benchmarks/XORShiftRandomBenchmark-jdk11-results.txt new file mode 100644 index 0000000000000..9aa10e4835a2f --- /dev/null +++ b/core/benchmarks/XORShiftRandomBenchmark-jdk11-results.txt @@ -0,0 +1,44 @@ +================================================================================================ +Pseudo random +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +nextInt: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java.util.Random 1362 1362 0 73.4 13.6 1.0X +XORShiftRandom 227 227 0 440.6 2.3 6.0X + +OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +nextLong: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java.util.Random 2725 2726 1 36.7 27.3 1.0X +XORShiftRandom 694 694 1 144.1 6.9 3.9X + +OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +nextDouble: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java.util.Random 2727 2728 0 36.7 27.3 1.0X +XORShiftRandom 693 694 0 144.2 6.9 3.9X + +OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +nextGaussian: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java.util.Random 7012 7016 4 14.3 70.1 1.0X +XORShiftRandom 6065 6067 1 16.5 60.7 1.2X + + +================================================================================================ +hash seed +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.4+11-LTS on Linux 3.10.0-862.3.2.el7.x86_64 +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Hash seed: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +XORShiftRandom.hashSeed 36 37 1 276.5 3.6 1.0X + + diff --git a/core/benchmarks/XORShiftRandomBenchmark-results.txt b/core/benchmarks/XORShiftRandomBenchmark-results.txt index 1140489e4a7f3..4b069878b2e9b 100644 --- a/core/benchmarks/XORShiftRandomBenchmark-results.txt +++ b/core/benchmarks/XORShiftRandomBenchmark-results.txt @@ -2,43 +2,43 @@ Pseudo random ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -nextInt: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -java.util.Random 1362 / 1362 73.4 13.6 1.0X -XORShiftRandom 227 / 227 440.6 2.3 6.0X +nextInt: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java.util.Random 1362 1396 59 73.4 13.6 1.0X +XORShiftRandom 227 227 0 440.7 2.3 6.0X -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -nextLong: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -java.util.Random 2732 / 2732 36.6 27.3 1.0X -XORShiftRandom 629 / 629 159.0 6.3 4.3X +nextLong: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java.util.Random 2732 2732 1 36.6 27.3 1.0X +XORShiftRandom 630 630 1 158.7 6.3 4.3X -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -nextDouble: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -java.util.Random 2730 / 2730 36.6 27.3 1.0X -XORShiftRandom 629 / 629 159.0 6.3 4.3X +nextDouble: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java.util.Random 2731 2732 1 36.6 27.3 1.0X +XORShiftRandom 630 630 0 158.8 6.3 4.3X -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -nextGaussian: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -java.util.Random 10288 / 10288 9.7 102.9 1.0X -XORShiftRandom 6351 / 6351 15.7 63.5 1.6X +nextGaussian: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +java.util.Random 8895 8899 4 11.2 88.9 1.0X +XORShiftRandom 5049 5052 5 19.8 50.5 1.8X ================================================================================================ hash seed ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_222-b10 on Linux 3.10.0-862.3.2.el7.x86_64 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -Hash seed: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -XORShiftRandom.hashSeed 1193 / 1195 8.4 119.3 1.0X +Hash seed: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +XORShiftRandom.hashSeed 67 68 1 148.8 6.7 1.0X diff --git a/core/pom.xml b/core/pom.xml index 42fc2c4b3a287..9d54d21b95ba3 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -26,12 +26,14 @@ spark-core_2.12 - - core - jar Spark Project Core http://spark.apache.org/ + + + core + + com.thoughtworks.paranamer @@ -163,7 +165,6 @@ javax.servlet-api ${javaxservlet.version} - org.apache.commons commons-lang3 @@ -292,6 +293,16 @@ io.dropwizard.metrics metrics-graphite + + + com.rabbitmq + amqp-client + + + + + io.dropwizard.metrics + metrics-jmx com.fasterxml.jackson.core @@ -384,6 +395,11 @@ curator-test test + + org.apache.hadoop + hadoop-minikdc + test + net.razorvine pyrolite @@ -501,6 +517,24 @@ + + org.codehaus.mojo + build-helper-maven-plugin + + + add-sources + generate-sources + + add-source + + + + src/main/scala-${scala.binary.version} + + + + + @@ -551,6 +585,15 @@ + + scala-2.13 + + + org.scala-lang.modules + scala-parallel-collections_${scala.binary.version} + + + diff --git a/core/src/main/java/org/apache/spark/ExecutorPlugin.java b/core/src/main/java/org/apache/spark/ExecutorPlugin.java deleted file mode 100644 index f86520c81df33..0000000000000 --- a/core/src/main/java/org/apache/spark/ExecutorPlugin.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark; - -import org.apache.spark.annotation.DeveloperApi; - -/** - * A plugin which can be automatically instantiated within each Spark executor. Users can specify - * plugins which should be created with the "spark.executor.plugins" configuration. An instance - * of each plugin will be created for every executor, including those created by dynamic allocation, - * before the executor starts running any tasks. - * - * The specific api exposed to the end users still considered to be very unstable. We will - * hopefully be able to keep compatibility by providing default implementations for any methods - * added, but make no guarantees this will always be possible across all Spark releases. - * - * Spark does nothing to verify the plugin is doing legitimate things, or to manage the resources - * it uses. A plugin acquires the same privileges as the user running the task. A bad plugin - * could also interfere with task execution and make the executor fail in unexpected ways. - */ -@DeveloperApi -public interface ExecutorPlugin { - - /** - * Initialize the executor plugin. - * - *

Each executor will, during its initialization, invoke this method on each - * plugin provided in the spark.executor.plugins configuration.

- * - *

Plugins should create threads in their implementation of this method for - * any polling, blocking, or intensive computation.

- */ - default void init() {} - - /** - * Clean up and terminate this plugin. - * - *

This function is called during the executor shutdown phase. The executor - * will wait for the plugin to terminate before continuing its own shutdown.

- */ - default void shutdown() {} -} diff --git a/core/src/main/java/org/apache/spark/api/plugin/DriverPlugin.java b/core/src/main/java/org/apache/spark/api/plugin/DriverPlugin.java new file mode 100644 index 0000000000000..0c0d0df8ae682 --- /dev/null +++ b/core/src/main/java/org/apache/spark/api/plugin/DriverPlugin.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.api.plugin; + +import java.util.Collections; +import java.util.Map; + +import org.apache.spark.SparkContext; +import org.apache.spark.annotation.DeveloperApi; + +/** + * :: DeveloperApi :: + * Driver component of a {@link SparkPlugin}. + * + * @since 3.0.0 + */ +@DeveloperApi +public interface DriverPlugin { + + /** + * Initialize the plugin. + *

+ * This method is called early in the initialization of the Spark driver. Explicitly, it is + * called before the Spark driver's task scheduler is initialized. This means that a lot + * of other Spark subsystems may yet not have been initialized. This call also blocks driver + * initialization. + *

+ * It's recommended that plugins be careful about what operations are performed in this call, + * preferrably performing expensive operations in a separate thread, or postponing them until + * the application has fully started. + * + * @param sc The SparkContext loading the plugin. + * @param pluginContext Additional plugin-specific about the Spark application where the plugin + * is running. + * @return A map that will be provided to the {@link ExecutorPlugin#init(PluginContext,Map)} + * method. + */ + default Map init(SparkContext sc, PluginContext pluginContext) { + return Collections.emptyMap(); + } + + /** + * Register metrics published by the plugin with Spark's metrics system. + *

+ * This method is called later in the initialization of the Spark application, after most + * subsystems are up and the application ID is known. If there are metrics registered in + * the registry ({@link PluginContext#metricRegistry()}), then a metrics source with the + * plugin name will be created. + *

+ * Note that even though the metric registry is still accessible after this method is called, + * registering new metrics after this method is called may result in the metrics not being + * available. + * + * @param appId The application ID from the cluster manager. + * @param pluginContext Additional plugin-specific about the Spark application where the plugin + * is running. + */ + default void registerMetrics(String appId, PluginContext pluginContext) {} + + /** + * RPC message handler. + *

+ * Plugins can use Spark's RPC system to send messages from executors to the driver (but not + * the other way around, currently). Messages sent by the executor component of the plugin will + * be delivered to this method, and the returned value will be sent back to the executor as + * the reply, if the executor has requested one. + *

+ * Any exception thrown will be sent back to the executor as an error, in case it is expecting + * a reply. In case a reply is not expected, a log message will be written to the driver log. + *

+ * The implementation of this handler should be thread-safe. + *

+ * Note all plugins share RPC dispatch threads, and this method is called synchronously. So + * performing expensive operations in this handler may affect the operation of other active + * plugins. Internal Spark endpoints are not directly affected, though, since they use different + * threads. + *

+ * Spark guarantees that the driver component will be ready to receive messages through this + * handler when executors are started. + * + * @param message The incoming message. + * @return Value to be returned to the caller. Ignored if the caller does not expect a reply. + */ + default Object receive(Object message) throws Exception { + throw new UnsupportedOperationException(); + } + + /** + * Informs the plugin that the Spark application is shutting down. + *

+ * This method is called during the driver shutdown phase. It is recommended that plugins + * not use any Spark functions (e.g. send RPC messages) during this call. + */ + default void shutdown() {} + +} diff --git a/core/src/main/java/org/apache/spark/api/plugin/ExecutorPlugin.java b/core/src/main/java/org/apache/spark/api/plugin/ExecutorPlugin.java new file mode 100644 index 0000000000000..4961308035163 --- /dev/null +++ b/core/src/main/java/org/apache/spark/api/plugin/ExecutorPlugin.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.api.plugin; + +import java.util.Map; + +import org.apache.spark.annotation.DeveloperApi; + +/** + * :: DeveloperApi :: + * Executor component of a {@link SparkPlugin}. + * + * @since 3.0.0 + */ +@DeveloperApi +public interface ExecutorPlugin { + + /** + * Initialize the executor plugin. + *

+ * When a Spark plugin provides an executor plugin, this method will be called during the + * initialization of the executor process. It will block executor initialization until it + * returns. + *

+ * Executor plugins that publish metrics should register all metrics with the context's + * registry ({@link PluginContext#metricRegistry()}) when this method is called. Metrics + * registered afterwards are not guaranteed to show up. + * + * @param ctx Context information for the executor where the plugin is running. + * @param extraConf Extra configuration provided by the driver component during its + * initialization. + */ + default void init(PluginContext ctx, Map extraConf) {} + + /** + * Clean up and terminate this plugin. + *

+ * This method is called during the executor shutdown phase, and blocks executor shutdown. + */ + default void shutdown() {} + +} diff --git a/core/src/main/java/org/apache/spark/api/plugin/PluginContext.java b/core/src/main/java/org/apache/spark/api/plugin/PluginContext.java new file mode 100644 index 0000000000000..36d827598dfc5 --- /dev/null +++ b/core/src/main/java/org/apache/spark/api/plugin/PluginContext.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.api.plugin; + +import java.io.IOException; +import java.util.Map; + +import com.codahale.metrics.MetricRegistry; + +import org.apache.spark.SparkConf; +import org.apache.spark.annotation.DeveloperApi; +import org.apache.spark.resource.ResourceInformation; + +/** + * :: DeveloperApi :: + * Context information and operations for plugins loaded by Spark. + *

+ * An instance of this class is provided to plugins in their initialization method. It is safe + * for plugins to keep a reference to the instance for later use (for example, to send messages + * to the plugin's driver component). + *

+ * Context instances are plugin-specific, so metrics and messages are tied each plugin. It is + * not possible for a plugin to directly interact with other plugins. + * + * @since 3.0.0 + */ +@DeveloperApi +public interface PluginContext { + + /** + * Registry where to register metrics published by the plugin associated with this context. + */ + MetricRegistry metricRegistry(); + + /** Configuration of the Spark application. */ + SparkConf conf(); + + /** Executor ID of the process. On the driver, this will identify the driver. */ + String executorID(); + + /** The host name which is being used by the Spark process for communication. */ + String hostname(); + + /** The custom resources (GPUs, FPGAs, etc) allocated to driver or executor. */ + Map resources(); + + /** + * Send a message to the plugin's driver-side component. + *

+ * This method sends a message to the driver-side component of the plugin, without expecting + * a reply. It returns as soon as the message is enqueued for sending. + *

+ * The message must be serializable. + * + * @param message Message to be sent. + */ + void send(Object message) throws IOException; + + /** + * Send an RPC to the plugin's driver-side component. + *

+ * This method sends a message to the driver-side component of the plugin, and blocks until a + * reply arrives, or the configured RPC ask timeout (spark.rpc.askTimeout) elapses. + *

+ * If the driver replies with an error, an exception with the corresponding error will be thrown. + *

+ * The message must be serializable. + * + * @param message Message to be sent. + * @return The reply from the driver-side component. + */ + Object ask(Object message) throws Exception; + +} diff --git a/core/src/main/java/org/apache/spark/api/plugin/SparkPlugin.java b/core/src/main/java/org/apache/spark/api/plugin/SparkPlugin.java new file mode 100644 index 0000000000000..21ddae37d8a0d --- /dev/null +++ b/core/src/main/java/org/apache/spark/api/plugin/SparkPlugin.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.api.plugin; + +import org.apache.spark.annotation.DeveloperApi; + +/** + * :: DeveloperApi :: + * A plugin that can be dynamically loaded into a Spark application. + *

+ * Plugins can be loaded by adding the plugin's class name to the appropriate Spark configuration. + * Check the Spark monitoring guide for details. + *

+ * Plugins have two optional components: a driver-side component, of which a single instance is + * created per application, inside the Spark driver. And an executor-side component, of which one + * instance is created in each executor that is started by Spark. Details of each component can be + * found in the documentation for {@link DriverPlugin} and {@link ExecutorPlugin}. + * + * @since 3.0.0 + */ +@DeveloperApi +public interface SparkPlugin { + + /** + * Return the plugin's driver-side component. + * + * @return The driver-side component, or null if one is not needed. + */ + DriverPlugin driverPlugin(); + + /** + * Return the plugin's executor-side component. + * + * @return The executor-side component, or null if one is not needed. + */ + ExecutorPlugin executorPlugin(); + +} diff --git a/core/src/main/java/org/apache/spark/api/resource/ResourceDiscoveryPlugin.java b/core/src/main/java/org/apache/spark/api/resource/ResourceDiscoveryPlugin.java new file mode 100644 index 0000000000000..ffd2f83552a63 --- /dev/null +++ b/core/src/main/java/org/apache/spark/api/resource/ResourceDiscoveryPlugin.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.api.resource; + +import java.util.Optional; + +import org.apache.spark.annotation.DeveloperApi; +import org.apache.spark.SparkConf; +import org.apache.spark.resource.ResourceInformation; +import org.apache.spark.resource.ResourceRequest; + +/** + * :: DeveloperApi :: + * A plugin that can be dynamically loaded into a Spark application to control how custom + * resources are discovered. Plugins can be chained to allow different plugins to handle + * different resource types. + *

+ * Plugins must implement the function discoveryResource. + * + * @since 3.0.0 + */ +@DeveloperApi +public interface ResourceDiscoveryPlugin { + /** + * Discover the addresses of the requested resource. + *

+ * This method is called early in the initialization of the Spark Executor/Driver/Worker. + * This function is responsible for discovering the addresses of the resource which Spark will + * then use for scheduling and eventually providing to the user. + * Depending on the deployment mode and and configuration of custom resources, this could be + * called by the Spark Driver, the Spark Executors, in standalone mode the Workers, or all of + * them. The ResourceRequest has a ResourceID component that can be used to distinguish which + * component it is called from and what resource its being called for. + * This will get called once for each resource type requested and its the responsibility of + * this function to return enough addresses of that resource based on the request. If + * the addresses do not meet the requested amount, Spark will fail. + * If this plugin doesn't handle a particular resource, it should return an empty Optional + * and Spark will try other plugins and then last fall back to the default discovery script + * plugin. + * + * @param request The ResourceRequest that to be discovered. + * @param sparkConf SparkConf + * @return An {@link Optional} containing a {@link ResourceInformation} object containing + * the resource name and the addresses of the resource. If it returns {@link Optional#EMPTY} + * other plugins will be called. + */ + Optional discoverResource(ResourceRequest request, SparkConf sparkConf); +} diff --git a/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java b/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java index 92bf0ecc1b5cb..7ca5ade7b9a74 100644 --- a/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java +++ b/core/src/main/java/org/apache/spark/io/NioBufferedFileInputStream.java @@ -51,7 +51,6 @@ public NioBufferedFileInputStream(File file) throws IOException { /** * Checks weather data is left to be read from the input stream. * @return true if data is left, false otherwise - * @throws IOException */ private boolean refill() throws IOException { if (!byteBuffer.hasRemaining()) { @@ -60,10 +59,10 @@ private boolean refill() throws IOException { while (nRead == 0) { nRead = fileChannel.read(byteBuffer); } + byteBuffer.flip(); if (nRead < 0) { return false; } - byteBuffer.flip(); } return true; } diff --git a/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java b/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java index 4bfd2d358f36f..9a9d0c7946549 100644 --- a/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java +++ b/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java @@ -54,7 +54,7 @@ public MemoryMode getMode() { /** * Returns the size of used memory in bytes. */ - protected long getUsed() { + public long getUsed() { return used; } @@ -78,7 +78,6 @@ public void spill() throws IOException { * @param size the amount of memory should be released * @param trigger the MemoryConsumer that trigger this spilling * @return the amount of released memory in bytes - * @throws IOException */ public abstract long spill(long size, MemoryConsumer trigger) throws IOException; diff --git a/core/src/main/java/org/apache/spark/shuffle/api/ShuffleDataIO.java b/core/src/main/java/org/apache/spark/shuffle/api/ShuffleDataIO.java index e9e50ecc11e52..e4554bda8acab 100644 --- a/core/src/main/java/org/apache/spark/shuffle/api/ShuffleDataIO.java +++ b/core/src/main/java/org/apache/spark/shuffle/api/ShuffleDataIO.java @@ -46,4 +46,10 @@ public interface ShuffleDataIO { * are only invoked on the executors. */ ShuffleExecutorComponents executor(); + + /** + * Called once on driver process to bootstrap the shuffle metadata modules that + * are maintained by the driver. + */ + ShuffleDriverComponents driver(); } diff --git a/core/src/main/java/org/apache/spark/shuffle/api/ShuffleDriverComponents.java b/core/src/main/java/org/apache/spark/shuffle/api/ShuffleDriverComponents.java new file mode 100644 index 0000000000000..b4cec17b85b32 --- /dev/null +++ b/core/src/main/java/org/apache/spark/shuffle/api/ShuffleDriverComponents.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.shuffle.api; + +import java.util.Map; + +import org.apache.spark.annotation.Private; + +/** + * :: Private :: + * An interface for building shuffle support modules for the Driver. + */ +@Private +public interface ShuffleDriverComponents { + + /** + * Called once in the driver to bootstrap this module that is specific to this application. + * This method is called before submitting executor requests to the cluster manager. + * + * This method should prepare the module with its shuffle components i.e. registering against + * an external file servers or shuffle services, or creating tables in a shuffle + * storage data database. + * + * @return additional SparkConf settings necessary for initializing the executor components. + * This would include configurations that cannot be statically set on the application, like + * the host:port of external services for shuffle storage. + */ + Map initializeApplication(); + + /** + * Called once at the end of the Spark application to clean up any existing shuffle state. + */ + void cleanupApplication(); + + /** + * Called once per shuffle id when the shuffle id is first generated for a shuffle stage. + * + * @param shuffleId The unique identifier for the shuffle stage. + */ + default void registerShuffle(int shuffleId) {} + + /** + * Removes shuffle data associated with the given shuffle. + * + * @param shuffleId The unique identifier for the shuffle stage. + * @param blocking Whether this call should block on the deletion of the data. + */ + default void removeShuffle(int shuffleId, boolean blocking) {} +} diff --git a/core/src/main/java/org/apache/spark/shuffle/api/ShuffleExecutorComponents.java b/core/src/main/java/org/apache/spark/shuffle/api/ShuffleExecutorComponents.java index 70c112b78911d..30ca177545789 100644 --- a/core/src/main/java/org/apache/spark/shuffle/api/ShuffleExecutorComponents.java +++ b/core/src/main/java/org/apache/spark/shuffle/api/ShuffleExecutorComponents.java @@ -18,6 +18,8 @@ package org.apache.spark.shuffle.api; import java.io.IOException; +import java.util.Map; +import java.util.Optional; import org.apache.spark.annotation.Private; @@ -33,23 +35,42 @@ public interface ShuffleExecutorComponents { /** * Called once per executor to bootstrap this module with state that is specific to * that executor, specifically the application ID and executor ID. + * + * @param appId The Spark application id + * @param execId The unique identifier of the executor being initialized + * @param extraConfigs Extra configs that were returned by + * {@link ShuffleDriverComponents#initializeApplication()} */ - void initializeExecutor(String appId, String execId); + void initializeExecutor(String appId, String execId, Map extraConfigs); /** * Called once per map task to create a writer that will be responsible for persisting all the * partitioned bytes written by that map task. - * @param shuffleId Unique identifier for the shuffle the map task is a part of - * @param mapId Within the shuffle, the identifier of the map task - * @param mapTaskAttemptId Identifier of the task attempt. Multiple attempts of the same map task - * with the same (shuffleId, mapId) pair can be distinguished by the - * different values of mapTaskAttemptId. + * + * @param shuffleId Unique identifier for the shuffle the map task is a part of + * @param mapTaskId An ID of the map task. The ID is unique within this Spark application. * @param numPartitions The number of partitions that will be written by the map task. Some of -* these partitions may be empty. + * these partitions may be empty. */ ShuffleMapOutputWriter createMapOutputWriter( int shuffleId, - int mapId, - long mapTaskAttemptId, + long mapTaskId, int numPartitions) throws IOException; + + /** + * An optional extension for creating a map output writer that can optimize the transfer of a + * single partition file, as the entire result of a map task, to the backing store. + *

+ * Most implementations should return the default {@link Optional#empty()} to indicate that + * they do not support this optimization. This primarily is for backwards-compatibility in + * preserving an optimization in the local disk shuffle storage implementation. + * + * @param shuffleId Unique identifier for the shuffle the map task is a part of + * @param mapId An ID of the map task. The ID is unique within this Spark application. + */ + default Optional createSingleFileMapOutputWriter( + int shuffleId, + long mapId) throws IOException { + return Optional.empty(); + } } diff --git a/core/src/main/java/org/apache/spark/shuffle/api/ShuffleMapOutputWriter.java b/core/src/main/java/org/apache/spark/shuffle/api/ShuffleMapOutputWriter.java index 7fac00b7fbc3f..21abe9a57cd25 100644 --- a/core/src/main/java/org/apache/spark/shuffle/api/ShuffleMapOutputWriter.java +++ b/core/src/main/java/org/apache/spark/shuffle/api/ShuffleMapOutputWriter.java @@ -39,7 +39,7 @@ public interface ShuffleMapOutputWriter { * for the same partition within any given map task. The partition identifier will be in the * range of precisely 0 (inclusive) to numPartitions (exclusive), where numPartitions was * provided upon the creation of this map output writer via - * {@link ShuffleExecutorComponents#createMapOutputWriter(int, int, long, int)}. + * {@link ShuffleExecutorComponents#createMapOutputWriter(int, long, int)}. *

* Calls to this method will be invoked with monotonically increasing reducePartitionIds; each * call to this method will be called with a reducePartitionId that is strictly greater than diff --git a/core/src/main/java/org/apache/spark/shuffle/api/SingleSpillShuffleMapOutputWriter.java b/core/src/main/java/org/apache/spark/shuffle/api/SingleSpillShuffleMapOutputWriter.java new file mode 100644 index 0000000000000..cad8dcfda52bc --- /dev/null +++ b/core/src/main/java/org/apache/spark/shuffle/api/SingleSpillShuffleMapOutputWriter.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.shuffle.api; + +import java.io.File; +import java.io.IOException; + +import org.apache.spark.annotation.Private; + +/** + * Optional extension for partition writing that is optimized for transferring a single + * file to the backing store. + */ +@Private +public interface SingleSpillShuffleMapOutputWriter { + + /** + * Transfer a file that contains the bytes of all the partitions written by this map task. + */ + void transferMapSpillFile(File mapOutputFile, long[] partitionLengths) throws IOException; +} diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java index f75e932860f90..dc157eaa3b253 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java @@ -85,8 +85,7 @@ final class BypassMergeSortShuffleWriter extends ShuffleWriter { private final Partitioner partitioner; private final ShuffleWriteMetricsReporter writeMetrics; private final int shuffleId; - private final int mapId; - private final long mapTaskAttemptId; + private final long mapId; private final Serializer serializer; private final ShuffleExecutorComponents shuffleExecutorComponents; @@ -106,8 +105,7 @@ final class BypassMergeSortShuffleWriter extends ShuffleWriter { BypassMergeSortShuffleWriter( BlockManager blockManager, BypassMergeSortShuffleHandle handle, - int mapId, - long mapTaskAttemptId, + long mapId, SparkConf conf, ShuffleWriteMetricsReporter writeMetrics, ShuffleExecutorComponents shuffleExecutorComponents) { @@ -117,7 +115,6 @@ final class BypassMergeSortShuffleWriter extends ShuffleWriter { this.blockManager = blockManager; final ShuffleDependency dep = handle.dependency(); this.mapId = mapId; - this.mapTaskAttemptId = mapTaskAttemptId; this.shuffleId = dep.shuffleId(); this.partitioner = dep.partitioner(); this.numPartitions = partitioner.numPartitions(); @@ -130,11 +127,12 @@ final class BypassMergeSortShuffleWriter extends ShuffleWriter { public void write(Iterator> records) throws IOException { assert (partitionWriters == null); ShuffleMapOutputWriter mapOutputWriter = shuffleExecutorComponents - .createMapOutputWriter(shuffleId, mapId, mapTaskAttemptId, numPartitions); + .createMapOutputWriter(shuffleId, mapId, numPartitions); try { if (!records.hasNext()) { partitionLengths = mapOutputWriter.commitAllPartitions(); - mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths); + mapStatus = MapStatus$.MODULE$.apply( + blockManager.shuffleServerId(), partitionLengths, mapId); return; } final SerializerInstance serInstance = serializer.newInstance(); @@ -167,7 +165,8 @@ public void write(Iterator> records) throws IOException { } partitionLengths = writePartitionedData(mapOutputWriter); - mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths); + mapStatus = MapStatus$.MODULE$.apply( + blockManager.shuffleServerId(), partitionLengths, mapId); } catch (Exception e) { try { mapOutputWriter.abort(e); diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java index 024756087bf7f..833744f4777ce 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java @@ -423,7 +423,6 @@ public void insertRecord(Object recordBase, long recordOffset, int length, int p * * @return metadata for the spill files written by this sorter. If no records were ever inserted * into this sorter, then this will return an empty array. - * @throws IOException */ public SpillInfo[] closeAndGetSpills() throws IOException { if (inMemSorter != null) { diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java index 9d05f03613ce9..d09282e61a9c7 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java @@ -17,9 +17,12 @@ package org.apache.spark.shuffle.sort; +import java.nio.channels.Channels; +import java.util.Optional; import javax.annotation.Nullable; import java.io.*; import java.nio.channels.FileChannel; +import java.nio.channels.WritableByteChannel; import java.util.Iterator; import scala.Option; @@ -31,7 +34,6 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.io.ByteStreams; import com.google.common.io.Closeables; -import com.google.common.io.Files; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,8 +43,6 @@ import org.apache.spark.io.CompressionCodec; import org.apache.spark.io.CompressionCodec$; import org.apache.spark.io.NioBufferedFileInputStream; -import org.apache.commons.io.output.CloseShieldOutputStream; -import org.apache.commons.io.output.CountingOutputStream; import org.apache.spark.memory.TaskMemoryManager; import org.apache.spark.network.util.LimitedInputStream; import org.apache.spark.scheduler.MapStatus; @@ -50,8 +50,12 @@ import org.apache.spark.shuffle.ShuffleWriteMetricsReporter; import org.apache.spark.serializer.SerializationStream; import org.apache.spark.serializer.SerializerInstance; -import org.apache.spark.shuffle.IndexShuffleBlockResolver; import org.apache.spark.shuffle.ShuffleWriter; +import org.apache.spark.shuffle.api.ShuffleExecutorComponents; +import org.apache.spark.shuffle.api.ShuffleMapOutputWriter; +import org.apache.spark.shuffle.api.ShufflePartitionWriter; +import org.apache.spark.shuffle.api.SingleSpillShuffleMapOutputWriter; +import org.apache.spark.shuffle.api.WritableByteChannelWrapper; import org.apache.spark.storage.BlockManager; import org.apache.spark.storage.TimeTrackingOutputStream; import org.apache.spark.unsafe.Platform; @@ -65,23 +69,21 @@ public class UnsafeShuffleWriter extends ShuffleWriter { private static final ClassTag OBJECT_CLASS_TAG = ClassTag$.MODULE$.Object(); @VisibleForTesting - static final int DEFAULT_INITIAL_SORT_BUFFER_SIZE = 4096; static final int DEFAULT_INITIAL_SER_BUFFER_SIZE = 1024 * 1024; private final BlockManager blockManager; - private final IndexShuffleBlockResolver shuffleBlockResolver; private final TaskMemoryManager memoryManager; private final SerializerInstance serializer; private final Partitioner partitioner; private final ShuffleWriteMetricsReporter writeMetrics; + private final ShuffleExecutorComponents shuffleExecutorComponents; private final int shuffleId; - private final int mapId; + private final long mapId; private final TaskContext taskContext; private final SparkConf sparkConf; private final boolean transferToEnabled; private final int initialSortBufferSize; private final int inputBufferSizeInBytes; - private final int outputBufferSizeInBytes; @Nullable private MapStatus mapStatus; @Nullable private ShuffleExternalSorter sorter; @@ -103,27 +105,15 @@ private static final class MyByteArrayOutputStream extends ByteArrayOutputStream */ private boolean stopping = false; - private class CloseAndFlushShieldOutputStream extends CloseShieldOutputStream { - - CloseAndFlushShieldOutputStream(OutputStream outputStream) { - super(outputStream); - } - - @Override - public void flush() { - // do nothing - } - } - public UnsafeShuffleWriter( BlockManager blockManager, - IndexShuffleBlockResolver shuffleBlockResolver, TaskMemoryManager memoryManager, SerializedShuffleHandle handle, - int mapId, + long mapId, TaskContext taskContext, SparkConf sparkConf, - ShuffleWriteMetricsReporter writeMetrics) throws IOException { + ShuffleWriteMetricsReporter writeMetrics, + ShuffleExecutorComponents shuffleExecutorComponents) { final int numPartitions = handle.dependency().partitioner().numPartitions(); if (numPartitions > SortShuffleManager.MAX_SHUFFLE_OUTPUT_PARTITIONS_FOR_SERIALIZED_MODE()) { throw new IllegalArgumentException( @@ -132,7 +122,6 @@ public UnsafeShuffleWriter( " reduce partitions"); } this.blockManager = blockManager; - this.shuffleBlockResolver = shuffleBlockResolver; this.memoryManager = memoryManager; this.mapId = mapId; final ShuffleDependency dep = handle.dependency(); @@ -140,6 +129,7 @@ public UnsafeShuffleWriter( this.serializer = dep.serializer().newInstance(); this.partitioner = dep.partitioner(); this.writeMetrics = writeMetrics; + this.shuffleExecutorComponents = shuffleExecutorComponents; this.taskContext = taskContext; this.sparkConf = sparkConf; this.transferToEnabled = sparkConf.getBoolean("spark.file.transferTo", true); @@ -147,8 +137,6 @@ public UnsafeShuffleWriter( (int) (long) sparkConf.get(package$.MODULE$.SHUFFLE_SORT_INIT_BUFFER_SIZE()); this.inputBufferSizeInBytes = (int) (long) sparkConf.get(package$.MODULE$.SHUFFLE_FILE_BUFFER_SIZE()) * 1024; - this.outputBufferSizeInBytes = - (int) (long) sparkConf.get(package$.MODULE$.SHUFFLE_UNSAFE_FILE_OUTPUT_BUFFER_SIZE()) * 1024; open(); } @@ -231,25 +219,17 @@ void closeAndWriteOutput() throws IOException { final SpillInfo[] spills = sorter.closeAndGetSpills(); sorter = null; final long[] partitionLengths; - final File output = shuffleBlockResolver.getDataFile(shuffleId, mapId); - final File tmp = Utils.tempFileWith(output); try { - try { - partitionLengths = mergeSpills(spills, tmp); - } finally { - for (SpillInfo spill : spills) { - if (spill.file.exists() && ! spill.file.delete()) { - logger.error("Error while deleting spill file {}", spill.file.getPath()); - } - } - } - shuffleBlockResolver.writeIndexFileAndCommit(shuffleId, mapId, partitionLengths, tmp); + partitionLengths = mergeSpills(spills); } finally { - if (tmp.exists() && !tmp.delete()) { - logger.error("Error while deleting temp file {}", tmp.getAbsolutePath()); + for (SpillInfo spill : spills) { + if (spill.file.exists() && !spill.file.delete()) { + logger.error("Error while deleting spill file {}", spill.file.getPath()); + } } } - mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths); + mapStatus = MapStatus$.MODULE$.apply( + blockManager.shuffleServerId(), partitionLengths, mapId); } @VisibleForTesting @@ -281,137 +261,153 @@ void forceSorterToSpill() throws IOException { * * @return the partition lengths in the merged file. */ - private long[] mergeSpills(SpillInfo[] spills, File outputFile) throws IOException { + private long[] mergeSpills(SpillInfo[] spills) throws IOException { + long[] partitionLengths; + if (spills.length == 0) { + final ShuffleMapOutputWriter mapWriter = shuffleExecutorComponents + .createMapOutputWriter(shuffleId, mapId, partitioner.numPartitions()); + return mapWriter.commitAllPartitions(); + } else if (spills.length == 1) { + Optional maybeSingleFileWriter = + shuffleExecutorComponents.createSingleFileMapOutputWriter(shuffleId, mapId); + if (maybeSingleFileWriter.isPresent()) { + // Here, we don't need to perform any metrics updates because the bytes written to this + // output file would have already been counted as shuffle bytes written. + partitionLengths = spills[0].partitionLengths; + maybeSingleFileWriter.get().transferMapSpillFile(spills[0].file, partitionLengths); + } else { + partitionLengths = mergeSpillsUsingStandardWriter(spills); + } + } else { + partitionLengths = mergeSpillsUsingStandardWriter(spills); + } + return partitionLengths; + } + + private long[] mergeSpillsUsingStandardWriter(SpillInfo[] spills) throws IOException { + long[] partitionLengths; final boolean compressionEnabled = (boolean) sparkConf.get(package$.MODULE$.SHUFFLE_COMPRESS()); final CompressionCodec compressionCodec = CompressionCodec$.MODULE$.createCodec(sparkConf); final boolean fastMergeEnabled = - (boolean) sparkConf.get(package$.MODULE$.SHUFFLE_UNDAFE_FAST_MERGE_ENABLE()); + (boolean) sparkConf.get(package$.MODULE$.SHUFFLE_UNSAFE_FAST_MERGE_ENABLE()); final boolean fastMergeIsSupported = !compressionEnabled || - CompressionCodec$.MODULE$.supportsConcatenationOfSerializedStreams(compressionCodec); + CompressionCodec$.MODULE$.supportsConcatenationOfSerializedStreams(compressionCodec); final boolean encryptionEnabled = blockManager.serializerManager().encryptionEnabled(); + final ShuffleMapOutputWriter mapWriter = shuffleExecutorComponents + .createMapOutputWriter(shuffleId, mapId, partitioner.numPartitions()); try { - if (spills.length == 0) { - new FileOutputStream(outputFile).close(); // Create an empty file - return new long[partitioner.numPartitions()]; - } else if (spills.length == 1) { - // Here, we don't need to perform any metrics updates because the bytes written to this - // output file would have already been counted as shuffle bytes written. - Files.move(spills[0].file, outputFile); - return spills[0].partitionLengths; - } else { - final long[] partitionLengths; - // There are multiple spills to merge, so none of these spill files' lengths were counted - // towards our shuffle write count or shuffle write time. If we use the slow merge path, - // then the final output file's size won't necessarily be equal to the sum of the spill - // files' sizes. To guard against this case, we look at the output file's actual size when - // computing shuffle bytes written. - // - // We allow the individual merge methods to report their own IO times since different merge - // strategies use different IO techniques. We count IO during merge towards the shuffle - // shuffle write time, which appears to be consistent with the "not bypassing merge-sort" - // branch in ExternalSorter. - if (fastMergeEnabled && fastMergeIsSupported) { - // Compression is disabled or we are using an IO compression codec that supports - // decompression of concatenated compressed streams, so we can perform a fast spill merge - // that doesn't need to interpret the spilled bytes. - if (transferToEnabled && !encryptionEnabled) { - logger.debug("Using transferTo-based fast merge"); - partitionLengths = mergeSpillsWithTransferTo(spills, outputFile); - } else { - logger.debug("Using fileStream-based fast merge"); - partitionLengths = mergeSpillsWithFileStream(spills, outputFile, null); - } + // There are multiple spills to merge, so none of these spill files' lengths were counted + // towards our shuffle write count or shuffle write time. If we use the slow merge path, + // then the final output file's size won't necessarily be equal to the sum of the spill + // files' sizes. To guard against this case, we look at the output file's actual size when + // computing shuffle bytes written. + // + // We allow the individual merge methods to report their own IO times since different merge + // strategies use different IO techniques. We count IO during merge towards the shuffle + // write time, which appears to be consistent with the "not bypassing merge-sort" branch in + // ExternalSorter. + if (fastMergeEnabled && fastMergeIsSupported) { + // Compression is disabled or we are using an IO compression codec that supports + // decompression of concatenated compressed streams, so we can perform a fast spill merge + // that doesn't need to interpret the spilled bytes. + if (transferToEnabled && !encryptionEnabled) { + logger.debug("Using transferTo-based fast merge"); + mergeSpillsWithTransferTo(spills, mapWriter); } else { - logger.debug("Using slow merge"); - partitionLengths = mergeSpillsWithFileStream(spills, outputFile, compressionCodec); + logger.debug("Using fileStream-based fast merge"); + mergeSpillsWithFileStream(spills, mapWriter, null); } - // When closing an UnsafeShuffleExternalSorter that has already spilled once but also has - // in-memory records, we write out the in-memory records to a file but do not count that - // final write as bytes spilled (instead, it's accounted as shuffle write). The merge needs - // to be counted as shuffle write, but this will lead to double-counting of the final - // SpillInfo's bytes. - writeMetrics.decBytesWritten(spills[spills.length - 1].file.length()); - writeMetrics.incBytesWritten(outputFile.length()); - return partitionLengths; + } else { + logger.debug("Using slow merge"); + mergeSpillsWithFileStream(spills, mapWriter, compressionCodec); } - } catch (IOException e) { - if (outputFile.exists() && !outputFile.delete()) { - logger.error("Unable to delete output file {}", outputFile.getPath()); + // When closing an UnsafeShuffleExternalSorter that has already spilled once but also has + // in-memory records, we write out the in-memory records to a file but do not count that + // final write as bytes spilled (instead, it's accounted as shuffle write). The merge needs + // to be counted as shuffle write, but this will lead to double-counting of the final + // SpillInfo's bytes. + writeMetrics.decBytesWritten(spills[spills.length - 1].file.length()); + partitionLengths = mapWriter.commitAllPartitions(); + } catch (Exception e) { + try { + mapWriter.abort(e); + } catch (Exception e2) { + logger.warn("Failed to abort writing the map output.", e2); + e.addSuppressed(e2); } throw e; } + return partitionLengths; } /** * Merges spill files using Java FileStreams. This code path is typically slower than * the NIO-based merge, {@link UnsafeShuffleWriter#mergeSpillsWithTransferTo(SpillInfo[], - * File)}, and it's mostly used in cases where the IO compression codec does not support - * concatenation of compressed data, when encryption is enabled, or when users have - * explicitly disabled use of {@code transferTo} in order to work around kernel bugs. + * ShuffleMapOutputWriter)}, and it's mostly used in cases where the IO compression codec + * does not support concatenation of compressed data, when encryption is enabled, or when + * users have explicitly disabled use of {@code transferTo} in order to work around kernel bugs. * This code path might also be faster in cases where individual partition size in a spill * is small and UnsafeShuffleWriter#mergeSpillsWithTransferTo method performs many small * disk ios which is inefficient. In those case, Using large buffers for input and output * files helps reducing the number of disk ios, making the file merging faster. * * @param spills the spills to merge. - * @param outputFile the file to write the merged data to. + * @param mapWriter the map output writer to use for output. * @param compressionCodec the IO compression codec, or null if shuffle compression is disabled. * @return the partition lengths in the merged file. */ - private long[] mergeSpillsWithFileStream( + private void mergeSpillsWithFileStream( SpillInfo[] spills, - File outputFile, + ShuffleMapOutputWriter mapWriter, @Nullable CompressionCodec compressionCodec) throws IOException { - assert (spills.length >= 2); final int numPartitions = partitioner.numPartitions(); - final long[] partitionLengths = new long[numPartitions]; final InputStream[] spillInputStreams = new InputStream[spills.length]; - final OutputStream bos = new BufferedOutputStream( - new FileOutputStream(outputFile), - outputBufferSizeInBytes); - // Use a counting output stream to avoid having to close the underlying file and ask - // the file system for its size after each partition is written. - final CountingOutputStream mergedFileOutputStream = new CountingOutputStream(bos); - boolean threwException = true; try { for (int i = 0; i < spills.length; i++) { spillInputStreams[i] = new NioBufferedFileInputStream( - spills[i].file, - inputBufferSizeInBytes); + spills[i].file, + inputBufferSizeInBytes); } for (int partition = 0; partition < numPartitions; partition++) { - final long initialFileLength = mergedFileOutputStream.getByteCount(); - // Shield the underlying output stream from close() and flush() calls, so that we can close - // the higher level streams to make sure all data is really flushed and internal state is - // cleaned. - OutputStream partitionOutput = new CloseAndFlushShieldOutputStream( - new TimeTrackingOutputStream(writeMetrics, mergedFileOutputStream)); - partitionOutput = blockManager.serializerManager().wrapForEncryption(partitionOutput); - if (compressionCodec != null) { - partitionOutput = compressionCodec.compressedOutputStream(partitionOutput); - } - for (int i = 0; i < spills.length; i++) { - final long partitionLengthInSpill = spills[i].partitionLengths[partition]; - if (partitionLengthInSpill > 0) { - InputStream partitionInputStream = new LimitedInputStream(spillInputStreams[i], - partitionLengthInSpill, false); - try { - partitionInputStream = blockManager.serializerManager().wrapForEncryption( - partitionInputStream); - if (compressionCodec != null) { - partitionInputStream = compressionCodec.compressedInputStream(partitionInputStream); + boolean copyThrewException = true; + ShufflePartitionWriter writer = mapWriter.getPartitionWriter(partition); + OutputStream partitionOutput = writer.openStream(); + try { + partitionOutput = new TimeTrackingOutputStream(writeMetrics, partitionOutput); + partitionOutput = blockManager.serializerManager().wrapForEncryption(partitionOutput); + if (compressionCodec != null) { + partitionOutput = compressionCodec.compressedOutputStream(partitionOutput); + } + for (int i = 0; i < spills.length; i++) { + final long partitionLengthInSpill = spills[i].partitionLengths[partition]; + + if (partitionLengthInSpill > 0) { + InputStream partitionInputStream = null; + boolean copySpillThrewException = true; + try { + partitionInputStream = new LimitedInputStream(spillInputStreams[i], + partitionLengthInSpill, false); + partitionInputStream = blockManager.serializerManager().wrapForEncryption( + partitionInputStream); + if (compressionCodec != null) { + partitionInputStream = compressionCodec.compressedInputStream( + partitionInputStream); + } + ByteStreams.copy(partitionInputStream, partitionOutput); + copySpillThrewException = false; + } finally { + Closeables.close(partitionInputStream, copySpillThrewException); } - ByteStreams.copy(partitionInputStream, partitionOutput); - } finally { - partitionInputStream.close(); } } + copyThrewException = false; + } finally { + Closeables.close(partitionOutput, copyThrewException); } - partitionOutput.flush(); - partitionOutput.close(); - partitionLengths[partition] = (mergedFileOutputStream.getByteCount() - initialFileLength); + long numBytesWritten = writer.getNumBytesWritten(); + writeMetrics.incBytesWritten(numBytesWritten); } threwException = false; } finally { @@ -420,9 +416,7 @@ private long[] mergeSpillsWithFileStream( for (InputStream stream : spillInputStreams) { Closeables.close(stream, threwException); } - Closeables.close(mergedFileOutputStream, threwException); } - return partitionLengths; } /** @@ -430,54 +424,46 @@ private long[] mergeSpillsWithFileStream( * This is only safe when the IO compression codec and serializer support concatenation of * serialized streams. * + * @param spills the spills to merge. + * @param mapWriter the map output writer to use for output. * @return the partition lengths in the merged file. */ - private long[] mergeSpillsWithTransferTo(SpillInfo[] spills, File outputFile) throws IOException { - assert (spills.length >= 2); + private void mergeSpillsWithTransferTo( + SpillInfo[] spills, + ShuffleMapOutputWriter mapWriter) throws IOException { final int numPartitions = partitioner.numPartitions(); - final long[] partitionLengths = new long[numPartitions]; final FileChannel[] spillInputChannels = new FileChannel[spills.length]; final long[] spillInputChannelPositions = new long[spills.length]; - FileChannel mergedFileOutputChannel = null; boolean threwException = true; try { for (int i = 0; i < spills.length; i++) { spillInputChannels[i] = new FileInputStream(spills[i].file).getChannel(); } - // This file needs to opened in append mode in order to work around a Linux kernel bug that - // affects transferTo; see SPARK-3948 for more details. - mergedFileOutputChannel = new FileOutputStream(outputFile, true).getChannel(); - - long bytesWrittenToMergedFile = 0; for (int partition = 0; partition < numPartitions; partition++) { - for (int i = 0; i < spills.length; i++) { - final long partitionLengthInSpill = spills[i].partitionLengths[partition]; - final FileChannel spillInputChannel = spillInputChannels[i]; - final long writeStartTime = System.nanoTime(); - Utils.copyFileStreamNIO( - spillInputChannel, - mergedFileOutputChannel, - spillInputChannelPositions[i], - partitionLengthInSpill); - spillInputChannelPositions[i] += partitionLengthInSpill; - writeMetrics.incWriteTime(System.nanoTime() - writeStartTime); - bytesWrittenToMergedFile += partitionLengthInSpill; - partitionLengths[partition] += partitionLengthInSpill; + boolean copyThrewException = true; + ShufflePartitionWriter writer = mapWriter.getPartitionWriter(partition); + WritableByteChannelWrapper resolvedChannel = writer.openChannelWrapper() + .orElseGet(() -> new StreamFallbackChannelWrapper(openStreamUnchecked(writer))); + try { + for (int i = 0; i < spills.length; i++) { + long partitionLengthInSpill = spills[i].partitionLengths[partition]; + final FileChannel spillInputChannel = spillInputChannels[i]; + final long writeStartTime = System.nanoTime(); + Utils.copyFileStreamNIO( + spillInputChannel, + resolvedChannel.channel(), + spillInputChannelPositions[i], + partitionLengthInSpill); + copyThrewException = false; + spillInputChannelPositions[i] += partitionLengthInSpill; + writeMetrics.incWriteTime(System.nanoTime() - writeStartTime); + } + } finally { + Closeables.close(resolvedChannel, copyThrewException); } - } - // Check the position after transferTo loop to see if it is in the right position and raise an - // exception if it is incorrect. The position will not be increased to the expected length - // after calling transferTo in kernel version 2.6.32. This issue is described at - // https://bugs.openjdk.java.net/browse/JDK-7052359 and SPARK-3948. - if (mergedFileOutputChannel.position() != bytesWrittenToMergedFile) { - throw new IOException( - "Current position " + mergedFileOutputChannel.position() + " does not equal expected " + - "position " + bytesWrittenToMergedFile + " after transferTo. Please check your kernel" + - " version to see if it is 2.6.32, as there is a kernel bug which will lead to " + - "unexpected behavior when using transferTo. You can set spark.file.transferTo=false " + - "to disable this NIO feature." - ); + long numBytes = writer.getNumBytesWritten(); + writeMetrics.incBytesWritten(numBytes); } threwException = false; } finally { @@ -487,9 +473,7 @@ private long[] mergeSpillsWithTransferTo(SpillInfo[] spills, File outputFile) th assert(spillInputChannelPositions[i] == spills[i].file.length()); Closeables.close(spillInputChannels[i], threwException); } - Closeables.close(mergedFileOutputChannel, threwException); } - return partitionLengths; } @Override @@ -518,4 +502,30 @@ public Option stop(boolean success) { } } } + + private static OutputStream openStreamUnchecked(ShufflePartitionWriter writer) { + try { + return writer.openStream(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private static final class StreamFallbackChannelWrapper implements WritableByteChannelWrapper { + private final WritableByteChannel channel; + + StreamFallbackChannelWrapper(OutputStream fallbackStream) { + this.channel = Channels.newChannel(fallbackStream); + } + + @Override + public WritableByteChannel channel() { + return channel; + } + + @Override + public void close() throws IOException { + channel.close(); + } + } } diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleDataIO.java b/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleDataIO.java index cabcb171ac23a..50eb2f1813714 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleDataIO.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleDataIO.java @@ -18,8 +18,9 @@ package org.apache.spark.shuffle.sort.io; import org.apache.spark.SparkConf; -import org.apache.spark.shuffle.api.ShuffleExecutorComponents; import org.apache.spark.shuffle.api.ShuffleDataIO; +import org.apache.spark.shuffle.api.ShuffleDriverComponents; +import org.apache.spark.shuffle.api.ShuffleExecutorComponents; /** * Implementation of the {@link ShuffleDataIO} plugin system that replicates the local shuffle @@ -37,4 +38,9 @@ public LocalDiskShuffleDataIO(SparkConf sparkConf) { public ShuffleExecutorComponents executor() { return new LocalDiskShuffleExecutorComponents(sparkConf); } + + @Override + public ShuffleDriverComponents driver() { + return new LocalDiskShuffleDriverComponents(); + } } diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleDriverComponents.java b/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleDriverComponents.java new file mode 100644 index 0000000000000..92b4b318c552d --- /dev/null +++ b/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleDriverComponents.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.shuffle.sort.io; + +import java.util.Collections; +import java.util.Map; + +import org.apache.spark.SparkEnv; +import org.apache.spark.shuffle.api.ShuffleDriverComponents; +import org.apache.spark.storage.BlockManagerMaster; + +public class LocalDiskShuffleDriverComponents implements ShuffleDriverComponents { + + private BlockManagerMaster blockManagerMaster; + + @Override + public Map initializeApplication() { + blockManagerMaster = SparkEnv.get().blockManager().master(); + return Collections.emptyMap(); + } + + @Override + public void cleanupApplication() { + // nothing to clean up + } + + @Override + public void removeShuffle(int shuffleId, boolean blocking) { + if (blockManagerMaster == null) { + throw new IllegalStateException("Driver components must be initialized before using"); + } + blockManagerMaster.removeShuffle(shuffleId, blocking); + } +} diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleExecutorComponents.java b/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleExecutorComponents.java index 02eb710737285..eb4d9d9abc8e3 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleExecutorComponents.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleExecutorComponents.java @@ -17,6 +17,9 @@ package org.apache.spark.shuffle.sort.io; +import java.util.Map; +import java.util.Optional; + import com.google.common.annotations.VisibleForTesting; import org.apache.spark.SparkConf; @@ -24,6 +27,7 @@ import org.apache.spark.shuffle.api.ShuffleExecutorComponents; import org.apache.spark.shuffle.api.ShuffleMapOutputWriter; import org.apache.spark.shuffle.IndexShuffleBlockResolver; +import org.apache.spark.shuffle.api.SingleSpillShuffleMapOutputWriter; import org.apache.spark.storage.BlockManager; public class LocalDiskShuffleExecutorComponents implements ShuffleExecutorComponents { @@ -47,7 +51,7 @@ public LocalDiskShuffleExecutorComponents( } @Override - public void initializeExecutor(String appId, String execId) { + public void initializeExecutor(String appId, String execId, Map extraConfigs) { blockManager = SparkEnv.get().blockManager(); if (blockManager == null) { throw new IllegalStateException("No blockManager available from the SparkEnv."); @@ -58,14 +62,24 @@ public void initializeExecutor(String appId, String execId) { @Override public ShuffleMapOutputWriter createMapOutputWriter( int shuffleId, - int mapId, - long mapTaskAttemptId, + long mapTaskId, int numPartitions) { if (blockResolver == null) { throw new IllegalStateException( "Executor components must be initialized before getting writers."); } return new LocalDiskShuffleMapOutputWriter( - shuffleId, mapId, numPartitions, blockResolver, sparkConf); + shuffleId, mapTaskId, numPartitions, blockResolver, sparkConf); + } + + @Override + public Optional createSingleFileMapOutputWriter( + int shuffleId, + long mapId) { + if (blockResolver == null) { + throw new IllegalStateException( + "Executor components must be initialized before getting writers."); + } + return Optional.of(new LocalDiskSingleSpillMapOutputWriter(shuffleId, mapId, blockResolver)); } } diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriter.java index 7fc19b1270a46..a6529fd76188a 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriter.java @@ -24,8 +24,8 @@ import java.io.OutputStream; import java.nio.channels.FileChannel; import java.nio.channels.WritableByteChannel; - import java.util.Optional; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -48,12 +48,13 @@ public class LocalDiskShuffleMapOutputWriter implements ShuffleMapOutputWriter { LoggerFactory.getLogger(LocalDiskShuffleMapOutputWriter.class); private final int shuffleId; - private final int mapId; + private final long mapId; private final IndexShuffleBlockResolver blockResolver; private final long[] partitionLengths; private final int bufferSize; private int lastPartitionId = -1; private long currChannelPosition; + private long bytesWrittenToMergedFile = 0L; private final File outputFile; private File outputTempFile; @@ -63,7 +64,7 @@ public class LocalDiskShuffleMapOutputWriter implements ShuffleMapOutputWriter { public LocalDiskShuffleMapOutputWriter( int shuffleId, - int mapId, + long mapId, int numPartitions, IndexShuffleBlockResolver blockResolver, SparkConf sparkConf) { @@ -97,6 +98,18 @@ public ShufflePartitionWriter getPartitionWriter(int reducePartitionId) throws I @Override public long[] commitAllPartitions() throws IOException { + // Check the position after transferTo loop to see if it is in the right position and raise a + // exception if it is incorrect. The position will not be increased to the expected length + // after calling transferTo in kernel version 2.6.32. This issue is described at + // https://bugs.openjdk.java.net/browse/JDK-7052359 and SPARK-3948. + if (outputFileChannel != null && outputFileChannel.position() != bytesWrittenToMergedFile) { + throw new IOException( + "Current position " + outputFileChannel.position() + " does not equal expected " + + "position " + bytesWrittenToMergedFile + " after transferTo. Please check your " + + " kernel version to see if it is 2.6.32, as there is a kernel bug which will lead " + + "to unexpected behavior when using transferTo. You can set " + + "spark.file.transferTo=false to disable this NIO feature."); + } cleanUp(); File resolvedTmp = outputTempFile != null && outputTempFile.isFile() ? outputTempFile : null; blockResolver.writeIndexFileAndCommit(shuffleId, mapId, partitionLengths, resolvedTmp); @@ -133,11 +146,10 @@ private void initStream() throws IOException { } private void initChannel() throws IOException { - if (outputFileStream == null) { - outputFileStream = new FileOutputStream(outputTempFile, true); - } + // This file needs to opened in append mode in order to work around a Linux kernel bug that + // affects transferTo; see SPARK-3948 for more details. if (outputFileChannel == null) { - outputFileChannel = outputFileStream.getChannel(); + outputFileChannel = new FileOutputStream(outputTempFile, true).getChannel(); } } @@ -227,6 +239,7 @@ public void write(byte[] buf, int pos, int length) throws IOException { public void close() { isClosed = true; partitionLengths[partitionId] = count; + bytesWrittenToMergedFile += count; } private void verifyNotClosed() { @@ -257,6 +270,7 @@ public WritableByteChannel channel() { @Override public void close() throws IOException { partitionLengths[partitionId] = getCount(); + bytesWrittenToMergedFile += partitionLengths[partitionId]; } } } diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskSingleSpillMapOutputWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskSingleSpillMapOutputWriter.java new file mode 100644 index 0000000000000..c8b41992a8919 --- /dev/null +++ b/core/src/main/java/org/apache/spark/shuffle/sort/io/LocalDiskSingleSpillMapOutputWriter.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.shuffle.sort.io; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; + +import org.apache.spark.shuffle.IndexShuffleBlockResolver; +import org.apache.spark.shuffle.api.SingleSpillShuffleMapOutputWriter; +import org.apache.spark.util.Utils; + +public class LocalDiskSingleSpillMapOutputWriter + implements SingleSpillShuffleMapOutputWriter { + + private final int shuffleId; + private final long mapId; + private final IndexShuffleBlockResolver blockResolver; + + public LocalDiskSingleSpillMapOutputWriter( + int shuffleId, + long mapId, + IndexShuffleBlockResolver blockResolver) { + this.shuffleId = shuffleId; + this.mapId = mapId; + this.blockResolver = blockResolver; + } + + @Override + public void transferMapSpillFile( + File mapSpillFile, + long[] partitionLengths) throws IOException { + // The map spill file already has the proper format, and it contains all of the partition data. + // So just transfer it directly to the destination without any merging. + File outputFile = blockResolver.getDataFile(shuffleId, mapId); + File tempFile = Utils.tempFileWith(outputFile); + Files.move(mapSpillFile.toPath(), tempFile.toPath()); + blockResolver.writeIndexFileAndCommit(shuffleId, mapId, partitionLengths, tempFile); + } +} diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java index d320ba3139541..7bdd89488d119 100644 --- a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java +++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java @@ -694,7 +694,10 @@ public boolean append(Object kbase, long koff, int klen, Object vbase, long voff assert (vlen % 8 == 0); assert (longArray != null); - if (numKeys == MAX_CAPACITY + // We should not increase number of keys to be MAX_CAPACITY. The usage pattern of this map is + // lookup + append. If we append key until the number of keys to be MAX_CAPACITY, next time + // the call of lookup will hang forever because it cannot find an empty slot. + if (numKeys == MAX_CAPACITY - 1 // The map could be reused from last spill (because of no enough memory to grow), // then we don't try to grow again if hit the `growthThreshold`. || !canGrowArray && numKeys >= growthThreshold) { @@ -741,7 +744,9 @@ public boolean append(Object kbase, long koff, int klen, Object vbase, long voff longArray.set(pos * 2 + 1, keyHashcode); isDefined = true; - if (numKeys >= growthThreshold && longArray.size() < MAX_CAPACITY) { + // We use two array entries per key, so the array size is twice the capacity. + // We should compare the current capacity of the array, instead of its size. + if (numKeys >= growthThreshold && longArray.size() / 2 < MAX_CAPACITY) { try { growAndRehash(); } catch (SparkOutOfMemoryError oom) { @@ -886,6 +891,7 @@ public void reset() { numKeys = 0; numValues = 0; freeArray(longArray); + longArray = null; while (dataPages.size() > 0) { MemoryBlock dataPage = dataPages.removeLast(); freePage(dataPage); diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java index 1b206c11d9a8e..55e4e609c3c7b 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java @@ -447,8 +447,6 @@ public void insertKVRecord(Object keyBase, long keyOffset, int keyLen, /** * Merges another UnsafeExternalSorters into this one, the other one will be emptied. - * - * @throws IOException */ public void merge(UnsafeExternalSorter other) throws IOException { other.spill(); diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java index 1a9453a8b3e80..e14964d68119b 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java @@ -205,6 +205,10 @@ public long getSortTimeNanos() { } public long getMemoryUsage() { + if (array == null) { + return 0L; + } + return array.size() * 8; } diff --git a/core/src/main/resources/META-INF/services/org.apache.spark.deploy.history.EventFilterBuilder b/core/src/main/resources/META-INF/services/org.apache.spark.deploy.history.EventFilterBuilder new file mode 100644 index 0000000000000..784e58270ab42 --- /dev/null +++ b/core/src/main/resources/META-INF/services/org.apache.spark.deploy.history.EventFilterBuilder @@ -0,0 +1 @@ +org.apache.spark.deploy.history.BasicEventFilterBuilder \ No newline at end of file diff --git a/core/src/main/resources/org/apache/spark/log4j-defaults.properties b/core/src/main/resources/org/apache/spark/log4j-defaults.properties index eb12848900b58..71652d0e9f5e8 100644 --- a/core/src/main/resources/org/apache/spark/log4j-defaults.properties +++ b/core/src/main/resources/org/apache/spark/log4j-defaults.properties @@ -33,7 +33,8 @@ log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO -# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs +# in SparkSQL with Hive support log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html b/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html index 31ef04552b880..e91449013e371 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html +++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage-template.html @@ -37,8 +37,8 @@

Summary

Disk Used Cores - Active Tasks - Failed Tasks + Active Tasks + Failed Tasks Complete Tasks Total Tasks Summary title="Total shuffle bytes and records read (includes both data read locally and data read from remote executors)."> Shuffle Read - Shuffle Write - Blacklisted @@ -71,13 +71,10 @@

Executors

- - - - + + + + - - - - - - - + + + + + + + - - + + diff --git a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js index 11d7c77d0c667..090bc72dc9274 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/executorspage.js +++ b/core/src/main/resources/org/apache/spark/ui/static/executorspage.js @@ -462,7 +462,8 @@ $(document).ready(function () { {"visible": false, "targets": 5}, {"visible": false, "targets": 6}, {"visible": false, "targets": 9} - ] + ], + "deferRender": true }; execDataTable = $(selector).DataTable(conf); diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage.js b/core/src/main/resources/org/apache/spark/ui/static/historypage.js index 5f291620e0e95..4df5f07f077d7 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/historypage.js +++ b/core/src/main/resources/org/apache/spark/ui/static/historypage.js @@ -177,6 +177,7 @@ $(document).ready(function() { {name: 'eventLog'}, ], "autoWidth": false, + "deferRender": true }; if (hasMultipleAttempts) { diff --git a/core/src/main/resources/org/apache/spark/ui/static/sorttable.js b/core/src/main/resources/org/apache/spark/ui/static/sorttable.js index 9960d5c34d1fc..ecd580e5c64aa 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/sorttable.js +++ b/core/src/main/resources/org/apache/spark/ui/static/sorttable.js @@ -97,9 +97,14 @@ sorttable = { sorttable.reverse(this.sorttable_tbody); this.className = this.className.replace('sorttable_sorted', 'sorttable_sorted_reverse'); - this.removeChild(document.getElementById('sorttable_sortfwdind')); + rowlists = this.parentNode.getElementsByTagName("span"); + for (var j=0; j < rowlists.length; j++) { + if (rowlists[j].className.search(/\bsorttable_sortfwdind\b/)) { + rowlists[j].parentNode.removeChild(rowlists[j]); + } + } sortrevind = document.createElement('span'); - sortrevind.id = "sorttable_sortrevind"; + sortrevind.class = "sorttable_sortrevind"; sortrevind.innerHTML = stIsIE ? ' 5' : ' ▾'; this.appendChild(sortrevind); return; @@ -110,9 +115,14 @@ sorttable = { sorttable.reverse(this.sorttable_tbody); this.className = this.className.replace('sorttable_sorted_reverse', 'sorttable_sorted'); - this.removeChild(document.getElementById('sorttable_sortrevind')); + rowlists = this.parentNode.getElementsByTagName("span"); + for (var j=0; j < rowlists.length; j++) { + if (rowlists[j].className.search(/\sorttable_sortrevind\b/)) { + rowlists[j].parentNode.removeChild(rowlists[j]); + } + } sortfwdind = document.createElement('span'); - sortfwdind.id = "sorttable_sortfwdind"; + sortfwdind.class = "sorttable_sortfwdind"; sortfwdind.innerHTML = stIsIE ? ' 6' : ' ▴'; this.appendChild(sortfwdind); return; @@ -126,14 +136,17 @@ sorttable = { cell.className = cell.className.replace('sorttable_sorted',''); } }); - sortfwdind = document.getElementById('sorttable_sortfwdind'); - if (sortfwdind) { sortfwdind.parentNode.removeChild(sortfwdind); } - sortrevind = document.getElementById('sorttable_sortrevind'); - if (sortrevind) { sortrevind.parentNode.removeChild(sortrevind); } + rowlists = this.parentNode.getElementsByTagName("span"); + for (var j=0; j < rowlists.length; j++) { + if (rowlists[j].className.search(/\bsorttable_sortfwdind\b/) + || rowlists[j].className.search(/\sorttable_sortrevind\b/) ) { + rowlists[j].parentNode.removeChild(rowlists[j]); + } + } this.className += ' sorttable_sorted'; sortfwdind = document.createElement('span'); - sortfwdind.id = "sorttable_sortfwdind"; + sortfwdind.class = "sorttable_sortfwdind"; sortfwdind.innerHTML = stIsIE ? ' 6' : ' ▴'; this.appendChild(sortfwdind); diff --git a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js index 035d72f8956ff..25dec9d3788ba 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js +++ b/core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js @@ -513,7 +513,7 @@ function addTooltipsForRDDs(svgContainer) { if (tooltipText) { node.select("circle") .attr("data-toggle", "tooltip") - .attr("data-placement", "bottom") + .attr("data-placement", "top") .attr("data-html", "true") // to interpret line break, tooltipText is showing title .attr("title", tooltipText); } diff --git a/core/src/main/resources/org/apache/spark/ui/static/stagepage.js b/core/src/main/resources/org/apache/spark/ui/static/stagepage.js index 3ef1a76fd7202..ee2b7b353d62e 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/stagepage.js +++ b/core/src/main/resources/org/apache/spark/ui/static/stagepage.js @@ -286,7 +286,7 @@ $(document).ready(function () { " Show Additional Metrics" + "" + ""); $('#scheduler_delay').attr("data-toggle", "tooltip") - .attr("data-placement", "right") + .attr("data-placement", "top") .attr("title", "Scheduler delay includes time to ship the task from the scheduler to the executor, and time to send " + "the task result from the executor to the scheduler. If scheduler delay is large, consider decreasing the size of tasks or decreasing the size of task results."); $('#task_deserialization_time').attr("data-toggle", "tooltip") - .attr("data-placement", "right") + .attr("data-placement", "top") .attr("title", "Time spent deserializing the task closure on the executor, including the time to read the broadcasted task."); $('#shuffle_read_blocked_time').attr("data-toggle", "tooltip") - .attr("data-placement", "right") + .attr("data-placement", "top") .attr("title", "Time that the task spent blocked waiting for shuffle data to be read from remote machines."); $('#shuffle_remote_reads').attr("data-toggle", "tooltip") - .attr("data-placement", "right") + .attr("data-placement", "top") .attr("title", "Total shuffle bytes read from remote executors. This is a subset of the shuffle read bytes; the remaining shuffle data is read locally. "); $('#result_serialization_time').attr("data-toggle", "tooltip") - .attr("data-placement", "right") + .attr("data-placement", "top") .attr("title", "Time spent serializing the task result on the executor before sending it back to the driver."); $('#getting_result_time').attr("data-toggle", "tooltip") - .attr("data-placement", "right") + .attr("data-placement", "top") .attr("title", "Time that the driver spends fetching task results from workers. If this is large, consider decreasing the amount of data returned from each task."); $('#peak_execution_memory').attr("data-toggle", "tooltip") - .attr("data-placement", "right") + .attr("data-placement", "top") .attr("title", "Execution memory refers to the memory used by internal data structures created during " + "shuffles, aggregations and joins when Tungsten is enabled. The value of this accumulator " + "should be approximately the sum of the peak sizes across all such data structures created " + @@ -325,6 +325,25 @@ $(document).ready(function () { $('[data-toggle="tooltip"]').tooltip(); var tasksSummary = $("#parent-container"); getStandAloneAppId(function (appId) { + // rendering the UI page + $.get(createTemplateURI(appId, "stagespage"), function(template) { + tasksSummary.append(Mustache.render($(template).filter("#stages-summary-template").html())); + + $("#additionalMetrics").click(function(){ + $("#arrowtoggle1").toggleClass("arrow-open arrow-closed"); + $("#toggle-metrics").toggle(); + if (window.localStorage) { + window.localStorage.setItem("arrowtoggle1class", $("#arrowtoggle1").attr('class')); + } + }); + + $("#aggregatedMetrics").click(function(){ + $("#arrowtoggle2").toggleClass("arrow-open arrow-closed"); + $("#toggle-aggregatedMetrics").toggle(); + if (window.localStorage) { + window.localStorage.setItem("arrowtoggle2class", $("#arrowtoggle2").attr('class')); + } + }); var endPoint = stageEndPoint(appId); var stageAttemptId = getStageAttemptId(); @@ -473,27 +492,6 @@ $(document).ready(function () { var accumulatorTable = responseBody.accumulatorUpdates.filter(accumUpdate => !(accumUpdate.name).toString().includes("internal.")); - // rendering the UI page - var data = {"executors": response}; - $.get(createTemplateURI(appId, "stagespage"), function(template) { - tasksSummary.append(Mustache.render($(template).filter("#stages-summary-template").html(), data)); - - $("#additionalMetrics").click(function(){ - $("#arrowtoggle1").toggleClass("arrow-open arrow-closed"); - $("#toggle-metrics").toggle(); - if (window.localStorage) { - window.localStorage.setItem("arrowtoggle1class", $("#arrowtoggle1").attr('class')); - } - }); - - $("#aggregatedMetrics").click(function(){ - $("#arrowtoggle2").toggleClass("arrow-open arrow-closed"); - $("#toggle-aggregatedMetrics").toggle(); - if (window.localStorage) { - window.localStorage.setItem("arrowtoggle2class", $("#arrowtoggle2").attr('class')); - } - }); - var quantiles = "0,0.25,0.5,0.75,1.0"; $.getJSON(endPoint + "/" + stageAttemptId + "/taskSummary?quantiles=" + quantiles, function(taskMetricsResponse, status, jqXHR) { @@ -758,8 +756,11 @@ $(document).ready(function () { { data : function (row, type) { if (accumulatorTable.length > 0 && row.accumulatorUpdates.length > 0) { - var accIndex = row.accumulatorUpdates.length - 1; - return row.accumulatorUpdates[accIndex].name + ' : ' + row.accumulatorUpdates[accIndex].update; + var allAccums = ""; + row.accumulatorUpdates.forEach(function(accumulator) { + allAccums += accumulator.name + ': ' + accumulator.update + "
"; + }) + return allAccums; } else { return ""; } @@ -877,7 +878,8 @@ $(document).ready(function () { { "visible": false, "targets": 16 }, { "visible": false, "targets": 17 }, { "visible": false, "targets": 18 } - ] + ], + "deferRender": true }; taskTableSelector = $(taskTable).DataTable(taskConf); $('#active-tasks-table_filter input').unbind(); @@ -961,4 +963,4 @@ $(document).ready(function () { }); }); }); -}); +}); \ No newline at end of file diff --git a/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.css b/core/src/main/resources/org/apache/spark/ui/static/streaming-page.css similarity index 100% rename from streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.css rename to core/src/main/resources/org/apache/spark/ui/static/streaming-page.css diff --git a/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js b/core/src/main/resources/org/apache/spark/ui/static/streaming-page.js similarity index 100% rename from streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js rename to core/src/main/resources/org/apache/spark/ui/static/streaming-page.js diff --git a/core/src/main/resources/org/apache/spark/ui/static/structured-streaming-page.js b/core/src/main/resources/org/apache/spark/ui/static/structured-streaming-page.js new file mode 100644 index 0000000000000..70250fdbd2d0c --- /dev/null +++ b/core/src/main/resources/org/apache/spark/ui/static/structured-streaming-page.js @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// pre-define some colors for legends. +var colorPool = ["#F8C471", "#F39C12", "#B9770E", "#73C6B6", "#16A085", "#117A65", "#B2BABB", "#7F8C8D", "#616A6B"]; + +function drawAreaStack(id, labels, values, minX, maxX, minY, maxY) { + d3.select(d3.select(id).node().parentNode) + .style("padding", "8px 0 8px 8px") + .style("border-right", "0px solid white"); + + // Setup svg using Bostock's margin convention + var margin = {top: 20, right: 40, bottom: 30, left: maxMarginLeftForTimeline}; + var width = 850 - margin.left - margin.right; + var height = 300 - margin.top - margin.bottom; + + var svg = d3.select(id) + .append("svg") + .attr("width", width + margin.left + margin.right) + .attr("height", height + margin.top + margin.bottom) + .append("g") + .attr("transform", "translate(" + margin.left + "," + margin.top + ")"); + + var data = values; + + var parse = d3.time.format("%H:%M:%S.%L").parse; + + // Transpose the data into layers + var dataset = d3.layout.stack()(labels.map(function(fruit) { + return data.map(function(d) { + return {_x: d.x, x: parse(d.x), y: +d[fruit]}; + }); + })); + + + // Set x, y and colors + var x = d3.scale.ordinal() + .domain(dataset[0].map(function(d) { return d.x; })) + .rangeRoundBands([10, width-10], 0.02); + + var y = d3.scale.linear() + .domain([0, d3.max(dataset, function(d) { return d3.max(d, function(d) { return d.y0 + d.y; }); })]) + .range([height, 0]); + + var colors = colorPool.slice(0, labels.length) + + // Define and draw axes + var yAxis = d3.svg.axis() + .scale(y) + .orient("left") + .ticks(7) + .tickFormat( function(d) { return d } ); + + var xAxis = d3.svg.axis() + .scale(x) + .orient("bottom") + .tickFormat(d3.time.format("%H:%M:%S.%L")); + + // Only show the first and last time in the graph + var xline = [] + xline.push(x.domain()[0]) + xline.push(x.domain()[x.domain().length - 1]) + xAxis.tickValues(xline); + + svg.append("g") + .attr("class", "y axis") + .call(yAxis) + .append("text") + .attr("transform", "translate(0," + unitLabelYOffset + ")") + .text("ms"); + + svg.append("g") + .attr("class", "x axis") + .attr("transform", "translate(0," + height + ")") + .call(xAxis); + + // Create groups for each series, rects for each segment + var groups = svg.selectAll("g.cost") + .data(dataset) + .enter().append("g") + .attr("class", "cost") + .style("fill", function(d, i) { return colors[i]; }); + + var rect = groups.selectAll("rect") + .data(function(d) { return d; }) + .enter() + .append("rect") + .attr("x", function(d) { return x(d.x); }) + .attr("y", function(d) { return y(d.y0 + d.y); }) + .attr("height", function(d) { return y(d.y0) - y(d.y0 + d.y); }) + .attr("width", x.rangeBand()) + .on('mouseover', function(d) { + var tip = ''; + var idx = 0; + var _values = timeToValues[d._x] + _values.forEach(function (k) { + tip += labels[idx] + ': ' + k + ' '; + idx += 1; + }); + tip += " at " + d._x + showBootstrapTooltip(d3.select(this).node(), tip); + }) + .on('mouseout', function() { + hideBootstrapTooltip(d3.select(this).node()); + }) + .on("mousemove", function(d) { + var xPosition = d3.mouse(this)[0] - 15; + var yPosition = d3.mouse(this)[1] - 25; + tooltip.attr("transform", "translate(" + xPosition + "," + yPosition + ")"); + tooltip.select("text").text(d.y); + }); + + + // Draw legend + var legend = svg.selectAll(".legend") + .data(colors) + .enter().append("g") + .attr("class", "legend") + .attr("transform", function(d, i) { return "translate(30," + i * 19 + ")"; }); + + legend.append("rect") + .attr("x", width - 20) + .attr("width", 18) + .attr("height", 18) + .style("fill", function(d, i) {return colors.slice().reverse()[i];}) + .on('mouseover', function(d, i) { + var len = labels.length + showBootstrapTooltip(d3.select(this).node(), labels[len - 1 - i]); + }) + .on('mouseout', function() { + hideBootstrapTooltip(d3.select(this).node()); + }) + .on("mousemove", function(d) { + var xPosition = d3.mouse(this)[0] - 15; + var yPosition = d3.mouse(this)[1] - 25; + tooltip.attr("transform", "translate(" + xPosition + "," + yPosition + ")"); + tooltip.select("text").text(d.y); + }); + + // Prep the tooltip bits, initial display is hidden + var tooltip = svg.append("g") + .attr("class", "tooltip") + .style("display", "none"); + + tooltip.append("rect") + .attr("width", 30) + .attr("height", 20) + .attr("fill", "white") + .style("opacity", 0.5); + + tooltip.append("text") + .attr("x", 15) + .attr("dy", "1.2em") + .style("text-anchor", "middle") + .attr("font-size", "12px") + .attr("font-weight", "bold"); +} diff --git a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.css b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.css index 10bceae2fbdda..3f31403eaeef3 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.css +++ b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.css @@ -207,6 +207,12 @@ rect.getting-result-time-proportion { border-color: #3EC0FF; } +.vis-timeline .vis-item.executor.added.vis-selected { + background-color: #00AAFF; + border-color: #184C66; + z-index: 2; +} + .legend-area rect.executor-added-legend { fill: #A0DFFF; stroke: #3EC0FF; @@ -217,17 +223,17 @@ rect.getting-result-time-proportion { border-color: #FF4D6D; } +.vis-timeline .vis-item.executor.removed.vis-selected { + background-color: #FF6680; + border-color: #661F2C; + z-index: 2; +} + .legend-area rect.executor-removed-legend { fill: #FFA1B0; stroke: #FF4D6D; } -.vis-timeline .vis-item.executor.vis-selected { - background-color: #A2FCC0; - border-color: #36F572; - z-index: 2; -} - tr.corresponding-item-hover > td, tr.corresponding-item-hover > th { background-color: #D6FFE4 !important; } diff --git a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js index 705a08f0293d3..b2cd616791734 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js +++ b/core/src/main/resources/org/apache/spark/ui/static/timeline-view.js @@ -83,8 +83,9 @@ function drawApplicationTimeline(groupArray, eventObjArray, startTime, offset) { }); } -$(function (){ - if (window.localStorage.getItem("expand-application-timeline") == "true") { +$(function () { + if ($("span.expand-application-timeline").length && + window.localStorage.getItem("expand-application-timeline") == "true") { // Set it to false so that the click function can revert it window.localStorage.setItem("expand-application-timeline", "false"); $("span.expand-application-timeline").trigger('click'); @@ -159,8 +160,9 @@ function drawJobTimeline(groupArray, eventObjArray, startTime, offset) { }); } -$(function (){ - if (window.localStorage.getItem("expand-job-timeline") == "true") { +$(function () { + if ($("span.expand-job-timeline").length && + window.localStorage.getItem("expand-job-timeline") == "true") { // Set it to false so that the click function can revert it window.localStorage.setItem("expand-job-timeline", "false"); $("span.expand-job-timeline").trigger('click'); @@ -226,8 +228,9 @@ function drawTaskAssignmentTimeline(groupArray, eventObjArray, minLaunchTime, ma }); } -$(function (){ - if (window.localStorage.getItem("expand-task-assignment-timeline") == "true") { +$(function () { + if ($("span.expand-task-assignment-timeline").length && + window.localStorage.getItem("expand-task-assignment-timeline") == "true") { // Set it to false so that the click function can revert it window.localStorage.setItem("expand-task-assignment-timeline", "false"); $("span.expand-task-assignment-timeline").trigger('click'); diff --git a/core/src/main/resources/org/apache/spark/ui/static/webui.css b/core/src/main/resources/org/apache/spark/ui/static/webui.css old mode 100644 new mode 100755 index 3e28816ba61b6..801c449fd626f --- a/core/src/main/resources/org/apache/spark/ui/static/webui.css +++ b/core/src/main/resources/org/apache/spark/ui/static/webui.css @@ -245,9 +245,9 @@ a.expandbutton { max-width: 600px; } -.paginate_button.active > a { - color: #999999; - text-decoration: underline; +.paginate_button.active { + border: 1px solid #979797 !important; + background: white linear-gradient(to bottom, #fff 0%, #dcdcdc 100%); } .title-table { @@ -263,32 +263,36 @@ a.expandbutton { width: 200px; } +.select-all-div-checkbox-div { + width: 90px; +} + .scheduler-delay-checkbox-div { - width: 120px; + width: 130px; } .task-deserialization-time-checkbox-div { - width: 175px; + width: 190px; } .shuffle-read-blocked-time-checkbox-div { - width: 187px; + width: 200px; } .shuffle-remote-reads-checkbox-div { - width: 157px; + width: 170px; } .result-serialization-time-checkbox-div { - width: 171px; + width: 185px; } .getting-result-time-checkbox-div { - width: 141px; + width: 155px; } .peak-execution-memory-checkbox-div { - width: 170px; + width: 180px; } #active-tasks-table th { diff --git a/core/src/main/resources/org/apache/spark/ui/static/webui.js b/core/src/main/resources/org/apache/spark/ui/static/webui.js index 89622106ff1f0..0ba461f02317f 100644 --- a/core/src/main/resources/org/apache/spark/ui/static/webui.js +++ b/core/src/main/resources/org/apache/spark/ui/static/webui.js @@ -87,4 +87,16 @@ $(function() { collapseTablePageLoad('collapse-aggregated-runningExecutions','aggregated-runningExecutions'); collapseTablePageLoad('collapse-aggregated-completedExecutions','aggregated-completedExecutions'); collapseTablePageLoad('collapse-aggregated-failedExecutions','aggregated-failedExecutions'); -}); \ No newline at end of file + collapseTablePageLoad('collapse-aggregated-sessionstat','aggregated-sessionstat'); + collapseTablePageLoad('collapse-aggregated-sqlstat','aggregated-sqlstat'); + collapseTablePageLoad('collapse-aggregated-sqlsessionstat','aggregated-sqlsessionstat'); + collapseTablePageLoad('collapse-aggregated-activeQueries','aggregated-activeQueries'); + collapseTablePageLoad('collapse-aggregated-completedQueries','aggregated-completedQueries'); +}); + +$(function() { + // Show/hide full job description on click event. + $(".description-input").click(function() { + $(this).toggleClass("description-input-full"); + }); +}); diff --git a/core/src/main/scala/org/apache/spark/util/BoundedPriorityQueue.scala b/core/src/main/scala-2.12/org/apache/spark/util/BoundedPriorityQueue.scala similarity index 93% rename from core/src/main/scala/org/apache/spark/util/BoundedPriorityQueue.scala rename to core/src/main/scala-2.12/org/apache/spark/util/BoundedPriorityQueue.scala index eff0aa4453f08..a241023723444 100644 --- a/core/src/main/scala/org/apache/spark/util/BoundedPriorityQueue.scala +++ b/core/src/main/scala-2.12/org/apache/spark/util/BoundedPriorityQueue.scala @@ -31,6 +31,8 @@ import scala.collection.generic.Growable private[spark] class BoundedPriorityQueue[A](maxSize: Int)(implicit ord: Ordering[A]) extends Iterable[A] with Growable[A] with Serializable { + // Note: this class supports Scala 2.12. A parallel source tree has a 2.13 implementation. + private val underlying = new JPriorityQueue[A](maxSize, ord) override def iterator: Iterator[A] = underlying.iterator.asScala @@ -59,7 +61,7 @@ private[spark] class BoundedPriorityQueue[A](maxSize: Int)(implicit ord: Orderin this += elem1 += elem2 ++= elems } - override def clear() { underlying.clear() } + override def clear(): Unit = { underlying.clear() } private def maybeReplaceLowest(a: A): Boolean = { val head = underlying.peek() diff --git a/core/src/main/scala/org/apache/spark/util/TimeStampedHashMap.scala b/core/src/main/scala-2.12/org/apache/spark/util/TimeStampedHashMap.scala similarity index 92% rename from core/src/main/scala/org/apache/spark/util/TimeStampedHashMap.scala rename to core/src/main/scala-2.12/org/apache/spark/util/TimeStampedHashMap.scala index 32af0127bbf38..da12582a5083a 100644 --- a/core/src/main/scala/org/apache/spark/util/TimeStampedHashMap.scala +++ b/core/src/main/scala-2.12/org/apache/spark/util/TimeStampedHashMap.scala @@ -40,6 +40,8 @@ private[spark] case class TimeStampedValue[V](value: V, timestamp: Long) private[spark] class TimeStampedHashMap[A, B](updateTimeStampOnGet: Boolean = false) extends mutable.Map[A, B]() with Logging { + // Note: this class supports Scala 2.12. A parallel source tree has a 2.13 implementation. + private val internalMap = new ConcurrentHashMap[A, TimeStampedValue[B]]() def get(key: A): Option[B] = { @@ -81,7 +83,7 @@ private[spark] class TimeStampedHashMap[A, B](updateTimeStampOnGet: Boolean = fa this } - override def update(key: A, value: B) { + override def update(key: A, value: B): Unit = { this += ((key, value)) } @@ -97,7 +99,7 @@ private[spark] class TimeStampedHashMap[A, B](updateTimeStampOnGet: Boolean = fa override def size: Int = internalMap.size - override def foreach[U](f: ((A, B)) => U) { + override def foreach[U](f: ((A, B)) => U): Unit = { val it = getEntrySet.iterator while(it.hasNext) { val entry = it.next() @@ -111,13 +113,13 @@ private[spark] class TimeStampedHashMap[A, B](updateTimeStampOnGet: Boolean = fa Option(prev).map(_.value) } - def putAll(map: Map[A, B]) { + def putAll(map: Map[A, B]): Unit = { map.foreach { case (k, v) => update(k, v) } } def toMap: Map[A, B] = iterator.toMap - def clearOldValues(threshTime: Long, f: (A, B) => Unit) { + def clearOldValues(threshTime: Long, f: (A, B) => Unit): Unit = { val it = getEntrySet.iterator while (it.hasNext) { val entry = it.next() @@ -130,7 +132,7 @@ private[spark] class TimeStampedHashMap[A, B](updateTimeStampOnGet: Boolean = fa } /** Removes old key-value pairs that have timestamp earlier than `threshTime`. */ - def clearOldValues(threshTime: Long) { + def clearOldValues(threshTime: Long): Unit = { clearOldValues(threshTime, (_, _) => ()) } diff --git a/core/src/main/scala-2.13/org/apache/spark/util/BoundedPriorityQueue.scala b/core/src/main/scala-2.13/org/apache/spark/util/BoundedPriorityQueue.scala new file mode 100644 index 0000000000000..bc55a44fc3c2e --- /dev/null +++ b/core/src/main/scala-2.13/org/apache/spark/util/BoundedPriorityQueue.scala @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.util + +import java.io.Serializable +import java.util.{PriorityQueue => JPriorityQueue} + +import scala.collection.JavaConverters._ +import scala.collection.mutable.Growable + +/** + * Bounded priority queue. This class wraps the original PriorityQueue + * class and modifies it such that only the top K elements are retained. + * The top K elements are defined by an implicit Ordering[A]. + */ +private[spark] class BoundedPriorityQueue[A](maxSize: Int)(implicit ord: Ordering[A]) + extends Iterable[A] with Growable[A] with Serializable { + + // Note: this class supports Scala 2.13. A parallel source tree has a 2.12 implementation. + + private val underlying = new JPriorityQueue[A](maxSize, ord) + + override def iterator: Iterator[A] = underlying.iterator.asScala + + override def size: Int = underlying.size + + override def knownSize: Int = size + + override def addAll(xs: IterableOnce[A]): this.type = { + xs.foreach { this += _ } + this + } + + override def addOne(elem: A): this.type = { + if (size < maxSize) { + underlying.offer(elem) + } else { + maybeReplaceLowest(elem) + } + this + } + + def poll(): A = { + underlying.poll() + } + + override def clear(): Unit = { underlying.clear() } + + private def maybeReplaceLowest(a: A): Boolean = { + val head = underlying.peek() + if (head != null && ord.gt(a, head)) { + underlying.poll() + underlying.offer(a) + } else { + false + } + } +} diff --git a/core/src/main/scala-2.13/org/apache/spark/util/TimeStampedHashMap.scala b/core/src/main/scala-2.13/org/apache/spark/util/TimeStampedHashMap.scala new file mode 100644 index 0000000000000..9c860061b5862 --- /dev/null +++ b/core/src/main/scala-2.13/org/apache/spark/util/TimeStampedHashMap.scala @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.util + +import java.util.Map.Entry +import java.util.Set +import java.util.concurrent.ConcurrentHashMap + +import scala.collection.JavaConverters._ +import scala.collection.mutable + +import org.apache.spark.internal.Logging + +private[spark] case class TimeStampedValue[V](value: V, timestamp: Long) + +/** + * This is a custom implementation of scala.collection.mutable.Map which stores the insertion + * timestamp along with each key-value pair. If specified, the timestamp of each pair can be + * updated every time it is accessed. Key-value pairs whose timestamp are older than a particular + * threshold time can then be removed using the clearOldValues method. This is intended to + * be a drop-in replacement of scala.collection.mutable.HashMap. + * + * @param updateTimeStampOnGet Whether timestamp of a pair will be updated when it is accessed + */ +private[spark] class TimeStampedHashMap[A, B](updateTimeStampOnGet: Boolean = false) + extends mutable.Map[A, B]() with Logging { + + // Note: this class supports Scala 2.13. A parallel source tree has a 2.12 implementation. + + private val internalMap = new ConcurrentHashMap[A, TimeStampedValue[B]]() + + def get(key: A): Option[B] = { + val value = internalMap.get(key) + if (value != null && updateTimeStampOnGet) { + internalMap.replace(key, value, TimeStampedValue(value.value, currentTime)) + } + Option(value).map(_.value) + } + + def iterator: Iterator[(A, B)] = { + getEntrySet.iterator.asScala.map(kv => (kv.getKey, kv.getValue.value)) + } + + def getEntrySet: Set[Entry[A, TimeStampedValue[B]]] = internalMap.entrySet + + override def + [B1 >: B](kv: (A, B1)): mutable.Map[A, B1] = { + val newMap = new TimeStampedHashMap[A, B1] + val oldInternalMap = this.internalMap.asInstanceOf[ConcurrentHashMap[A, TimeStampedValue[B1]]] + newMap.internalMap.putAll(oldInternalMap) + kv match { case (a, b) => newMap.internalMap.put(a, TimeStampedValue(b, currentTime)) } + newMap + } + + override def addOne(kv: (A, B)): this.type = { + kv match { case (a, b) => internalMap.put(a, TimeStampedValue(b, currentTime)) } + this + } + + override def subtractOne(key: A): this.type = { + internalMap.remove(key) + this + } + + override def update(key: A, value: B): Unit = { + this += ((key, value)) + } + + override def apply(key: A): B = { + get(key).getOrElse { throw new NoSuchElementException() } + } + + override def filter(p: ((A, B)) => Boolean): mutable.Map[A, B] = { + internalMap.asScala.map { case (k, TimeStampedValue(v, t)) => (k, v) }.filter(p) + } + + override def empty: mutable.Map[A, B] = new TimeStampedHashMap[A, B]() + + override def size: Int = internalMap.size + + override def foreach[U](f: ((A, B)) => U): Unit = { + val it = getEntrySet.iterator + while(it.hasNext) { + val entry = it.next() + val kv = (entry.getKey, entry.getValue.value) + f(kv) + } + } + + def putIfAbsent(key: A, value: B): Option[B] = { + val prev = internalMap.putIfAbsent(key, TimeStampedValue(value, currentTime)) + Option(prev).map(_.value) + } + + def putAll(map: Map[A, B]): Unit = { + map.foreach { case (k, v) => update(k, v) } + } + + def toMap: Map[A, B] = iterator.toMap + + def clearOldValues(threshTime: Long, f: (A, B) => Unit): Unit = { + val it = getEntrySet.iterator + while (it.hasNext) { + val entry = it.next() + if (entry.getValue.timestamp < threshTime) { + f(entry.getKey, entry.getValue.value) + logDebug("Removing key " + entry.getKey) + it.remove() + } + } + } + + /** Removes old key-value pairs that have timestamp earlier than `threshTime`. */ + def clearOldValues(threshTime: Long): Unit = { + clearOldValues(threshTime, (_, _) => ()) + } + + private def currentTime: Long = System.currentTimeMillis + + // For testing + + def getTimeStampedValue(key: A): Option[TimeStampedValue[B]] = { + Option(internalMap.get(key)) + } + + def getTimestamp(key: A): Option[Long] = { + getTimeStampedValue(key).map(_.timestamp) + } +} diff --git a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala index 9f59295059d30..4e417679ca663 100644 --- a/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala +++ b/core/src/main/scala/org/apache/spark/BarrierCoordinator.scala @@ -107,9 +107,9 @@ private[spark] class BarrierCoordinator( private var timerTask: TimerTask = null // Init a TimerTask for a barrier() call. - private def initTimerTask(): Unit = { + private def initTimerTask(state: ContextBarrierState): Unit = { timerTask = new TimerTask { - override def run(): Unit = synchronized { + override def run(): Unit = state.synchronized { // Timeout current barrier() call, fail all the sync requests. requesters.foreach(_.sendFailure(new SparkException("The coordinator didn't get all " + s"barrier sync requests for barrier epoch $barrierEpoch from $barrierId within " + @@ -148,7 +148,7 @@ private[spark] class BarrierCoordinator( // If this is the first sync message received for a barrier() call, start timer to ensure // we may timeout for the sync. if (requesters.isEmpty) { - initTimerTask() + initTimerTask(this) timer.schedule(timerTask, timeoutInSecs * 1000) } // Add the requester to array of RPCCallContexts pending for reply. diff --git a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala index 5afd8a5d866b2..3d369802f3023 100644 --- a/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala +++ b/core/src/main/scala/org/apache/spark/BarrierTaskContext.scala @@ -19,6 +19,7 @@ package org.apache.spark import java.util.{Properties, Timer, TimerTask} +import scala.collection.JavaConverters._ import scala.concurrent.TimeoutException import scala.concurrent.duration._ @@ -211,6 +212,10 @@ class BarrierTaskContext private[spark] ( override def resources(): Map[String, ResourceInformation] = taskContext.resources() + override def resourcesJMap(): java.util.Map[String, ResourceInformation] = { + resources().asJava + } + override private[spark] def killTaskIfInterrupted(): Unit = taskContext.killTaskIfInterrupted() override private[spark] def getKillReason(): Option[String] = taskContext.getKillReason() diff --git a/core/src/main/scala/org/apache/spark/ContextCleaner.scala b/core/src/main/scala/org/apache/spark/ContextCleaner.scala index 24c83993b1b60..9506c36bf9c8c 100644 --- a/core/src/main/scala/org/apache/spark/ContextCleaner.scala +++ b/core/src/main/scala/org/apache/spark/ContextCleaner.scala @@ -27,6 +27,7 @@ import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ import org.apache.spark.rdd.{RDD, ReliableRDDCheckpointData} +import org.apache.spark.shuffle.api.ShuffleDriverComponents import org.apache.spark.util.{AccumulatorContext, AccumulatorV2, ThreadUtils, Utils} /** @@ -58,7 +59,9 @@ private class CleanupTaskWeakReference( * to be processed when the associated object goes out of scope of the application. Actual * cleanup is performed in a separate daemon thread. */ -private[spark] class ContextCleaner(sc: SparkContext) extends Logging { +private[spark] class ContextCleaner( + sc: SparkContext, + shuffleDriverComponents: ShuffleDriverComponents) extends Logging { /** * A buffer to ensure that `CleanupTaskWeakReference`s are not garbage collected as long as they @@ -71,7 +74,7 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging { private val listeners = new ConcurrentLinkedQueue[CleanerListener]() - private val cleaningThread = new Thread() { override def run() { keepCleaning() }} + private val cleaningThread = new Thread() { override def run(): Unit = keepCleaning() } private val periodicGCService: ScheduledExecutorService = ThreadUtils.newDaemonSingleThreadScheduledExecutor("context-cleaner-periodic-gc") @@ -221,7 +224,7 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging { try { logDebug("Cleaning shuffle " + shuffleId) mapOutputTrackerMaster.unregisterShuffle(shuffleId) - blockManagerMaster.removeShuffle(shuffleId, blocking) + shuffleDriverComponents.removeShuffle(shuffleId, blocking) listeners.asScala.foreach(_.shuffleCleaned(shuffleId)) logDebug("Cleaned shuffle " + shuffleId) } catch { @@ -269,7 +272,6 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging { } } - private def blockManagerMaster = sc.env.blockManager.master private def broadcastManager = sc.env.broadcastManager private def mapOutputTrackerMaster = sc.env.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] } diff --git a/core/src/main/scala/org/apache/spark/Dependency.scala b/core/src/main/scala/org/apache/spark/Dependency.scala index fb051a8c0db8e..ba8e4d69ba755 100644 --- a/core/src/main/scala/org/apache/spark/Dependency.scala +++ b/core/src/main/scala/org/apache/spark/Dependency.scala @@ -93,9 +93,10 @@ class ShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag]( val shuffleId: Int = _rdd.context.newShuffleId() val shuffleHandle: ShuffleHandle = _rdd.context.env.shuffleManager.registerShuffle( - shuffleId, _rdd.partitions.length, this) + shuffleId, this) _rdd.sparkContext.cleaner.foreach(_.registerShuffleForCleanup(this)) + _rdd.sparkContext.shuffleDriverComponents.registerShuffle(shuffleId) } diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala index cb965cb180207..00bd0063c9e3a 100644 --- a/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala +++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala @@ -37,24 +37,29 @@ private[spark] trait ExecutorAllocationClient { /** * Update the cluster manager on our scheduling needs. Three bits of information are included * to help it make decisions. - * @param numExecutors The total number of executors we'd like to have. The cluster manager - * shouldn't kill any running executor to reach this number, but, - * if all existing executors were to die, this is the number of executors - * we'd want to be allocated. - * @param localityAwareTasks The number of tasks in all active stages that have a locality - * preferences. This includes running, pending, and completed tasks. - * @param hostToLocalTaskCount A map of hosts to the number of tasks from all active stages - * that would like to like to run on that host. - * This includes running, pending, and completed tasks. + * + * @param resourceProfileIdToNumExecutors The total number of executors we'd like to have per + * ResourceProfile id. The cluster manager shouldn't kill + * any running executor to reach this number, but, if all + * existing executors were to die, this is the number + * of executors we'd want to be allocated. + * @param numLocalityAwareTasksPerResourceProfileId The number of tasks in all active stages that + * have a locality preferences per + * ResourceProfile id. This includes running, + * pending, and completed tasks. + * @param hostToLocalTaskCount A map of ResourceProfile id to a map of hosts to the number of + * tasks from all active stages that would like to like to run on + * that host. This includes running, pending, and completed tasks. * @return whether the request is acknowledged by the cluster manager. */ private[spark] def requestTotalExecutors( - numExecutors: Int, - localityAwareTasks: Int, - hostToLocalTaskCount: Map[String, Int]): Boolean + resourceProfileIdToNumExecutors: Map[Int, Int], + numLocalityAwareTasksPerResourceProfileId: Map[Int, Int], + hostToLocalTaskCount: Map[Int, Map[String, Int]]): Boolean /** - * Request an additional number of executors from the cluster manager. + * Request an additional number of executors from the cluster manager for the default + * ResourceProfile. * @return whether the request is acknowledged by the cluster manager. */ def requestExecutors(numAdditionalExecutors: Int): Boolean diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala index 5114cf70e3f26..5cb3160711a90 100644 --- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala +++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala @@ -29,6 +29,8 @@ import org.apache.spark.internal.{config, Logging} import org.apache.spark.internal.config._ import org.apache.spark.internal.config.Tests.TEST_SCHEDULE_INTERVAL import org.apache.spark.metrics.source.Source +import org.apache.spark.resource.ResourceProfile.UNKNOWN_RESOURCE_PROFILE_ID +import org.apache.spark.resource.ResourceProfileManager import org.apache.spark.scheduler._ import org.apache.spark.scheduler.dynalloc.ExecutorMonitor import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils} @@ -36,9 +38,9 @@ import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils} /** * An agent that dynamically allocates and removes executors based on the workload. * - * The ExecutorAllocationManager maintains a moving target number of executors which is periodically - * synced to the cluster manager. The target starts at a configured initial value and changes with - * the number of pending and running tasks. + * The ExecutorAllocationManager maintains a moving target number of executors, for each + * ResourceProfile, which is periodically synced to the cluster manager. The target starts + * at a configured initial value and changes with the number of pending and running tasks. * * Decreasing the target number of executors happens when the current target is more than needed to * handle the current load. The target number of executors is always truncated to the number of @@ -57,14 +59,18 @@ import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils} * quickly over time in case the maximum number of executors is very high. Otherwise, it will take * a long time to ramp up under heavy workloads. * - * The remove policy is simpler: If an executor has been idle for K seconds, meaning it has not - * been scheduled to run any tasks, then it is removed. Note that an executor caching any data + * The remove policy is simpler and is applied on each ResourceProfile separately. If an executor + * for that ResourceProfile has been idle for K seconds and the number of executors is more + * then what is needed for that ResourceProfile, meaning there are not enough tasks that could use + * the executor, then it is removed. Note that an executor caching any data * blocks will be removed if it has been idle for more than L seconds. * * There is no retry logic in either case because we make the assumption that the cluster manager * will eventually fulfill all requests it receives asynchronously. * - * The relevant Spark properties include the following: + * The relevant Spark properties are below. Each of these properties applies separately to + * every ResourceProfile. So if you set a minimum number of executors, that is a minimum + * for each ResourceProfile. * * spark.dynamicAllocation.enabled - Whether this feature is enabled * spark.dynamicAllocation.minExecutors - Lower bound on the number of executors @@ -95,7 +101,8 @@ private[spark] class ExecutorAllocationManager( listenerBus: LiveListenerBus, conf: SparkConf, cleaner: Option[ContextCleaner] = None, - clock: Clock = new SystemClock()) + clock: Clock = new SystemClock(), + resourceProfileManager: ResourceProfileManager) extends Logging { allocationManager => @@ -117,23 +124,23 @@ private[spark] class ExecutorAllocationManager( // During testing, the methods to actually kill and add executors are mocked out private val testing = conf.get(DYN_ALLOCATION_TESTING) - // TODO: The default value of 1 for spark.executor.cores works right now because dynamic - // allocation is only supported for YARN and the default number of cores per executor in YARN is - // 1, but it might need to be attained differently for different cluster managers - private val tasksPerExecutorForFullParallelism = - conf.get(EXECUTOR_CORES) / conf.get(CPUS_PER_TASK) - private val executorAllocationRatio = conf.get(DYN_ALLOCATION_EXECUTOR_ALLOCATION_RATIO) + private val defaultProfileId = resourceProfileManager.defaultResourceProfile.id + validateSettings() - // Number of executors to add in the next round - private var numExecutorsToAdd = 1 + // Number of executors to add for each ResourceProfile in the next round + private val numExecutorsToAddPerResourceProfileId = new mutable.HashMap[Int, Int] + numExecutorsToAddPerResourceProfileId(defaultProfileId) = 1 // The desired number of executors at this moment in time. If all our executors were to die, this // is the number of executors we would immediately want from the cluster manager. - private var numExecutorsTarget = initialNumExecutors + // Note every profile will be allowed to have initial number, + // we may want to make this configurable per Profile in the future + private val numExecutorsTargetPerResourceProfileId = new mutable.HashMap[Int, Int] + numExecutorsTargetPerResourceProfileId(defaultProfileId) = initialNumExecutors // A timestamp of when an addition should be triggered, or NOT_SET if it is not set // This is set when pending tasks are added but not scheduled yet @@ -165,11 +172,12 @@ private[spark] class ExecutorAllocationManager( // (2) an executor idle timeout has elapsed. @volatile private var initializing: Boolean = true - // Number of locality aware tasks, used for executor placement. - private var localityAwareTasks = 0 + // Number of locality aware tasks for each ResourceProfile, used for executor placement. + private var numLocalityAwareTasksPerResourceProfileId = new mutable.HashMap[Int, Int] + numLocalityAwareTasksPerResourceProfileId(defaultProfileId) = 0 - // Host to possible task running on it, used for executor placement. - private var hostToLocalTaskCount: Map[String, Int] = Map.empty + // ResourceProfile id to Host to possible task running on it, used for executor placement. + private var rpIdToHostToLocalTaskCount: Map[Int, Map[String, Int]] = Map.empty /** * Verify that the settings specified through the config are valid. @@ -233,7 +241,14 @@ private[spark] class ExecutorAllocationManager( } executor.scheduleWithFixedDelay(scheduleTask, 0, intervalMillis, TimeUnit.MILLISECONDS) - client.requestTotalExecutors(numExecutorsTarget, localityAwareTasks, hostToLocalTaskCount) + // copy the maps inside synchonize to ensure not being modified + val (numExecutorsTarget, numLocalityAware) = synchronized { + val numTarget = numExecutorsTargetPerResourceProfileId.toMap + val numLocality = numLocalityAwareTasksPerResourceProfileId.toMap + (numTarget, numLocality) + } + + client.requestTotalExecutors(numExecutorsTarget, numLocalityAware, rpIdToHostToLocalTaskCount) } /** @@ -253,23 +268,38 @@ private[spark] class ExecutorAllocationManager( */ def reset(): Unit = synchronized { addTime = 0L - numExecutorsTarget = initialNumExecutors + numExecutorsTargetPerResourceProfileId.keys.foreach { rpId => + numExecutorsTargetPerResourceProfileId(rpId) = initialNumExecutors + } executorMonitor.reset() } /** - * The maximum number of executors we would need under the current load to satisfy all running - * and pending tasks, rounded up. + * The maximum number of executors, for the ResourceProfile id passed in, that we would need + * under the current load to satisfy all running and pending tasks, rounded up. */ - private def maxNumExecutorsNeeded(): Int = { - val numRunningOrPendingTasks = listener.totalPendingTasks + listener.totalRunningTasks - math.ceil(numRunningOrPendingTasks * executorAllocationRatio / - tasksPerExecutorForFullParallelism) - .toInt + private def maxNumExecutorsNeededPerResourceProfile(rpId: Int): Int = { + val pending = listener.totalPendingTasksPerResourceProfile(rpId) + val pendingSpeculative = listener.pendingSpeculativeTasksPerResourceProfile(rpId) + val running = listener.totalRunningTasksPerResourceProfile(rpId) + val numRunningOrPendingTasks = pending + running + val rp = resourceProfileManager.resourceProfileFromId(rpId) + val tasksPerExecutor = rp.maxTasksPerExecutor(conf) + logDebug(s"max needed for rpId: $rpId numpending: $numRunningOrPendingTasks," + + s" tasksperexecutor: $tasksPerExecutor") + val maxNeeded = math.ceil(numRunningOrPendingTasks * executorAllocationRatio / + tasksPerExecutor).toInt + if (tasksPerExecutor > 1 && maxNeeded == 1 && pendingSpeculative > 0) { + // If we have pending speculative tasks and only need a single executor, allocate one more + // to satisfy the locality requirements of speculation + maxNeeded + 1 + } else { + maxNeeded + } } - private def totalRunningTasks(): Int = synchronized { - listener.totalRunningTasks + private def totalRunningTasksPerResourceProfile(id: Int): Int = synchronized { + listener.totalRunningTasksPerResourceProfile(id) } /** @@ -288,14 +318,15 @@ private[spark] class ExecutorAllocationManager( } // Update executor target number only after initializing flag is unset - updateAndSyncNumExecutorsTarget(clock.getTimeMillis()) + updateAndSyncNumExecutorsTarget(clock.nanoTime()) if (executorIdsToBeRemoved.nonEmpty) { removeExecutors(executorIdsToBeRemoved) } } /** - * Updates our target number of executors and syncs the result with the cluster manager. + * Updates our target number of executors for each ResourceProfile and then syncs the result + * with the cluster manager. * * Check to see whether our existing allocation and the requests we've made previously exceed our * current needs. If so, truncate our target and let the cluster manager know so that it can @@ -307,136 +338,205 @@ private[spark] class ExecutorAllocationManager( * @return the delta in the target number of executors. */ private def updateAndSyncNumExecutorsTarget(now: Long): Int = synchronized { - val maxNeeded = maxNumExecutorsNeeded - if (initializing) { // Do not change our target while we are still initializing, // Otherwise the first job may have to ramp up unnecessarily 0 - } else if (maxNeeded < numExecutorsTarget) { - // The target number exceeds the number we actually need, so stop adding new - // executors and inform the cluster manager to cancel the extra pending requests - val oldNumExecutorsTarget = numExecutorsTarget - numExecutorsTarget = math.max(maxNeeded, minNumExecutors) - numExecutorsToAdd = 1 - - // If the new target has not changed, avoid sending a message to the cluster manager - if (numExecutorsTarget < oldNumExecutorsTarget) { - // We lower the target number of executors but don't actively kill any yet. Killing is - // controlled separately by an idle timeout. It's still helpful to reduce the target number - // in case an executor just happens to get lost (eg., bad hardware, or the cluster manager - // preempts it) -- in that case, there is no point in trying to immediately get a new - // executor, since we wouldn't even use it yet. - client.requestTotalExecutors(numExecutorsTarget, localityAwareTasks, hostToLocalTaskCount) - logDebug(s"Lowering target number of executors to $numExecutorsTarget (previously " + - s"$oldNumExecutorsTarget) because not all requested executors are actually needed") + } else { + val updatesNeeded = new mutable.HashMap[Int, ExecutorAllocationManager.TargetNumUpdates] + + // Update targets for all ResourceProfiles then do a single request to the cluster manager + numExecutorsTargetPerResourceProfileId.foreach { case (rpId, targetExecs) => + val maxNeeded = maxNumExecutorsNeededPerResourceProfile(rpId) + if (maxNeeded < targetExecs) { + // The target number exceeds the number we actually need, so stop adding new + // executors and inform the cluster manager to cancel the extra pending requests + + // We lower the target number of executors but don't actively kill any yet. Killing is + // controlled separately by an idle timeout. It's still helpful to reduce + // the target number in case an executor just happens to get lost (eg., bad hardware, + // or the cluster manager preempts it) -- in that case, there is no point in trying + // to immediately get a new executor, since we wouldn't even use it yet. + decrementExecutorsFromTarget(maxNeeded, rpId, updatesNeeded) + } else if (addTime != NOT_SET && now >= addTime) { + addExecutorsToTarget(maxNeeded, rpId, updatesNeeded) + } + } + doUpdateRequest(updatesNeeded.toMap, now) + } + } + + private def addExecutorsToTarget( + maxNeeded: Int, + rpId: Int, + updatesNeeded: mutable.HashMap[Int, ExecutorAllocationManager.TargetNumUpdates]): Int = { + updateTargetExecs(addExecutors, maxNeeded, rpId, updatesNeeded) + } + + private def decrementExecutorsFromTarget( + maxNeeded: Int, + rpId: Int, + updatesNeeded: mutable.HashMap[Int, ExecutorAllocationManager.TargetNumUpdates]): Int = { + updateTargetExecs(decrementExecutors, maxNeeded, rpId, updatesNeeded) + } + + private def updateTargetExecs( + updateTargetFn: (Int, Int) => Int, + maxNeeded: Int, + rpId: Int, + updatesNeeded: mutable.HashMap[Int, ExecutorAllocationManager.TargetNumUpdates]): Int = { + val oldNumExecutorsTarget = numExecutorsTargetPerResourceProfileId(rpId) + // update the target number (add or remove) + val delta = updateTargetFn(maxNeeded, rpId) + if (delta != 0) { + updatesNeeded(rpId) = ExecutorAllocationManager.TargetNumUpdates(delta, oldNumExecutorsTarget) + } + delta + } + + private def doUpdateRequest( + updates: Map[Int, ExecutorAllocationManager.TargetNumUpdates], + now: Long): Int = { + // Only call cluster manager if target has changed. + if (updates.size > 0) { + val requestAcknowledged = try { + logDebug("requesting updates: " + updates) + testing || + client.requestTotalExecutors( + numExecutorsTargetPerResourceProfileId.toMap, + numLocalityAwareTasksPerResourceProfileId.toMap, + rpIdToHostToLocalTaskCount) + } catch { + case NonFatal(e) => + // Use INFO level so the error it doesn't show up by default in shells. + // Errors here are more commonly caused by YARN AM restarts, which is a recoverable + // issue, and generate a lot of noisy output. + logInfo("Error reaching cluster manager.", e) + false + } + if (requestAcknowledged) { + // have to go through all resource profiles that changed + var totalDelta = 0 + updates.foreach { case (rpId, targetNum) => + val delta = targetNum.delta + totalDelta += delta + if (delta > 0) { + val executorsString = "executor" + { if (delta > 1) "s" else "" } + logInfo(s"Requesting $delta new $executorsString because tasks are backlogged " + + s"(new desired total will be ${numExecutorsTargetPerResourceProfileId(rpId)} " + + s"for resource profile id: ${rpId})") + numExecutorsToAddPerResourceProfileId(rpId) = + if (delta == numExecutorsToAddPerResourceProfileId(rpId)) { + numExecutorsToAddPerResourceProfileId(rpId) * 2 + } else { + 1 + } + logDebug(s"Starting timer to add more executors (to " + + s"expire in $sustainedSchedulerBacklogTimeoutS seconds)") + addTime = now + TimeUnit.SECONDS.toNanos(sustainedSchedulerBacklogTimeoutS) + } else { + logDebug(s"Lowering target number of executors to" + + s" ${numExecutorsTargetPerResourceProfileId(rpId)} (previously " + + s"$targetNum.oldNumExecutorsTarget for resource profile id: ${rpId}) " + + "because not all requested executors " + + "are actually needed") + } + } + totalDelta + } else { + // request was for all profiles so we have to go through all to reset to old num + updates.foreach { case (rpId, targetNum) => + logWarning("Unable to reach the cluster manager to request more executors!") + numExecutorsTargetPerResourceProfileId(rpId) = targetNum.oldNumExecutorsTarget + } + 0 } - numExecutorsTarget - oldNumExecutorsTarget - } else if (addTime != NOT_SET && now >= addTime) { - val delta = addExecutors(maxNeeded) - logDebug(s"Starting timer to add more executors (to " + - s"expire in $sustainedSchedulerBacklogTimeoutS seconds)") - addTime = now + (sustainedSchedulerBacklogTimeoutS * 1000) - delta } else { + logDebug("No change in number of executors") 0 } } + private def decrementExecutors(maxNeeded: Int, rpId: Int): Int = { + val oldNumExecutorsTarget = numExecutorsTargetPerResourceProfileId(rpId) + numExecutorsTargetPerResourceProfileId(rpId) = math.max(maxNeeded, minNumExecutors) + numExecutorsToAddPerResourceProfileId(rpId) = 1 + numExecutorsTargetPerResourceProfileId(rpId) - oldNumExecutorsTarget + } + /** - * Request a number of executors from the cluster manager. + * Update the target number of executors and figure out how many to add. * If the cap on the number of executors is reached, give up and reset the * number of executors to add next round instead of continuing to double it. * * @param maxNumExecutorsNeeded the maximum number of executors all currently running or pending * tasks could fill + * @param rpId the ResourceProfile id of the executors * @return the number of additional executors actually requested. */ - private def addExecutors(maxNumExecutorsNeeded: Int): Int = { + private def addExecutors(maxNumExecutorsNeeded: Int, rpId: Int): Int = { + val oldNumExecutorsTarget = numExecutorsTargetPerResourceProfileId(rpId) // Do not request more executors if it would put our target over the upper bound - if (numExecutorsTarget >= maxNumExecutors) { - logDebug(s"Not adding executors because our current target total " + - s"is already $numExecutorsTarget (limit $maxNumExecutors)") - numExecutorsToAdd = 1 + // this is doing a max check per ResourceProfile + if (oldNumExecutorsTarget >= maxNumExecutors) { + logDebug("Not adding executors because our current target total " + + s"is already ${oldNumExecutorsTarget} (limit $maxNumExecutors)") + numExecutorsToAddPerResourceProfileId(rpId) = 1 return 0 } - - val oldNumExecutorsTarget = numExecutorsTarget // There's no point in wasting time ramping up to the number of executors we already have, so // make sure our target is at least as much as our current allocation: - numExecutorsTarget = math.max(numExecutorsTarget, executorMonitor.executorCount) + var numExecutorsTarget = math.max(numExecutorsTargetPerResourceProfileId(rpId), + executorMonitor.executorCountWithResourceProfile(rpId)) // Boost our target with the number to add for this round: - numExecutorsTarget += numExecutorsToAdd + numExecutorsTarget += numExecutorsToAddPerResourceProfileId(rpId) // Ensure that our target doesn't exceed what we need at the present moment: numExecutorsTarget = math.min(numExecutorsTarget, maxNumExecutorsNeeded) // Ensure that our target fits within configured bounds: numExecutorsTarget = math.max(math.min(numExecutorsTarget, maxNumExecutors), minNumExecutors) - val delta = numExecutorsTarget - oldNumExecutorsTarget + numExecutorsTargetPerResourceProfileId(rpId) = numExecutorsTarget // If our target has not changed, do not send a message // to the cluster manager and reset our exponential growth if (delta == 0) { - // Check if there is any speculative jobs pending - if (listener.pendingTasks == 0 && listener.pendingSpeculativeTasks > 0) { - numExecutorsTarget = - math.max(math.min(maxNumExecutorsNeeded + 1, maxNumExecutors), minNumExecutors) - } else { - numExecutorsToAdd = 1 - return 0 - } - } - - val addRequestAcknowledged = try { - testing || - client.requestTotalExecutors(numExecutorsTarget, localityAwareTasks, hostToLocalTaskCount) - } catch { - case NonFatal(e) => - // Use INFO level so the error it doesn't show up by default in shells. Errors here are more - // commonly caused by YARN AM restarts, which is a recoverable issue, and generate a lot of - // noisy output. - logInfo("Error reaching cluster manager.", e) - false - } - if (addRequestAcknowledged) { - val executorsString = "executor" + { if (delta > 1) "s" else "" } - logInfo(s"Requesting $delta new $executorsString because tasks are backlogged" + - s" (new desired total will be $numExecutorsTarget)") - numExecutorsToAdd = if (delta == numExecutorsToAdd) { - numExecutorsToAdd * 2 - } else { - 1 - } - delta - } else { - logWarning( - s"Unable to reach the cluster manager to request $numExecutorsTarget total executors!") - numExecutorsTarget = oldNumExecutorsTarget - 0 + numExecutorsToAddPerResourceProfileId(rpId) = 1 } + delta } /** * Request the cluster manager to remove the given executors. * Returns the list of executors which are removed. */ - private def removeExecutors(executors: Seq[String]): Seq[String] = synchronized { + private def removeExecutors(executors: Seq[(String, Int)]): Seq[String] = synchronized { val executorIdsToBeRemoved = new ArrayBuffer[String] - - logInfo("Request to remove executorIds: " + executors.mkString(", ")) - val numExistingExecutors = executorMonitor.executorCount - executorMonitor.pendingRemovalCount - - var newExecutorTotal = numExistingExecutors - executors.foreach { executorIdToBeRemoved => - if (newExecutorTotal - 1 < minNumExecutors) { - logDebug(s"Not removing idle executor $executorIdToBeRemoved because there are only " + - s"$newExecutorTotal executor(s) left (minimum number of executor limit $minNumExecutors)") - } else if (newExecutorTotal - 1 < numExecutorsTarget) { - logDebug(s"Not removing idle executor $executorIdToBeRemoved because there are only " + - s"$newExecutorTotal executor(s) left (number of executor target $numExecutorsTarget)") + logDebug(s"Request to remove executorIds: ${executors.mkString(", ")}") + val numExecutorsTotalPerRpId = mutable.Map[Int, Int]() + executors.foreach { case (executorIdToBeRemoved, rpId) => + if (rpId == UNKNOWN_RESOURCE_PROFILE_ID) { + if (testing) { + throw new SparkException("ResourceProfile Id was UNKNOWN, this is not expected") + } + logWarning(s"Not removing executor $executorIdsToBeRemoved because the " + + "ResourceProfile was UNKNOWN!") } else { - executorIdsToBeRemoved += executorIdToBeRemoved - newExecutorTotal -= 1 + // get the running total as we remove or initialize it to the count - pendingRemoval + val newExecutorTotal = numExecutorsTotalPerRpId.getOrElseUpdate(rpId, + (executorMonitor.executorCountWithResourceProfile(rpId) - + executorMonitor.pendingRemovalCountPerResourceProfileId(rpId))) + if (newExecutorTotal - 1 < minNumExecutors) { + logDebug(s"Not removing idle executor $executorIdToBeRemoved because there " + + s"are only $newExecutorTotal executor(s) left (minimum number of executor limit " + + s"$minNumExecutors)") + } else if (newExecutorTotal - 1 < numExecutorsTargetPerResourceProfileId(rpId)) { + logDebug(s"Not removing idle executor $executorIdToBeRemoved because there " + + s"are only $newExecutorTotal executor(s) left (number of executor " + + s"target ${numExecutorsTargetPerResourceProfileId(rpId)})") + } else { + executorIdsToBeRemoved += executorIdToBeRemoved + numExecutorsTotalPerRpId(rpId) -= 1 + } } } @@ -456,14 +556,15 @@ private[spark] class ExecutorAllocationManager( // [SPARK-21834] killExecutors api reduces the target number of executors. // So we need to update the target with desired value. - client.requestTotalExecutors(numExecutorsTarget, localityAwareTasks, hostToLocalTaskCount) + client.requestTotalExecutors( + numExecutorsTargetPerResourceProfileId.toMap, + numLocalityAwareTasksPerResourceProfileId.toMap, + rpIdToHostToLocalTaskCount) + // reset the newExecutorTotal to the existing number of executors - newExecutorTotal = numExistingExecutors if (testing || executorsRemoved.nonEmpty) { - newExecutorTotal -= executorsRemoved.size executorMonitor.executorsKilled(executorsRemoved) - logInfo(s"Executors ${executorsRemoved.mkString(",")} removed due to idle timeout." + - s"(new desired total will be $newExecutorTotal)") + logInfo(s"Executors ${executorsRemoved.mkString(",")} removed due to idle timeout.") executorsRemoved } else { logWarning(s"Unable to reach the cluster manager to kill executor/s " + @@ -481,7 +582,7 @@ private[spark] class ExecutorAllocationManager( if (addTime == NOT_SET) { logDebug(s"Starting timer to add executors because pending tasks " + s"are building up (to expire in $schedulerBacklogTimeoutS seconds)") - addTime = clock.getTimeMillis + schedulerBacklogTimeoutS * 1000 + addTime = clock.nanoTime() + TimeUnit.SECONDS.toNanos(schedulerBacklogTimeoutS) } } @@ -492,7 +593,7 @@ private[spark] class ExecutorAllocationManager( private def onSchedulerQueueEmpty(): Unit = synchronized { logDebug("Clearing timer to add executors because there are no more pending tasks") addTime = NOT_SET - numExecutorsToAdd = 1 + numExecutorsToAddPerResourceProfileId.transform { case (_, _) => 1 } } private case class StageAttempt(stageId: Int, stageAttemptId: Int) { @@ -512,18 +613,22 @@ private[spark] class ExecutorAllocationManager( // Should be 0 when no stages are active. private val stageAttemptToNumRunningTask = new mutable.HashMap[StageAttempt, Int] private val stageAttemptToTaskIndices = new mutable.HashMap[StageAttempt, mutable.HashSet[Int]] - // Number of speculative tasks to be scheduled in each stageAttempt + // Number of speculative tasks pending/running in each stageAttempt private val stageAttemptToNumSpeculativeTasks = new mutable.HashMap[StageAttempt, Int] // The speculative tasks started in each stageAttempt private val stageAttemptToSpeculativeTaskIndices = new mutable.HashMap[StageAttempt, mutable.HashSet[Int]] + private val resourceProfileIdToStageAttempt = + new mutable.HashMap[Int, mutable.Set[StageAttempt]] + // stageAttempt to tuple (the number of task with locality preferences, a map where each pair - // is a node and the number of tasks that would like to be scheduled on that node) map, + // is a node and the number of tasks that would like to be scheduled on that node, and + // the resource profile id) map, // maintain the executor placement hints for each stageAttempt used by resource framework // to better place the executors. private val stageAttemptToExecutorPlacementHints = - new mutable.HashMap[StageAttempt, (Int, Map[String, Int])] + new mutable.HashMap[StageAttempt, (Int, Map[String, Int], Int)] override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = { initializing = false @@ -534,6 +639,13 @@ private[spark] class ExecutorAllocationManager( allocationManager.synchronized { stageAttemptToNumTasks(stageAttempt) = numTasks allocationManager.onSchedulerBacklogged() + // need to keep stage task requirements to ask for the right containers + val profId = stageSubmitted.stageInfo.resourceProfileId + logDebug(s"Stage resource profile id is: $profId with numTasks: $numTasks") + resourceProfileIdToStageAttempt.getOrElseUpdate( + profId, new mutable.HashSet[StageAttempt]) += stageAttempt + numExecutorsToAddPerResourceProfileId.getOrElseUpdate(profId, 1) + numExecutorsTargetPerResourceProfileId.getOrElseUpdate(profId, initialNumExecutors) // Compute the number of tasks requested by the stage on each host var numTasksPending = 0 @@ -548,7 +660,7 @@ private[spark] class ExecutorAllocationManager( } } stageAttemptToExecutorPlacementHints.put(stageAttempt, - (numTasksPending, hostToLocalTaskCountPerStage.toMap)) + (numTasksPending, hostToLocalTaskCountPerStage.toMap, profId)) // Update the executor placement hints updateExecutorPlacementHints() @@ -560,7 +672,7 @@ private[spark] class ExecutorAllocationManager( val stageAttemptId = stageCompleted.stageInfo.attemptNumber() val stageAttempt = StageAttempt(stageId, stageAttemptId) allocationManager.synchronized { - // do NOT remove stageAttempt from stageAttemptToNumRunningTasks, + // do NOT remove stageAttempt from stageAttemptToNumRunningTask // because the attempt may still have running tasks, // even after another attempt for the stage is submitted. stageAttemptToNumTasks -= stageAttempt @@ -596,7 +708,7 @@ private[spark] class ExecutorAllocationManager( stageAttemptToTaskIndices.getOrElseUpdate(stageAttempt, new mutable.HashSet[Int]) += taskIndex } - if (totalPendingTasks() == 0) { + if (!hasPendingTasks) { allocationManager.onSchedulerQueueEmpty() } } @@ -612,20 +724,45 @@ private[spark] class ExecutorAllocationManager( stageAttemptToNumRunningTask(stageAttempt) -= 1 if (stageAttemptToNumRunningTask(stageAttempt) == 0) { stageAttemptToNumRunningTask -= stageAttempt + if (!stageAttemptToNumTasks.contains(stageAttempt)) { + val rpForStage = resourceProfileIdToStageAttempt.filter { case (k, v) => + v.contains(stageAttempt) + }.keys + if (rpForStage.size == 1) { + // be careful about the removal from here due to late tasks, make sure stage is + // really complete and no tasks left + resourceProfileIdToStageAttempt(rpForStage.head) -= stageAttempt + } else { + logWarning(s"Should have exactly one resource profile for stage $stageAttempt," + + s" but have $rpForStage") + } + } + } } - // If the task failed, we expect it to be resubmitted later. To ensure we have - // enough resources to run the resubmitted task, we need to mark the scheduler - // as backlogged again if it's not already marked as such (SPARK-8366) - if (taskEnd.reason != Success) { - if (totalPendingTasks() == 0) { - allocationManager.onSchedulerBacklogged() - } - if (taskEnd.taskInfo.speculative) { - stageAttemptToSpeculativeTaskIndices.get(stageAttempt).foreach {_.remove(taskIndex)} - } else { - stageAttemptToTaskIndices.get(stageAttempt).foreach {_.remove(taskIndex)} - } + if (taskEnd.taskInfo.speculative) { + stageAttemptToSpeculativeTaskIndices.get(stageAttempt).foreach {_.remove{taskIndex}} + stageAttemptToNumSpeculativeTasks(stageAttempt) -= 1 + } + + taskEnd.reason match { + case Success | _: TaskKilled => + case _ => + if (!hasPendingTasks) { + // If the task failed (not intentionally killed), we expect it to be resubmitted + // later. To ensure we have enough resources to run the resubmitted task, we need to + // mark the scheduler as backlogged again if it's not already marked as such + // (SPARK-8366) + allocationManager.onSchedulerBacklogged() + } + if (!taskEnd.taskInfo.speculative) { + // If a non-speculative task is intentionally killed, it means the speculative task + // has succeeded, and no further task of this task index will be resubmitted. In this + // case, the task index is completed and we shouldn't remove it from + // stageAttemptToTaskIndices. Otherwise, we will have a pending non-speculative task + // for the task index (SPARK-30511) + stageAttemptToTaskIndices.get(stageAttempt).foreach {_.remove(taskIndex)} + } } } } @@ -648,20 +785,46 @@ private[spark] class ExecutorAllocationManager( * * Note: This is not thread-safe without the caller owning the `allocationManager` lock. */ - def pendingTasks(): Int = { - stageAttemptToNumTasks.map { case (stageAttempt, numTasks) => - numTasks - stageAttemptToTaskIndices.get(stageAttempt).map(_.size).getOrElse(0) - }.sum + def pendingTasksPerResourceProfile(rpId: Int): Int = { + val attempts = resourceProfileIdToStageAttempt.getOrElse(rpId, Set.empty).toSeq + attempts.map(attempt => getPendingTaskSum(attempt)).sum } - def pendingSpeculativeTasks(): Int = { - stageAttemptToNumSpeculativeTasks.map { case (stageAttempt, numTasks) => - numTasks - stageAttemptToSpeculativeTaskIndices.get(stageAttempt).map(_.size).getOrElse(0) - }.sum + def hasPendingRegularTasks: Boolean = { + val attemptSets = resourceProfileIdToStageAttempt.values + attemptSets.exists(attempts => attempts.exists(getPendingTaskSum(_) > 0)) } - def totalPendingTasks(): Int = { - pendingTasks + pendingSpeculativeTasks + private def getPendingTaskSum(attempt: StageAttempt): Int = { + val numTotalTasks = stageAttemptToNumTasks.getOrElse(attempt, 0) + val numRunning = stageAttemptToTaskIndices.get(attempt).map(_.size).getOrElse(0) + numTotalTasks - numRunning + } + + def pendingSpeculativeTasksPerResourceProfile(rp: Int): Int = { + val attempts = resourceProfileIdToStageAttempt.getOrElse(rp, Set.empty).toSeq + attempts.map(attempt => getPendingSpeculativeTaskSum(attempt)).sum + } + + def hasPendingSpeculativeTasks: Boolean = { + val attemptSets = resourceProfileIdToStageAttempt.values + attemptSets.exists { attempts => + attempts.exists(getPendingSpeculativeTaskSum(_) > 0) + } + } + + private def getPendingSpeculativeTaskSum(attempt: StageAttempt): Int = { + val numTotalTasks = stageAttemptToNumSpeculativeTasks.getOrElse(attempt, 0) + val numRunning = stageAttemptToSpeculativeTaskIndices.get(attempt).map(_.size).getOrElse(0) + numTotalTasks - numRunning + } + + def hasPendingTasks: Boolean = { + hasPendingSpeculativeTasks || hasPendingRegularTasks + } + + def totalPendingTasksPerResourceProfile(rp: Int): Int = { + pendingTasksPerResourceProfile(rp) + pendingSpeculativeTasksPerResourceProfile(rp) } /** @@ -672,6 +835,14 @@ private[spark] class ExecutorAllocationManager( stageAttemptToNumRunningTask.values.sum } + def totalRunningTasksPerResourceProfile(rp: Int): Int = { + val attempts = resourceProfileIdToStageAttempt.getOrElse(rp, Set.empty).toSeq + // attempts is a Set, change to Seq so we keep all values + attempts.map { attempt => + stageAttemptToNumRunningTask.getOrElseUpdate(attempt, 0) + }.sum + } + /** * Update the Executor placement hints (the number of tasks with locality preferences, * a map where each pair is a node and the number of tasks that would like to be scheduled @@ -681,18 +852,27 @@ private[spark] class ExecutorAllocationManager( * granularity within stages. */ def updateExecutorPlacementHints(): Unit = { - var localityAwareTasks = 0 - val localityToCount = new mutable.HashMap[String, Int]() - stageAttemptToExecutorPlacementHints.values.foreach { case (numTasksPending, localities) => - localityAwareTasks += numTasksPending - localities.foreach { case (hostname, count) => - val updatedCount = localityToCount.getOrElse(hostname, 0) + count - localityToCount(hostname) = updatedCount - } + val localityAwareTasksPerResourceProfileId = new mutable.HashMap[Int, Int] + + // ResourceProfile id => map[host, count] + val rplocalityToCount = new mutable.HashMap[Int, mutable.HashMap[String, Int]]() + stageAttemptToExecutorPlacementHints.values.foreach { + case (numTasksPending, localities, rpId) => + val rpNumPending = + localityAwareTasksPerResourceProfileId.getOrElse(rpId, 0) + localityAwareTasksPerResourceProfileId(rpId) = rpNumPending + numTasksPending + localities.foreach { case (hostname, count) => + val rpBasedHostToCount = + rplocalityToCount.getOrElseUpdate(rpId, new mutable.HashMap[String, Int]) + val newUpdated = rpBasedHostToCount.getOrElse(hostname, 0) + count + rpBasedHostToCount(hostname) = newUpdated + } } - allocationManager.localityAwareTasks = localityAwareTasks - allocationManager.hostToLocalTaskCount = localityToCount.toMap + allocationManager.numLocalityAwareTasksPerResourceProfileId = + localityAwareTasksPerResourceProfileId + allocationManager.rpIdToHostToLocalTaskCount = + rplocalityToCount.map { case (k, v) => (k, v.toMap)}.toMap } } @@ -713,14 +893,22 @@ private[spark] class ExecutorAllocationManager( }) } - registerGauge("numberExecutorsToAdd", numExecutorsToAdd, 0) + // The metrics are going to return the sum for all the different ResourceProfiles. + registerGauge("numberExecutorsToAdd", + numExecutorsToAddPerResourceProfileId.values.sum, 0) registerGauge("numberExecutorsPendingToRemove", executorMonitor.pendingRemovalCount, 0) registerGauge("numberAllExecutors", executorMonitor.executorCount, 0) - registerGauge("numberTargetExecutors", numExecutorsTarget, 0) - registerGauge("numberMaxNeededExecutors", maxNumExecutorsNeeded(), 0) + registerGauge("numberTargetExecutors", + numExecutorsTargetPerResourceProfileId.values.sum, 0) + registerGauge("numberMaxNeededExecutors", numExecutorsTargetPerResourceProfileId.keys + .map(maxNumExecutorsNeededPerResourceProfile(_)).sum, 0) } } private object ExecutorAllocationManager { val NOT_SET = Long.MaxValue + + // helper case class for requesting executors, here to be visible for testing + private[spark] case class TargetNumUpdates(delta: Int, oldNumExecutorsTarget: Int) + } diff --git a/core/src/main/scala/org/apache/spark/FutureAction.scala b/core/src/main/scala/org/apache/spark/FutureAction.scala index 8230533f9d245..4bdcafce0d75a 100644 --- a/core/src/main/scala/org/apache/spark/FutureAction.scala +++ b/core/src/main/scala/org/apache/spark/FutureAction.scala @@ -115,7 +115,7 @@ class SimpleFutureAction[T] private[spark](jobWaiter: JobWaiter[_], resultFunc: @volatile private var _cancelled: Boolean = false - override def cancel() { + override def cancel(): Unit = { _cancelled = true jobWaiter.cancel() } @@ -132,7 +132,7 @@ class SimpleFutureAction[T] private[spark](jobWaiter: JobWaiter[_], resultFunc: value.get.get } - override def onComplete[U](func: (Try[T]) => U)(implicit executor: ExecutionContext) { + override def onComplete[U](func: (Try[T]) => U)(implicit executor: ExecutionContext): Unit = { jobWaiter.completionFuture onComplete {_ => func(value.get)} } diff --git a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala index 20224eb721c09..2ac72e66d6f32 100644 --- a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala +++ b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala @@ -27,6 +27,9 @@ import org.apache.spark.internal.{config, Logging} import org.apache.spark.internal.config.Network import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.scheduler._ +import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RemoveExecutor +import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend +import org.apache.spark.scheduler.local.LocalSchedulerBackend import org.apache.spark.storage.BlockManagerId import org.apache.spark.util._ @@ -199,14 +202,30 @@ private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock) if (now - lastSeenMs > executorTimeoutMs) { logWarning(s"Removing executor $executorId with no recent heartbeats: " + s"${now - lastSeenMs} ms exceeds timeout $executorTimeoutMs ms") - scheduler.executorLost(executorId, SlaveLost("Executor heartbeat " + - s"timed out after ${now - lastSeenMs} ms")) - // Asynchronously kill the executor to avoid blocking the current thread + // Asynchronously kill the executor to avoid blocking the current thread killExecutorThread.submit(new Runnable { override def run(): Unit = Utils.tryLogNonFatalError { // Note: we want to get an executor back after expiring this one, // so do not simply call `sc.killExecutor` here (SPARK-8119) sc.killAndReplaceExecutor(executorId) + // SPARK-27348: in case of the executors which are not gracefully shut down, + // we should remove lost executors from CoarseGrainedSchedulerBackend manually + // here to guarantee two things: + // 1) explicitly remove executor information from CoarseGrainedSchedulerBackend for + // a lost executor instead of waiting for disconnect message + // 2) call scheduler.executorLost() underlying to fail any tasks assigned to + // those executors to avoid app hang + sc.schedulerBackend match { + case backend: CoarseGrainedSchedulerBackend => + backend.driverEndpoint.send(RemoveExecutor(executorId, + SlaveLost(s"Executor heartbeat timed out after ${now - lastSeenMs} ms"))) + + // LocalSchedulerBackend is used locally and only has one single executor + case _: LocalSchedulerBackend => + + case other => throw new UnsupportedOperationException( + s"Unknown scheduler backend: ${other.getClass}") + } } }) executorLastSeen.remove(executorId) diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala index d878fc527791a..f229061a6d0f6 100644 --- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala +++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala @@ -17,22 +17,25 @@ package org.apache.spark -import java.io._ +import java.io.{ByteArrayInputStream, ObjectInputStream, ObjectOutputStream} import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue, ThreadPoolExecutor, TimeUnit} -import java.util.zip.{GZIPInputStream, GZIPOutputStream} +import java.util.concurrent.locks.ReentrantReadWriteLock import scala.collection.JavaConverters._ -import scala.collection.mutable.{HashMap, HashSet, ListBuffer, Map} +import scala.collection.mutable.{HashMap, ListBuffer, Map} import scala.concurrent.{ExecutionContext, Future} import scala.concurrent.duration.Duration import scala.reflect.ClassTag import scala.util.control.NonFatal +import org.apache.commons.io.output.{ByteArrayOutputStream => ApacheByteArrayOutputStream} + import org.apache.spark.broadcast.{Broadcast, BroadcastManager} import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ +import org.apache.spark.io.CompressionCodec import org.apache.spark.rpc.{RpcCallContext, RpcEndpoint, RpcEndpointRef, RpcEnv} -import org.apache.spark.scheduler.MapStatus +import org.apache.spark.scheduler.{ExecutorCacheTaskLocation, MapStatus} import org.apache.spark.shuffle.MetadataFetchFailedException import org.apache.spark.storage.{BlockId, BlockManagerId, ShuffleBlockId} import org.apache.spark.util._ @@ -41,14 +44,36 @@ import org.apache.spark.util._ * Helper class used by the [[MapOutputTrackerMaster]] to perform bookkeeping for a single * ShuffleMapStage. * - * This class maintains a mapping from mapIds to `MapStatus`. It also maintains a cache of + * This class maintains a mapping from map index to `MapStatus`. It also maintains a cache of * serialized map statuses in order to speed up tasks' requests for map output statuses. * * All public methods of this class are thread-safe. */ private class ShuffleStatus(numPartitions: Int) { - // All accesses to the following state must be guarded with `this.synchronized`. + private val (readLock, writeLock) = { + val lock = new ReentrantReadWriteLock() + (lock.readLock(), lock.writeLock()) + } + + // All accesses to the following state must be guarded with `withReadLock` or `withWriteLock`. + private def withReadLock[B](fn: => B): B = { + readLock.lock() + try { + fn + } finally { + readLock.unlock() + } + } + + private def withWriteLock[B](fn: => B): B = { + writeLock.lock() + try { + fn + } finally { + writeLock.unlock() + } + } /** * MapStatus for each partition. The index of the array is the map partition id. @@ -88,12 +113,12 @@ private class ShuffleStatus(numPartitions: Int) { * Register a map output. If there is already a registered location for the map output then it * will be replaced by the new location. */ - def addMapOutput(mapId: Int, status: MapStatus): Unit = synchronized { - if (mapStatuses(mapId) == null) { + def addMapOutput(mapIndex: Int, status: MapStatus): Unit = withWriteLock { + if (mapStatuses(mapIndex) == null) { _numAvailableOutputs += 1 invalidateSerializedMapOutputStatusCache() } - mapStatuses(mapId) = status + mapStatuses(mapIndex) = status } /** @@ -101,10 +126,10 @@ private class ShuffleStatus(numPartitions: Int) { * This is a no-op if there is no registered map output or if the registered output is from a * different block manager. */ - def removeMapOutput(mapId: Int, bmAddress: BlockManagerId): Unit = synchronized { - if (mapStatuses(mapId) != null && mapStatuses(mapId).location == bmAddress) { + def removeMapOutput(mapIndex: Int, bmAddress: BlockManagerId): Unit = withWriteLock { + if (mapStatuses(mapIndex) != null && mapStatuses(mapIndex).location == bmAddress) { _numAvailableOutputs -= 1 - mapStatuses(mapId) = null + mapStatuses(mapIndex) = null invalidateSerializedMapOutputStatusCache() } } @@ -113,7 +138,7 @@ private class ShuffleStatus(numPartitions: Int) { * Removes all shuffle outputs associated with this host. Note that this will also remove * outputs which are served by an external shuffle server (if one exists). */ - def removeOutputsOnHost(host: String): Unit = { + def removeOutputsOnHost(host: String): Unit = withWriteLock { removeOutputsByFilter(x => x.host == host) } @@ -122,7 +147,7 @@ private class ShuffleStatus(numPartitions: Int) { * remove outputs which are served by an external shuffle server (if one exists), as they are * still registered with that execId. */ - def removeOutputsOnExecutor(execId: String): Unit = synchronized { + def removeOutputsOnExecutor(execId: String): Unit = withWriteLock { removeOutputsByFilter(x => x.executorId == execId) } @@ -130,11 +155,11 @@ private class ShuffleStatus(numPartitions: Int) { * Removes all shuffle outputs which satisfies the filter. Note that this will also * remove outputs which are served by an external shuffle server (if one exists). */ - def removeOutputsByFilter(f: (BlockManagerId) => Boolean): Unit = synchronized { - for (mapId <- 0 until mapStatuses.length) { - if (mapStatuses(mapId) != null && f(mapStatuses(mapId).location)) { + def removeOutputsByFilter(f: BlockManagerId => Boolean): Unit = withWriteLock { + for (mapIndex <- mapStatuses.indices) { + if (mapStatuses(mapIndex) != null && f(mapStatuses(mapIndex).location)) { _numAvailableOutputs -= 1 - mapStatuses(mapId) = null + mapStatuses(mapIndex) = null invalidateSerializedMapOutputStatusCache() } } @@ -143,14 +168,14 @@ private class ShuffleStatus(numPartitions: Int) { /** * Number of partitions that have shuffle outputs. */ - def numAvailableOutputs: Int = synchronized { + def numAvailableOutputs: Int = withReadLock { _numAvailableOutputs } /** * Returns the sequence of partition ids that are missing (i.e. needs to be computed). */ - def findMissingPartitions(): Seq[Int] = synchronized { + def findMissingPartitions(): Seq[Int] = withReadLock { val missing = (0 until numPartitions).filter(id => mapStatuses(id) == null) assert(missing.size == numPartitions - _numAvailableOutputs, s"${missing.size} missing, expected ${numPartitions - _numAvailableOutputs}") @@ -169,18 +194,32 @@ private class ShuffleStatus(numPartitions: Int) { def serializedMapStatus( broadcastManager: BroadcastManager, isLocal: Boolean, - minBroadcastSize: Int): Array[Byte] = synchronized { - if (cachedSerializedMapStatus eq null) { - val serResult = MapOutputTracker.serializeMapStatuses( - mapStatuses, broadcastManager, isLocal, minBroadcastSize) - cachedSerializedMapStatus = serResult._1 - cachedSerializedBroadcast = serResult._2 + minBroadcastSize: Int, + conf: SparkConf): Array[Byte] = { + var result: Array[Byte] = null + + withReadLock { + if (cachedSerializedMapStatus != null) { + result = cachedSerializedMapStatus + } } - cachedSerializedMapStatus + + if (result == null) withWriteLock { + if (cachedSerializedMapStatus == null) { + val serResult = MapOutputTracker.serializeMapStatuses( + mapStatuses, broadcastManager, isLocal, minBroadcastSize, conf) + cachedSerializedMapStatus = serResult._1 + cachedSerializedBroadcast = serResult._2 + } + // The following line has to be outside if statement since it's possible that another thread + // initializes cachedSerializedMapStatus in-between `withReadLock` and `withWriteLock`. + result = cachedSerializedMapStatus + } + result } // Used in testing. - def hasCachedSerializedBroadcast: Boolean = synchronized { + def hasCachedSerializedBroadcast: Boolean = withReadLock { cachedSerializedBroadcast != null } @@ -188,14 +227,14 @@ private class ShuffleStatus(numPartitions: Int) { * Helper function which provides thread-safe access to the mapStatuses array. * The function should NOT mutate the array. */ - def withMapStatuses[T](f: Array[MapStatus] => T): T = synchronized { + def withMapStatuses[T](f: Array[MapStatus] => T): T = withReadLock { f(mapStatuses) } /** * Clears the cached serialized map output statuses. */ - def invalidateSerializedMapOutputStatusCache(): Unit = synchronized { + def invalidateSerializedMapOutputStatusCache(): Unit = withWriteLock { if (cachedSerializedBroadcast != null) { // Prevent errors during broadcast cleanup from crashing the DAGScheduler (see SPARK-21444) Utils.tryLogNonFatalError { @@ -272,7 +311,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging } /** Send a one-way message to the trackerEndpoint, to which we expect it to reply with true. */ - protected def sendTracker(message: Any) { + protected def sendTracker(message: Any): Unit = { val response = askTracker[Boolean](message) if (response != true) { throw new SparkException( @@ -282,7 +321,7 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging // For testing def getMapSizesByExecutorId(shuffleId: Int, reduceId: Int) - : Iterator[(BlockManagerId, Seq[(BlockId, Long)])] = { + : Iterator[(BlockManagerId, Seq[(BlockId, Long, Int)])] = { getMapSizesByExecutorId(shuffleId, reduceId, reduceId + 1) } @@ -292,18 +331,39 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging * endPartition is excluded from the range). * * @return A sequence of 2-item tuples, where the first item in the tuple is a BlockManagerId, - * and the second item is a sequence of (shuffle block id, shuffle block size) tuples - * describing the shuffle blocks that are stored at that block manager. + * and the second item is a sequence of (shuffle block id, shuffle block size, map index) + * tuples describing the shuffle blocks that are stored at that block manager. */ - def getMapSizesByExecutorId(shuffleId: Int, startPartition: Int, endPartition: Int) - : Iterator[(BlockManagerId, Seq[(BlockId, Long)])] + def getMapSizesByExecutorId( + shuffleId: Int, + startPartition: Int, + endPartition: Int) + : Iterator[(BlockManagerId, Seq[(BlockId, Long, Int)])] + + /** + * Called from executors to get the server URIs and output sizes for each shuffle block that + * needs to be read from a given range of map output partitions (startPartition is included but + * endPartition is excluded from the range) and is produced by + * a range of mappers (startMapIndex, endMapIndex, startMapIndex is included and + * the endMapIndex is excluded). + * + * @return A sequence of 2-item tuples, where the first item in the tuple is a BlockManagerId, + * and the second item is a sequence of (shuffle block id, shuffle block size, map index) + * tuples describing the shuffle blocks that are stored at that block manager. + */ + def getMapSizesByRange( + shuffleId: Int, + startMapIndex: Int, + endMapIndex: Int, + startPartition: Int, + endPartition: Int): Iterator[(BlockManagerId, Seq[(BlockId, Long, Int)])] /** * Deletes map output status information for the specified shuffle stage. */ def unregisterShuffle(shuffleId: Int): Unit - def stop() {} + def stop(): Unit = {} } /** @@ -317,8 +377,8 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging */ private[spark] class MapOutputTrackerMaster( conf: SparkConf, - broadcastManager: BroadcastManager, - isLocal: Boolean) + private[spark] val broadcastManager: BroadcastManager, + private[spark] val isLocal: Boolean) extends MapOutputTracker(conf) { // The size at which we use Broadcast to send the map output statuses to the executors @@ -393,7 +453,8 @@ private[spark] class MapOutputTrackerMaster( " to " + hostPort) val shuffleStatus = shuffleStatuses.get(shuffleId).head context.reply( - shuffleStatus.serializedMapStatus(broadcastManager, isLocal, minSizeForBroadcast)) + shuffleStatus.serializedMapStatus(broadcastManager, isLocal, minSizeForBroadcast, + conf)) } catch { case NonFatal(e) => logError(e.getMessage, e) } @@ -412,21 +473,21 @@ private[spark] class MapOutputTrackerMaster( shuffleStatuses.valuesIterator.count(_.hasCachedSerializedBroadcast) } - def registerShuffle(shuffleId: Int, numMaps: Int) { + def registerShuffle(shuffleId: Int, numMaps: Int): Unit = { if (shuffleStatuses.put(shuffleId, new ShuffleStatus(numMaps)).isDefined) { throw new IllegalArgumentException("Shuffle ID " + shuffleId + " registered twice") } } - def registerMapOutput(shuffleId: Int, mapId: Int, status: MapStatus) { - shuffleStatuses(shuffleId).addMapOutput(mapId, status) + def registerMapOutput(shuffleId: Int, mapIndex: Int, status: MapStatus): Unit = { + shuffleStatuses(shuffleId).addMapOutput(mapIndex, status) } /** Unregister map output information of the given shuffle, mapper and block manager */ - def unregisterMapOutput(shuffleId: Int, mapId: Int, bmAddress: BlockManagerId) { + def unregisterMapOutput(shuffleId: Int, mapIndex: Int, bmAddress: BlockManagerId): Unit = { shuffleStatuses.get(shuffleId) match { case Some(shuffleStatus) => - shuffleStatus.removeMapOutput(mapId, bmAddress) + shuffleStatus.removeMapOutput(mapIndex, bmAddress) incrementEpoch() case None => throw new SparkException("unregisterMapOutput called for nonexistent shuffle ID") @@ -434,7 +495,7 @@ private[spark] class MapOutputTrackerMaster( } /** Unregister all map output information of the given shuffle. */ - def unregisterAllMapOutput(shuffleId: Int) { + def unregisterAllMapOutput(shuffleId: Int): Unit = { shuffleStatuses.get(shuffleId) match { case Some(shuffleStatus) => shuffleStatus.removeOutputsByFilter(x => true) @@ -446,7 +507,7 @@ private[spark] class MapOutputTrackerMaster( } /** Unregister shuffle data */ - def unregisterShuffle(shuffleId: Int) { + def unregisterShuffle(shuffleId: Int): Unit = { shuffleStatuses.remove(shuffleId).foreach { shuffleStatus => shuffleStatus.invalidateSerializedMapOutputStatusCache() } @@ -629,7 +690,36 @@ private[spark] class MapOutputTrackerMaster( None } - def incrementEpoch() { + /** + * Return the locations where the Mappers ran. The locations each includes both a host and an + * executor id on that host. + * + * @param dep shuffle dependency object + * @param startMapIndex the start map index + * @param endMapIndex the end map index + * @return a sequence of locations where task runs. + */ + def getMapLocation( + dep: ShuffleDependency[_, _, _], + startMapIndex: Int, + endMapIndex: Int): Seq[String] = + { + val shuffleStatus = shuffleStatuses.get(dep.shuffleId).orNull + if (shuffleStatus != null) { + shuffleStatus.withMapStatuses { statuses => + if (startMapIndex < endMapIndex && (startMapIndex >= 0 && endMapIndex < statuses.length)) { + val statusesPicked = statuses.slice(startMapIndex, endMapIndex).filter(_ != null) + statusesPicked.map(_.location.host).toSeq + } else { + Nil + } + } + } else { + Nil + } + } + + def incrementEpoch(): Unit = { epochLock.synchronized { epoch += 1 logDebug("Increasing epoch to " + epoch) @@ -645,20 +735,43 @@ private[spark] class MapOutputTrackerMaster( // Get blocks sizes by executor Id. Note that zero-sized blocks are excluded in the result. // This method is only called in local-mode. - def getMapSizesByExecutorId(shuffleId: Int, startPartition: Int, endPartition: Int) - : Iterator[(BlockManagerId, Seq[(BlockId, Long)])] = { + def getMapSizesByExecutorId( + shuffleId: Int, + startPartition: Int, + endPartition: Int) + : Iterator[(BlockManagerId, Seq[(BlockId, Long, Int)])] = { logDebug(s"Fetching outputs for shuffle $shuffleId, partitions $startPartition-$endPartition") shuffleStatuses.get(shuffleId) match { case Some (shuffleStatus) => shuffleStatus.withMapStatuses { statuses => - MapOutputTracker.convertMapStatuses(shuffleId, startPartition, endPartition, statuses) + MapOutputTracker.convertMapStatuses( + shuffleId, startPartition, endPartition, statuses, 0, shuffleStatus.mapStatuses.length) + } + case None => + Iterator.empty + } + } + + override def getMapSizesByRange( + shuffleId: Int, + startMapIndex: Int, + endMapIndex: Int, + startPartition: Int, + endPartition: Int): Iterator[(BlockManagerId, Seq[(BlockId, Long, Int)])] = { + logDebug(s"Fetching outputs for shuffle $shuffleId, mappers $startMapIndex-$endMapIndex" + + s"partitions $startPartition-$endPartition") + shuffleStatuses.get(shuffleId) match { + case Some(shuffleStatus) => + shuffleStatus.withMapStatuses { statuses => + MapOutputTracker.convertMapStatuses( + shuffleId, startPartition, endPartition, statuses, startMapIndex, endMapIndex) } case None => Iterator.empty } } - override def stop() { + override def stop(): Unit = { mapOutputRequests.offer(PoisonPill) threadpool.shutdown() sendTracker(StopMapOutputTracker) @@ -685,12 +798,36 @@ private[spark] class MapOutputTrackerWorker(conf: SparkConf) extends MapOutputTr private val fetchingLock = new KeyLock[Int] // Get blocks sizes by executor Id. Note that zero-sized blocks are excluded in the result. - override def getMapSizesByExecutorId(shuffleId: Int, startPartition: Int, endPartition: Int) - : Iterator[(BlockManagerId, Seq[(BlockId, Long)])] = { + override def getMapSizesByExecutorId( + shuffleId: Int, + startPartition: Int, + endPartition: Int) + : Iterator[(BlockManagerId, Seq[(BlockId, Long, Int)])] = { logDebug(s"Fetching outputs for shuffle $shuffleId, partitions $startPartition-$endPartition") - val statuses = getStatuses(shuffleId) + val statuses = getStatuses(shuffleId, conf) + try { + MapOutputTracker.convertMapStatuses( + shuffleId, startPartition, endPartition, statuses, 0, statuses.length) + } catch { + case e: MetadataFetchFailedException => + // We experienced a fetch failure so our mapStatuses cache is outdated; clear it: + mapStatuses.clear() + throw e + } + } + + override def getMapSizesByRange( + shuffleId: Int, + startMapIndex: Int, + endMapIndex: Int, + startPartition: Int, + endPartition: Int): Iterator[(BlockManagerId, Seq[(BlockId, Long, Int)])] = { + logDebug(s"Fetching outputs for shuffle $shuffleId, mappers $startMapIndex-$endMapIndex" + + s"partitions $startPartition-$endPartition") + val statuses = getStatuses(shuffleId, conf) try { - MapOutputTracker.convertMapStatuses(shuffleId, startPartition, endPartition, statuses) + MapOutputTracker.convertMapStatuses( + shuffleId, startPartition, endPartition, statuses, startMapIndex, endMapIndex) } catch { case e: MetadataFetchFailedException => // We experienced a fetch failure so our mapStatuses cache is outdated; clear it: @@ -705,7 +842,7 @@ private[spark] class MapOutputTrackerWorker(conf: SparkConf) extends MapOutputTr * * (It would be nice to remove this restriction in the future.) */ - private def getStatuses(shuffleId: Int): Array[MapStatus] = { + private def getStatuses(shuffleId: Int, conf: SparkConf): Array[MapStatus] = { val statuses = mapStatuses.get(shuffleId).orNull if (statuses == null) { logInfo("Don't have map outputs for shuffle " + shuffleId + ", fetching them") @@ -715,7 +852,7 @@ private[spark] class MapOutputTrackerWorker(conf: SparkConf) extends MapOutputTr if (fetchedStatuses == null) { logInfo("Doing the fetch; tracker endpoint = " + trackerEndpoint) val fetchedBytes = askTracker[Array[Byte]](GetMapOutputStatuses(shuffleId)) - fetchedStatuses = MapOutputTracker.deserializeMapStatuses(fetchedBytes) + fetchedStatuses = MapOutputTracker.deserializeMapStatuses(fetchedBytes, conf) logInfo("Got the output locations") mapStatuses.put(shuffleId, fetchedStatuses) } @@ -757,13 +894,22 @@ private[spark] object MapOutputTracker extends Logging { private val BROADCAST = 1 // Serialize an array of map output locations into an efficient byte format so that we can send - // it to reduce tasks. We do this by compressing the serialized bytes using GZIP. They will + // it to reduce tasks. We do this by compressing the serialized bytes using Zstd. They will // generally be pretty compressible because many map outputs will be on the same hostname. - def serializeMapStatuses(statuses: Array[MapStatus], broadcastManager: BroadcastManager, - isLocal: Boolean, minBroadcastSize: Int): (Array[Byte], Broadcast[Array[Byte]]) = { - val out = new ByteArrayOutputStream + def serializeMapStatuses( + statuses: Array[MapStatus], + broadcastManager: BroadcastManager, + isLocal: Boolean, + minBroadcastSize: Int, + conf: SparkConf): (Array[Byte], Broadcast[Array[Byte]]) = { + // Using `org.apache.commons.io.output.ByteArrayOutputStream` instead of the standard one + // This implementation doesn't reallocate the whole memory block but allocates + // additional buffers. This way no buffers need to be garbage collected and + // the contents don't have to be copied to the new buffer. + val out = new ApacheByteArrayOutputStream() out.write(DIRECT) - val objOut = new ObjectOutputStream(new GZIPOutputStream(out)) + val codec = CompressionCodec.createCodec(conf, conf.get(MAP_STATUS_COMPRESSION_CODEC)) + val objOut = new ObjectOutputStream(codec.compressedOutputStream(out)) Utils.tryWithSafeFinally { // Since statuses can be modified in parallel, sync on it statuses.synchronized { @@ -780,9 +926,12 @@ private[spark] object MapOutputTracker extends Logging { // toByteArray creates copy, so we can reuse out out.reset() out.write(BROADCAST) - val oos = new ObjectOutputStream(new GZIPOutputStream(out)) - oos.writeObject(bcast) - oos.close() + val oos = new ObjectOutputStream(codec.compressedOutputStream(out)) + Utils.tryWithSafeFinally { + oos.writeObject(bcast) + } { + oos.close() + } val outArr = out.toByteArray logInfo("Broadcast mapstatuses size = " + outArr.length + ", actual size = " + arr.length) (outArr, bcast) @@ -792,11 +941,15 @@ private[spark] object MapOutputTracker extends Logging { } // Opposite of serializeMapStatuses. - def deserializeMapStatuses(bytes: Array[Byte]): Array[MapStatus] = { + def deserializeMapStatuses(bytes: Array[Byte], conf: SparkConf): Array[MapStatus] = { assert (bytes.length > 0) def deserializeObject(arr: Array[Byte], off: Int, len: Int): AnyRef = { - val objIn = new ObjectInputStream(new GZIPInputStream( + val codec = CompressionCodec.createCodec(conf, conf.get(MAP_STATUS_COMPRESSION_CODEC)) + // The ZStd codec is wrapped in a `BufferedInputStream` which avoids overhead excessive + // of JNI call while trying to decompress small amount of data for each element + // of `MapStatuses` + val objIn = new ObjectInputStream(codec.compressedInputStream( new ByteArrayInputStream(arr, off, len))) Utils.tryWithSafeFinally { objIn.readObject() @@ -832,19 +985,24 @@ private[spark] object MapOutputTracker extends Logging { * @param shuffleId Identifier for the shuffle * @param startPartition Start of map output partition ID range (included in range) * @param endPartition End of map output partition ID range (excluded from range) - * @param statuses List of map statuses, indexed by map ID. + * @param statuses List of map statuses, indexed by map partition index. + * @param startMapIndex Start Map index. + * @param endMapIndex End Map index. * @return A sequence of 2-item tuples, where the first item in the tuple is a BlockManagerId, - * and the second item is a sequence of (shuffle block ID, shuffle block size) tuples - * describing the shuffle blocks that are stored at that block manager. + * and the second item is a sequence of (shuffle block id, shuffle block size, map index) + * tuples describing the shuffle blocks that are stored at that block manager. */ def convertMapStatuses( shuffleId: Int, startPartition: Int, endPartition: Int, - statuses: Array[MapStatus]): Iterator[(BlockManagerId, Seq[(BlockId, Long)])] = { + statuses: Array[MapStatus], + startMapIndex : Int, + endMapIndex: Int): Iterator[(BlockManagerId, Seq[(BlockId, Long, Int)])] = { assert (statuses != null) - val splitsByAddress = new HashMap[BlockManagerId, ListBuffer[(BlockId, Long)]] - for ((status, mapId) <- statuses.iterator.zipWithIndex) { + val splitsByAddress = new HashMap[BlockManagerId, ListBuffer[(BlockId, Long, Int)]] + val iter = statuses.iterator.zipWithIndex + for ((status, mapIndex) <- iter.slice(startMapIndex, endMapIndex)) { if (status == null) { val errorMessage = s"Missing an output location for shuffle $shuffleId" logError(errorMessage) @@ -854,11 +1012,12 @@ private[spark] object MapOutputTracker extends Logging { val size = status.getSizeForBlock(part) if (size != 0) { splitsByAddress.getOrElseUpdate(status.location, ListBuffer()) += - ((ShuffleBlockId(shuffleId, mapId, part), size)) + ((ShuffleBlockId(shuffleId, status.mapId, part), size, mapIndex)) } } } } + splitsByAddress.iterator } } diff --git a/core/src/main/scala/org/apache/spark/SecurityManager.scala b/core/src/main/scala/org/apache/spark/SecurityManager.scala index 77db0f5d0eaa7..d061627bea69c 100644 --- a/core/src/main/scala/org/apache/spark/SecurityManager.scala +++ b/core/src/main/scala/org/apache/spark/SecurityManager.scala @@ -108,12 +108,12 @@ private[spark] class SecurityManager( * Admin acls should be set before the view or modify acls. If you modify the admin * acls you should also set the view and modify acls again to pick up the changes. */ - def setViewAcls(defaultUsers: Set[String], allowedUsers: Seq[String]) { + def setViewAcls(defaultUsers: Set[String], allowedUsers: Seq[String]): Unit = { viewAcls = adminAcls ++ defaultUsers ++ allowedUsers logInfo("Changing view acls to: " + viewAcls.mkString(",")) } - def setViewAcls(defaultUser: String, allowedUsers: Seq[String]) { + def setViewAcls(defaultUser: String, allowedUsers: Seq[String]): Unit = { setViewAcls(Set[String](defaultUser), allowedUsers) } @@ -121,7 +121,7 @@ private[spark] class SecurityManager( * Admin acls groups should be set before the view or modify acls groups. If you modify the admin * acls groups you should also set the view and modify acls groups again to pick up the changes. */ - def setViewAclsGroups(allowedUserGroups: Seq[String]) { + def setViewAclsGroups(allowedUserGroups: Seq[String]): Unit = { viewAclsGroups = adminAclsGroups ++ allowedUserGroups logInfo("Changing view acls groups to: " + viewAclsGroups.mkString(",")) } @@ -149,7 +149,7 @@ private[spark] class SecurityManager( * Admin acls should be set before the view or modify acls. If you modify the admin * acls you should also set the view and modify acls again to pick up the changes. */ - def setModifyAcls(defaultUsers: Set[String], allowedUsers: Seq[String]) { + def setModifyAcls(defaultUsers: Set[String], allowedUsers: Seq[String]): Unit = { modifyAcls = adminAcls ++ defaultUsers ++ allowedUsers logInfo("Changing modify acls to: " + modifyAcls.mkString(",")) } @@ -158,7 +158,7 @@ private[spark] class SecurityManager( * Admin acls groups should be set before the view or modify acls groups. If you modify the admin * acls groups you should also set the view and modify acls groups again to pick up the changes. */ - def setModifyAclsGroups(allowedUserGroups: Seq[String]) { + def setModifyAclsGroups(allowedUserGroups: Seq[String]): Unit = { modifyAclsGroups = adminAclsGroups ++ allowedUserGroups logInfo("Changing modify acls groups to: " + modifyAclsGroups.mkString(",")) } @@ -186,7 +186,7 @@ private[spark] class SecurityManager( * Admin acls should be set before the view or modify acls. If you modify the admin * acls you should also set the view and modify acls again to pick up the changes. */ - def setAdminAcls(adminUsers: Seq[String]) { + def setAdminAcls(adminUsers: Seq[String]): Unit = { adminAcls = adminUsers.toSet logInfo("Changing admin acls to: " + adminAcls.mkString(",")) } @@ -195,12 +195,12 @@ private[spark] class SecurityManager( * Admin acls groups should be set before the view or modify acls groups. If you modify the admin * acls groups you should also set the view and modify acls groups again to pick up the changes. */ - def setAdminAclsGroups(adminUserGroups: Seq[String]) { + def setAdminAclsGroups(adminUserGroups: Seq[String]): Unit = { adminAclsGroups = adminUserGroups.toSet logInfo("Changing admin acls groups to: " + adminAclsGroups.mkString(",")) } - def setAcls(aclSetting: Boolean) { + def setAcls(aclSetting: Boolean): Unit = { aclsOn = aclSetting logInfo("Changing acls enabled to: " + aclsOn) } diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index 24be54ec91828..40915e3904f7e 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -504,7 +504,7 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria * Checks for illegal or deprecated config settings. Throws an exception for the former. Not * idempotent - may mutate this conf object to convert deprecated settings to supported ones. */ - private[spark] def validateSettings() { + private[spark] def validateSettings(): Unit = { if (contains("spark.local.dir")) { val msg = "Note that spark.local.dir will be overridden by the value set by " + "the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS" + @@ -548,23 +548,6 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria } } - if (contains("spark.master") && get("spark.master").startsWith("yarn-")) { - val warning = s"spark.master ${get("spark.master")} is deprecated in Spark 2.0+, please " + - "instead use \"yarn\" with specified deploy mode." - - get("spark.master") match { - case "yarn-cluster" => - logWarning(warning) - set("spark.master", "yarn") - set(SUBMIT_DEPLOY_MODE, "cluster") - case "yarn-client" => - logWarning(warning) - set("spark.master", "yarn") - set(SUBMIT_DEPLOY_MODE, "client") - case _ => // Any other unexpected master will be checked when creating scheduler backend. - } - } - if (contains(SUBMIT_DEPLOY_MODE)) { get(SUBMIT_DEPLOY_MODE) match { case "cluster" | "client" => @@ -636,7 +619,9 @@ private[spark] object SparkConf extends Logging { "Not used anymore. Please use spark.shuffle.service.index.cache.size"), DeprecatedConfig("spark.yarn.credentials.file.retention.count", "2.4.0", "Not used anymore."), DeprecatedConfig("spark.yarn.credentials.file.retention.days", "2.4.0", "Not used anymore."), - DeprecatedConfig("spark.yarn.services", "3.0.0", "Feature no longer available.") + DeprecatedConfig("spark.yarn.services", "3.0.0", "Feature no longer available."), + DeprecatedConfig("spark.executor.plugins", "3.0.0", + "Feature replaced with new plugin API. See Monitoring documentation.") ) Map(configs.map { cfg => (cfg.key -> cfg) } : _*) @@ -699,7 +684,8 @@ private[spark] object SparkConf extends Logging { "spark.yarn.jars" -> Seq( AlternateConfig("spark.yarn.jar", "2.0")), MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM.key -> Seq( - AlternateConfig("spark.reducer.maxReqSizeShuffleToMem", "2.3")), + AlternateConfig("spark.reducer.maxReqSizeShuffleToMem", "2.3"), + AlternateConfig("spark.maxRemoteBlockSizeFetchToMem", "3.0")), LISTENER_BUS_EVENT_QUEUE_CAPACITY.key -> Seq( AlternateConfig("spark.scheduler.listenerbus.eventqueue.size", "2.3")), DRIVER_MEMORY_OVERHEAD.key -> Seq( diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 396d712bd739c..a47136ea36736 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -25,13 +25,13 @@ import java.util.concurrent.atomic.{AtomicBoolean, AtomicInteger, AtomicReferenc import scala.collection.JavaConverters._ import scala.collection.Map +import scala.collection.immutable import scala.collection.mutable.HashMap import scala.language.implicitConversions import scala.reflect.{classTag, ClassTag} import scala.util.control.NonFatal import com.google.common.collect.MapMaker -import org.apache.commons.lang3.SerializationUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{ArrayWritable, BooleanWritable, BytesWritable, DoubleWritable, FloatWritable, IntWritable, LongWritable, NullWritable, Text, Writable} @@ -43,22 +43,25 @@ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.broadcast.Broadcast import org.apache.spark.deploy.{LocalSparkCluster, SparkHadoopUtil} import org.apache.spark.deploy.StandaloneResourceUtils._ -import org.apache.spark.executor.ExecutorMetrics +import org.apache.spark.executor.{ExecutorMetrics, ExecutorMetricsSource} import org.apache.spark.input.{FixedLengthBinaryInputFormat, PortableDataStream, StreamInputFormat, WholeTextFileInputFormat} import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ import org.apache.spark.internal.config.Tests._ import org.apache.spark.internal.config.UI._ +import org.apache.spark.internal.plugin.PluginContainer import org.apache.spark.io.CompressionCodec import org.apache.spark.metrics.source.JVMCPUSource import org.apache.spark.partial.{ApproximateEvaluator, PartialResult} import org.apache.spark.rdd._ -import org.apache.spark.resource.{ResourceID, ResourceInformation} +import org.apache.spark.resource._ import org.apache.spark.resource.ResourceUtils._ import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.scheduler._ import org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend import org.apache.spark.scheduler.local.LocalSchedulerBackend +import org.apache.spark.shuffle.ShuffleDataIOUtils +import org.apache.spark.shuffle.api.ShuffleDriverComponents import org.apache.spark.status.{AppStatusSource, AppStatusStore} import org.apache.spark.status.api.v1.ThreadStackTrace import org.apache.spark.storage._ @@ -217,7 +220,10 @@ class SparkContext(config: SparkConf) extends Logging { private var _shutdownHookRef: AnyRef = _ private var _statusStore: AppStatusStore = _ private var _heartbeater: Heartbeater = _ - private var _resources: scala.collection.immutable.Map[String, ResourceInformation] = _ + private var _resources: immutable.Map[String, ResourceInformation] = _ + private var _shuffleDriverComponents: ShuffleDriverComponents = _ + private var _plugins: Option[PluginContainer] = None + private var _resourceProfileManager: ResourceProfileManager = _ /* ------------------------------------------------------------------------------------- * | Accessors and public fields. These provide access to the internal state of the | @@ -320,6 +326,8 @@ class SparkContext(config: SparkConf) extends Logging { _dagScheduler = ds } + private[spark] def shuffleDriverComponents: ShuffleDriverComponents = _shuffleDriverComponents + /** * A unique identifier for the Spark application. * Its format depends on the scheduler implementation. @@ -337,6 +345,8 @@ class SparkContext(config: SparkConf) extends Logging { private[spark] def executorAllocationManager: Option[ExecutorAllocationManager] = _executorAllocationManager + private[spark] def resourceProfileManager: ResourceProfileManager = _resourceProfileManager + private[spark] def cleaner: Option[ContextCleaner] = _cleaner private[spark] var checkpointDir: Option[String] = None @@ -346,7 +356,7 @@ class SparkContext(config: SparkConf) extends Logging { override protected def childValue(parent: Properties): Properties = { // Note: make a clone such that changes in the parent properties aren't reflected in // the those of the children threads, which has confusing semantics (SPARK-10563). - SerializationUtils.clone(parent) + Utils.cloneProperties(parent) } override protected def initialValue(): Properties = new Properties() } @@ -367,7 +377,7 @@ class SparkContext(config: SparkConf) extends Logging { * @param logLevel The desired log level as a string. * Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN */ - def setLogLevel(logLevel: String) { + def setLogLevel(logLevel: String): Unit = { // let's allow lowercase or mixed case too val upperCased = logLevel.toUpperCase(Locale.ROOT) require(SparkContext.VALID_LOG_LEVELS.contains(upperCased), @@ -438,13 +448,14 @@ class SparkContext(config: SparkConf) extends Logging { _eventLogCodec = { val compress = _conf.get(EVENT_LOG_COMPRESS) if (compress && isEventLogEnabled) { - Some(CompressionCodec.getCodecName(_conf)).map(CompressionCodec.getShortName) + Some(_conf.get(EVENT_LOG_COMPRESSION_CODEC)).map(CompressionCodec.getShortName) } else { None } } _listenerBus = new LiveListenerBus(_conf) + _resourceProfileManager = new ResourceProfileManager(_conf) // Initialize the app status store and listener before SparkEnv is created so that it gets // all events. @@ -525,11 +536,19 @@ class SparkContext(config: SparkConf) extends Logging { executorEnvs ++= _conf.getExecutorEnv executorEnvs("SPARK_USER") = sparkUser + _shuffleDriverComponents = ShuffleDataIOUtils.loadShuffleDataIO(config).driver() + _shuffleDriverComponents.initializeApplication().asScala.foreach { case (k, v) => + _conf.set(ShuffleDataIOUtils.SHUFFLE_SPARK_CONF_PREFIX + k, v) + } + // We need to register "HeartbeatReceiver" before "createTaskScheduler" because Executor will // retrieve "HeartbeatReceiver" in the constructor. (SPARK-6640) _heartbeatReceiver = env.rpcEnv.setupEndpoint( HeartbeatReceiver.ENDPOINT_NAME, new HeartbeatReceiver(this)) + // Initialize any plugins before the task scheduler is initialized. + _plugins = PluginContainer(this, _resources.asJava) + // Create and start the scheduler val (sched, ts) = SparkContext.createTaskScheduler(this, master, deployMode) _schedulerBackend = sched @@ -537,9 +556,16 @@ class SparkContext(config: SparkConf) extends Logging { _dagScheduler = new DAGScheduler(this) _heartbeatReceiver.ask[Boolean](TaskSchedulerIsSet) + val _executorMetricsSource = + if (_conf.get(METRICS_EXECUTORMETRICS_SOURCE_ENABLED)) { + Some(new ExecutorMetricsSource) + } else { + None + } + // create and start the heartbeater for collecting memory metrics _heartbeater = new Heartbeater( - () => SparkContext.this.reportHeartBeat(), + () => SparkContext.this.reportHeartBeat(_executorMetricsSource), "driver-heartbeater", conf.get(EXECUTOR_HEARTBEAT_INTERVAL)) _heartbeater.start() @@ -559,7 +585,7 @@ class SparkContext(config: SparkConf) extends Logging { // The metrics system for Driver need to be set spark.app.id to app ID. // So it should start after we get app ID from the task scheduler and set spark.app.id. - _env.metricsSystem.start() + _env.metricsSystem.start(_conf.get(METRICS_STATIC_SOURCES_ENABLED)) // Attach the driver metrics servlet handler to the web ui after the metrics system is started. _env.metricsSystem.getServletHandlers.foreach(handler => ui.foreach(_.attachHandler(handler))) @@ -577,7 +603,7 @@ class SparkContext(config: SparkConf) extends Logging { _cleaner = if (_conf.get(CLEANER_REFERENCE_TRACKING)) { - Some(new ContextCleaner(this)) + Some(new ContextCleaner(this, _shuffleDriverComponents)) } else { None } @@ -590,7 +616,7 @@ class SparkContext(config: SparkConf) extends Logging { case b: ExecutorAllocationClient => Some(new ExecutorAllocationManager( schedulerBackend.asInstanceOf[ExecutorAllocationClient], listenerBus, _conf, - cleaner = cleaner)) + cleaner = cleaner, resourceProfileManager = resourceProfileManager)) case _ => None } @@ -608,10 +634,12 @@ class SparkContext(config: SparkConf) extends Logging { _env.metricsSystem.registerSource(_dagScheduler.metricsSource) _env.metricsSystem.registerSource(new BlockManagerSource(_env.blockManager)) _env.metricsSystem.registerSource(new JVMCPUSource()) + _executorMetricsSource.foreach(_.register(_env.metricsSystem)) _executorAllocationManager.foreach { e => _env.metricsSystem.registerSource(e.executorAllocationManagerSource) } appStatusSource.foreach(_env.metricsSystem.registerSource(_)) + _plugins.foreach(_.registerMetrics(applicationId)) // Make sure the context is stopped if the user forgets about it. This avoids leaving // unfinished event logs around after the JVM exits cleanly. It doesn't help if the JVM // is killed, though. @@ -662,7 +690,7 @@ class SparkContext(config: SparkConf) extends Logging { private[spark] def getLocalProperties: Properties = localProperties.get() - private[spark] def setLocalProperties(props: Properties) { + private[spark] def setLocalProperties(props: Properties): Unit = { localProperties.set(props) } @@ -677,7 +705,7 @@ class SparkContext(config: SparkConf) extends Logging { * implementation of thread pools have worker threads spawn other worker threads. * As a result, local properties may propagate unpredictably. */ - def setLocalProperty(key: String, value: String) { + def setLocalProperty(key: String, value: String): Unit = { if (value == null) { localProperties.get.remove(key) } else { @@ -693,7 +721,7 @@ class SparkContext(config: SparkConf) extends Logging { Option(localProperties.get).map(_.getProperty(key)).orNull /** Set a human readable description of the current job. */ - def setJobDescription(value: String) { + def setJobDescription(value: String): Unit = { setLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION, value) } @@ -721,7 +749,8 @@ class SparkContext(config: SparkConf) extends Logging { * are actually stopped in a timely manner, but is off by default due to HDFS-1208, where HDFS * may respond to Thread.interrupt() by marking nodes as dead. */ - def setJobGroup(groupId: String, description: String, interruptOnCancel: Boolean = false) { + def setJobGroup(groupId: String, + description: String, interruptOnCancel: Boolean = false): Unit = { setLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION, description) setLocalProperty(SparkContext.SPARK_JOB_GROUP_ID, groupId) // Note: Specifying interruptOnCancel in setJobGroup (rather than cancelJobGroup) avoids @@ -732,7 +761,7 @@ class SparkContext(config: SparkConf) extends Logging { } /** Clear the current thread's job group ID and its description. */ - def clearJobGroup() { + def clearJobGroup(): Unit = { setLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION, null) setLocalProperty(SparkContext.SPARK_JOB_GROUP_ID, null) setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, null) @@ -1509,17 +1538,17 @@ class SparkContext(config: SparkConf) extends Logging { */ def addFile(path: String, recursive: Boolean): Unit = { val uri = new Path(path).toUri - val schemeCorrectedPath = uri.getScheme match { - case null => new File(path).getCanonicalFile.toURI.toString + val schemeCorrectedURI = uri.getScheme match { + case null => new File(path).getCanonicalFile.toURI case "local" => logWarning("File with 'local' scheme is not supported to add to file server, since " + "it is already available on every node.") return - case _ => path + case _ => uri } - val hadoopPath = new Path(schemeCorrectedPath) - val scheme = new URI(schemeCorrectedPath).getScheme + val hadoopPath = new Path(schemeCorrectedURI) + val scheme = schemeCorrectedURI.getScheme if (!Array("http", "https", "ftp").contains(scheme)) { val fs = hadoopPath.getFileSystem(hadoopConfiguration) val isDir = fs.getFileStatus(hadoopPath).isDirectory @@ -1539,7 +1568,11 @@ class SparkContext(config: SparkConf) extends Logging { val key = if (!isLocal && scheme == "file") { env.rpcEnv.fileServer.addFile(new File(uri.getPath)) } else { - schemeCorrectedPath + if (uri.getScheme == null) { + schemeCorrectedURI.toString + } else { + path + } } val timestamp = System.currentTimeMillis if (addedFiles.putIfAbsent(key, timestamp).isEmpty) { @@ -1560,7 +1593,7 @@ class SparkContext(config: SparkConf) extends Logging { * Register a listener to receive up-calls from events that happen during execution. */ @DeveloperApi - def addSparkListener(listener: SparkListenerInterface) { + def addSparkListener(listener: SparkListenerInterface): Unit = { listenerBus.addToSharedQueue(listener) } @@ -1594,7 +1627,7 @@ class SparkContext(config: SparkConf) extends Logging { /** * Update the cluster manager on our scheduling needs. Three bits of information are included - * to help it make decisions. + * to help it make decisions. This applies to the default ResourceProfile. * @param numExecutors The total number of executors we'd like to have. The cluster manager * shouldn't kill any running executor to reach this number, but, * if all existing executors were to die, this is the number of executors @@ -1610,11 +1643,16 @@ class SparkContext(config: SparkConf) extends Logging { def requestTotalExecutors( numExecutors: Int, localityAwareTasks: Int, - hostToLocalTaskCount: scala.collection.immutable.Map[String, Int] + hostToLocalTaskCount: immutable.Map[String, Int] ): Boolean = { schedulerBackend match { case b: ExecutorAllocationClient => - b.requestTotalExecutors(numExecutors, localityAwareTasks, hostToLocalTaskCount) + // this is being applied to the default resource profile, would need to add api to support + // others + val defaultProfId = resourceProfileManager.defaultResourceProfile.id + b.requestTotalExecutors(immutable.Map(defaultProfId-> numExecutors), + immutable.Map(localityAwareTasks -> defaultProfId), + immutable.Map(defaultProfId -> hostToLocalTaskCount)) case _ => logWarning("Requesting executors is not supported by current scheduler.") false @@ -1789,14 +1827,14 @@ class SparkContext(config: SparkConf) extends Logging { /** * Register an RDD to be persisted in memory and/or disk storage */ - private[spark] def persistRDD(rdd: RDD[_]) { + private[spark] def persistRDD(rdd: RDD[_]): Unit = { persistentRdds(rdd.id) = rdd } /** * Unpersist an RDD from memory and/or disk storage */ - private[spark] def unpersistRDD(rddId: Int, blocking: Boolean) { + private[spark] def unpersistRDD(rddId: Int, blocking: Boolean): Unit = { env.blockManager.master.removeRdd(rddId, blocking) persistentRdds.remove(rddId) listenerBus.post(SparkListenerUnpersistRDD(rddId)) @@ -1812,7 +1850,7 @@ class SparkContext(config: SparkConf) extends Logging { * * @note A path can be added only once. Subsequent additions of the same path are ignored. */ - def addJar(path: String) { + def addJar(path: String): Unit = { def addLocalJarFile(file: File): String = { try { if (!file.exists()) { @@ -1832,7 +1870,7 @@ class SparkContext(config: SparkConf) extends Logging { def checkRemoteJarFile(path: String): String = { val hadoopPath = new Path(path) - val scheme = new URI(path).getScheme + val scheme = hadoopPath.toUri.getScheme if (!Array("http", "https", "ftp").contains(scheme)) { try { val fs = hadoopPath.getFileSystem(hadoopConfiguration) @@ -1854,21 +1892,21 @@ class SparkContext(config: SparkConf) extends Logging { } } - if (path == null) { - logWarning("null specified as parameter to addJar") + if (path == null || path.isEmpty) { + logWarning("null or empty path specified as parameter to addJar") } else { val key = if (path.contains("\\")) { // For local paths with backslashes on Windows, URI throws an exception addLocalJarFile(new File(path)) } else { - val uri = new URI(path) + val uri = new Path(path).toUri // SPARK-17650: Make sure this is a valid URL before adding it to the list of dependencies Utils.validateURL(uri) uri.getScheme match { // A JAR file which exists only on the driver node case null => // SPARK-22585 path without schema is not url encoded - addLocalJarFile(new File(uri.getRawPath)) + addLocalJarFile(new File(uri.getPath)) // A JAR file which exists only on the driver node case "file" => addLocalJarFile(new File(uri.getPath)) // A JAR file which exists locally on every worker node @@ -1966,6 +2004,9 @@ class SparkContext(config: SparkConf) extends Logging { _listenerBusStarted = false } } + Utils.tryLogNonFatalError { + _plugins.foreach(_.shutdown()) + } Utils.tryLogNonFatalError { _eventLogger.foreach(_.stop()) } @@ -1975,6 +2016,11 @@ class SparkContext(config: SparkConf) extends Logging { } _heartbeater = null } + if (_shuffleDriverComponents != null) { + Utils.tryLogNonFatalError { + _shuffleDriverComponents.cleanupApplication() + } + } if (env != null && _heartbeatReceiver != null) { Utils.tryLogNonFatalError { env.rpcEnv.stop(_heartbeatReceiver) @@ -2000,6 +2046,7 @@ class SparkContext(config: SparkConf) extends Logging { // Clear this `InheritableThreadLocal`, or it will still be inherited in child threads even this // `SparkContext` is stopped. localProperties.remove() + ResourceProfile.clearDefaultProfile() // Unset YARN mode system env variable, to allow switching between cluster types. SparkContext.clearActiveContext() logInfo("Successfully stopped SparkContext") @@ -2019,7 +2066,7 @@ class SparkContext(config: SparkConf) extends Logging { * Set the thread-local property for overriding the call sites * of actions and RDDs. */ - def setCallSite(shortCallSite: String) { + def setCallSite(shortCallSite: String): Unit = { setLocalProperty(CallSite.SHORT_FORM, shortCallSite) } @@ -2027,7 +2074,7 @@ class SparkContext(config: SparkConf) extends Logging { * Set the thread-local property for overriding the call sites * of actions and RDDs. */ - private[spark] def setCallSite(callSite: CallSite) { + private[spark] def setCallSite(callSite: CallSite): Unit = { setLocalProperty(CallSite.SHORT_FORM, callSite.shortForm) setLocalProperty(CallSite.LONG_FORM, callSite.longForm) } @@ -2036,7 +2083,7 @@ class SparkContext(config: SparkConf) extends Logging { * Clear the thread-local property for overriding the call sites * of actions and RDDs. */ - def clearCallSite() { + def clearCallSite(): Unit = { setLocalProperty(CallSite.SHORT_FORM, null) setLocalProperty(CallSite.LONG_FORM, null) } @@ -2156,8 +2203,7 @@ class SparkContext(config: SparkConf) extends Logging { def runJob[T, U: ClassTag]( rdd: RDD[T], processPartition: (TaskContext, Iterator[T]) => U, - resultHandler: (Int, U) => Unit) - { + resultHandler: (Int, U) => Unit): Unit = { runJob[T, U](rdd, processPartition, 0 until rdd.partitions.length, resultHandler) } @@ -2171,8 +2217,7 @@ class SparkContext(config: SparkConf) extends Logging { def runJob[T, U: ClassTag]( rdd: RDD[T], processPartition: Iterator[T] => U, - resultHandler: (Int, U) => Unit) - { + resultHandler: (Int, U) => Unit): Unit = { val processFunc = (context: TaskContext, iter: Iterator[T]) => processPartition(iter) runJob[T, U](rdd, processFunc, 0 until rdd.partitions.length, resultHandler) } @@ -2257,13 +2302,13 @@ class SparkContext(config: SparkConf) extends Logging { * Cancel active jobs for the specified group. See `org.apache.spark.SparkContext.setJobGroup` * for more information. */ - def cancelJobGroup(groupId: String) { + def cancelJobGroup(groupId: String): Unit = { assertNotStopped() dagScheduler.cancelJobGroup(groupId) } /** Cancel all jobs that have been scheduled or are running. */ - def cancelAllJobs() { + def cancelAllJobs(): Unit = { assertNotStopped() dagScheduler.cancelAllJobs() } @@ -2351,7 +2396,7 @@ class SparkContext(config: SparkConf) extends Logging { * @param directory path to the directory where checkpoint files will be stored * (must be HDFS path if running in cluster) */ - def setCheckpointDir(directory: String) { + def setCheckpointDir(directory: String): Unit = { // If we are running on a cluster, log a warning if the directory is local. // Otherwise, the driver may attempt to reconstruct the checkpointed RDD from @@ -2423,7 +2468,7 @@ class SparkContext(config: SparkConf) extends Logging { } /** Post the application start event */ - private def postApplicationStart() { + private def postApplicationStart(): Unit = { // Note: this code assumes that the task scheduler has been initialized and has contacted // the cluster manager to get an application ID (in case the cluster manager provides one). listenerBus.post(SparkListenerApplicationStart(appName, Some(applicationId), @@ -2433,12 +2478,12 @@ class SparkContext(config: SparkConf) extends Logging { } /** Post the application end event */ - private def postApplicationEnd() { + private def postApplicationEnd(): Unit = { listenerBus.post(SparkListenerApplicationEnd(System.currentTimeMillis)) } /** Post the environment update event once the task scheduler is ready */ - private def postEnvironmentUpdate() { + private def postEnvironmentUpdate(): Unit = { if (taskScheduler != null) { val schedulingMode = getSchedulingMode.toString val addedJarPaths = addedJars.keys.toSeq @@ -2451,8 +2496,10 @@ class SparkContext(config: SparkConf) extends Logging { } /** Reports heartbeat metrics for the driver. */ - private def reportHeartBeat(): Unit = { + private def reportHeartBeat(executorMetricsSource: Option[ExecutorMetricsSource]): Unit = { val currentMetrics = ExecutorMetrics.getCurrentMetrics(env.memoryManager) + executorMetricsSource.foreach(_.updateMetricsSnapshot(currentMetrics)) + val driverUpdates = new HashMap[(Int, Int), ExecutorMetrics] // In the driver, we do not track per-stage metrics, so use a dummy stage for the key driverUpdates.put(EventLoggingListener.DRIVER_STAGE_KEY, new ExecutorMetrics(currentMetrics)) @@ -2735,75 +2782,34 @@ object SparkContext extends Logging { // When running locally, don't try to re-execute tasks on failure. val MAX_LOCAL_TASK_FAILURES = 1 - // Ensure that executor's resources satisfies one or more tasks requirement. - def checkResourcesPerTask(clusterMode: Boolean, executorCores: Option[Int]): Unit = { + // Ensure that default executor's resources satisfies one or more tasks requirement. + // This function is for cluster managers that don't set the executor cores config, for + // others its checked in ResourceProfile. + def checkResourcesPerTask(executorCores: Int): Unit = { val taskCores = sc.conf.get(CPUS_PER_TASK) - val execCores = if (clusterMode) { - executorCores.getOrElse(sc.conf.get(EXECUTOR_CORES)) - } else { - executorCores.get - } - - // Number of cores per executor must meet at least one task requirement. - if (execCores < taskCores) { - throw new SparkException(s"The number of cores per executor (=$execCores) has to be >= " + - s"the task config: ${CPUS_PER_TASK.key} = $taskCores when run on $master.") - } - - // Calculate the max slots each executor can provide based on resources available on each - // executor and resources required by each task. - val taskResourceRequirements = parseResourceRequirements(sc.conf, SPARK_TASK_PREFIX) - val executorResourcesAndAmounts = - parseAllResourceRequests(sc.conf, SPARK_EXECUTOR_PREFIX) - .map(request => (request.id.resourceName, request.amount)).toMap - var numSlots = execCores / taskCores - var limitingResourceName = "CPU" - - taskResourceRequirements.foreach { taskReq => - // Make sure the executor resources were specified through config. - val execAmount = executorResourcesAndAmounts.getOrElse(taskReq.resourceName, - throw new SparkException("The executor resource config: " + - ResourceID(SPARK_EXECUTOR_PREFIX, taskReq.resourceName).amountConf + - " needs to be specified since a task requirement config: " + - ResourceID(SPARK_TASK_PREFIX, taskReq.resourceName).amountConf + - " was specified") - ) - // Make sure the executor resources are large enough to launch at least one task. - if (execAmount < taskReq.amount) { - throw new SparkException("The executor resource config: " + - ResourceID(SPARK_EXECUTOR_PREFIX, taskReq.resourceName).amountConf + - s" = $execAmount has to be >= the requested amount in task resource config: " + - ResourceID(SPARK_TASK_PREFIX, taskReq.resourceName).amountConf + - s" = ${taskReq.amount}") - } - // Compare and update the max slots each executor can provide. - val resourceNumSlots = execAmount / taskReq.amount - if (resourceNumSlots < numSlots) { - numSlots = resourceNumSlots - limitingResourceName = taskReq.resourceName - } - } - // There have been checks above to make sure the executor resources were specified and are - // large enough if any task resources were specified. - taskResourceRequirements.foreach { taskReq => - val execAmount = executorResourcesAndAmounts(taskReq.resourceName) - if (taskReq.amount * numSlots < execAmount) { - val message = s"The configuration of resource: ${taskReq.resourceName} " + - s"(exec = ${execAmount}, task = ${taskReq.amount}) will result in wasted " + - s"resources due to resource ${limitingResourceName} limiting the number of " + - s"runnable tasks per executor to: ${numSlots}. Please adjust your configuration." - if (Utils.isTesting) { - throw new SparkException(message) - } else { - logWarning(message) - } - } + validateTaskCpusLargeEnough(executorCores, taskCores) + val defaultProf = sc.resourceProfileManager.defaultResourceProfile + // TODO - this is temporary until all of stage level scheduling feature is integrated, + // fail if any other resource limiting due to dynamic allocation and scheduler using + // slots based on cores + val cpuSlots = executorCores/taskCores + val limitingResource = defaultProf.limitingResource(sc.conf) + if (limitingResource.nonEmpty && !limitingResource.equals(ResourceProfile.CPUS) && + defaultProf.maxTasksPerExecutor(sc.conf) < cpuSlots) { + throw new IllegalArgumentException("The number of slots on an executor has to be " + + "limited by the number of cores, otherwise you waste resources and " + + "dynamic allocation doesn't work properly. Your configuration has " + + s"core/task cpu slots = ${cpuSlots} and " + + s"${limitingResource} = " + + s"${defaultProf.maxTasksPerExecutor(sc.conf)}. Please adjust your configuration " + + "so that all resources require same number of executor slots.") } + ResourceUtils.warnOnWastedResources(defaultProf, sc.conf, Some(executorCores)) } master match { case "local" => - checkResourcesPerTask(clusterMode = false, Some(1)) + checkResourcesPerTask(1) val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true) val backend = new LocalSchedulerBackend(sc.getConf, scheduler, 1) scheduler.initialize(backend) @@ -2816,7 +2822,7 @@ object SparkContext extends Logging { if (threadCount <= 0) { throw new SparkException(s"Asked to run locally with $threadCount threads") } - checkResourcesPerTask(clusterMode = false, Some(threadCount)) + checkResourcesPerTask(threadCount) val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true) val backend = new LocalSchedulerBackend(sc.getConf, scheduler, threadCount) scheduler.initialize(backend) @@ -2827,14 +2833,13 @@ object SparkContext extends Logging { // local[*, M] means the number of cores on the computer with M failures // local[N, M] means exactly N threads with M failures val threadCount = if (threads == "*") localCpuCount else threads.toInt - checkResourcesPerTask(clusterMode = false, Some(threadCount)) + checkResourcesPerTask(threadCount) val scheduler = new TaskSchedulerImpl(sc, maxFailures.toInt, isLocal = true) val backend = new LocalSchedulerBackend(sc.getConf, scheduler, threadCount) scheduler.initialize(backend) (backend, scheduler) case SPARK_REGEX(sparkUrl) => - checkResourcesPerTask(clusterMode = true, None) val scheduler = new TaskSchedulerImpl(sc) val masterUrls = sparkUrl.split(",").map("spark://" + _) val backend = new StandaloneSchedulerBackend(scheduler, sc, masterUrls) @@ -2842,7 +2847,7 @@ object SparkContext extends Logging { (backend, scheduler) case LOCAL_CLUSTER_REGEX(numSlaves, coresPerSlave, memoryPerSlave) => - checkResourcesPerTask(clusterMode = true, Some(coresPerSlave.toInt)) + checkResourcesPerTask(coresPerSlave.toInt) // Check to make sure memory requested <= memoryPerSlave. Otherwise Spark will just hang. val memoryPerSlaveInt = memoryPerSlave.toInt if (sc.executorMemory > memoryPerSlaveInt) { @@ -2851,6 +2856,14 @@ object SparkContext extends Logging { memoryPerSlaveInt, sc.executorMemory)) } + // For host local mode setting the default of SHUFFLE_HOST_LOCAL_DISK_READING_ENABLED + // to false because this mode is intended to be used for testing and in this case all the + // executors are running on the same host. So if host local reading was enabled here then + // testing of the remote fetching would be secondary as setting this config explicitly to + // false would be required in most of the unit test (despite the fact that remote fetching + // is much more frequent in production). + sc.conf.setIfMissing(SHUFFLE_HOST_LOCAL_DISK_READING_ENABLED, false) + val scheduler = new TaskSchedulerImpl(sc) val localCluster = new LocalSparkCluster( numSlaves.toInt, coresPerSlave.toInt, memoryPerSlaveInt, sc.conf) @@ -2863,7 +2876,6 @@ object SparkContext extends Logging { (backend, scheduler) case masterUrl => - checkResourcesPerTask(clusterMode = true, None) val cm = getClusterManager(masterUrl) match { case Some(clusterMgr) => clusterMgr case None => throw new SparkException("Could not parse Master URL: '" + master + "'") diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala index 419f0ab065150..8ba1739831803 100644 --- a/core/src/main/scala/org/apache/spark/SparkEnv.scala +++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala @@ -22,10 +22,11 @@ import java.net.Socket import java.util.Locale import scala.collection.JavaConverters._ +import scala.collection.concurrent import scala.collection.mutable import scala.util.Properties -import com.google.common.collect.MapMaker +import com.google.common.cache.CacheBuilder import org.apache.hadoop.conf.Configuration import org.apache.spark.annotation.DeveloperApi @@ -70,16 +71,17 @@ class SparkEnv ( val outputCommitCoordinator: OutputCommitCoordinator, val conf: SparkConf) extends Logging { - private[spark] var isStopped = false + @volatile private[spark] var isStopped = false private val pythonWorkers = mutable.HashMap[(String, Map[String, String]), PythonWorkerFactory]() // A general, soft-reference map for metadata needed during HadoopRDD split computation // (e.g., HadoopFileRDD uses this to cache JobConfs and InputFormats). - private[spark] val hadoopJobMetadata = new MapMaker().softValues().makeMap[String, Any]() + private[spark] val hadoopJobMetadata = + CacheBuilder.newBuilder().softValues().build[String, AnyRef]().asMap() private[spark] var driverTmpDir: Option[String] = None - private[spark] def stop() { + private[spark] def stop(): Unit = { if (!isStopped) { isStopped = true @@ -119,7 +121,8 @@ class SparkEnv ( } private[spark] - def destroyPythonWorker(pythonExec: String, envVars: Map[String, String], worker: Socket) { + def destroyPythonWorker(pythonExec: String, + envVars: Map[String, String], worker: Socket): Unit = { synchronized { val key = (pythonExec, envVars) pythonWorkers.get(key).foreach(_.stopWorker(worker)) @@ -127,7 +130,8 @@ class SparkEnv ( } private[spark] - def releasePythonWorker(pythonExec: String, envVars: Map[String, String], worker: Socket) { + def releasePythonWorker(pythonExec: String, + envVars: Map[String, String], worker: Socket): Unit = { synchronized { val key = (pythonExec, envVars) pythonWorkers.get(key).foreach(_.releaseWorker(worker)) @@ -141,7 +145,7 @@ object SparkEnv extends Logging { private[spark] val driverSystemName = "sparkDriver" private[spark] val executorSystemName = "sparkExecutor" - def set(e: SparkEnv) { + def set(e: SparkEnv): Unit = { env = e } @@ -193,6 +197,7 @@ object SparkEnv extends Logging { private[spark] def createExecutorEnv( conf: SparkConf, executorId: String, + bindAddress: String, hostname: String, numCores: Int, ioEncryptionKey: Option[Array[Byte]], @@ -200,7 +205,7 @@ object SparkEnv extends Logging { val env = create( conf, executorId, - hostname, + bindAddress, hostname, None, isLocal, @@ -211,6 +216,17 @@ object SparkEnv extends Logging { env } + private[spark] def createExecutorEnv( + conf: SparkConf, + executorId: String, + hostname: String, + numCores: Int, + ioEncryptionKey: Option[Array[Byte]], + isLocal: Boolean): SparkEnv = { + createExecutorEnv(conf, executorId, hostname, + hostname, numCores, ioEncryptionKey, isLocal) + } + /** * Helper method to create a SparkEnv for a driver or an executor. */ @@ -337,19 +353,26 @@ object SparkEnv extends Logging { None } - val blockManagerMaster = new BlockManagerMaster(registerOrLookupEndpoint( - BlockManagerMaster.DRIVER_ENDPOINT_NAME, - new BlockManagerMasterEndpoint( - rpcEnv, - isLocal, - conf, - listenerBus, - if (conf.get(config.SHUFFLE_SERVICE_FETCH_RDD_ENABLED)) { - externalShuffleClient - } else { - None - })), - conf, isDriver) + // Mapping from block manager id to the block manager's information. + val blockManagerInfo = new concurrent.TrieMap[BlockManagerId, BlockManagerInfo]() + val blockManagerMaster = new BlockManagerMaster( + registerOrLookupEndpoint( + BlockManagerMaster.DRIVER_ENDPOINT_NAME, + new BlockManagerMasterEndpoint( + rpcEnv, + isLocal, + conf, + listenerBus, + if (conf.get(config.SHUFFLE_SERVICE_FETCH_RDD_ENABLED)) { + externalShuffleClient + } else { + None + }, blockManagerInfo)), + registerOrLookupEndpoint( + BlockManagerMaster.DRIVER_HEARTBEAT_ENDPOINT_NAME, + new BlockManagerMasterHeartbeatEndpoint(rpcEnv, isLocal, blockManagerInfo)), + conf, + isDriver) val blockTransferService = new NettyBlockTransferService(conf, securityManager, bindAddress, advertiseAddress, @@ -381,7 +404,7 @@ object SparkEnv extends Logging { conf.set(EXECUTOR_ID, executorId) val ms = MetricsSystem.createMetricsSystem(MetricsSystemInstances.EXECUTOR, conf, securityManager) - ms.start() + ms.start(conf.get(METRICS_STATIC_SOURCES_ENABLED)) ms } diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala b/core/src/main/scala/org/apache/spark/TaskContext.scala index 2299c54e2624b..fd41facf95c76 100644 --- a/core/src/main/scala/org/apache/spark/TaskContext.scala +++ b/core/src/main/scala/org/apache/spark/TaskContext.scala @@ -185,6 +185,14 @@ abstract class TaskContext extends Serializable { @Evolving def resources(): Map[String, ResourceInformation] + /** + * (java-specific) Resources allocated to the task. The key is the resource name and the value + * is information about the resource. Please refer to + * [[org.apache.spark.resource.ResourceInformation]] for specifics. + */ + @Evolving + def resourcesJMap(): java.util.Map[String, ResourceInformation] + @DeveloperApi def taskMetrics(): TaskMetrics diff --git a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala index 516fb95593324..08a58a029528b 100644 --- a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala +++ b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala @@ -20,6 +20,7 @@ package org.apache.spark import java.util.Properties import javax.annotation.concurrent.GuardedBy +import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import org.apache.spark.executor.TaskMetrics @@ -101,6 +102,10 @@ private[spark] class TaskContextImpl( this } + override def resourcesJMap(): java.util.Map[String, ResourceInformation] = { + resources.asJava + } + @GuardedBy("this") private[spark] override def markTaskFailed(error: Throwable): Unit = synchronized { if (failed) return diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala index 19f71a1dec296..b13028f868072 100644 --- a/core/src/main/scala/org/apache/spark/TaskEndReason.scala +++ b/core/src/main/scala/org/apache/spark/TaskEndReason.scala @@ -83,14 +83,15 @@ case object Resubmitted extends TaskFailedReason { case class FetchFailed( bmAddress: BlockManagerId, // Note that bmAddress can be null shuffleId: Int, - mapId: Int, + mapId: Long, + mapIndex: Int, reduceId: Int, message: String) extends TaskFailedReason { override def toErrorString: String = { val bmAddressString = if (bmAddress == null) "null" else bmAddress.toString - s"FetchFailed($bmAddressString, shuffleId=$shuffleId, mapId=$mapId, reduceId=$reduceId, " + - s"message=\n$message\n)" + s"FetchFailed($bmAddressString, shuffleId=$shuffleId, mapIndex=$mapIndex, " + + s"mapId=$mapId, reduceId=$reduceId, message=\n$message\n)" } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/sql/ShowTablesStatement.scala b/core/src/main/scala/org/apache/spark/TaskOutputFileAlreadyExistException.scala similarity index 78% rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/sql/ShowTablesStatement.scala rename to core/src/main/scala/org/apache/spark/TaskOutputFileAlreadyExistException.scala index d75c4085a974b..68054625bac21 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/sql/ShowTablesStatement.scala +++ b/core/src/main/scala/org/apache/spark/TaskOutputFileAlreadyExistException.scala @@ -15,10 +15,9 @@ * limitations under the License. */ -package org.apache.spark.sql.catalyst.plans.logical.sql +package org.apache.spark /** - * A SHOW TABLES statement, as parsed from SQL. + * Exception thrown when a task cannot write to output file due to the file already exists. */ -case class ShowTablesStatement(namespace: Option[Seq[String]], pattern: Option[String]) - extends ParsedStatement +private[spark] class TaskOutputFileAlreadyExistException(error: Throwable) extends Exception(error) diff --git a/core/src/main/scala/org/apache/spark/TestUtils.scala b/core/src/main/scala/org/apache/spark/TestUtils.scala index 41ae3ae3b758a..d459627930f4c 100644 --- a/core/src/main/scala/org/apache/spark/TestUtils.scala +++ b/core/src/main/scala/org/apache/spark/TestUtils.scala @@ -24,9 +24,9 @@ import java.nio.file.{Files => JavaFiles} import java.nio.file.attribute.PosixFilePermission.{OWNER_EXECUTE, OWNER_READ, OWNER_WRITE} import java.security.SecureRandom import java.security.cert.X509Certificate -import java.util.{Arrays, EnumSet, Properties} +import java.util.{Arrays, EnumSet, Locale, Properties} import java.util.concurrent.{TimeoutException, TimeUnit} -import java.util.jar.{JarEntry, JarOutputStream} +import java.util.jar.{JarEntry, JarOutputStream, Manifest} import javax.net.ssl._ import javax.tools.{JavaFileObject, SimpleJavaFileObject, ToolProvider} @@ -42,7 +42,6 @@ import org.json4s.JsonAST.JValue import org.json4s.jackson.JsonMethods.{compact, render} import org.apache.spark.executor.TaskMetrics -import org.apache.spark.internal.config._ import org.apache.spark.scheduler._ import org.apache.spark.util.Utils @@ -98,9 +97,23 @@ private[spark] object TestUtils { * Create a jar file that contains this set of files. All files will be located in the specified * directory or at the root of the jar. */ - def createJar(files: Seq[File], jarFile: File, directoryPrefix: Option[String] = None): URL = { + def createJar( + files: Seq[File], + jarFile: File, + directoryPrefix: Option[String] = None, + mainClass: Option[String] = None): URL = { + val manifest = mainClass match { + case Some(mc) => + val m = new Manifest() + m.getMainAttributes.putValue("Manifest-Version", "1.0") + m.getMainAttributes.putValue("Main-Class", mc) + m + case None => + new Manifest() + } + val jarFileStream = new FileOutputStream(jarFile) - val jarStream = new JarOutputStream(jarFileStream, new java.util.jar.Manifest()) + val jarStream = new JarOutputStream(jarFileStream, manifest) for (file <- files) { // The `name` for the argument in `JarEntry` should use / for its separator. This is @@ -201,12 +214,20 @@ private[spark] object TestUtils { * Asserts that exception message contains the message. Please note this checks all * exceptions in the tree. */ - def assertExceptionMsg(exception: Throwable, msg: String): Unit = { + def assertExceptionMsg(exception: Throwable, msg: String, ignoreCase: Boolean = false): Unit = { + def contain(msg1: String, msg2: String): Boolean = { + if (ignoreCase) { + msg1.toLowerCase(Locale.ROOT).contains(msg2.toLowerCase(Locale.ROOT)) + } else { + msg1.contains(msg2) + } + } + var e = exception - var contains = e.getMessage.contains(msg) + var contains = contain(e.getMessage, msg) while (e.getCause != null && !contains) { e = e.getCause - contains = e.getMessage.contains(msg) + contains = contain(e.getMessage, msg) } assert(contains, s"Exception tree doesn't contain the expected message: $msg") } @@ -226,6 +247,16 @@ private[spark] object TestUtils { url: URL, method: String = "GET", headers: Seq[(String, String)] = Nil): Int = { + withHttpConnection(url, method, headers = headers) { connection => + connection.getResponseCode() + } + } + + def withHttpConnection[T]( + url: URL, + method: String = "GET", + headers: Seq[(String, String)] = Nil) + (fn: HttpURLConnection => T): T = { val connection = url.openConnection().asInstanceOf[HttpURLConnection] connection.setRequestMethod(method) headers.foreach { case (k, v) => connection.setRequestProperty(k, v) } @@ -235,8 +266,10 @@ private[spark] object TestUtils { val sslCtx = SSLContext.getInstance("SSL") val trustManager = new X509TrustManager { override def getAcceptedIssuers(): Array[X509Certificate] = null - override def checkClientTrusted(x509Certificates: Array[X509Certificate], s: String) {} - override def checkServerTrusted(x509Certificates: Array[X509Certificate], s: String) {} + override def checkClientTrusted(x509Certificates: Array[X509Certificate], + s: String): Unit = {} + override def checkServerTrusted(x509Certificates: Array[X509Certificate], + s: String): Unit = {} } val verifier = new HostnameVerifier() { override def verify(hostname: String, session: SSLSession): Boolean = true @@ -248,7 +281,7 @@ private[spark] object TestUtils { try { connection.connect() - connection.getResponseCode() + fn(connection) } finally { connection.disconnect() } @@ -264,7 +297,7 @@ private[spark] object TestUtils { try { body(listener) } finally { - sc.listenerBus.waitUntilEmpty(TimeUnit.SECONDS.toMillis(10)) + sc.listenerBus.waitUntilEmpty() sc.listenerBus.removeListener(listener) } } diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala index 317f3c51d0154..aa01374a2f2e8 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala @@ -791,7 +791,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) keyClass: Class[_], valueClass: Class[_], outputFormatClass: Class[F], - conf: JobConf) { + conf: JobConf): Unit = { rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass, conf) } @@ -800,7 +800,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) path: String, keyClass: Class[_], valueClass: Class[_], - outputFormatClass: Class[F]) { + outputFormatClass: Class[F]): Unit = { rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass) } @@ -810,7 +810,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) keyClass: Class[_], valueClass: Class[_], outputFormatClass: Class[F], - codec: Class[_ <: CompressionCodec]) { + codec: Class[_ <: CompressionCodec]): Unit = { rdd.saveAsHadoopFile(path, keyClass, valueClass, outputFormatClass, codec) } @@ -820,7 +820,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) keyClass: Class[_], valueClass: Class[_], outputFormatClass: Class[F], - conf: Configuration) { + conf: Configuration): Unit = { rdd.saveAsNewAPIHadoopFile(path, keyClass, valueClass, outputFormatClass, conf) } @@ -828,7 +828,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) * Output the RDD to any Hadoop-supported storage system, using * a Configuration object for that storage system. */ - def saveAsNewAPIHadoopDataset(conf: Configuration) { + def saveAsNewAPIHadoopDataset(conf: Configuration): Unit = { rdd.saveAsNewAPIHadoopDataset(conf) } @@ -837,7 +837,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) path: String, keyClass: Class[_], valueClass: Class[_], - outputFormatClass: Class[F]) { + outputFormatClass: Class[F]): Unit = { rdd.saveAsNewAPIHadoopFile(path, keyClass, valueClass, outputFormatClass) } @@ -847,7 +847,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)]) * (e.g. a table name to write to) in the same way as it would be configured for a Hadoop * MapReduce job. */ - def saveAsHadoopDataset(conf: JobConf) { + def saveAsHadoopDataset(conf: JobConf): Unit = { rdd.saveAsHadoopDataset(conf) } diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala index 5ba821935ac69..1ca5262742665 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala @@ -347,7 +347,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable { /** * Applies a function f to all elements of this RDD. */ - def foreach(f: VoidFunction[T]) { + def foreach(f: VoidFunction[T]): Unit = { rdd.foreach(x => f.call(x)) } diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala index 330c2f6e6117e..149def29b8fbd 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala @@ -546,7 +546,7 @@ class JavaSparkContext(val sc: SparkContext) extends Closeable { def broadcast[T](value: T): Broadcast[T] = sc.broadcast(value)(fakeClassTag) /** Shut down the SparkContext. */ - def stop() { + def stop(): Unit = { sc.stop() } @@ -567,7 +567,7 @@ class JavaSparkContext(val sc: SparkContext) extends Closeable { * * @note A path can be added only once. Subsequent additions of the same path are ignored. */ - def addFile(path: String) { + def addFile(path: String): Unit = { sc.addFile(path) } @@ -593,7 +593,7 @@ class JavaSparkContext(val sc: SparkContext) extends Closeable { * * @note A path can be added only once. Subsequent additions of the same path are ignored. */ - def addJar(path: String) { + def addJar(path: String): Unit = { sc.addJar(path) } @@ -609,9 +609,9 @@ class JavaSparkContext(val sc: SparkContext) extends Closeable { /** * Set the directory under which RDDs are going to be checkpointed. The directory must - * be a HDFS path if running on a cluster. + * be an HDFS path if running on a cluster. */ - def setCheckpointDir(dir: String) { + def setCheckpointDir(dir: String): Unit = { sc.setCheckpointDir(dir) } @@ -631,14 +631,14 @@ class JavaSparkContext(val sc: SparkContext) extends Closeable { /** * Pass-through to SparkContext.setCallSite. For API support only. */ - def setCallSite(site: String) { + def setCallSite(site: String): Unit = { sc.setCallSite(site) } /** * Pass-through to SparkContext.setCallSite. For API support only. */ - def clearCallSite() { + def clearCallSite(): Unit = { sc.clearCallSite() } @@ -669,7 +669,7 @@ class JavaSparkContext(val sc: SparkContext) extends Closeable { * @param logLevel The desired log level as a string. * Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN */ - def setLogLevel(logLevel: String) { + def setLogLevel(logLevel: String): Unit = { sc.setLogLevel(logLevel) } diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala b/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala index fd96052f95d3f..e9c77f4086d0d 100644 --- a/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala +++ b/core/src/main/scala/org/apache/spark/api/java/JavaUtils.scala @@ -81,7 +81,7 @@ private[spark] object JavaUtils { } } - override def remove() { + override def remove(): Unit = { prev match { case Some(k) => underlying match { diff --git a/core/src/main/scala/org/apache/spark/api/python/Py4JServer.scala b/core/src/main/scala/org/apache/spark/api/python/Py4JServer.scala new file mode 100644 index 0000000000000..db440b1178920 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/api/python/Py4JServer.scala @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.api.python + +import java.net.InetAddress +import java.util.Locale + +import org.apache.spark.SparkConf +import org.apache.spark.internal.Logging +import org.apache.spark.util.Utils + +/** + * A wrapper for both GatewayServer, and ClientServer to pin Python thread to JVM thread. + */ +private[spark] class Py4JServer(sparkConf: SparkConf) extends Logging { + private[spark] val secret: String = Utils.createSecret(sparkConf) + + // Launch a Py4J gateway or client server for the process to connect to; this will let it see our + // Java system properties and such + private val localhost = InetAddress.getLoopbackAddress() + private[spark] val server = if (sys.env.getOrElse( + "PYSPARK_PIN_THREAD", "false").toLowerCase(Locale.ROOT) == "true") { + new py4j.ClientServer.ClientServerBuilder() + .authToken(secret) + .javaPort(0) + .javaAddress(localhost) + .build() + } else { + new py4j.GatewayServer.GatewayServerBuilder() + .authToken(secret) + .javaPort(0) + .javaAddress(localhost) + .callbackClient(py4j.GatewayServer.DEFAULT_PYTHON_PORT, localhost, secret) + .build() + } + + def start(): Unit = server match { + case clientServer: py4j.ClientServer => clientServer.startServer() + case gatewayServer: py4j.GatewayServer => gatewayServer.start() + case other => throw new RuntimeException(s"Unexpected Py4J server ${other.getClass}") + } + + def getListeningPort: Int = server match { + case clientServer: py4j.ClientServer => clientServer.getJavaServer.getListeningPort + case gatewayServer: py4j.GatewayServer => gatewayServer.getListeningPort + case other => throw new RuntimeException(s"Unexpected Py4J server ${other.getClass}") + } + + def shutdown(): Unit = server match { + case clientServer: py4j.ClientServer => clientServer.shutdown() + case gatewayServer: py4j.GatewayServer => gatewayServer.shutdown() + case other => throw new RuntimeException(s"Unexpected Py4J server ${other.getClass}") + } +} diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonGatewayServer.scala b/core/src/main/scala/org/apache/spark/api/python/PythonGatewayServer.scala index 9ddc4a4910180..ed70e26e2520d 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonGatewayServer.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonGatewayServer.scala @@ -18,18 +18,14 @@ package org.apache.spark.api.python import java.io.{DataOutputStream, File, FileOutputStream} -import java.net.InetAddress import java.nio.charset.StandardCharsets.UTF_8 import java.nio.file.Files -import py4j.GatewayServer - import org.apache.spark.SparkConf import org.apache.spark.internal.Logging -import org.apache.spark.util.Utils /** - * Process that starts a Py4J GatewayServer on an ephemeral port. + * Process that starts a Py4J server on an ephemeral port. * * This process is launched (via SparkSubmit) by the PySpark driver (see java_gateway.py). */ @@ -37,23 +33,13 @@ private[spark] object PythonGatewayServer extends Logging { initializeLogIfNecessary(true) def main(args: Array[String]): Unit = { - val secret = Utils.createSecret(new SparkConf()) - - // Start a GatewayServer on an ephemeral port. Make sure the callback client is configured - // with the same secret, in case the app needs callbacks from the JVM to the underlying - // python processes. - val localhost = InetAddress.getLoopbackAddress() - val gatewayServer: GatewayServer = new GatewayServer.GatewayServerBuilder() - .authToken(secret) - .javaPort(0) - .javaAddress(localhost) - .callbackClient(GatewayServer.DEFAULT_PYTHON_PORT, localhost, secret) - .build() + val sparkConf = new SparkConf() + val gatewayServer: Py4JServer = new Py4JServer(sparkConf) gatewayServer.start() val boundPort: Int = gatewayServer.getListeningPort if (boundPort == -1) { - logError("GatewayServer failed to bind; exiting") + logError(s"${gatewayServer.server.getClass} failed to bind; exiting") System.exit(1) } else { logDebug(s"Started PythonGatewayServer on port $boundPort") @@ -68,7 +54,7 @@ private[spark] object PythonGatewayServer extends Logging { val dos = new DataOutputStream(new FileOutputStream(tmpPath)) dos.writeInt(boundPort) - val secretBytes = secret.getBytes(UTF_8) + val secretBytes = gatewayServer.secret.getBytes(UTF_8) dos.writeInt(secretBytes.length) dos.write(secretBytes, 0, secretBytes.length) dos.close() diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala index 4d76ff76e6752..6dc1721f56adf 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala @@ -24,6 +24,7 @@ import java.util.{ArrayList => JArrayList, List => JList, Map => JMap} import scala.collection.JavaConverters._ import scala.collection.mutable +import scala.concurrent.duration.Duration import scala.reflect.ClassTag import org.apache.hadoop.conf.Configuration @@ -179,15 +180,22 @@ private[spark] object PythonRDD extends Logging { * data collected from this job, the secret for authentication, and a socket auth * server object that can be used to join the JVM serving thread in Python. */ - def toLocalIteratorAndServe[T](rdd: RDD[T]): Array[Any] = { + def toLocalIteratorAndServe[T](rdd: RDD[T], prefetchPartitions: Boolean = false): Array[Any] = { val handleFunc = (sock: Socket) => { val out = new DataOutputStream(sock.getOutputStream) val in = new DataInputStream(sock.getInputStream) Utils.tryWithSafeFinallyAndFailureCallbacks(block = { // Collects a partition on each iteration val collectPartitionIter = rdd.partitions.indices.iterator.map { i => - rdd.sparkContext.runJob(rdd, (iter: Iterator[Any]) => iter.toArray, Seq(i)).head + var result: Array[Any] = null + rdd.sparkContext.submitJob( + rdd, + (iter: Iterator[Any]) => iter.toArray, + Seq(i), // The partition we are evaluating + (_, res: Array[Any]) => result = res, + result) } + val prefetchIter = collectPartitionIter.buffered // Write data until iteration is complete, client stops iteration, or error occurs var complete = false @@ -196,10 +204,15 @@ private[spark] object PythonRDD extends Logging { // Read request for data, value of zero will stop iteration or non-zero to continue if (in.readInt() == 0) { complete = true - } else if (collectPartitionIter.hasNext) { + } else if (prefetchIter.hasNext) { // Client requested more data, attempt to collect the next partition - val partitionArray = collectPartitionIter.next() + val partitionFuture = prefetchIter.next() + // Cause the next job to be submitted if prefetchPartitions is enabled. + if (prefetchPartitions) { + prefetchIter.headOption + } + val partitionArray = ThreadUtils.awaitResult(partitionFuture, Duration.Inf) // Send response there is a partition to read out.writeInt(1) @@ -245,7 +258,7 @@ private[spark] object PythonRDD extends Logging { new PythonBroadcast(path) } - def writeIteratorToStream[T](iter: Iterator[T], dataOut: DataOutputStream) { + def writeIteratorToStream[T](iter: Iterator[T], dataOut: DataOutputStream): Unit = { def write(obj: Any): Unit = obj match { case null => @@ -431,7 +444,7 @@ private[spark] object PythonRDD extends Logging { } } - def writeUTF(str: String, dataOut: DataOutputStream) { + def writeUTF(str: String, dataOut: DataOutputStream): Unit = { val bytes = str.getBytes(StandardCharsets.UTF_8) dataOut.writeInt(bytes.length) dataOut.write(bytes) diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala index d2a10df7acbd3..658e0d593a167 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala @@ -48,6 +48,7 @@ private[spark] object PythonEvalType { val SQL_WINDOW_AGG_PANDAS_UDF = 203 val SQL_SCALAR_PANDAS_ITER_UDF = 204 val SQL_MAP_PANDAS_ITER_UDF = 205 + val SQL_COGROUPED_MAP_PANDAS_UDF = 206 def toString(pythonEvalType: Int): String = pythonEvalType match { case NON_UDF => "NON_UDF" @@ -58,6 +59,7 @@ private[spark] object PythonEvalType { case SQL_WINDOW_AGG_PANDAS_UDF => "SQL_WINDOW_AGG_PANDAS_UDF" case SQL_SCALAR_PANDAS_ITER_UDF => "SQL_SCALAR_PANDAS_ITER_UDF" case SQL_MAP_PANDAS_ITER_UDF => "SQL_MAP_PANDAS_ITER_UDF" + case SQL_COGROUPED_MAP_PANDAS_UDF => "SQL_COGROUPED_MAP_PANDAS_UDF" } } @@ -192,7 +194,7 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( def exception: Option[Throwable] = Option(_exception) /** Terminates the writer thread, ignoring any exceptions that may occur due to cleanup. */ - def shutdownOnTaskCompletion() { + def shutdownOnTaskCompletion(): Unit = { assert(context.isCompleted) this.interrupt() } @@ -410,7 +412,7 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( } } - def writeUTF(str: String, dataOut: DataOutputStream) { + def writeUTF(str: String, dataOut: DataOutputStream): Unit = { val bytes = str.getBytes(UTF_8) dataOut.writeInt(bytes.length) dataOut.write(bytes) @@ -529,7 +531,7 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( setDaemon(true) - override def run() { + override def run(): Unit = { // Kill the worker if it is interrupted, checking until task completion. // TODO: This has a race condition if interruption occurs, as completed may still become true. while (!context.isInterrupted && !context.isCompleted) { @@ -609,7 +611,7 @@ private[spark] class PythonRunner(funcs: Seq[ChainedPythonFunctions]) val obj = new Array[Byte](length) stream.readFully(obj) obj - case 0 => Array.empty[Byte] + case 0 => Array.emptyByteArray case SpecialLengths.TIMING_DATA => handleTimingData() read() diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala b/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala index 6c37844a088ce..df236ba8926c1 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala @@ -189,7 +189,7 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String null } - private def startDaemon() { + private def startDaemon(): Unit = { self.synchronized { // Is it already running? if (daemon != null) { @@ -212,8 +212,13 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String try { daemonPort = in.readInt() } catch { + case _: EOFException if daemon.isAlive => + throw new SparkException("EOFException occurred while reading the port number " + + s"from $daemonModule's stdout") case _: EOFException => - throw new SparkException(s"No port number in $daemonModule's stdout") + throw new SparkException( + s"EOFException occurred while reading the port number from $daemonModule's" + + s" stdout and terminated with code: ${daemon.exitValue}.") } // test that the returned port number is within a valid range. @@ -271,7 +276,7 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String /** * Redirect the given streams to our stderr in separate threads. */ - private def redirectStreamsToStderr(stdout: InputStream, stderr: InputStream) { + private def redirectStreamsToStderr(stdout: InputStream, stderr: InputStream): Unit = { try { new RedirectThread(stdout, System.err, "stdout reader for " + pythonExec).start() new RedirectThread(stderr, System.err, "stderr reader for " + pythonExec).start() @@ -288,7 +293,7 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String setDaemon(true) - override def run() { + override def run(): Unit = { while (true) { self.synchronized { if (IDLE_WORKER_TIMEOUT_NS < System.nanoTime() - lastActivityNs) { @@ -301,7 +306,7 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String } } - private def cleanupIdleWorkers() { + private def cleanupIdleWorkers(): Unit = { while (idleWorkers.nonEmpty) { val worker = idleWorkers.dequeue() try { @@ -314,7 +319,7 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String } } - private def stopDaemon() { + private def stopDaemon(): Unit = { self.synchronized { if (useDaemon) { cleanupIdleWorkers() @@ -332,11 +337,11 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String } } - def stop() { + def stop(): Unit = { stopDaemon() } - def stopWorker(worker: Socket) { + def stopWorker(worker: Socket): Unit = { self.synchronized { if (useDaemon) { if (daemon != null) { @@ -355,7 +360,7 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String worker.close() } - def releaseWorker(worker: Socket) { + def releaseWorker(worker: Socket): Unit = { if (useDaemon) { self.synchronized { lastActivityNs = System.nanoTime() diff --git a/core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTestDataGenerator.scala b/core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTestDataGenerator.scala index 86965dbc2e778..4e790b364e1d2 100644 --- a/core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTestDataGenerator.scala +++ b/core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTestDataGenerator.scala @@ -37,11 +37,11 @@ case class TestWritable(var str: String, var int: Int, var double: Double) exten def this() = this("", 0, 0.0) def getStr: String = str - def setStr(str: String) { this.str = str } + def setStr(str: String): Unit = { this.str = str } def getInt: Int = int - def setInt(int: Int) { this.int = int } + def setInt(int: Int): Unit = { this.int = int } def getDouble: Double = double - def setDouble(double: Double) { this.double = double } + def setDouble(double: Double): Unit = { this.double = double } def write(out: DataOutput): Unit = { out.writeUTF(str) @@ -106,13 +106,13 @@ private[python] class WritableToDoubleArrayConverter extends Converter[Any, Arra */ object WriteInputFormatTestDataGenerator { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { val path = args(0) val sc = new JavaSparkContext("local[4]", "test-writables") generateData(path, sc) } - def generateData(path: String, jsc: JavaSparkContext) { + def generateData(path: String, jsc: JavaSparkContext): Unit = { val sc = jsc.sc val basePath = s"$path/sftestdata/" diff --git a/core/src/main/scala/org/apache/spark/api/r/BaseRRunner.scala b/core/src/main/scala/org/apache/spark/api/r/BaseRRunner.scala index f96c5215cf0af..fdfe5f5b41d0a 100644 --- a/core/src/main/scala/org/apache/spark/api/r/BaseRRunner.scala +++ b/core/src/main/scala/org/apache/spark/api/r/BaseRRunner.scala @@ -82,12 +82,7 @@ private[spark] abstract class BaseRRunner[IN, OUT]( serverSocket.close() } - try { - newReaderIterator(dataStream, errThread) - } catch { - case e: Exception => - throw new SparkException("R computation failed with\n " + errThread.getLines(), e) - } + newReaderIterator(dataStream, errThread) } /** @@ -138,6 +133,16 @@ private[spark] abstract class BaseRRunner[IN, OUT]( * and then returns null. */ protected def read(): OUT + + protected val handleException: PartialFunction[Throwable, OUT] = { + case e: Exception => + var msg = "R unexpectedly exited." + val lines = errThread.getLines() + if (lines.trim().nonEmpty) { + msg += s"\nR worker produced errors: $lines\n" + } + throw new SparkException(msg, e) + } } /** @@ -230,7 +235,7 @@ private[spark] class BufferedStreamThread( errBufferSize: Int) extends Thread(name) with Logging { val lines = new Array[String](errBufferSize) var lineIdx = 0 - override def run() { + override def run(): Unit = { for (line <- Source.fromInputStream(in).getLines) { synchronized { lines(lineIdx) = line diff --git a/core/src/main/scala/org/apache/spark/api/r/RRunner.scala b/core/src/main/scala/org/apache/spark/api/r/RRunner.scala index 0327386b45ed5..20ab6fc2f348d 100644 --- a/core/src/main/scala/org/apache/spark/api/r/RRunner.scala +++ b/core/src/main/scala/org/apache/spark/api/r/RRunner.scala @@ -125,10 +125,7 @@ private[spark] class RRunner[IN, OUT]( eos = true null.asInstanceOf[OUT] } - } catch { - case eof: EOFException => - throw new SparkException("R worker exited unexpectedly (cranshed)", eof) - } + } catch handleException } } } diff --git a/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala index 0e81ad198db67..9ef6c7c5906a2 100644 --- a/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala +++ b/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala @@ -74,7 +74,7 @@ abstract class Broadcast[T: ClassTag](val id: Long) extends Serializable with Lo * Asynchronously delete cached copies of this broadcast on the executors. * If the broadcast is used after this is called, it will need to be re-sent to each executor. */ - def unpersist() { + def unpersist(): Unit = { unpersist(blocking = false) } @@ -83,7 +83,7 @@ abstract class Broadcast[T: ClassTag](val id: Long) extends Serializable with Lo * this is called, it will need to be re-sent to each executor. * @param blocking Whether to block until unpersisting has completed */ - def unpersist(blocking: Boolean) { + def unpersist(blocking: Boolean): Unit = { assertValid() doUnpersist(blocking) } @@ -93,7 +93,7 @@ abstract class Broadcast[T: ClassTag](val id: Long) extends Serializable with Lo * Destroy all data and metadata related to this broadcast variable. Use this with caution; * once a broadcast variable has been destroyed, it cannot be used again. */ - def destroy() { + def destroy(): Unit = { destroy(blocking = false) } @@ -102,7 +102,7 @@ abstract class Broadcast[T: ClassTag](val id: Long) extends Serializable with Lo * once a broadcast variable has been destroyed, it cannot be used again. * @param blocking Whether to block until destroy has completed */ - private[spark] def destroy(blocking: Boolean) { + private[spark] def destroy(blocking: Boolean): Unit = { assertValid() _isValid = false _destroySite = Utils.getCallSite().shortForm @@ -128,17 +128,17 @@ abstract class Broadcast[T: ClassTag](val id: Long) extends Serializable with Lo * Actually unpersist the broadcasted value on the executors. Concrete implementations of * Broadcast class must define their own logic to unpersist their own data. */ - protected def doUnpersist(blocking: Boolean) + protected def doUnpersist(blocking: Boolean): Unit /** * Actually destroy all data and metadata related to this broadcast variable. * Implementation of Broadcast class must define their own logic to destroy their own * state. */ - protected def doDestroy(blocking: Boolean) + protected def doDestroy(blocking: Boolean): Unit /** Check if this broadcast is valid. If not valid, exception is thrown. */ - protected def assertValid() { + protected def assertValid(): Unit = { if (!_isValid) { throw new SparkException( "Attempted to use %s after it was destroyed (%s) ".format(toString, _destroySite)) diff --git a/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala b/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala index 9fa47451c1831..c93cadf1ab3e8 100644 --- a/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala +++ b/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala @@ -40,7 +40,7 @@ private[spark] class BroadcastManager( initialize() // Called by SparkContext or Executor before using Broadcast - private def initialize() { + private def initialize(): Unit = { synchronized { if (!initialized) { broadcastFactory = new TorrentBroadcastFactory @@ -50,7 +50,7 @@ private[spark] class BroadcastManager( } } - def stop() { + def stop(): Unit = { broadcastFactory.stop() } @@ -77,7 +77,7 @@ private[spark] class BroadcastManager( broadcastFactory.newBroadcast[T](value_, isLocal, bid) } - def unbroadcast(id: Long, removeFromDriver: Boolean, blocking: Boolean) { + def unbroadcast(id: Long, removeFromDriver: Boolean, blocking: Boolean): Unit = { broadcastFactory.unbroadcast(id, removeFromDriver, blocking) } } diff --git a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala index 1379314ba1b53..77fbbc08c2103 100644 --- a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala +++ b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala @@ -73,7 +73,7 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long) /** Size of each block. Default value is 4MB. This value is only read by the broadcaster. */ @transient private var blockSize: Int = _ - private def setConf(conf: SparkConf) { + private def setConf(conf: SparkConf): Unit = { compressionCodec = if (conf.get(config.BROADCAST_COMPRESS)) { Some(CompressionCodec.createCodec(conf)) } else { @@ -196,7 +196,7 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long) /** * Remove all persisted state associated with this Torrent broadcast on the executors. */ - override protected def doUnpersist(blocking: Boolean) { + override protected def doUnpersist(blocking: Boolean): Unit = { TorrentBroadcast.unpersist(id, removeFromDriver = false, blocking) } @@ -204,7 +204,7 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long) * Remove all persisted state associated with this Torrent broadcast on the executors * and driver. */ - override protected def doDestroy(blocking: Boolean) { + override protected def doDestroy(blocking: Boolean): Unit = { TorrentBroadcast.unpersist(id, removeFromDriver = true, blocking) } diff --git a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcastFactory.scala b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcastFactory.scala index b11f9ba171b84..65fb5186afae1 100644 --- a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcastFactory.scala +++ b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcastFactory.scala @@ -28,20 +28,21 @@ import org.apache.spark.{SecurityManager, SparkConf} */ private[spark] class TorrentBroadcastFactory extends BroadcastFactory { - override def initialize(isDriver: Boolean, conf: SparkConf, securityMgr: SecurityManager) { } + override def initialize(isDriver: Boolean, conf: SparkConf, + securityMgr: SecurityManager): Unit = { } override def newBroadcast[T: ClassTag](value_ : T, isLocal: Boolean, id: Long): Broadcast[T] = { new TorrentBroadcast[T](value_, id) } - override def stop() { } + override def stop(): Unit = { } /** * Remove all persisted state associated with the torrent broadcast with the given ID. * @param removeFromDriver Whether to remove state from the driver. * @param blocking Whether to block until unbroadcasted */ - override def unbroadcast(id: Long, removeFromDriver: Boolean, blocking: Boolean) { + override def unbroadcast(id: Long, removeFromDriver: Boolean, blocking: Boolean): Unit = { TorrentBroadcast.unpersist(id, removeFromDriver, blocking) } } diff --git a/core/src/main/scala/org/apache/spark/deploy/Client.scala b/core/src/main/scala/org/apache/spark/deploy/Client.scala index 648a8b1c763db..7022b986ea025 100644 --- a/core/src/main/scala/org/apache/spark/deploy/Client.scala +++ b/core/src/main/scala/org/apache/spark/deploy/Client.scala @@ -219,7 +219,7 @@ private class ClientEndpoint( * Executable utility for starting and terminating drivers inside of a standalone cluster. */ object Client { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { // scalastyle:off println if (!sys.props.contains("SPARK_SUBMIT")) { println("WARNING: This client is deprecated and will be removed in a future version of Spark") diff --git a/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala b/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala index a86ee66fb72b9..9d6bbf91168da 100644 --- a/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala @@ -100,7 +100,7 @@ private[deploy] class ClientArguments(args: Array[String]) { /** * Print usage and exit JVM with the given exit code. */ - private def printUsageAndExit(exitCode: Int) { + private def printUsageAndExit(exitCode: Int): Unit = { // TODO: It wouldn't be too hard to allow users to submit their app and dependency jars // separately similar to in the YARN client. val usage = diff --git a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala index fba371dcfb761..18305ad3746a6 100644 --- a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala @@ -60,6 +60,15 @@ private[deploy] object DeployMessages { assert (port > 0) } + /** + * @param id the worker id + * @param worker the worker endpoint ref + */ + case class WorkerDecommission( + id: String, + worker: RpcEndpointRef) + extends DeployMessage + case class ExecutorStateChanged( appId: String, execId: Int, @@ -149,6 +158,8 @@ private[deploy] object DeployMessages { case object ReregisterWithMaster // used when a worker attempts to reconnect to a master + case object DecommissionSelf // Mark as decommissioned. May be Master to Worker in the future. + // AppClient to Master case class RegisterApplication(appDescription: ApplicationDescription, driver: RpcEndpointRef) diff --git a/core/src/main/scala/org/apache/spark/deploy/ExecutorState.scala b/core/src/main/scala/org/apache/spark/deploy/ExecutorState.scala index 69c98e28931d7..0751bcf221f86 100644 --- a/core/src/main/scala/org/apache/spark/deploy/ExecutorState.scala +++ b/core/src/main/scala/org/apache/spark/deploy/ExecutorState.scala @@ -19,9 +19,13 @@ package org.apache.spark.deploy private[deploy] object ExecutorState extends Enumeration { - val LAUNCHING, RUNNING, KILLED, FAILED, LOST, EXITED = Value + val LAUNCHING, RUNNING, KILLED, FAILED, LOST, EXITED, DECOMMISSIONED = Value type ExecutorState = Value - def isFinished(state: ExecutorState): Boolean = Seq(KILLED, FAILED, LOST, EXITED).contains(state) + // DECOMMISSIONED isn't listed as finished since we don't want to remove the executor from + // the worker and the executor still exists - but we do want to avoid scheduling new tasks on it. + private val finishedStates = Seq(KILLED, FAILED, LOST, EXITED) + + def isFinished(state: ExecutorState): Boolean = finishedStates.contains(state) } diff --git a/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala b/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala index 64277e8de2a4d..ebfff89308886 100644 --- a/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala +++ b/core/src/main/scala/org/apache/spark/deploy/ExternalShuffleService.scala @@ -87,14 +87,14 @@ class ExternalShuffleService(sparkConf: SparkConf, securityManager: SecurityMana } /** Starts the external shuffle service if the user has configured us to. */ - def startIfEnabled() { + def startIfEnabled(): Unit = { if (enabled) { start() } } /** Start the external shuffle service */ - def start() { + def start(): Unit = { require(server == null, "Shuffle server already started") val authEnabled = securityManager.isAuthenticationEnabled() logInfo(s"Starting shuffle service on port $port (auth enabled = $authEnabled)") @@ -125,7 +125,7 @@ class ExternalShuffleService(sparkConf: SparkConf, securityManager: SecurityMana blockHandler.executorRemoved(executorId, appId) } - def stop() { + def stop(): Unit = { if (server != null) { server.close() server = null diff --git a/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala b/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala index 99f841234005e..6ff68b694f8f3 100644 --- a/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala +++ b/core/src/main/scala/org/apache/spark/deploy/FaultToleranceTest.scala @@ -78,7 +78,7 @@ private object FaultToleranceTest extends App with Logging { System.setProperty(config.DRIVER_HOST_ADDRESS.key, "172.17.42.1") // default docker host ip - private def afterEach() { + private def afterEach(): Unit = { if (sc != null) { sc.stop() sc = null @@ -180,7 +180,7 @@ private object FaultToleranceTest extends App with Logging { } } - private def test(name: String)(fn: => Unit) { + private def test(name: String)(fn: => Unit): Unit = { try { fn numPassed += 1 @@ -198,12 +198,12 @@ private object FaultToleranceTest extends App with Logging { afterEach() } - private def addMasters(num: Int) { + private def addMasters(num: Int): Unit = { logInfo(s">>>>> ADD MASTERS $num <<<<<") (1 to num).foreach { _ => masters += SparkDocker.startMaster(dockerMountDir) } } - private def addWorkers(num: Int) { + private def addWorkers(num: Int): Unit = { logInfo(s">>>>> ADD WORKERS $num <<<<<") val masterUrls = getMasterUrls(masters) (1 to num).foreach { _ => workers += SparkDocker.startWorker(dockerMountDir, masterUrls) } @@ -239,7 +239,7 @@ private object FaultToleranceTest extends App with Logging { private def delay(secs: Duration = 5.seconds) = Thread.sleep(secs.toMillis) - private def terminateCluster() { + private def terminateCluster(): Unit = { logInfo(">>>>> TERMINATE CLUSTER <<<<<") masters.foreach(_.kill()) workers.foreach(_.kill()) @@ -326,7 +326,7 @@ private object FaultToleranceTest extends App with Logging { } } - private def assertTrue(bool: Boolean, message: String = "") { + private def assertTrue(bool: Boolean, message: String = ""): Unit = { if (!bool) { throw new IllegalStateException("Assertion failed: " + message) } @@ -346,7 +346,7 @@ private class TestMasterInfo(val ip: String, val dockerId: DockerId, val logFile logDebug("Created master: " + this) - def readState() { + def readState(): Unit = { try { val masterStream = new InputStreamReader( new URL("http://%s:8080/json".format(ip)).openStream, StandardCharsets.UTF_8) @@ -372,7 +372,7 @@ private class TestMasterInfo(val ip: String, val dockerId: DockerId, val logFile } } - def kill() { Docker.kill(dockerId) } + def kill(): Unit = { Docker.kill(dockerId) } override def toString: String = "[ip=%s, id=%s, logFile=%s, state=%s]". @@ -386,7 +386,7 @@ private class TestWorkerInfo(val ip: String, val dockerId: DockerId, val logFile logDebug("Created worker: " + this) - def kill() { Docker.kill(dockerId) } + def kill(): Unit = { Docker.kill(dockerId) } override def toString: String = "[ip=%s, id=%s, logFile=%s]".format(ip, dockerId, logFile.getAbsolutePath) diff --git a/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala b/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala index f1b58eb33a1b7..fc849d7f4372f 100644 --- a/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala +++ b/core/src/main/scala/org/apache/spark/deploy/LocalSparkCluster.scala @@ -72,7 +72,7 @@ class LocalSparkCluster( masters } - def stop() { + def stop(): Unit = { logInfo("Shutting down local Spark cluster.") // Stop the workers before the master so they don't get upset that it disconnected workerRpcEnvs.foreach(_.shutdown()) diff --git a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala index 8055a6270dac8..574ce60b19b4e 100644 --- a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala +++ b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala @@ -18,7 +18,7 @@ package org.apache.spark.deploy import java.io.File -import java.net.{InetAddress, URI} +import java.net.URI import java.nio.file.Files import scala.collection.JavaConverters._ @@ -26,7 +26,7 @@ import scala.collection.mutable.ArrayBuffer import scala.util.Try import org.apache.spark.{SparkConf, SparkUserAppException} -import org.apache.spark.api.python.PythonUtils +import org.apache.spark.api.python.{Py4JServer, PythonUtils} import org.apache.spark.internal.config._ import org.apache.spark.util.{RedirectThread, Utils} @@ -35,12 +35,11 @@ import org.apache.spark.util.{RedirectThread, Utils} * subprocess and then has it connect back to the JVM to access system properties, etc. */ object PythonRunner { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { val pythonFile = args(0) val pyFiles = args(1) val otherArgs = args.slice(2, args.length) val sparkConf = new SparkConf() - val secret = Utils.createSecret(sparkConf) val pythonExec = sparkConf.get(PYSPARK_DRIVER_PYTHON) .orElse(sparkConf.get(PYSPARK_PYTHON)) .orElse(sys.env.get("PYSPARK_DRIVER_PYTHON")) @@ -51,15 +50,8 @@ object PythonRunner { val formattedPythonFile = formatPath(pythonFile) val formattedPyFiles = resolvePyFiles(formatPaths(pyFiles)) - // Launch a Py4J gateway server for the process to connect to; this will let it see our - // Java system properties and such - val localhost = InetAddress.getLoopbackAddress() - val gatewayServer = new py4j.GatewayServer.GatewayServerBuilder() - .authToken(secret) - .javaPort(0) - .javaAddress(localhost) - .callbackClient(py4j.GatewayServer.DEFAULT_PYTHON_PORT, localhost, secret) - .build() + val gatewayServer = new Py4JServer(sparkConf) + val thread = new Thread(() => Utils.logUncaughtExceptions { gatewayServer.start() }) thread.setName("py4j-gateway-init") thread.setDaemon(true) @@ -86,7 +78,7 @@ object PythonRunner { // This is equivalent to setting the -u flag; we use it because ipython doesn't support -u: env.put("PYTHONUNBUFFERED", "YES") // value is needed to be set to a non-empty string env.put("PYSPARK_GATEWAY_PORT", "" + gatewayServer.getListeningPort) - env.put("PYSPARK_GATEWAY_SECRET", secret) + env.put("PYSPARK_GATEWAY_SECRET", gatewayServer.secret) // pass conf spark.pyspark.python to python process, the only way to pass info to // python process is through environment variable. sparkConf.get(PYSPARK_PYTHON).foreach(env.put("PYSPARK_PYTHON", _)) diff --git a/core/src/main/scala/org/apache/spark/deploy/RRunner.scala b/core/src/main/scala/org/apache/spark/deploy/RRunner.scala index 60ba0470a628a..b32f9ea3b4747 100644 --- a/core/src/main/scala/org/apache/spark/deploy/RRunner.scala +++ b/core/src/main/scala/org/apache/spark/deploy/RRunner.scala @@ -73,7 +73,7 @@ object RRunner { @volatile var sparkRBackendSecret: String = null val initialized = new Semaphore(0) val sparkRBackendThread = new Thread("SparkR backend") { - override def run() { + override def run(): Unit = { val (port, authHelper) = sparkRBackend.init() sparkRBackendPort = port sparkRBackendSecret = authHelper.secret diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkCuratorUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkCuratorUtil.scala index 8118c01eb712f..b89ae1b35e693 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkCuratorUtil.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkCuratorUtil.scala @@ -45,7 +45,7 @@ private[spark] object SparkCuratorUtil extends Logging { zk } - def mkdir(zk: CuratorFramework, path: String) { + def mkdir(zk: CuratorFramework, path: String): Unit = { if (zk.checkExists().forPath(path) == null) { try { zk.create().creatingParentsIfNeeded().forPath(path) @@ -57,7 +57,7 @@ private[spark] object SparkCuratorUtil extends Logging { } } - def deleteRecursive(zk: CuratorFramework, path: String) { + def deleteRecursive(zk: CuratorFramework, path: String): Unit = { if (zk.checkExists().forPath(path) != null) { for (child <- zk.getChildren.forPath(path).asScala) { zk.delete().forPath(path + "/" + child) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala index 11420bb985520..1180501e8c738 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala @@ -57,7 +57,7 @@ private[spark] class SparkHadoopUtil extends Logging { * you need to look https://issues.apache.org/jira/browse/HDFS-3545 and possibly * do a FileSystem.closeAllForUGI in order to avoid leaking Filesystems */ - def runAsSparkUser(func: () => Unit) { + def runAsSparkUser(func: () => Unit): Unit = { createSparkUser().doAs(new PrivilegedExceptionAction[Unit] { def run: Unit = func() }) @@ -71,7 +71,7 @@ private[spark] class SparkHadoopUtil extends Logging { ugi } - def transferCredentials(source: UserGroupInformation, dest: UserGroupInformation) { + def transferCredentials(source: UserGroupInformation, dest: UserGroupInformation): Unit = { dest.addCredentials(source.getCredentials()) } @@ -79,8 +79,10 @@ private[spark] class SparkHadoopUtil extends Logging { * Appends S3-specific, spark.hadoop.*, and spark.buffer.size configurations to a Hadoop * configuration. */ - def appendS3AndSparkHadoopConfigurations(conf: SparkConf, hadoopConf: Configuration): Unit = { - SparkHadoopUtil.appendS3AndSparkHadoopConfigurations(conf, hadoopConf) + def appendS3AndSparkHadoopHiveConfigurations( + conf: SparkConf, + hadoopConf: Configuration): Unit = { + SparkHadoopUtil.appendS3AndSparkHadoopHiveConfigurations(conf, hadoopConf) } /** @@ -103,6 +105,15 @@ private[spark] class SparkHadoopUtil extends Logging { } } + def appendSparkHiveConfigs( + srcMap: Map[String, String], + destMap: HashMap[String, String]): Unit = { + // Copy any "spark.hive.foo=bar" system properties into destMap as "hive.foo=bar" + for ((key, value) <- srcMap if key.startsWith("spark.hive.")) { + destMap.put(key.substring("spark.".length), value) + } + } + /** * Return an appropriate (subclass) of Configuration. Creating config can initialize some Hadoop * subsystems. @@ -140,7 +151,7 @@ private[spark] class SparkHadoopUtil extends Logging { * Add or overwrite current user's credentials with serialized delegation tokens, * also confirms correct hadoop configuration is set. */ - private[spark] def addDelegationTokens(tokens: Array[Byte], sparkConf: SparkConf) { + private[spark] def addDelegationTokens(tokens: Array[Byte], sparkConf: SparkConf): Unit = { UserGroupInformation.setConfiguration(newConfiguration(sparkConf)) val creds = deserialize(tokens) logInfo("Updating delegation tokens for current user.") @@ -413,11 +424,11 @@ private[spark] object SparkHadoopUtil { */ private[spark] def newConfiguration(conf: SparkConf): Configuration = { val hadoopConf = new Configuration() - appendS3AndSparkHadoopConfigurations(conf, hadoopConf) + appendS3AndSparkHadoopHiveConfigurations(conf, hadoopConf) hadoopConf } - private def appendS3AndSparkHadoopConfigurations( + private def appendS3AndSparkHadoopHiveConfigurations( conf: SparkConf, hadoopConf: Configuration): Unit = { // Note: this null check is around more than just access to the "conf" object to maintain @@ -440,6 +451,7 @@ private[spark] object SparkHadoopUtil { } } appendSparkHadoopConfigs(conf, hadoopConf) + appendSparkHiveConfigs(conf, hadoopConf) val bufferSize = conf.get(BUFFER_SIZE).toString hadoopConf.set("io.file.buffer.size", bufferSize) } @@ -452,37 +464,48 @@ private[spark] object SparkHadoopUtil { } } + private def appendSparkHiveConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = { + // Copy any "spark.hive.foo=bar" spark properties into conf as "hive.foo=bar" + for ((key, value) <- conf.getAll if key.startsWith("spark.hive.")) { + hadoopConf.set(key.substring("spark.".length), value) + } + } + // scalastyle:off line.size.limit /** - * Create a path that uses replication instead of erasure coding (ec), regardless of the default - * configuration in hdfs for the given path. This can be helpful as hdfs ec doesn't support - * hflush(), hsync(), or append() + * Create a file on the given file system, optionally making sure erasure coding is disabled. + * + * Disabling EC can be helpful as HDFS EC doesn't support hflush(), hsync(), or append(). * https://hadoop.apache.org/docs/r3.0.0/hadoop-project-dist/hadoop-hdfs/HDFSErasureCoding.html#Limitations */ // scalastyle:on line.size.limit - def createNonECFile(fs: FileSystem, path: Path): FSDataOutputStream = { - try { - // Use reflection as this uses APIs only available in Hadoop 3 - val builderMethod = fs.getClass().getMethod("createFile", classOf[Path]) - // the builder api does not resolve relative paths, nor does it create parent dirs, while - // the old api does. - if (!fs.mkdirs(path.getParent())) { - throw new IOException(s"Failed to create parents of $path") + def createFile(fs: FileSystem, path: Path, allowEC: Boolean): FSDataOutputStream = { + if (allowEC) { + fs.create(path) + } else { + try { + // Use reflection as this uses APIs only available in Hadoop 3 + val builderMethod = fs.getClass().getMethod("createFile", classOf[Path]) + // the builder api does not resolve relative paths, nor does it create parent dirs, while + // the old api does. + if (!fs.mkdirs(path.getParent())) { + throw new IOException(s"Failed to create parents of $path") + } + val qualifiedPath = fs.makeQualified(path) + val builder = builderMethod.invoke(fs, qualifiedPath) + val builderCls = builder.getClass() + // this may throw a NoSuchMethodException if the path is not on hdfs + val replicateMethod = builderCls.getMethod("replicate") + val buildMethod = builderCls.getMethod("build") + val b2 = replicateMethod.invoke(builder) + buildMethod.invoke(b2).asInstanceOf[FSDataOutputStream] + } catch { + case _: NoSuchMethodException => + // No createFile() method, we're using an older hdfs client, which doesn't give us control + // over EC vs. replication. Older hdfs doesn't have EC anyway, so just create a file with + // old apis. + fs.create(path) } - val qualifiedPath = fs.makeQualified(path) - val builder = builderMethod.invoke(fs, qualifiedPath) - val builderCls = builder.getClass() - // this may throw a NoSuchMethodException if the path is not on hdfs - val replicateMethod = builderCls.getMethod("replicate") - val buildMethod = builderCls.getMethod("build") - val b2 = replicateMethod.invoke(builder) - buildMethod.invoke(b2).asInstanceOf[FSDataOutputStream] - } catch { - case _: NoSuchMethodException => - // No createFile() method, we're using an older hdfs client, which doesn't give us control - // over EC vs. replication. Older hdfs doesn't have EC anyway, so just create a file with - // old apis. - fs.create(path) } } diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 12a8473b22025..8a03af5e38c9b 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -23,11 +23,12 @@ import java.net.{URI, URL} import java.security.PrivilegedExceptionAction import java.text.ParseException import java.util.{ServiceLoader, UUID} +import java.util.jar.JarInputStream import scala.annotation.tailrec import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer -import scala.util.{Properties, Try} +import scala.util.{Failure, Properties, Success, Try} import org.apache.commons.io.FilenameUtils import org.apache.commons.lang3.StringUtils @@ -229,10 +230,6 @@ private[spark] class SparkSubmit extends Logging { // Set the cluster manager val clusterManager: Int = args.master match { case "yarn" => YARN - case "yarn-client" | "yarn-cluster" => - logWarning(s"Master ${args.master} is deprecated since 2.0." + - " Please use master \"yarn\" with specified deploy mode instead.") - YARN case m if m.startsWith("spark") => STANDALONE case m if m.startsWith("mesos") => MESOS case m if m.startsWith("k8s") => KUBERNETES @@ -251,22 +248,7 @@ private[spark] class SparkSubmit extends Logging { -1 } - // Because the deprecated way of specifying "yarn-cluster" and "yarn-client" encapsulate both - // the master and deploy mode, we have some logic to infer the master and deploy mode - // from each other if only one is specified, or exit early if they are at odds. if (clusterManager == YARN) { - (args.master, args.deployMode) match { - case ("yarn-cluster", null) => - deployMode = CLUSTER - args.master = "yarn" - case ("yarn-cluster", "client") => - error("Client deploy mode is not compatible with master \"yarn-cluster\"") - case ("yarn-client", "cluster") => - error("Cluster deploy mode is not compatible with master \"yarn-client\"") - case (_, mode) => - args.master = "yarn" - } - // Make sure YARN is included in our build if we're trying to use it if (!Utils.classIsLoadable(YARN_CLUSTER_SUBMIT_CLASS) && !Utils.isTesting) { error( @@ -456,6 +438,32 @@ private[spark] class SparkSubmit extends Logging { }.orNull } + // At this point, we have attempted to download all remote resources. + // Now we try to resolve the main class if our primary resource is a JAR. + if (args.mainClass == null && !args.isPython && !args.isR) { + try { + val uri = new URI( + Option(localPrimaryResource).getOrElse(args.primaryResource) + ) + val fs = FileSystem.get(uri, hadoopConf) + + Utils.tryWithResource(new JarInputStream(fs.open(new Path(uri)))) { jar => + args.mainClass = jar.getManifest.getMainAttributes.getValue("Main-Class") + } + } catch { + case e: Throwable => + error( + s"Failed to get main class in JAR with error '${e.getMessage}'. " + + " Please specify one with --class." + ) + } + + if (args.mainClass == null) { + // If we still can't figure out the main class at this point, blow up. + error("No main class set in JAR; please specify one with --class.") + } + } + // If we're running a python app, set the main class to our specific python runner if (args.isPython && deployMode == CLIENT) { if (args.primaryResource == PYSPARK_SHELL) { @@ -1047,7 +1055,7 @@ object SparkSubmit extends CommandLineUtils with Logging { * Return whether the given primary resource requires running R. */ private[deploy] def isR(res: String): Boolean = { - res != null && res.endsWith(".R") || res == SPARKR_SHELL + res != null && (res.endsWith(".R") || res.endsWith(".r")) || res == SPARKR_SHELL } private[deploy] def isInternal(res: String): Boolean = { diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index ed1324baed0f1..3f7cfea778ac6 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -19,10 +19,8 @@ package org.apache.spark.deploy import java.io.{ByteArrayOutputStream, File, PrintStream} import java.lang.reflect.InvocationTargetException -import java.net.URI import java.nio.charset.StandardCharsets import java.util.{List => JList} -import java.util.jar.JarFile import scala.collection.JavaConverters._ import scala.collection.mutable.{ArrayBuffer, HashMap} @@ -139,10 +137,10 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S * Remove keys that don't start with "spark." from `sparkProperties`. */ private def ignoreNonSparkProperties(): Unit = { - sparkProperties.foreach { case (k, v) => + sparkProperties.keys.foreach { k => if (!k.startsWith("spark.")) { sparkProperties -= k - logWarning(s"Ignoring non-spark config property: $k=$v") + logWarning(s"Ignoring non-Spark config property: $k") } } } @@ -211,29 +209,6 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S dynamicAllocationEnabled = sparkProperties.get(DYN_ALLOCATION_ENABLED.key).exists("true".equalsIgnoreCase) - // Try to set main class from JAR if no --class argument is given - if (mainClass == null && !isPython && !isR && primaryResource != null) { - val uri = new URI(primaryResource) - val uriScheme = uri.getScheme() - - uriScheme match { - case "file" => - try { - Utils.tryWithResource(new JarFile(uri.getPath)) { jar => - // Note that this might still return null if no main-class is set; we catch that later - mainClass = jar.getManifest.getMainAttributes.getValue("Main-Class") - } - } catch { - case _: Exception => - error(s"Cannot load main class from JAR $primaryResource") - } - case _ => - error( - s"Cannot load main class from JAR $primaryResource with URI $uriScheme. " + - "Please specify a class through --class.") - } - } - // Global defaults. These should be keep to minimum to avoid confusing behavior. master = Option(master).getOrElse("local[*]") @@ -269,9 +244,6 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S if (primaryResource == null) { error("Must specify a primary resource (JAR or Python or R file)") } - if (mainClass == null && SparkSubmit.isUserJar(primaryResource)) { - error("No main class set in JAR; please specify one with --class") - } if (driverMemory != null && Try(JavaUtils.byteStringAsBytes(driverMemory)).getOrElse(-1L) <= 0) { error("Driver memory must be a positive number") diff --git a/core/src/main/scala/org/apache/spark/deploy/StandaloneResourceUtils.scala b/core/src/main/scala/org/apache/spark/deploy/StandaloneResourceUtils.scala index d6f9618af4aac..65bf4351ebfd9 100644 --- a/core/src/main/scala/org/apache/spark/deploy/StandaloneResourceUtils.scala +++ b/core/src/main/scala/org/apache/spark/deploy/StandaloneResourceUtils.scala @@ -208,7 +208,7 @@ private[spark] object StandaloneResourceUtils extends Logging { } val newAllocation = { val allocations = newAssignments.map { case (rName, addresses) => - ResourceAllocation(ResourceID(componentName, rName), addresses) + ResourceAllocation(new ResourceID(componentName, rName), addresses) }.toSeq StandaloneResourceAllocation(pid, allocations) } @@ -348,7 +348,7 @@ private[spark] object StandaloneResourceUtils extends Logging { val compShortName = componentName.substring(componentName.lastIndexOf(".") + 1) val tmpFile = Utils.tempFileWith(dir) val allocations = resources.map { case (rName, rInfo) => - ResourceAllocation(ResourceID(componentName, rName), rInfo.addresses) + ResourceAllocation(new ResourceID(componentName, rName), rInfo.addresses) }.toSeq try { writeResourceAllocationJson(componentName, allocations, tmpFile) diff --git a/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala b/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala index 34ade4ce6f39b..eedf5e969e291 100644 --- a/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala +++ b/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClient.scala @@ -120,7 +120,7 @@ private[spark] class StandaloneAppClient( * * nthRetry means this is the nth attempt to register with master. */ - private def registerWithMaster(nthRetry: Int) { + private def registerWithMaster(nthRetry: Int): Unit = { registerMasterFutures.set(tryRegisterAllMasters()) registrationRetryTimer.set(registrationRetryThread.schedule(new Runnable { override def run(): Unit = { @@ -180,6 +180,8 @@ private[spark] class StandaloneAppClient( logInfo("Executor updated: %s is now %s%s".format(fullId, state, messageText)) if (ExecutorState.isFinished(state)) { listener.executorRemoved(fullId, message.getOrElse(""), exitStatus, workerLost) + } else if (state == ExecutorState.DECOMMISSIONED) { + listener.executorDecommissioned(fullId, message.getOrElse("")) } case WorkerRemoved(id, host, message) => @@ -246,14 +248,14 @@ private[spark] class StandaloneAppClient( /** * Notify the listener that we disconnected, if we hadn't already done so before. */ - def markDisconnected() { + def markDisconnected(): Unit = { if (!alreadyDisconnected) { listener.disconnected() alreadyDisconnected = true } } - def markDead(reason: String) { + def markDead(reason: String): Unit = { if (!alreadyDead.get) { listener.dead(reason) alreadyDead.set(true) @@ -271,12 +273,12 @@ private[spark] class StandaloneAppClient( } - def start() { + def start(): Unit = { // Just launch an rpcEndpoint; it will call back into the listener. endpoint.set(rpcEnv.setupEndpoint("AppClient", new ClientEndpoint(rpcEnv))) } - def stop() { + def stop(): Unit = { if (endpoint.get != null) { try { val timeout = RpcUtils.askRpcTimeout(conf) diff --git a/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClientListener.scala b/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClientListener.scala index d8bc1a883def1..2e38a6847891d 100644 --- a/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClientListener.scala +++ b/core/src/main/scala/org/apache/spark/deploy/client/StandaloneAppClientListener.scala @@ -39,5 +39,7 @@ private[spark] trait StandaloneAppClientListener { def executorRemoved( fullId: String, message: String, exitStatus: Option[Int], workerLost: Boolean): Unit + def executorDecommissioned(fullId: String, message: String): Unit + def workerRemoved(workerId: String, host: String, message: String): Unit } diff --git a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationCache.scala b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationCache.scala index 8c63fa65b40fd..fb2a67c2ab103 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationCache.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationCache.scala @@ -209,9 +209,8 @@ private[history] class ApplicationCache( /** * Register a filter for the web UI which checks for updates to the given app/attempt - * @param ui Spark UI to attach filters to - * @param appId application ID - * @param attemptId attempt ID + * @param key consisted of appId and attemptId + * @param loadedUI Spark UI to attach filters to */ private def registerFilter(key: CacheKey, loadedUI: LoadedAppUI): Unit = { require(loadedUI != null) @@ -231,7 +230,7 @@ private[history] class ApplicationCache( /** * An entry in the cache. * - * @param ui Spark UI + * @param loadedUI Spark UI * @param completed Flag to indicated that the application has completed (and so * does not need refreshing). */ diff --git a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala index f1c06205bf04c..472b52957ed7f 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala @@ -114,6 +114,12 @@ private[history] abstract class ApplicationHistoryProvider { */ def stop(): Unit = { } + /** + * Called when the server is starting up. Implement this function to init the provider and start + * background threads. With this function we can start provider later after it is created. + */ + def start(): Unit = { } + /** * Returns configuration data to be shown in the History Server home page. * diff --git a/core/src/main/scala/org/apache/spark/deploy/history/BasicEventFilterBuilder.scala b/core/src/main/scala/org/apache/spark/deploy/history/BasicEventFilterBuilder.scala new file mode 100644 index 0000000000000..b18bf2665d6ce --- /dev/null +++ b/core/src/main/scala/org/apache/spark/deploy/history/BasicEventFilterBuilder.scala @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy.history + +import scala.collection.mutable + +import org.apache.spark.SparkContext +import org.apache.spark.deploy.history.EventFilter.FilterStatistics +import org.apache.spark.internal.Logging +import org.apache.spark.scheduler._ +import org.apache.spark.storage.BlockManagerId + +/** + * This class tracks both live jobs and live executors, and pass the list to the + * [[BasicEventFilter]] to help BasicEventFilter to reject finished jobs (+ stages/tasks/RDDs) + * and dead executors. + */ +private[spark] class BasicEventFilterBuilder extends SparkListener with EventFilterBuilder { + private val liveJobToStages = new mutable.HashMap[Int, Set[Int]] + private val stageToTasks = new mutable.HashMap[Int, mutable.Set[Long]] + private val stageToRDDs = new mutable.HashMap[Int, Set[Int]] + private val _liveExecutors = new mutable.HashSet[String] + + private var totalJobs: Long = 0L + private var totalStages: Long = 0L + private var totalTasks: Long = 0L + + private[history] def liveJobs: Set[Int] = liveJobToStages.keySet.toSet + private[history] def liveStages: Set[Int] = stageToRDDs.keySet.toSet + private[history] def liveTasks: Set[Long] = stageToTasks.values.flatten.toSet + private[history] def liveRDDs: Set[Int] = stageToRDDs.values.flatten.toSet + private[history] def liveExecutors: Set[String] = _liveExecutors.toSet + + override def onJobStart(jobStart: SparkListenerJobStart): Unit = { + totalJobs += 1 + totalStages += jobStart.stageIds.length + liveJobToStages += jobStart.jobId -> jobStart.stageIds.toSet + } + + override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = { + val stages = liveJobToStages.getOrElse(jobEnd.jobId, Seq.empty[Int]) + liveJobToStages -= jobEnd.jobId + stageToTasks --= stages + stageToRDDs --= stages + } + + override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = { + val stageId = stageSubmitted.stageInfo.stageId + stageToRDDs.put(stageId, stageSubmitted.stageInfo.rddInfos.map(_.id).toSet) + stageToTasks.getOrElseUpdate(stageId, new mutable.HashSet[Long]()) + } + + override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = { + totalTasks += 1 + stageToTasks.get(taskStart.stageId).foreach { tasks => + tasks += taskStart.taskInfo.taskId + } + } + + override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded): Unit = { + _liveExecutors += executorAdded.executorId + } + + override def onExecutorRemoved(executorRemoved: SparkListenerExecutorRemoved): Unit = { + _liveExecutors -= executorRemoved.executorId + } + + override def createFilter(): EventFilter = { + val stats = FilterStatistics(totalJobs, liveJobs.size, totalStages, + liveStages.size, totalTasks, liveTasks.size) + + new BasicEventFilter(stats, liveJobs, liveStages, liveTasks, liveRDDs, liveExecutors) + } +} + +/** + * This class provides the functionality to reject events which are related to the finished + * jobs based on the given information. This class only deals with job related events, and provides + * a PartialFunction which returns false for rejected events for finished jobs, returns true + * otherwise. + */ +private[spark] abstract class JobEventFilter( + stats: Option[FilterStatistics], + liveJobs: Set[Int], + liveStages: Set[Int], + liveTasks: Set[Long], + liveRDDs: Set[Int]) extends EventFilter with Logging { + + logDebug(s"jobs : $liveJobs") + logDebug(s"stages : $liveStages") + logDebug(s"tasks : $liveTasks") + logDebug(s"RDDs : $liveRDDs") + + override def statistics(): Option[FilterStatistics] = stats + + protected val acceptFnForJobEvents: PartialFunction[SparkListenerEvent, Boolean] = { + case e: SparkListenerStageCompleted => + liveStages.contains(e.stageInfo.stageId) + case e: SparkListenerStageSubmitted => + liveStages.contains(e.stageInfo.stageId) + case e: SparkListenerTaskStart => + liveTasks.contains(e.taskInfo.taskId) + case e: SparkListenerTaskGettingResult => + liveTasks.contains(e.taskInfo.taskId) + case e: SparkListenerTaskEnd => + liveTasks.contains(e.taskInfo.taskId) + case e: SparkListenerJobStart => + liveJobs.contains(e.jobId) + case e: SparkListenerJobEnd => + liveJobs.contains(e.jobId) + case e: SparkListenerUnpersistRDD => + liveRDDs.contains(e.rddId) + case e: SparkListenerExecutorMetricsUpdate => + e.accumUpdates.exists { case (taskId, stageId, _, _) => + liveTasks.contains(taskId) || liveStages.contains(stageId) + } + case e: SparkListenerSpeculativeTaskSubmitted => + liveStages.contains(e.stageId) + } +} + +/** + * This class rejects events which are related to the finished jobs or dead executors, + * based on the given information. The events which are not related to the job and executor + * will be considered as "Don't mind". + */ +private[spark] class BasicEventFilter( + stats: FilterStatistics, + liveJobs: Set[Int], + liveStages: Set[Int], + liveTasks: Set[Long], + liveRDDs: Set[Int], + liveExecutors: Set[String]) + extends JobEventFilter( + Some(stats), + liveJobs, + liveStages, + liveTasks, + liveRDDs) with Logging { + + logDebug(s"live executors : $liveExecutors") + + private val _acceptFn: PartialFunction[SparkListenerEvent, Boolean] = { + case e: SparkListenerExecutorAdded => liveExecutors.contains(e.executorId) + case e: SparkListenerExecutorRemoved => liveExecutors.contains(e.executorId) + case e: SparkListenerExecutorBlacklisted => liveExecutors.contains(e.executorId) + case e: SparkListenerExecutorUnblacklisted => liveExecutors.contains(e.executorId) + case e: SparkListenerStageExecutorMetrics => liveExecutors.contains(e.execId) + case e: SparkListenerBlockManagerAdded => acceptBlockManagerEvent(e.blockManagerId) + case e: SparkListenerBlockManagerRemoved => acceptBlockManagerEvent(e.blockManagerId) + case e: SparkListenerBlockUpdated => acceptBlockManagerEvent(e.blockUpdatedInfo.blockManagerId) + } + + private def acceptBlockManagerEvent(blockManagerId: BlockManagerId): Boolean = { + blockManagerId.isDriver || liveExecutors.contains(blockManagerId.executorId) + } + + override def acceptFn(): PartialFunction[SparkListenerEvent, Boolean] = { + _acceptFn.orElse(acceptFnForJobEvents) + } +} diff --git a/core/src/main/scala/org/apache/spark/deploy/history/EventFilter.scala b/core/src/main/scala/org/apache/spark/deploy/history/EventFilter.scala new file mode 100644 index 0000000000000..a5f2394960b70 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/deploy/history/EventFilter.scala @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy.history + +import scala.io.{Codec, Source} +import scala.util.control.NonFatal + +import org.apache.hadoop.fs.{FileSystem, Path} +import org.json4s.jackson.JsonMethods.parse + +import org.apache.spark.deploy.history.EventFilter.FilterStatistics +import org.apache.spark.internal.Logging +import org.apache.spark.scheduler._ +import org.apache.spark.util.{JsonProtocol, Utils} + +/** + * EventFilterBuilder provides the interface to gather the information from events being received + * by [[SparkListenerInterface]], and create a new [[EventFilter]] instance which leverages + * information gathered to decide whether the event should be accepted or not. + */ +private[spark] trait EventFilterBuilder extends SparkListenerInterface { + def createFilter(): EventFilter +} + +/** [[EventFilter]] decides whether the given event should be accepted or rejected. */ +private[spark] trait EventFilter { + /** + * Provide statistic information of event filter, which would be used for measuring the score + * of compaction. + * + * To simplify the condition, currently the fields of statistic are static, since major kinds of + * events compaction would filter out are job related event types. If the filter doesn't track + * with job related events, return None instead. + */ + def statistics(): Option[FilterStatistics] + + /** + * Classify whether the event is accepted or rejected by this filter. + * + * The method should return the partial function which matches the events where the filter can + * decide whether the event should be accepted or rejected. Otherwise it should leave the events + * be unmatched. + */ + def acceptFn(): PartialFunction[SparkListenerEvent, Boolean] +} + +private[spark] object EventFilter extends Logging { + case class FilterStatistics( + totalJobs: Long, + liveJobs: Long, + totalStages: Long, + liveStages: Long, + totalTasks: Long, + liveTasks: Long) + + def applyFilterToFile( + fs: FileSystem, + filters: Seq[EventFilter], + path: Path, + onAccepted: (String, SparkListenerEvent) => Unit, + onRejected: (String, SparkListenerEvent) => Unit, + onUnidentified: String => Unit): Unit = { + Utils.tryWithResource(EventLogFileReader.openEventLog(path, fs)) { in => + val lines = Source.fromInputStream(in)(Codec.UTF8).getLines() + + lines.zipWithIndex.foreach { case (line, lineNum) => + try { + val event = try { + Some(JsonProtocol.sparkEventFromJson(parse(line))) + } catch { + // ignore any exception occurred from unidentified json + case NonFatal(_) => + onUnidentified(line) + None + } + + event.foreach { e => + val results = filters.flatMap(_.acceptFn().lift.apply(e)) + if (results.nonEmpty && results.forall(_ == false)) { + onRejected(line, e) + } else { + onAccepted(line, e) + } + } + } catch { + case e: Exception => + logError(s"Exception parsing Spark event log: ${path.getName}", e) + logError(s"Malformed line #$lineNum: $line\n") + throw e + } + } + } + } +} diff --git a/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileCompactor.scala b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileCompactor.scala new file mode 100644 index 0000000000000..8558f765175fc --- /dev/null +++ b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileCompactor.scala @@ -0,0 +1,225 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy.history + +import java.io.IOException +import java.net.URI +import java.util.ServiceLoader + +import scala.collection.JavaConverters._ + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} + +import org.apache.spark.SparkConf +import org.apache.spark.deploy.history.EventFilter.FilterStatistics +import org.apache.spark.internal.Logging +import org.apache.spark.scheduler.ReplayListenerBus +import org.apache.spark.util.Utils + +/** + * This class compacts the old event log files into one compact file, via two phases reading: + * + * 1) Initialize available [[EventFilterBuilder]] instances, and replay the old event log files with + * builders, so that these builders can gather the information to create [[EventFilter]] instances. + * 2) Initialize [[EventFilter]] instances from [[EventFilterBuilder]] instances, and replay the + * old event log files with filters. Rewrite the events to the compact file which the filters decide + * to accept. + * + * This class will calculate the score based on statistic from [[EventFilter]] instances, which + * represents approximate rate of filtered-out events. Score is being calculated via applying + * heuristic; task events tend to take most size in event log. + */ +class EventLogFileCompactor( + sparkConf: SparkConf, + hadoopConf: Configuration, + fs: FileSystem, + maxFilesToRetain: Int, + compactionThresholdScore: Double) extends Logging { + + require(maxFilesToRetain > 0, "Max event log files to retain should be higher than 0.") + + /** + * Compacts the old event log files into one compact file, and clean old event log files being + * compacted away. + * + * This method assumes caller will provide the sorted list of files which are sorted by + * the index of event log file, with at most one compact file placed first if it exists. + * + * When compacting the files, the range of compaction for given file list is determined as: + * (first ~ the file where there're `maxFilesToRetain` files on the right side) + * + * This method skips compaction for some circumstances described below: + * - not enough files on the range of compaction + * - score is lower than the threshold of compaction (meaning compaction won't help much) + * + * If this method returns the compaction result as SUCCESS, caller needs to re-read the list + * of event log files, as new compact file is available as well as old event log files are + * removed. + */ + def compact(eventLogFiles: Seq[FileStatus]): CompactionResult = { + assertPrecondition(eventLogFiles) + + if (eventLogFiles.length < maxFilesToRetain) { + return CompactionResult(CompactionResultCode.NOT_ENOUGH_FILES, None) + } + + val filesToCompact = findFilesToCompact(eventLogFiles) + if (filesToCompact.isEmpty) { + CompactionResult(CompactionResultCode.NOT_ENOUGH_FILES, None) + } else { + val builders = initializeBuilders(fs, filesToCompact.map(_.getPath)) + + val filters = builders.map(_.createFilter()) + val minScore = filters.flatMap(_.statistics()).map(calculateScore).min + + if (minScore < compactionThresholdScore) { + CompactionResult(CompactionResultCode.LOW_SCORE_FOR_COMPACTION, None) + } else { + rewrite(filters, filesToCompact) + cleanupCompactedFiles(filesToCompact) + CompactionResult(CompactionResultCode.SUCCESS, Some( + RollingEventLogFilesWriter.getEventLogFileIndex(filesToCompact.last.getPath.getName))) + } + } + } + + private def assertPrecondition(eventLogFiles: Seq[FileStatus]): Unit = { + val idxCompactedFiles = eventLogFiles.zipWithIndex.filter { case (file, _) => + EventLogFileWriter.isCompacted(file.getPath) + } + require(idxCompactedFiles.size < 2 && idxCompactedFiles.headOption.forall(_._2 == 0), + "The number of compact files should be at most 1, and should be placed first if exists.") + } + + /** + * Loads all available EventFilterBuilders in classloader via ServiceLoader, and initializes + * them via replaying events in given files. + */ + private def initializeBuilders(fs: FileSystem, files: Seq[Path]): Seq[EventFilterBuilder] = { + val bus = new ReplayListenerBus() + + val builders = ServiceLoader.load(classOf[EventFilterBuilder], + Utils.getContextOrSparkClassLoader).asScala.toSeq + builders.foreach(bus.addListener) + + files.foreach { log => + Utils.tryWithResource(EventLogFileReader.openEventLog(log, fs)) { in => + bus.replay(in, log.getName) + } + } + + builders + } + + private def calculateScore(stats: FilterStatistics): Double = { + // For now it's simply measuring how many task events will be filtered out (rejected) + // but it can be sophisticated later once we get more heuristic information and found + // the case where this simple calculation doesn't work. + (stats.totalTasks - stats.liveTasks) * 1.0 / stats.totalTasks + } + + /** + * This method rewrites the event log files into one compact file: the compact file will only + * contain the events which pass the filters. Events will be dropped only when all filters + * decide to reject the event or don't mind about the event. Otherwise, the original line for + * the event is written to the compact file as it is. + */ + private[history] def rewrite( + filters: Seq[EventFilter], + eventLogFiles: Seq[FileStatus]): String = { + require(eventLogFiles.nonEmpty) + + val lastIndexEventLogPath = eventLogFiles.last.getPath + val logWriter = new CompactedEventLogFileWriter(lastIndexEventLogPath, "dummy", None, + lastIndexEventLogPath.getParent.toUri, sparkConf, hadoopConf) + + logWriter.start() + eventLogFiles.foreach { file => + EventFilter.applyFilterToFile(fs, filters, file.getPath, + onAccepted = (line, _) => logWriter.writeEvent(line, flushLogger = true), + onRejected = (_, _) => {}, + onUnidentified = line => logWriter.writeEvent(line, flushLogger = true) + ) + } + logWriter.stop() + + logWriter.logPath + } + + private def cleanupCompactedFiles(files: Seq[FileStatus]): Unit = { + files.foreach { file => + var deleted = false + try { + deleted = fs.delete(file.getPath, true) + } catch { + case _: IOException => + } + if (!deleted) { + logWarning(s"Failed to remove ${file.getPath} / skip removing.") + } + } + } + + private def findFilesToCompact(eventLogFiles: Seq[FileStatus]): Seq[FileStatus] = { + val numNormalEventLogFiles = { + if (EventLogFileWriter.isCompacted(eventLogFiles.head.getPath)) { + eventLogFiles.length - 1 + } else { + eventLogFiles.length + } + } + + // This avoids compacting only compact file. + if (numNormalEventLogFiles > maxFilesToRetain) { + eventLogFiles.dropRight(maxFilesToRetain) + } else { + Seq.empty + } + } +} + +/** + * Describes the result of compaction. + * + * @param code The result of compaction. + * @param compactIndex The index of compact file if the compaction is successful. + * Otherwise it will be None. + */ +case class CompactionResult(code: CompactionResultCode.Value, compactIndex: Option[Long]) + +object CompactionResultCode extends Enumeration { + val SUCCESS, NOT_ENOUGH_FILES, LOW_SCORE_FOR_COMPACTION = Value +} + +/** + * This class helps to write compact file; to avoid reimplementing everything, it extends + * [[SingleEventLogFileWriter]], but only `originalFilePath` is used to determine the + * path of compact file. + */ +private class CompactedEventLogFileWriter( + originalFilePath: Path, + appId: String, + appAttemptId: Option[String], + logBaseDir: URI, + sparkConf: SparkConf, + hadoopConf: Configuration) + extends SingleEventLogFileWriter(appId, appAttemptId, logBaseDir, sparkConf, hadoopConf) { + + override val logPath: String = originalFilePath.toUri.toString + EventLogFileWriter.COMPACTED +} diff --git a/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileReaders.scala b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileReaders.scala new file mode 100644 index 0000000000000..9f63a6441a838 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileReaders.scala @@ -0,0 +1,280 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy.history + +import java.io.{BufferedInputStream, InputStream} +import java.util.concurrent.ConcurrentHashMap +import java.util.zip.{ZipEntry, ZipOutputStream} + +import com.google.common.io.ByteStreams +import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} +import org.apache.hadoop.hdfs.DFSInputStream + +import org.apache.spark.SparkConf +import org.apache.spark.deploy.history.EventLogFileWriter.codecName +import org.apache.spark.io.CompressionCodec +import org.apache.spark.util.Utils + +/** The base class of reader which will read the information of event log file(s). */ +abstract class EventLogFileReader( + protected val fileSystem: FileSystem, + val rootPath: Path) { + + protected def fileSizeForDFS(path: Path): Option[Long] = { + Utils.tryWithResource(fileSystem.open(path)) { in => + in.getWrappedStream match { + case dfsIn: DFSInputStream => Some(dfsIn.getFileLength) + case _ => None + } + } + } + + protected def addFileAsZipEntry( + zipStream: ZipOutputStream, + path: Path, + entryName: String): Unit = { + Utils.tryWithResource(fileSystem.open(path, 1 * 1024 * 1024)) { inputStream => + zipStream.putNextEntry(new ZipEntry(entryName)) + ByteStreams.copy(inputStream, zipStream) + zipStream.closeEntry() + } + } + + /** Returns the last index of event log files. None for single event log file. */ + def lastIndex: Option[Long] + + /** + * Returns the size of file for the last index of event log files. Returns its size for + * single event log file. + */ + def fileSizeForLastIndex: Long + + /** Returns whether the application is completed. */ + def completed: Boolean + + /** + * Returns the size of file for the last index (itself for single event log file) of event log + * files, only when underlying input stream is DFSInputStream. Otherwise returns None. + */ + def fileSizeForLastIndexForDFS: Option[Long] + + /** + * Returns the modification time for the last index (itself for single event log file) + * of event log files. + */ + def modificationTime: Long + + /** + * This method compresses the files passed in, and writes the compressed data out into the + * ZipOutputStream passed in. Each file is written as a new ZipEntry with its name being + * the name of the file being compressed. + */ + def zipEventLogFiles(zipStream: ZipOutputStream): Unit + + /** Returns all available event log files. */ + def listEventLogFiles: Seq[FileStatus] + + /** Returns the short compression name if being used. None if it's uncompressed. */ + def compressionCodec: Option[String] + + /** Returns the size of all event log files. */ + def totalSize: Long +} + +object EventLogFileReader { + // A cache for compression codecs to avoid creating the same codec many times + private val codecMap = new ConcurrentHashMap[String, CompressionCodec]() + + def apply( + fs: FileSystem, + path: Path, + lastIndex: Option[Long]): EventLogFileReader = { + lastIndex match { + case Some(_) => new RollingEventLogFilesFileReader(fs, path) + case None => new SingleFileEventLogFileReader(fs, path) + } + } + + def apply(fs: FileSystem, path: Path): Option[EventLogFileReader] = { + apply(fs, fs.getFileStatus(path)) + } + + def apply(fs: FileSystem, status: FileStatus): Option[EventLogFileReader] = { + if (isSingleEventLog(status)) { + Some(new SingleFileEventLogFileReader(fs, status.getPath)) + } else if (isRollingEventLogs(status)) { + Some(new RollingEventLogFilesFileReader(fs, status.getPath)) + } else { + None + } + } + + /** + * Opens an event log file and returns an input stream that contains the event data. + * + * @return input stream that holds one JSON record per line. + */ + def openEventLog(log: Path, fs: FileSystem): InputStream = { + val in = new BufferedInputStream(fs.open(log)) + try { + val codec = codecName(log).map { c => + codecMap.computeIfAbsent(c, CompressionCodec.createCodec(new SparkConf, _)) + } + codec.map(_.compressedContinuousInputStream(in)).getOrElse(in) + } catch { + case e: Throwable => + in.close() + throw e + } + } + + private def isSingleEventLog(status: FileStatus): Boolean = { + !status.isDirectory && + // FsHistoryProvider used to generate a hidden file which can't be read. Accidentally + // reading a garbage file is safe, but we would log an error which can be scary to + // the end-user. + !status.getPath.getName.startsWith(".") + } + + private def isRollingEventLogs(status: FileStatus): Boolean = { + RollingEventLogFilesWriter.isEventLogDir(status) + } +} + +/** + * The reader which will read the information of single event log file. + * + * This reader gets the status of event log file only once when required; + * It may not give "live" status of file that could be changing concurrently, and + * FileNotFoundException could occur if the log file is renamed before getting the + * status of log file. + */ +class SingleFileEventLogFileReader( + fs: FileSystem, + path: Path) extends EventLogFileReader(fs, path) { + private lazy val status = fileSystem.getFileStatus(rootPath) + + override def lastIndex: Option[Long] = None + + override def fileSizeForLastIndex: Long = status.getLen + + override def completed: Boolean = !rootPath.getName.stripSuffix(EventLogFileWriter.COMPACTED) + .endsWith(EventLogFileWriter.IN_PROGRESS) + + override def fileSizeForLastIndexForDFS: Option[Long] = { + if (completed) { + Some(fileSizeForLastIndex) + } else { + fileSizeForDFS(rootPath) + } + } + + override def modificationTime: Long = status.getModificationTime + + override def zipEventLogFiles(zipStream: ZipOutputStream): Unit = { + addFileAsZipEntry(zipStream, rootPath, rootPath.getName) + } + + override def listEventLogFiles: Seq[FileStatus] = Seq(status) + + override def compressionCodec: Option[String] = EventLogFileWriter.codecName(rootPath) + + override def totalSize: Long = fileSizeForLastIndex +} + +/** + * The reader which will read the information of rolled multiple event log files. + * + * This reader lists the files only once; if caller would like to play with updated list, + * it needs to create another reader instance. + */ +class RollingEventLogFilesFileReader( + fs: FileSystem, + path: Path) extends EventLogFileReader(fs, path) { + import RollingEventLogFilesWriter._ + + private lazy val files: Seq[FileStatus] = { + val ret = fs.listStatus(rootPath).toSeq + require(ret.exists(isEventLogFile), "Log directory must contain at least one event log file!") + require(ret.exists(isAppStatusFile), "Log directory must contain an appstatus file!") + ret + } + + private lazy val appStatusFile = files.find(isAppStatusFile).get + + private lazy val eventLogFiles: Seq[FileStatus] = { + val eventLogFiles = files.filter(isEventLogFile).sortBy { status => + val filePath = status.getPath + var idx = getEventLogFileIndex(filePath.getName).toDouble + // trick to place compacted file later than normal file if index is same. + if (EventLogFileWriter.isCompacted(filePath)) { + idx += 0.1 + } + idx + } + val filesToRead = dropBeforeLastCompactFile(eventLogFiles) + val indices = filesToRead.map { file => getEventLogFileIndex(file.getPath.getName) } + require((indices.head to indices.last) == indices, "Found missing event log file, expected" + + s" indices: ${indices.head to indices.last}, actual: ${indices}") + filesToRead + } + + override def lastIndex: Option[Long] = Some( + getEventLogFileIndex(lastEventLogFile.getPath.getName)) + + override def fileSizeForLastIndex: Long = lastEventLogFile.getLen + + override def completed: Boolean = { + !appStatusFile.getPath.getName.endsWith(EventLogFileWriter.IN_PROGRESS) + } + + override def fileSizeForLastIndexForDFS: Option[Long] = { + if (completed) { + Some(fileSizeForLastIndex) + } else { + fileSizeForDFS(lastEventLogFile.getPath) + } + } + + override def modificationTime: Long = lastEventLogFile.getModificationTime + + override def zipEventLogFiles(zipStream: ZipOutputStream): Unit = { + val dirEntryName = rootPath.getName + "/" + zipStream.putNextEntry(new ZipEntry(dirEntryName)) + files.foreach { file => + addFileAsZipEntry(zipStream, file.getPath, dirEntryName + file.getPath.getName) + } + } + + override def listEventLogFiles: Seq[FileStatus] = eventLogFiles + + override def compressionCodec: Option[String] = { + EventLogFileWriter.codecName(eventLogFiles.head.getPath) + } + + override def totalSize: Long = eventLogFiles.map(_.getLen).sum + + private def lastEventLogFile: FileStatus = eventLogFiles.last + + private def dropBeforeLastCompactFile(eventLogFiles: Seq[FileStatus]): Seq[FileStatus] = { + val lastCompactedFileIdx = eventLogFiles.lastIndexWhere { fs => + EventLogFileWriter.isCompacted(fs.getPath) + } + eventLogFiles.drop(lastCompactedFileIdx) + } +} diff --git a/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala new file mode 100644 index 0000000000000..1d58d054b7825 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileWriters.scala @@ -0,0 +1,423 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy.history + +import java.io._ +import java.net.URI +import java.nio.charset.StandardCharsets + +import org.apache.commons.compress.utils.CountingOutputStream +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileStatus, FileSystem, FSDataOutputStream, Path} +import org.apache.hadoop.fs.permission.FsPermission + +import org.apache.spark.SparkConf +import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.internal.Logging +import org.apache.spark.internal.config._ +import org.apache.spark.io.CompressionCodec +import org.apache.spark.util.Utils + +/** + * The base class of writer which will write event logs into file. + * + * The following configurable parameters are available to tune the behavior of writing: + * spark.eventLog.compress - Whether to compress logged events + * spark.eventLog.compression.codec - The codec to compress logged events + * spark.eventLog.overwrite - Whether to overwrite any existing files + * spark.eventLog.buffer.kb - Buffer size to use when writing to output streams + * + * Note that descendant classes can maintain its own parameters: refer the javadoc of each class + * for more details. + * + * NOTE: CountingOutputStream being returned by "initLogFile" counts "non-compressed" bytes. + */ +abstract class EventLogFileWriter( + appId: String, + appAttemptId : Option[String], + logBaseDir: URI, + sparkConf: SparkConf, + hadoopConf: Configuration) extends Logging { + + protected val shouldCompress = sparkConf.get(EVENT_LOG_COMPRESS) + protected val shouldOverwrite = sparkConf.get(EVENT_LOG_OVERWRITE) + protected val outputBufferSize = sparkConf.get(EVENT_LOG_OUTPUT_BUFFER_SIZE).toInt + protected val fileSystem = Utils.getHadoopFileSystem(logBaseDir, hadoopConf) + protected val compressionCodec = + if (shouldCompress) { + Some(CompressionCodec.createCodec(sparkConf, sparkConf.get(EVENT_LOG_COMPRESSION_CODEC))) + } else { + None + } + + private[history] val compressionCodecName = compressionCodec.map { c => + CompressionCodec.getShortName(c.getClass.getName) + } + + // Only defined if the file system scheme is not local + protected var hadoopDataStream: Option[FSDataOutputStream] = None + protected var writer: Option[PrintWriter] = None + + protected def requireLogBaseDirAsDirectory(): Unit = { + if (!fileSystem.getFileStatus(new Path(logBaseDir)).isDirectory) { + throw new IllegalArgumentException(s"Log directory $logBaseDir is not a directory.") + } + } + + protected def initLogFile(path: Path)(fnSetupWriter: OutputStream => PrintWriter): Unit = { + if (shouldOverwrite && fileSystem.delete(path, true)) { + logWarning(s"Event log $path already exists. Overwriting...") + } + + val defaultFs = FileSystem.getDefaultUri(hadoopConf).getScheme + val isDefaultLocal = defaultFs == null || defaultFs == "file" + val uri = path.toUri + + // The Hadoop LocalFileSystem (r1.0.4) has known issues with syncing (HADOOP-7844). + // Therefore, for local files, use FileOutputStream instead. + val dstream = + if ((isDefaultLocal && uri.getScheme == null) || uri.getScheme == "file") { + new FileOutputStream(uri.getPath) + } else { + hadoopDataStream = Some( + SparkHadoopUtil.createFile(fileSystem, path, sparkConf.get(EVENT_LOG_ALLOW_EC))) + hadoopDataStream.get + } + + try { + val cstream = compressionCodec.map(_.compressedContinuousOutputStream(dstream)) + .getOrElse(dstream) + val bstream = new BufferedOutputStream(cstream, outputBufferSize) + fileSystem.setPermission(path, EventLogFileWriter.LOG_FILE_PERMISSIONS) + logInfo(s"Logging events to $path") + writer = Some(fnSetupWriter(bstream)) + } catch { + case e: Exception => + dstream.close() + throw e + } + } + + protected def writeLine(line: String, flushLogger: Boolean = false): Unit = { + // scalastyle:off println + writer.foreach(_.println(line)) + // scalastyle:on println + if (flushLogger) { + writer.foreach(_.flush()) + hadoopDataStream.foreach(_.hflush()) + } + } + + protected def closeWriter(): Unit = { + writer.foreach(_.close()) + } + + protected def renameFile(src: Path, dest: Path, overwrite: Boolean): Unit = { + if (fileSystem.exists(dest)) { + if (overwrite) { + logWarning(s"Event log $dest already exists. Overwriting...") + if (!fileSystem.delete(dest, true)) { + logWarning(s"Error deleting $dest") + } + } else { + throw new IOException(s"Target log file already exists ($dest)") + } + } + fileSystem.rename(src, dest) + // touch file to ensure modtime is current across those filesystems where rename() + // does not set it but support setTimes() instead; it's a no-op on most object stores + try { + fileSystem.setTimes(dest, System.currentTimeMillis(), -1) + } catch { + case e: Exception => logDebug(s"failed to set time of $dest", e) + } + } + + /** initialize writer for event logging */ + def start(): Unit + + /** writes JSON format of event to file */ + def writeEvent(eventJson: String, flushLogger: Boolean = false): Unit + + /** stops writer - indicating the application has been completed */ + def stop(): Unit + + /** returns representative path of log. for tests only. */ + def logPath: String +} + +object EventLogFileWriter { + // Suffix applied to the names of files still being written by applications. + val IN_PROGRESS = ".inprogress" + val COMPACTED = ".compact" + + val LOG_FILE_PERMISSIONS = new FsPermission(Integer.parseInt("770", 8).toShort) + + def apply( + appId: String, + appAttemptId: Option[String], + logBaseDir: URI, + sparkConf: SparkConf, + hadoopConf: Configuration): EventLogFileWriter = { + if (sparkConf.get(EVENT_LOG_ENABLE_ROLLING)) { + new RollingEventLogFilesWriter(appId, appAttemptId, logBaseDir, sparkConf, hadoopConf) + } else { + new SingleEventLogFileWriter(appId, appAttemptId, logBaseDir, sparkConf, hadoopConf) + } + } + + def nameForAppAndAttempt(appId: String, appAttemptId: Option[String]): String = { + val base = Utils.sanitizeDirName(appId) + if (appAttemptId.isDefined) { + base + "_" + Utils.sanitizeDirName(appAttemptId.get) + } else { + base + } + } + + def codecName(log: Path): Option[String] = { + // Compression codec is encoded as an extension, e.g. app_123.lzf + // Since we sanitize the app ID to not include periods, it is safe to split on it + val logName = log.getName.stripSuffix(COMPACTED).stripSuffix(IN_PROGRESS) + logName.split("\\.").tail.lastOption + } + + def isCompacted(log: Path): Boolean = log.getName.endsWith(COMPACTED) +} + +/** + * The writer to write event logs into single file. + */ +class SingleEventLogFileWriter( + appId: String, + appAttemptId : Option[String], + logBaseDir: URI, + sparkConf: SparkConf, + hadoopConf: Configuration) + extends EventLogFileWriter(appId, appAttemptId, logBaseDir, sparkConf, hadoopConf) { + + override val logPath: String = SingleEventLogFileWriter.getLogPath(logBaseDir, appId, + appAttemptId, compressionCodecName) + + protected def inProgressPath = logPath + EventLogFileWriter.IN_PROGRESS + + override def start(): Unit = { + requireLogBaseDirAsDirectory() + + initLogFile(new Path(inProgressPath)) { os => + new PrintWriter(new OutputStreamWriter(os, StandardCharsets.UTF_8)) + } + } + + override def writeEvent(eventJson: String, flushLogger: Boolean = false): Unit = { + writeLine(eventJson, flushLogger) + } + + /** + * Stop logging events. The event log file will be renamed so that it loses the + * ".inprogress" suffix. + */ + override def stop(): Unit = { + closeWriter() + renameFile(new Path(inProgressPath), new Path(logPath), shouldOverwrite) + } +} + +object SingleEventLogFileWriter { + /** + * Return a file-system-safe path to the log file for the given application. + * + * Note that because we currently only create a single log file for each application, + * we must encode all the information needed to parse this event log in the file name + * instead of within the file itself. Otherwise, if the file is compressed, for instance, + * we won't know which codec to use to decompress the metadata needed to open the file in + * the first place. + * + * The log file name will identify the compression codec used for the contents, if any. + * For example, app_123 for an uncompressed log, app_123.lzf for an LZF-compressed log. + * + * @param logBaseDir Directory where the log file will be written. + * @param appId A unique app ID. + * @param appAttemptId A unique attempt id of appId. May be the empty string. + * @param compressionCodecName Name to identify the codec used to compress the contents + * of the log, or None if compression is not enabled. + * @return A path which consists of file-system-safe characters. + */ + def getLogPath( + logBaseDir: URI, + appId: String, + appAttemptId: Option[String], + compressionCodecName: Option[String] = None): String = { + val codec = compressionCodecName.map("." + _).getOrElse("") + new Path(logBaseDir).toString.stripSuffix("/") + "/" + + EventLogFileWriter.nameForAppAndAttempt(appId, appAttemptId) + codec + } +} + +/** + * The writer to write event logs into multiple log files, rolled over via configured size. + * + * The class creates one directory per application, and stores event log files as well as + * metadata files. The name of directory and files in the directory would follow: + * + * - The name of directory: eventlog_v2_appId(_[appAttemptId]) + * - The prefix of name on event files: events_[index]_[appId](_[appAttemptId])(.[codec]) + * - "index" would be monotonically increasing value (say, sequence) + * - The name of metadata (app. status) file name: appstatus_[appId](_[appAttemptId])(.inprogress) + * + * The writer will roll over the event log file when configured size is reached. Note that the + * writer doesn't check the size on file being open for write: the writer tracks the count of bytes + * written before compression is applied. + * + * For metadata files, the class will leverage zero-byte file, as it provides minimized cost. + */ +class RollingEventLogFilesWriter( + appId: String, + appAttemptId : Option[String], + logBaseDir: URI, + sparkConf: SparkConf, + hadoopConf: Configuration) + extends EventLogFileWriter(appId, appAttemptId, logBaseDir, sparkConf, hadoopConf) { + + import RollingEventLogFilesWriter._ + + private val eventFileMaxLength = sparkConf.get(EVENT_LOG_ROLLING_MAX_FILE_SIZE) + + private val logDirForAppPath = getAppEventLogDirPath(logBaseDir, appId, appAttemptId) + + private var countingOutputStream: Option[CountingOutputStream] = None + + // index and event log path will be updated soon in rollEventLogFile, which `start` will call + private var index: Long = 0L + private var currentEventLogFilePath: Path = _ + + override def start(): Unit = { + requireLogBaseDirAsDirectory() + + if (fileSystem.exists(logDirForAppPath) && shouldOverwrite) { + fileSystem.delete(logDirForAppPath, true) + } + + if (fileSystem.exists(logDirForAppPath)) { + throw new IOException(s"Target log directory already exists ($logDirForAppPath)") + } + + fileSystem.mkdirs(logDirForAppPath, EventLogFileWriter.LOG_FILE_PERMISSIONS) + createAppStatusFile(inProgress = true) + rollEventLogFile() + } + + override def writeEvent(eventJson: String, flushLogger: Boolean = false): Unit = { + writer.foreach { w => + val currentLen = countingOutputStream.get.getBytesWritten + if (currentLen + eventJson.length > eventFileMaxLength) { + rollEventLogFile() + } + } + + writeLine(eventJson, flushLogger) + } + + /** exposed for testing only */ + private[history] def rollEventLogFile(): Unit = { + closeWriter() + + index += 1 + currentEventLogFilePath = getEventLogFilePath(logDirForAppPath, appId, appAttemptId, index, + compressionCodecName) + + initLogFile(currentEventLogFilePath) { os => + countingOutputStream = Some(new CountingOutputStream(os)) + new PrintWriter( + new OutputStreamWriter(countingOutputStream.get, StandardCharsets.UTF_8)) + } + } + + override def stop(): Unit = { + closeWriter() + val appStatusPathIncomplete = getAppStatusFilePath(logDirForAppPath, appId, appAttemptId, + inProgress = true) + val appStatusPathComplete = getAppStatusFilePath(logDirForAppPath, appId, appAttemptId, + inProgress = false) + renameFile(appStatusPathIncomplete, appStatusPathComplete, overwrite = true) + } + + override def logPath: String = logDirForAppPath.toString + + private def createAppStatusFile(inProgress: Boolean): Unit = { + val appStatusPath = getAppStatusFilePath(logDirForAppPath, appId, appAttemptId, inProgress) + val outputStream = fileSystem.create(appStatusPath) + // we intentionally create zero-byte file to minimize the cost + outputStream.close() + } +} + +object RollingEventLogFilesWriter { + private[history] val EVENT_LOG_DIR_NAME_PREFIX = "eventlog_v2_" + private[history] val EVENT_LOG_FILE_NAME_PREFIX = "events_" + private[history] val APPSTATUS_FILE_NAME_PREFIX = "appstatus_" + + def getAppEventLogDirPath(logBaseDir: URI, appId: String, appAttemptId: Option[String]): Path = + new Path(new Path(logBaseDir), EVENT_LOG_DIR_NAME_PREFIX + + EventLogFileWriter.nameForAppAndAttempt(appId, appAttemptId)) + + def getAppStatusFilePath( + appLogDir: Path, + appId: String, + appAttemptId: Option[String], + inProgress: Boolean): Path = { + val base = APPSTATUS_FILE_NAME_PREFIX + + EventLogFileWriter.nameForAppAndAttempt(appId, appAttemptId) + val name = if (inProgress) base + EventLogFileWriter.IN_PROGRESS else base + new Path(appLogDir, name) + } + + def getEventLogFilePath( + appLogDir: Path, + appId: String, + appAttemptId: Option[String], + index: Long, + codecName: Option[String]): Path = { + val base = s"${EVENT_LOG_FILE_NAME_PREFIX}${index}_" + + EventLogFileWriter.nameForAppAndAttempt(appId, appAttemptId) + val codec = codecName.map("." + _).getOrElse("") + new Path(appLogDir, base + codec) + } + + def isEventLogDir(status: FileStatus): Boolean = { + status.isDirectory && status.getPath.getName.startsWith(EVENT_LOG_DIR_NAME_PREFIX) + } + + def isEventLogFile(fileName: String): Boolean = { + fileName.startsWith(EVENT_LOG_FILE_NAME_PREFIX) + } + + def isEventLogFile(status: FileStatus): Boolean = { + status.isFile && isEventLogFile(status.getPath.getName) + } + + def isAppStatusFile(status: FileStatus): Boolean = { + status.isFile && status.getPath.getName.startsWith(APPSTATUS_FILE_NAME_PREFIX) + } + + def getEventLogFileIndex(eventLogFileName: String): Long = { + require(isEventLogFile(eventLogFileName), "Not an event log file!") + val index = eventLogFileName.stripPrefix(EVENT_LOG_FILE_NAME_PREFIX).split("_")(0) + index.toLong + } +} diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala index 5f9b18ce01279..99d3eceb1121a 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala @@ -18,23 +18,21 @@ package org.apache.spark.deploy.history import java.io.{File, FileNotFoundException, IOException} +import java.lang.{Long => JLong} import java.nio.file.Files -import java.util.{Date, ServiceLoader} +import java.util.{Date, NoSuchElementException, ServiceLoader} import java.util.concurrent.{ConcurrentHashMap, ExecutorService, Future, TimeUnit} -import java.util.zip.{ZipEntry, ZipOutputStream} +import java.util.zip.ZipOutputStream import scala.collection.JavaConverters._ import scala.collection.mutable -import scala.concurrent.ExecutionException import scala.io.Source -import scala.util.Try import scala.xml.Node import com.fasterxml.jackson.annotation.JsonIgnore -import com.google.common.io.ByteStreams -import com.google.common.util.concurrent.MoreExecutors +import com.fasterxml.jackson.databind.annotation.JsonDeserialize import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} -import org.apache.hadoop.hdfs.{DFSInputStream, DistributedFileSystem} +import org.apache.hadoop.hdfs.DistributedFileSystem import org.apache.hadoop.hdfs.protocol.HdfsConstants import org.apache.hadoop.security.AccessControlException import org.fusesource.leveldbjni.internal.NativeDB @@ -47,7 +45,6 @@ import org.apache.spark.internal.config.History._ import org.apache.spark.internal.config.Status._ import org.apache.spark.internal.config.Tests.IS_TESTING import org.apache.spark.internal.config.UI._ -import org.apache.spark.io.CompressionCodec import org.apache.spark.scheduler._ import org.apache.spark.scheduler.ReplayListenerBus._ import org.apache.spark.status._ @@ -161,6 +158,29 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) new HistoryServerDiskManager(conf, path, listing, clock) } + private val fileCompactor = new EventLogFileCompactor(conf, hadoopConf, fs, + conf.get(EVENT_LOG_ROLLING_MAX_FILES_TO_RETAIN), conf.get(EVENT_LOG_COMPACTION_SCORE_THRESHOLD)) + + // Used to store the paths, which are being processed. This enable the replay log tasks execute + // asynchronously and make sure that checkForLogs would not process a path repeatedly. + private val processing = ConcurrentHashMap.newKeySet[String] + + private def isProcessing(path: Path): Boolean = { + processing.contains(path.getName) + } + + private def isProcessing(info: LogInfo): Boolean = { + processing.contains(info.logPath.split("/").last) + } + + private def processing(path: Path): Unit = { + processing.add(path.getName) + } + + private def endProcessing(path: Path): Unit = { + processing.remove(path.getName) + } + private val blacklist = new ConcurrentHashMap[String, Long] // Visible for testing @@ -196,11 +216,11 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) if (!Utils.isTesting) { ThreadUtils.newDaemonFixedThreadPool(NUM_PROCESSING_THREADS, "log-replay-executor") } else { - MoreExecutors.sameThreadExecutor() + ThreadUtils.sameThreadExecutorService } } - val initThread = initialize() + var initThread: Thread = null private[history] def initialize(): Thread = { if (!isFsInSafeMode()) { @@ -353,10 +373,11 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) val ui = SparkUI.create(None, new HistoryAppStatusStore(conf, kvstore), conf, secManager, app.info.name, HistoryServer.getAttemptURI(appId, attempt.info.attemptId), attempt.info.startTime.getTime(), attempt.info.appSparkVersion) - loadPlugins().foreach(_.setupUI(ui)) - val loadedUI = LoadedAppUI(ui) + // place the tab in UI based on the display order + loadPlugins().toSeq.sortBy(_.displayOrder).foreach(_.setupUI(ui)) + val loadedUI = LoadedAppUI(ui) synchronized { activeUIs((appId, attemptId)) = loadedUI } @@ -384,6 +405,10 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) Map("Event log directory" -> logDir.toString) ++ safeMode } + override def start(): Unit = { + initThread = initialize() + } + override def stop(): Unit = { try { if (initThread != null && initThread.isAlive()) { @@ -435,27 +460,27 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) logDebug(s"Scanning $logDir with lastScanTime==$lastScanTime") val updated = Option(fs.listStatus(new Path(logDir))).map(_.toSeq).getOrElse(Nil) - .filter { entry => - !entry.isDirectory() && - // FsHistoryProvider used to generate a hidden file which can't be read. Accidentally - // reading a garbage file is safe, but we would log an error which can be scary to - // the end-user. - !entry.getPath().getName().startsWith(".") && - !isBlacklisted(entry.getPath) - } - .filter { entry => + .filter { entry => !isBlacklisted(entry.getPath) } + .filter { entry => !isProcessing(entry.getPath) } + .flatMap { entry => EventLogFileReader(fs, entry) } + .filter { reader => try { - val info = listing.read(classOf[LogInfo], entry.getPath().toString()) + val info = listing.read(classOf[LogInfo], reader.rootPath.toString()) if (info.appId.isDefined) { // If the SHS view has a valid application, update the time the file was last seen so // that the entry is not deleted from the SHS listing. Also update the file size, in // case the code below decides we don't need to parse the log. - listing.write(info.copy(lastProcessed = newLastScanTime, fileSize = entry.getLen())) + listing.write(info.copy(lastProcessed = newLastScanTime, + fileSize = reader.fileSizeForLastIndex, + lastIndex = reader.lastIndex, + isComplete = reader.completed)) } - if (shouldReloadLog(info, entry)) { - if (info.appId.isDefined && fastInProgressParsing) { + if (shouldReloadLog(info, reader)) { + // ignore fastInProgressParsing when rolling event log is enabled on the log path, + // to ensure proceeding compaction even fastInProgressParsing is turned on. + if (info.appId.isDefined && reader.lastIndex.isEmpty && fastInProgressParsing) { // When fast in-progress parsing is on, we don't need to re-parse when the // size changes, but we do need to invalidate any existing UIs. // Also, we need to update the `lastUpdated time` to display the updated time in @@ -468,6 +493,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) attempt.info.copy(lastUpdated = new Date(newLastScanTime)), attempt.logPath, attempt.fileSize, + attempt.lastIndex, attempt.adminAcls, attempt.viewAcls, attempt.adminAclsGroups, @@ -493,56 +519,23 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) // If the file is currently not being tracked by the SHS, add an entry for it and try // to parse it. This will allow the cleaner code to detect the file as stale later on // if it was not possible to parse it. - listing.write(LogInfo(entry.getPath().toString(), newLastScanTime, LogType.EventLogs, - None, None, entry.getLen())) - entry.getLen() > 0 + listing.write(LogInfo(reader.rootPath.toString(), newLastScanTime, LogType.EventLogs, + None, None, reader.fileSizeForLastIndex, reader.lastIndex, None, + reader.completed)) + reader.fileSizeForLastIndex > 0 } } .sortWith { case (entry1, entry2) => - entry1.getModificationTime() > entry2.getModificationTime() + entry1.modificationTime > entry2.modificationTime } if (updated.nonEmpty) { - logDebug(s"New/updated attempts found: ${updated.size} ${updated.map(_.getPath)}") + logDebug(s"New/updated attempts found: ${updated.size} ${updated.map(_.rootPath)}") } - val tasks = updated.flatMap { entry => - try { - val task: Future[Unit] = replayExecutor.submit( - () => mergeApplicationListing(entry, newLastScanTime, true)) - Some(task -> entry.getPath) - } catch { - // let the iteration over the updated entries break, since an exception on - // replayExecutor.submit (..) indicates the ExecutorService is unable - // to take any more submissions at this time - case e: Exception => - logError(s"Exception while submitting event log for replay", e) - None - } - } - - pendingReplayTasksCount.addAndGet(tasks.size) - - // Wait for all tasks to finish. This makes sure that checkForLogs - // is not scheduled again while some tasks are already running in - // the replayExecutor. - tasks.foreach { case (task, path) => - try { - task.get() - } catch { - case e: InterruptedException => - throw e - case e: ExecutionException if e.getCause.isInstanceOf[AccessControlException] => - // We don't have read permissions on the log file - logWarning(s"Unable to read log $path", e.getCause) - blacklist(path) - // SPARK-28157 We should remove this blacklisted entry from the KVStore - // to handle permission-only changes with the same file sizes later. - listing.delete(classOf[LogInfo], path.toString) - case e: Exception => - logError("Exception while merging application listings", e) - } finally { - pendingReplayTasksCount.decrementAndGet() + updated.foreach { entry => + submitLogProcessTask(entry.rootPath) { () => + mergeApplicationListing(entry, newLastScanTime, true) } } @@ -557,7 +550,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) .last(newLastScanTime - 1) .asScala .toList - stale.foreach { log => + stale.filterNot(isProcessing).foreach { log => log.appId.foreach { appId => cleanAppData(appId, log.attemptId, log.logPath) listing.delete(classOf[LogInfo], log.logPath) @@ -570,22 +563,26 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) } } - private[history] def shouldReloadLog(info: LogInfo, entry: FileStatus): Boolean = { - var result = info.fileSize < entry.getLen - if (!result && info.logPath.endsWith(EventLoggingListener.IN_PROGRESS)) { - try { - result = Utils.tryWithResource(fs.open(entry.getPath)) { in => - in.getWrappedStream match { - case dfsIn: DFSInputStream => info.fileSize < dfsIn.getFileLength - case _ => false - } + private[history] def shouldReloadLog(info: LogInfo, reader: EventLogFileReader): Boolean = { + if (info.isComplete != reader.completed) { + true + } else { + var result = if (info.lastIndex.isDefined) { + require(reader.lastIndex.isDefined) + info.lastIndex.get < reader.lastIndex.get || info.fileSize < reader.fileSizeForLastIndex + } else { + info.fileSize < reader.fileSizeForLastIndex + } + if (!result && !reader.completed) { + try { + result = reader.fileSizeForLastIndexForDFS.exists(info.fileSize < _) + } catch { + case e: Exception => + logDebug(s"Failed to check the length for the file : ${info.logPath}", e) } - } catch { - case e: Exception => - logDebug(s"Failed to check the length for the file : ${info.logPath}", e) } + result } - result } private def cleanAppData(appId: String, attemptId: Option[String], logPath: String): Unit = { @@ -632,23 +629,6 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) attemptId: Option[String], zipStream: ZipOutputStream): Unit = { - /** - * This method compresses the files passed in, and writes the compressed data out into the - * [[OutputStream]] passed in. Each file is written as a new [[ZipEntry]] with its name being - * the name of the file being compressed. - */ - def zipFileToStream(file: Path, entryName: String, outputStream: ZipOutputStream): Unit = { - val fs = file.getFileSystem(hadoopConf) - val inputStream = fs.open(file, 1 * 1024 * 1024) // 1MB Buffer - try { - outputStream.putNextEntry(new ZipEntry(entryName)) - ByteStreams.copy(inputStream, outputStream) - outputStream.closeEntry() - } finally { - inputStream.close() - } - } - val app = try { load(appId) } catch { @@ -661,22 +641,68 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) attemptId .map { id => app.attempts.filter(_.info.attemptId == Some(id)) } .getOrElse(app.attempts) - .map(_.logPath) - .foreach { log => - zipFileToStream(new Path(logDir, log), log, zipStream) + .foreach { attempt => + val reader = EventLogFileReader(fs, new Path(logDir, attempt.logPath), + attempt.lastIndex) + reader.zipEventLogFiles(zipStream) } } finally { zipStream.close() } } + private def mergeApplicationListing( + reader: EventLogFileReader, + scanTime: Long, + enableOptimizations: Boolean): Unit = { + val rootPath = reader.rootPath + var succeeded = false + try { + val lastEvaluatedForCompaction: Option[Long] = try { + listing.read(classOf[LogInfo], rootPath.toString).lastEvaluatedForCompaction + } catch { + case _: NoSuchElementException => None + } + + pendingReplayTasksCount.incrementAndGet() + doMergeApplicationListing(reader, scanTime, enableOptimizations, lastEvaluatedForCompaction) + if (conf.get(CLEANER_ENABLED)) { + checkAndCleanLog(rootPath.toString) + } + + succeeded = true + } catch { + case e: InterruptedException => + throw e + case e: AccessControlException => + // We don't have read permissions on the log file + logWarning(s"Unable to read log $rootPath", e) + blacklist(rootPath) + // SPARK-28157 We should remove this blacklisted entry from the KVStore + // to handle permission-only changes with the same file sizes later. + listing.delete(classOf[LogInfo], rootPath.toString) + case e: Exception => + logError("Exception while merging application listings", e) + } finally { + endProcessing(rootPath) + pendingReplayTasksCount.decrementAndGet() + + // triggering another task for compaction task only if it succeeds + if (succeeded) { + submitLogProcessTask(rootPath) { () => compact(reader) } + } + } + } + /** * Replay the given log file, saving the application in the listing db. + * Visable for testing */ - protected def mergeApplicationListing( - fileStatus: FileStatus, + private[history] def doMergeApplicationListing( + reader: EventLogFileReader, scanTime: Long, - enableOptimizations: Boolean): Unit = { + enableOptimizations: Boolean, + lastEvaluatedForCompaction: Option[Long]): Unit = { val eventsFilter: ReplayEventsFilter = { eventString => eventString.startsWith(APPL_START_EVENT_PREFIX) || eventString.startsWith(APPL_END_EVENT_PREFIX) || @@ -684,8 +710,8 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) eventString.startsWith(ENV_UPDATE_EVENT_PREFIX) } - val logPath = fileStatus.getPath() - val appCompleted = isCompleted(logPath.getName()) + val logPath = reader.rootPath + val appCompleted = reader.completed val reparseChunkSize = conf.get(END_EVENT_REPARSE_CHUNK_SIZE) // Enable halt support in listener if: @@ -695,13 +721,12 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) ((!appCompleted && fastInProgressParsing) || reparseChunkSize > 0) val bus = new ReplayListenerBus() - val listener = new AppListingListener(fileStatus, clock, shouldHalt) + val listener = new AppListingListener(reader, clock, shouldHalt) bus.addListener(listener) logInfo(s"Parsing $logPath for listing data...") - Utils.tryWithResource(EventLoggingListener.openEventLog(logPath, fs)) { in => - bus.replay(in, logPath.toString, !appCompleted, eventsFilter) - } + val logFiles = reader.listEventLogFiles + parseAppEventLogs(logFiles, bus, !appCompleted, eventsFilter) // If enabled above, the listing listener will halt parsing when there's enough information to // create a listing entry. When the app is completed, or fast parsing is disabled, we still need @@ -723,8 +748,9 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) // current position is, since the replay listener bus buffers data internally. val lookForEndEvent = shouldHalt && (appCompleted || !fastInProgressParsing) if (lookForEndEvent && listener.applicationInfo.isDefined) { - Utils.tryWithResource(EventLoggingListener.openEventLog(logPath, fs)) { in => - val target = fileStatus.getLen() - reparseChunkSize + val lastFile = logFiles.last + Utils.tryWithResource(EventLogFileReader.openEventLog(lastFile.getPath, fs)) { in => + val target = lastFile.getLen - reparseChunkSize if (target > 0) { logInfo(s"Looking for end event; skipping $target bytes from $logPath...") var skipped = 0L @@ -741,7 +767,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) source.next() } - bus.replay(source, logPath.toString, !appCompleted, eventsFilter) + bus.replay(source, lastFile.getPath.toString, !appCompleted, eventsFilter) } } @@ -754,12 +780,15 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) invalidateUI(app.info.id, app.attempts.head.info.attemptId) addListing(app) listing.write(LogInfo(logPath.toString(), scanTime, LogType.EventLogs, Some(app.info.id), - app.attempts.head.info.attemptId, fileStatus.getLen())) + app.attempts.head.info.attemptId, reader.fileSizeForLastIndex, reader.lastIndex, + lastEvaluatedForCompaction, reader.completed)) // For a finished log, remove the corresponding "in progress" entry from the listing DB if // the file is really gone. - if (appCompleted) { - val inProgressLog = logPath.toString() + EventLoggingListener.IN_PROGRESS + // The logic is only valid for single event log, as root path doesn't change for + // rolled event logs. + if (appCompleted && reader.lastIndex.isEmpty) { + val inProgressLog = logPath.toString() + EventLogFileWriter.IN_PROGRESS try { // Fetch the entry first to avoid an RPC when it's already removed. listing.read(classOf[LogInfo], inProgressLog) @@ -776,14 +805,49 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) // mean the end event is before the configured threshold, so call the method again to // re-parse the whole log. logInfo(s"Reparsing $logPath since end event was not found.") - mergeApplicationListing(fileStatus, scanTime, false) + doMergeApplicationListing(reader, scanTime, enableOptimizations = false, + lastEvaluatedForCompaction) case _ => // If the app hasn't written down its app ID to the logs, still record the entry in the // listing db, with an empty ID. This will make the log eligible for deletion if the app // does not make progress after the configured max log age. listing.write( - LogInfo(logPath.toString(), scanTime, LogType.EventLogs, None, None, fileStatus.getLen())) + LogInfo(logPath.toString(), scanTime, LogType.EventLogs, None, None, + reader.fileSizeForLastIndex, reader.lastIndex, lastEvaluatedForCompaction, + reader.completed)) + } + } + + private def compact(reader: EventLogFileReader): Unit = { + val rootPath = reader.rootPath + try { + reader.lastIndex match { + case Some(lastIndex) => + try { + val info = listing.read(classOf[LogInfo], reader.rootPath.toString) + if (info.lastEvaluatedForCompaction.isEmpty || + info.lastEvaluatedForCompaction.get < lastIndex) { + // haven't tried compaction for this index, do compaction + fileCompactor.compact(reader.listEventLogFiles) + listing.write(info.copy(lastEvaluatedForCompaction = Some(lastIndex))) + } + } catch { + case _: NoSuchElementException => + // this should exist, but ignoring doesn't hurt much + } + + case None => // This is not applied to single event log file. + } + } catch { + case e: InterruptedException => + throw e + case e: AccessControlException => + logWarning(s"Insufficient permission while compacting log for $rootPath", e) + case e: Exception => + logError(s"Exception while compacting log for $rootPath", e) + } finally { + endProcessing(rootPath) } } @@ -800,6 +864,30 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) } } + /** + * Check and delete specified event log according to the max log age defined by the user. + */ + private[history] def checkAndCleanLog(logPath: String): Unit = Utils.tryLog { + val maxTime = clock.getTimeMillis() - conf.get(MAX_LOG_AGE_S) * 1000 + val log = listing.read(classOf[LogInfo], logPath) + + if (log.lastProcessed <= maxTime && log.appId.isEmpty) { + logInfo(s"Deleting invalid / corrupt event log ${log.logPath}") + deleteLog(fs, new Path(log.logPath)) + listing.delete(classOf[LogInfo], log.logPath) + } + + log.appId.foreach { appId => + val app = listing.read(classOf[ApplicationInfoWrapper], appId) + if (app.oldestAttempt() <= maxTime) { + val (remaining, toDelete) = app.attempts.partition { attempt => + attempt.info.lastUpdated.getTime() >= maxTime + } + deleteAttemptLogs(app, remaining, toDelete) + } + } + } + /** * Delete event logs from the log directory according to the clean policy defined by the user. */ @@ -829,7 +917,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) .asScala .filter { l => l.logType == null || l.logType == LogType.EventLogs } .toList - stale.foreach { log => + stale.filterNot(isProcessing).foreach { log => if (log.appId.isEmpty) { logInfo(s"Deleting invalid / corrupt event log ${log.logPath}") deleteLog(fs, new Path(log.logPath)) @@ -918,7 +1006,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) case e: NoSuchElementException => // For every new driver log file discovered, create a new entry in listing listing.write(LogInfo(f.getPath().toString(), currentTime, LogType.DriverLogs, None, - None, f.getLen())) + None, f.getLen(), None, None, false)) false } if (deleteFile) { @@ -937,7 +1025,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) .asScala .filter { l => l.logType != null && l.logType == LogType.DriverLogs } .toList - stale.foreach { log => + stale.filterNot(isProcessing).foreach { log => logInfo(s"Deleting invalid driver log ${log.logPath}") listing.delete(classOf[LogInfo], log.logPath) deleteLog(driverLogFs, new Path(log.logPath)) @@ -945,11 +1033,11 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) } /** - * Rebuilds the application state store from its event log. + * Rebuilds the application state store from its event log. Exposed for testing. */ - private def rebuildAppStore( + private[spark] def rebuildAppStore( store: KVStore, - eventLog: FileStatus, + reader: EventLogFileReader, lastUpdated: Long): Unit = { // Disable async updates, since they cause higher memory usage, and it's ok to take longer // to parse the event logs in the SHS. @@ -966,13 +1054,11 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) } replayBus.addListener(listener) try { - val path = eventLog.getPath() - logInfo(s"Parsing $path to re-build UI...") - Utils.tryWithResource(EventLoggingListener.openEventLog(path, fs)) { in => - replayBus.replay(in, path.toString(), maybeTruncated = !isCompleted(path.toString())) - } + val eventLogFiles = reader.listEventLogFiles + logInfo(s"Parsing ${reader.rootPath} to re-build UI...") + parseAppEventLogs(eventLogFiles, replayBus, !reader.completed) trackingStore.close(false) - logInfo(s"Finished parsing $path") + logInfo(s"Finished parsing ${reader.rootPath}") } catch { case e: Exception => Utils.tryLogNonFatalError { @@ -982,6 +1068,23 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) } } + private def parseAppEventLogs( + logFiles: Seq[FileStatus], + replayBus: ReplayListenerBus, + maybeTruncated: Boolean, + eventsFilter: ReplayEventsFilter = SELECT_ALL_FILTER): Unit = { + // stop replaying next log files if ReplayListenerBus indicates some error or halt + var continueReplay = true + logFiles.foreach { file => + if (continueReplay) { + Utils.tryWithResource(EventLogFileReader.openEventLog(file.getPath, fs)) { in => + continueReplay = replayBus.replay(in, file.getPath.toString, + maybeTruncated = maybeTruncated, eventsFilter = eventsFilter) + } + } + } + } + /** * Checks whether HDFS is in safe mode. * @@ -1063,30 +1166,60 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) // At this point the disk data either does not exist or was deleted because it failed to // load, so the event log needs to be replayed. - val status = fs.getFileStatus(new Path(logDir, attempt.logPath)) - val isCompressed = EventLoggingListener.codecName(status.getPath()).flatMap { name => - Try(CompressionCodec.getShortName(name)).toOption - }.isDefined - logInfo(s"Leasing disk manager space for app $appId / ${attempt.info.attemptId}...") - val lease = dm.lease(status.getLen(), isCompressed) - val newStorePath = try { - Utils.tryWithResource(KVUtils.open(lease.tmpPath, metadata)) { store => - rebuildAppStore(store, status, attempt.info.lastUpdated.getTime()) + + var retried = false + var newStorePath: File = null + while (newStorePath == null) { + val reader = EventLogFileReader(fs, new Path(logDir, attempt.logPath), + attempt.lastIndex) + val isCompressed = reader.compressionCodec.isDefined + logInfo(s"Leasing disk manager space for app $appId / ${attempt.info.attemptId}...") + val lease = dm.lease(reader.totalSize, isCompressed) + try { + Utils.tryWithResource(KVUtils.open(lease.tmpPath, metadata)) { store => + rebuildAppStore(store, reader, attempt.info.lastUpdated.getTime()) + } + newStorePath = lease.commit(appId, attempt.info.attemptId) + } catch { + case _: IOException if !retried => + // compaction may touch the file(s) which app rebuild wants to read + // compaction wouldn't run in short interval, so try again... + logWarning(s"Exception occurred while rebuilding app $appId - trying again...") + lease.rollback() + retried = true + + case e: Exception => + lease.rollback() + throw e } - lease.commit(appId, attempt.info.attemptId) - } catch { - case e: Exception => - lease.rollback() - throw e } KVUtils.open(newStorePath, metadata) } private def createInMemoryStore(attempt: AttemptInfoWrapper): KVStore = { - val store = new InMemoryStore() - val status = fs.getFileStatus(new Path(logDir, attempt.logPath)) - rebuildAppStore(store, status, attempt.info.lastUpdated.getTime()) + var retried = false + var store: KVStore = null + while (store == null) { + try { + val s = new InMemoryStore() + val reader = EventLogFileReader(fs, new Path(logDir, attempt.logPath), + attempt.lastIndex) + rebuildAppStore(s, reader, attempt.info.lastUpdated.getTime()) + store = s + } catch { + case _: IOException if !retried => + // compaction may touch the file(s) which app rebuild wants to read + // compaction wouldn't run in short interval, so try again... + logWarning(s"Exception occurred while rebuilding log path ${attempt.logPath} - " + + "trying again...") + retried = true + + case e: Exception => + throw e + } + } + store } @@ -1117,10 +1250,20 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock) deleted } - private def isCompleted(name: String): Boolean = { - !name.endsWith(EventLoggingListener.IN_PROGRESS) + /** NOTE: 'task' should ensure it executes 'endProcessing' at the end */ + private def submitLogProcessTask(rootPath: Path)(task: Runnable): Unit = { + try { + processing(rootPath) + replayExecutor.submit(task) + } catch { + // let the iteration over the updated entries break, since an exception on + // replayExecutor.submit (..) indicates the ExecutorService is unable + // to take any more submissions at this time + case e: Exception => + logError(s"Exception while submitting task", e) + endProcessing(rootPath) + } } - } private[history] object FsHistoryProvider { @@ -1161,12 +1304,19 @@ private[history] case class LogInfo( logType: LogType.Value, appId: Option[String], attemptId: Option[String], - fileSize: Long) + fileSize: Long, + @JsonDeserialize(contentAs = classOf[JLong]) + lastIndex: Option[Long], + @JsonDeserialize(contentAs = classOf[JLong]) + lastEvaluatedForCompaction: Option[Long], + isComplete: Boolean) private[history] class AttemptInfoWrapper( val info: ApplicationAttemptInfo, val logPath: String, val fileSize: Long, + @JsonDeserialize(contentAs = classOf[JLong]) + val lastIndex: Option[Long], val adminAcls: Option[String], val viewAcls: Option[String], val adminAclsGroups: Option[String], @@ -1190,12 +1340,13 @@ private[history] class ApplicationInfoWrapper( } private[history] class AppListingListener( - log: FileStatus, + reader: EventLogFileReader, clock: Clock, haltEnabled: Boolean) extends SparkListener { private val app = new MutableApplicationInfo() - private val attempt = new MutableAttemptInfo(log.getPath().getName(), log.getLen()) + private val attempt = new MutableAttemptInfo(reader.rootPath.getName(), + reader.fileSizeForLastIndex, reader.lastIndex) private var gotEnvUpdate = false private var halted = false @@ -1214,7 +1365,7 @@ private[history] class AppListingListener( override def onApplicationEnd(event: SparkListenerApplicationEnd): Unit = { attempt.endTime = new Date(event.time) - attempt.lastUpdated = new Date(log.getModificationTime()) + attempt.lastUpdated = new Date(reader.modificationTime) attempt.duration = event.time - attempt.startTime.getTime() attempt.completed = true } @@ -1280,7 +1431,7 @@ private[history] class AppListingListener( } - private class MutableAttemptInfo(logPath: String, fileSize: Long) { + private class MutableAttemptInfo(logPath: String, fileSize: Long, lastIndex: Option[Long]) { var attemptId: Option[String] = None var startTime = new Date(-1) var endTime = new Date(-1) @@ -1309,6 +1460,7 @@ private[history] class AppListingListener( apiInfo, logPath, fileSize, + lastIndex, adminAcls, viewAcls, adminAclsGroups, diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala index 878f0cb632c5a..62cac261ae014 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala @@ -135,7 +135,7 @@ class HistoryServer( * This starts a background thread that periodically synchronizes information displayed on * this UI with the event logs in the provided base directory. */ - def initialize() { + def initialize(): Unit = { attachPage(new HistoryPage(this)) attachHandler(ApiRootResource.getServletHandler(this)) @@ -149,12 +149,12 @@ class HistoryServer( } /** Bind to the HTTP server behind this web interface. */ - override def bind() { + override def bind(): Unit = { super.bind() } /** Stop the server and close the file system. */ - override def stop() { + override def stop(): Unit = { super.stop() provider.stop() } @@ -164,7 +164,7 @@ class HistoryServer( appId: String, attemptId: Option[String], ui: SparkUI, - completed: Boolean) { + completed: Boolean): Unit = { assert(serverInfo.isDefined, "HistoryServer must be bound before attaching SparkUIs") ui.getHandlers.foreach { handler => serverInfo.get.addHandler(handler, ui.securityManager) @@ -297,6 +297,7 @@ object HistoryServer extends Logging { val server = new HistoryServer(conf, provider, securityManager, port) server.bind() + provider.start() ShutdownHookManager.addShutdownHook { () => server.stop() } @@ -326,7 +327,7 @@ object HistoryServer extends Logging { new SecurityManager(config) } - def initSecurity() { + def initSecurity(): Unit = { // If we are accessing HDFS and it has security enabled (Kerberos), we have to login // from a keytab file so that we can access HDFS beyond the kerberos ticket expiration. // As long as it is using Hadoop rpc (hdfs://), a relogin will automatically diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala index dec89769c030b..01cc59e1d2e6e 100644 --- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerArguments.scala @@ -52,7 +52,7 @@ private[history] class HistoryServerArguments(conf: SparkConf, args: Array[Strin // This mutates the SparkConf, so all accesses to it must be made after this line Utils.loadDefaultSparkProperties(conf, propertiesFile) - private def printUsageAndExit(exitCode: Int) { + private def printUsageAndExit(exitCode: Int): Unit = { // scalastyle:off println System.err.println( """ diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala index 6c56807458b27..03965e6dbbf31 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ApplicationInfo.scala @@ -57,7 +57,7 @@ private[spark] class ApplicationInfo( init() } - private def init() { + private def init(): Unit = { state = ApplicationState.WAITING executors = new mutable.HashMap[Int, ExecutorDesc] coresGranted = 0 @@ -92,7 +92,7 @@ private[spark] class ApplicationInfo( exec } - private[master] def removeExecutor(exec: ExecutorDesc) { + private[master] def removeExecutor(exec: ExecutorDesc): Unit = { if (executors.contains(exec.id)) { removedExecutors += executors(exec.id) executors -= exec.id @@ -115,7 +115,7 @@ private[spark] class ApplicationInfo( private[master] def resetRetryCount() = _retryCount = 0 - private[master] def markFinished(endState: ApplicationState.Value) { + private[master] def markFinished(endState: ApplicationState.Value): Unit = { state = endState endTime = System.currentTimeMillis() } diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ExecutorDesc.scala b/core/src/main/scala/org/apache/spark/deploy/master/ExecutorDesc.scala index a8f8492561115..a598d2a1ddd76 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ExecutorDesc.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ExecutorDesc.scala @@ -33,7 +33,7 @@ private[master] class ExecutorDesc( var state = ExecutorState.LAUNCHING /** Copy all state (non-val) variables from the given on-the-wire ExecutorDescription. */ - def copyState(execDesc: ExecutorDescription) { + def copyState(execDesc: ExecutorDescription): Unit = { state = execDesc.state } diff --git a/core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala b/core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala index f2b5ea7e23ec1..ba949e2630e43 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/FileSystemPersistenceEngine.scala @@ -56,7 +56,7 @@ private[master] class FileSystemPersistenceEngine( files.map(deserializeFromFile[T]) } - private def serializeIntoFile(file: File, value: AnyRef) { + private def serializeIntoFile(file: File, value: AnyRef): Unit = { val created = file.createNewFile() if (!created) { throw new IllegalStateException("Could not create file: " + file) } val fileOut = new FileOutputStream(file) diff --git a/core/src/main/scala/org/apache/spark/deploy/master/LeaderElectionAgent.scala b/core/src/main/scala/org/apache/spark/deploy/master/LeaderElectionAgent.scala index 52e2854961eda..5bdfd18f37cd0 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/LeaderElectionAgent.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/LeaderElectionAgent.scala @@ -27,7 +27,7 @@ import org.apache.spark.annotation.DeveloperApi @DeveloperApi trait LeaderElectionAgent { val masterInstance: LeaderElectable - def stop() {} // to avoid noops in implementations. + def stop(): Unit = {} // to avoid noops in implementations. } @DeveloperApi diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala index 5588dc8cff47a..71df5dfa423a9 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala @@ -192,7 +192,7 @@ private[deploy] class Master( leaderElectionAgent = leaderElectionAgent_ } - override def onStop() { + override def onStop(): Unit = { masterMetricsSystem.report() applicationMetricsSystem.report() // prevent the CompleteRecovery message sending to restarted master @@ -211,11 +211,11 @@ private[deploy] class Master( leaderElectionAgent.stop() } - override def electedLeader() { + override def electedLeader(): Unit = { self.send(ElectedLeader) } - override def revokedLeadership() { + override def revokedLeadership(): Unit = { self.send(RevokedLeadership) } @@ -243,6 +243,15 @@ private[deploy] class Master( logError("Leadership has been revoked -- master shutting down.") System.exit(0) + case WorkerDecommission(id, workerRef) => + logInfo("Recording worker %s decommissioning".format(id)) + if (state == RecoveryState.STANDBY) { + workerRef.send(MasterInStandby) + } else { + // We use foreach since get gives us an option and we can skip the failures. + idToWorker.get(id).foreach(decommissionWorker) + } + case RegisterWorker( id, workerHost, workerPort, workerRef, cores, memory, workerWebUiUrl, masterAddress, resources) => @@ -313,7 +322,9 @@ private[deploy] class Master( // Only retry certain number of times so we don't go into an infinite loop. // Important note: this code path is not exercised by tests, so be very careful when // changing this `if` condition. + // We also don't count failures from decommissioned workers since they are "expected." if (!normalExit + && oldState != ExecutorState.DECOMMISSIONED && appInfo.incrementRetryCount() >= maxExecutorRetries && maxExecutorRetries >= 0) { // < 0 disables this application-killing path val execs = appInfo.executors.values @@ -529,7 +540,7 @@ private[deploy] class Master( apps.count(_.state == ApplicationState.UNKNOWN) == 0 private def beginRecovery(storedApps: Seq[ApplicationInfo], storedDrivers: Seq[DriverInfo], - storedWorkers: Seq[WorkerInfo]) { + storedWorkers: Seq[WorkerInfo]): Unit = { for (app <- storedApps) { logInfo("Trying to recover app: " + app.id) try { @@ -559,7 +570,7 @@ private[deploy] class Master( } } - private def completeRecovery() { + private def completeRecovery(): Unit = { // Ensure "only-once" recovery semantics using a short synchronization period. if (state != RecoveryState.RECOVERING) { return } state = RecoveryState.COMPLETING_RECOVERY @@ -850,7 +861,27 @@ private[deploy] class Master( true } - private def removeWorker(worker: WorkerInfo, msg: String) { + private def decommissionWorker(worker: WorkerInfo): Unit = { + if (worker.state != WorkerState.DECOMMISSIONED) { + logInfo("Decommissioning worker %s on %s:%d".format(worker.id, worker.host, worker.port)) + worker.setState(WorkerState.DECOMMISSIONED) + for (exec <- worker.executors.values) { + logInfo("Telling app of decommission executors") + exec.application.driver.send(ExecutorUpdated( + exec.id, ExecutorState.DECOMMISSIONED, + Some("worker decommissioned"), None, workerLost = false)) + exec.state = ExecutorState.DECOMMISSIONED + exec.application.removeExecutor(exec) + } + // On recovery do not add a decommissioned executor + persistenceEngine.removeWorker(worker) + } else { + logWarning("Skipping decommissioning worker %s on %s:%d as worker is already decommissioned". + format(worker.id, worker.host, worker.port)) + } + } + + private def removeWorker(worker: WorkerInfo, msg: String): Unit = { logInfo("Removing worker " + worker.id + " on " + worker.host + ":" + worker.port) worker.setState(WorkerState.DEAD) idToWorker -= worker.id @@ -879,7 +910,7 @@ private[deploy] class Master( persistenceEngine.removeWorker(worker) } - private def relaunchDriver(driver: DriverInfo) { + private def relaunchDriver(driver: DriverInfo): Unit = { // We must setup a new driver with a new driver id here, because the original driver may // be still running. Consider this scenario: a worker is network partitioned with master, // the master then relaunches driver driverID1 with a driver id driverID2, then the worker @@ -919,11 +950,11 @@ private[deploy] class Master( waitingApps += app } - private def finishApplication(app: ApplicationInfo) { + private def finishApplication(app: ApplicationInfo): Unit = { removeApplication(app, ApplicationState.FINISHED) } - def removeApplication(app: ApplicationInfo, state: ApplicationState.Value) { + def removeApplication(app: ApplicationInfo, state: ApplicationState.Value): Unit = { if (apps.contains(app)) { logInfo("Removing app " + app.id) apps -= app @@ -1047,7 +1078,7 @@ private[deploy] class Master( } /** Check for, and remove, any timed-out workers */ - private def timeOutDeadWorkers() { + private def timeOutDeadWorkers(): Unit = { // Copy the workers into an array so we don't modify the hashset while iterating through it val currentTime = System.currentTimeMillis() val toRemove = workers.filter(_.lastHeartbeat < currentTime - workerTimeoutMs).toArray @@ -1077,7 +1108,7 @@ private[deploy] class Master( new DriverInfo(now, newDriverId(date), desc, date) } - private def launchDriver(worker: WorkerInfo, driver: DriverInfo) { + private def launchDriver(worker: WorkerInfo, driver: DriverInfo): Unit = { logInfo("Launching driver " + driver.id + " on worker " + worker.id) worker.addDriver(driver) driver.worker = Some(worker) @@ -1088,7 +1119,7 @@ private[deploy] class Master( private def removeDriver( driverId: String, finalState: DriverState, - exception: Option[Exception]) { + exception: Option[Exception]): Unit = { drivers.find(d => d.id == driverId) match { case Some(driver) => logInfo(s"Removing driver: $driverId") @@ -1113,7 +1144,7 @@ private[deploy] object Master extends Logging { val SYSTEM_NAME = "sparkMaster" val ENDPOINT_NAME = "Master" - def main(argStrings: Array[String]) { + def main(argStrings: Array[String]): Unit = { Thread.setDefaultUncaughtExceptionHandler(new SparkUncaughtExceptionHandler( exitOnUncaughtException = false)) Utils.initDaemon(log) diff --git a/core/src/main/scala/org/apache/spark/deploy/master/MasterArguments.scala b/core/src/main/scala/org/apache/spark/deploy/master/MasterArguments.scala index cd31bbdcfab59..045a3da74dcd0 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/MasterArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/MasterArguments.scala @@ -94,7 +94,7 @@ private[master] class MasterArguments(args: Array[String], conf: SparkConf) exte /** * Print usage and exit JVM with the given exit code. */ - private def printUsageAndExit(exitCode: Int) { + private def printUsageAndExit(exitCode: Int): Unit = { // scalastyle:off println System.err.println( "Usage: Master [options]\n" + diff --git a/core/src/main/scala/org/apache/spark/deploy/master/PersistenceEngine.scala b/core/src/main/scala/org/apache/spark/deploy/master/PersistenceEngine.scala index b30bc821b7324..9a695e15a9cea 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/PersistenceEngine.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/PersistenceEngine.scala @@ -88,7 +88,7 @@ abstract class PersistenceEngine { } } - def close() {} + def close(): Unit = {} } private[master] class BlackHolePersistenceEngine extends PersistenceEngine { diff --git a/core/src/main/scala/org/apache/spark/deploy/master/WorkerInfo.scala b/core/src/main/scala/org/apache/spark/deploy/master/WorkerInfo.scala index a33b15354efea..0137e2be74720 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/WorkerInfo.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/WorkerInfo.scala @@ -18,9 +18,7 @@ package org.apache.spark.deploy.master import scala.collection.mutable -import scala.reflect.ClassTag -import org.apache.spark.deploy.StandaloneResourceUtils.MutableResourceInfo import org.apache.spark.resource.{ResourceAllocator, ResourceInformation, ResourceRequirement} import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.util.Utils @@ -30,6 +28,7 @@ private[spark] case class WorkerResourceInfo(name: String, addresses: Seq[String override protected def resourceName = this.name override protected def resourceAddresses = this.addresses + override protected def slotsPerAddress: Int = 1 def acquire(amount: Int): ResourceInformation = { val allocated = availableAddrs.take(amount) @@ -93,7 +92,7 @@ private[spark] class WorkerInfo( init() } - private def init() { + private def init(): Unit = { executors = new mutable.HashMap drivers = new mutable.HashMap state = WorkerState.ALIVE @@ -107,13 +106,13 @@ private[spark] class WorkerInfo( host + ":" + port } - def addExecutor(exec: ExecutorDesc) { + def addExecutor(exec: ExecutorDesc): Unit = { executors(exec.fullId) = exec coresUsed += exec.cores memoryUsed += exec.memory } - def removeExecutor(exec: ExecutorDesc) { + def removeExecutor(exec: ExecutorDesc): Unit = { if (executors.contains(exec.fullId)) { executors -= exec.fullId coresUsed -= exec.cores @@ -126,13 +125,13 @@ private[spark] class WorkerInfo( executors.values.exists(_.application == app) } - def addDriver(driver: DriverInfo) { + def addDriver(driver: DriverInfo): Unit = { drivers(driver.id) = driver memoryUsed += driver.desc.mem coresUsed += driver.desc.cores } - def removeDriver(driver: DriverInfo) { + def removeDriver(driver: DriverInfo): Unit = { drivers -= driver.id memoryUsed -= driver.desc.mem coresUsed -= driver.desc.cores diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperLeaderElectionAgent.scala b/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperLeaderElectionAgent.scala index 47f309144bdc0..d4ae977b19f4b 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperLeaderElectionAgent.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperLeaderElectionAgent.scala @@ -36,7 +36,7 @@ private[master] class ZooKeeperLeaderElectionAgent(val masterInstance: LeaderEle start() - private def start() { + private def start(): Unit = { logInfo("Starting ZooKeeper LeaderElection agent") zk = SparkCuratorUtil.newClient(conf) leaderLatch = new LeaderLatch(zk, workingDir) @@ -44,12 +44,12 @@ private[master] class ZooKeeperLeaderElectionAgent(val masterInstance: LeaderEle leaderLatch.start() } - override def stop() { + override def stop(): Unit = { leaderLatch.close() zk.close() } - override def isLeader() { + override def isLeader(): Unit = { synchronized { // could have lost leadership by now. if (!leaderLatch.hasLeadership) { @@ -61,7 +61,7 @@ private[master] class ZooKeeperLeaderElectionAgent(val masterInstance: LeaderEle } } - override def notLeader() { + override def notLeader(): Unit = { synchronized { // could have gained leadership by now. if (leaderLatch.hasLeadership) { @@ -73,7 +73,7 @@ private[master] class ZooKeeperLeaderElectionAgent(val masterInstance: LeaderEle } } - private def updateLeadershipStatus(isLeader: Boolean) { + private def updateLeadershipStatus(isLeader: Boolean): Unit = { if (isLeader && status == LeadershipStatus.NOT_LEADER) { status = LeadershipStatus.LEADER masterInstance.electedLeader() diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala b/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala index 73dd0de017960..8eae445b439d9 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala @@ -55,11 +55,11 @@ private[master] class ZooKeeperPersistenceEngine(conf: SparkConf, val serializer .filter(_.startsWith(prefix)).flatMap(deserializeFromFile[T]) } - override def close() { + override def close(): Unit = { zk.close() } - private def serializeIntoFile(path: String, value: AnyRef) { + private def serializeIntoFile(path: String, value: AnyRef): Unit = { val serialized = serializer.newInstance().serialize(value) val bytes = new Array[Byte](serialized.remaining()) serialized.get(bytes) diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala index c7e73bcc13c5f..071b79135d641 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala @@ -73,7 +73,7 @@ private[ui] class ApplicationPage(parent: MasterWebUI) extends WebUIPage("app")
  • + data-placement="top"> Executor Limit: { if (app.executorLimit == Int.MaxValue) "Unlimited" else app.executorLimit diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala index e8b614527f69c..f64b449851d86 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala @@ -27,7 +27,6 @@ import org.apache.spark.deploy.DeployMessages.{KillDriverResponse, MasterStateRe import org.apache.spark.deploy.JsonProtocol import org.apache.spark.deploy.StandaloneResourceUtils._ import org.apache.spark.deploy.master._ -import org.apache.spark.resource.ResourceInformation import org.apache.spark.ui.{UIUtils, WebUIPage} import org.apache.spark.util.Utils @@ -109,12 +108,17 @@ private[ui] class MasterPage(parent: MasterWebUI) extends WebUIPage("") { val completedApps = state.completedApps.sortBy(_.endTime).reverse val completedAppsTable = UIUtils.listingTable(appHeaders, appRow, completedApps) - val driverHeaders = Seq("Submission ID", "Submitted Time", "Worker", "State", "Cores", - "Memory", "Resources", "Main Class") + val activeDriverHeaders = Seq("Submission ID", "Submitted Time", "Worker", "State", "Cores", + "Memory", "Resources", "Main Class", "Duration") val activeDrivers = state.activeDrivers.sortBy(_.startTime).reverse - val activeDriversTable = UIUtils.listingTable(driverHeaders, driverRow, activeDrivers) + val activeDriversTable = + UIUtils.listingTable(activeDriverHeaders, activeDriverRow, activeDrivers) + + val completedDriverHeaders = Seq("Submission ID", "Submitted Time", "Worker", "State", "Cores", + "Memory", "Resources", "Main Class") val completedDrivers = state.completedDrivers.sortBy(_.startTime).reverse - val completedDriversTable = UIUtils.listingTable(driverHeaders, driverRow, completedDrivers) + val completedDriversTable = + UIUtils.listingTable(completedDriverHeaders, completedDriverRow, completedDrivers) // For now we only show driver information if the user has submitted drivers to the cluster. // This is until we integrate the notion of drivers and applications in the UI. @@ -311,7 +315,11 @@ private[ui] class MasterPage(parent: MasterWebUI) extends WebUIPage("") { } - private def driverRow(driver: DriverInfo): Seq[Node] = { + private def activeDriverRow(driver: DriverInfo) = driverRow(driver, showDuration = true) + + private def completedDriverRow(driver: DriverInfo) = driverRow(driver, showDuration = false) + + private def driverRow(driver: DriverInfo, showDuration: Boolean): Seq[Node] = { val killLink = if (parent.killEnabled && (driver.state == DriverState.RUNNING || driver.state == DriverState.SUBMITTED || @@ -346,6 +354,9 @@ private[ui] class MasterPage(parent: MasterWebUI) extends WebUIPage("") {
  • + {if (showDuration) { + + }} } } diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala index be402ae247511..86554ec4ec1c9 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterWebUI.scala @@ -40,7 +40,7 @@ class MasterWebUI( initialize() /** Initialize all components of the server. */ - def initialize() { + def initialize(): Unit = { val masterPage = new MasterPage(this) attachPage(new ApplicationPage(this)) attachPage(masterPage) diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala index e59bf3f0eaf44..f60d940b8c82a 100644 --- a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala +++ b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionServer.scala @@ -317,8 +317,7 @@ private class ErrorServlet extends RestServlet { versionMismatch = true s"Unknown protocol version '$unknownVersion'." case _ => - // never reached - s"Malformed path $path." + "Malformed path." } msg += s" Please submit requests through http://[host]:[port]/$serverVersion/submissions/..." val error = handleError(msg) diff --git a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala index 759d857d56e0e..3168c763df4df 100644 --- a/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala +++ b/core/src/main/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManager.scala @@ -65,11 +65,6 @@ private[spark] class HadoopDelegationTokenManager( protected val hadoopConf: Configuration, protected val schedulerRef: RpcEndpointRef) extends Logging { - private val deprecatedProviderEnabledConfigs = List( - "spark.yarn.security.tokens.%s.enabled", - "spark.yarn.security.credentials.%s.enabled") - private val providerEnabledConfig = "spark.security.credentials.%s.enabled" - private val principal = sparkConf.get(PRINCIPAL).orNull // The keytab can be a local: URI for cluster mode, so translate it to a regular path. If it is @@ -140,13 +135,21 @@ private[spark] class HadoopDelegationTokenManager( * @param creds Credentials object where to store the delegation tokens. */ def obtainDelegationTokens(creds: Credentials): Unit = { - val freshUGI = doLogin() - freshUGI.doAs(new PrivilegedExceptionAction[Unit]() { - override def run(): Unit = { - val (newTokens, _) = obtainDelegationTokens() - creds.addAll(newTokens) - } - }) + val currentUser = UserGroupInformation.getCurrentUser() + val hasKerberosCreds = principal != null || + Option(currentUser.getRealUser()).getOrElse(currentUser).hasKerberosCredentials() + + // Delegation tokens can only be obtained if the real user has Kerberos credentials, so + // skip creation when those are not available. + if (hasKerberosCreds) { + val freshUGI = doLogin() + freshUGI.doAs(new PrivilegedExceptionAction[Unit]() { + override def run(): Unit = { + val (newTokens, _) = obtainDelegationTokens() + creds.addAll(newTokens) + } + }) + } } /** @@ -173,29 +176,6 @@ private[spark] class HadoopDelegationTokenManager( delegationTokenProviders.contains(serviceName) } - protected def isServiceEnabled(serviceName: String): Boolean = { - val key = providerEnabledConfig.format(serviceName) - - deprecatedProviderEnabledConfigs.foreach { pattern => - val deprecatedKey = pattern.format(serviceName) - if (sparkConf.contains(deprecatedKey)) { - logWarning(s"${deprecatedKey} is deprecated. Please use ${key} instead.") - } - } - - val isEnabledDeprecated = deprecatedProviderEnabledConfigs.forall { pattern => - sparkConf - .getOption(pattern.format(serviceName)) - .map(_.toBoolean) - .getOrElse(true) - } - - sparkConf - .getOption(key) - .map(_.toBoolean) - .getOrElse(isEnabledDeprecated) - } - private def scheduleRenewal(delay: Long): Unit = { val _delay = math.max(0, delay) logInfo(s"Scheduling renewal in ${UIUtils.formatDuration(delay)}.") @@ -291,8 +271,39 @@ private[spark] class HadoopDelegationTokenManager( // Filter out providers for which spark.security.credentials.{service}.enabled is false. providers - .filter { p => isServiceEnabled(p.serviceName) } + .filter { p => HadoopDelegationTokenManager.isServiceEnabled(sparkConf, p.serviceName) } .map { p => (p.serviceName, p) } .toMap } } + +private[spark] object HadoopDelegationTokenManager extends Logging { + private val providerEnabledConfig = "spark.security.credentials.%s.enabled" + + private val deprecatedProviderEnabledConfigs = List( + "spark.yarn.security.tokens.%s.enabled", + "spark.yarn.security.credentials.%s.enabled") + + def isServiceEnabled(sparkConf: SparkConf, serviceName: String): Boolean = { + val key = providerEnabledConfig.format(serviceName) + + deprecatedProviderEnabledConfigs.foreach { pattern => + val deprecatedKey = pattern.format(serviceName) + if (sparkConf.contains(deprecatedKey)) { + logWarning(s"${deprecatedKey} is deprecated. Please use ${key} instead.") + } + } + + val isEnabledDeprecated = deprecatedProviderEnabledConfigs.forall { pattern => + sparkConf + .getOption(pattern.format(serviceName)) + .map(_.toBoolean) + .getOrElse(true) + } + + sparkConf + .getOption(key) + .map(_.toBoolean) + .getOrElse(isEnabledDeprecated) + } +} diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala b/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala index 12e0dae3f5e5a..f7423f1fc3f1c 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala @@ -102,12 +102,12 @@ object CommandUtils extends Logging { } /** Spawn a thread that will redirect a given stream to a file */ - def redirectStream(in: InputStream, file: File) { + def redirectStream(in: InputStream, file: File): Unit = { val out = new FileOutputStream(file, true) // TODO: It would be nice to add a shutdown hook here that explains why the output is // terminating. Otherwise if the worker dies the executor logs will silently stop. new Thread("redirect output to " + file) { - override def run() { + override def run(): Unit = { try { Utils.copyStream(in, out, true) } catch { diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala index 4934722c0d83e..53ec7b3a88f35 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverRunner.scala @@ -84,7 +84,7 @@ private[deploy] class DriverRunner( /** Starts a thread to run and manage the driver. */ private[worker] def start() = { new Thread("DriverRunner for " + driverId) { - override def run() { + override def run(): Unit = { var shutdownHook: AnyRef = null try { shutdownHook = ShutdownHookManager.addShutdownHook { () => diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala index 56356f5f27e27..45ffdde58d6c3 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala @@ -32,7 +32,7 @@ import org.apache.spark.util._ * This is used in standalone cluster mode only. */ object DriverWrapper extends Logging { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { args.toList match { /* * IMPORTANT: Spark 1.3 provides a stable application submission gateway that is both diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala index 97939107f3057..2a5528bbe89cb 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala @@ -31,7 +31,7 @@ import org.apache.spark.deploy.StandaloneResourceUtils.prepareResourcesFile import org.apache.spark.internal.Logging import org.apache.spark.internal.config.SPARK_EXECUTOR_PREFIX import org.apache.spark.internal.config.UI._ -import org.apache.spark.resource.{ResourceInformation, ResourceUtils} +import org.apache.spark.resource.ResourceInformation import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.util.{ShutdownHookManager, Utils} import org.apache.spark.util.logging.FileAppender @@ -74,9 +74,9 @@ private[deploy] class ExecutorRunner( // make sense to remove this in the future. private var shutdownHook: AnyRef = null - private[worker] def start() { + private[worker] def start(): Unit = { workerThread = new Thread("ExecutorRunner for " + fullId) { - override def run() { fetchAndRunExecutor() } + override def run(): Unit = { fetchAndRunExecutor() } } workerThread.start() // Shutdown hook that kills actors on shutdown. @@ -94,7 +94,7 @@ private[deploy] class ExecutorRunner( * * @param message the exception message which caused the executor's death */ - private def killProcess(message: Option[String]) { + private def killProcess(message: Option[String]): Unit = { var exitCode: Option[Int] = None if (process != null) { logInfo("Killing process!") @@ -118,7 +118,7 @@ private[deploy] class ExecutorRunner( } /** Stop this executor runner, including killing the process it launched */ - private[worker] def kill() { + private[worker] def kill(): Unit = { if (workerThread != null) { // the workerThread will kill the child process when interrupted workerThread.interrupt() @@ -145,7 +145,7 @@ private[deploy] class ExecutorRunner( /** * Download and run the executor described in our ApplicationDescription */ - private def fetchAndRunExecutor() { + private def fetchAndRunExecutor(): Unit = { try { val resourceFileOpt = prepareResourcesFile(SPARK_EXECUTOR_PREFIX, resources, executorDir) // Launch the process diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala index 3731b6aec6522..d988bcedb47f0 100755 --- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala @@ -67,6 +67,14 @@ private[deploy] class Worker( Utils.checkHost(host) assert (port > 0) + // If worker decommissioning is enabled register a handler on PWR to shutdown. + if (conf.get(WORKER_DECOMMISSION_ENABLED)) { + logInfo("Registering SIGPWR handler to trigger decommissioning.") + SignalUtils.register("PWR")(decommissionSelf) + } else { + logInfo("Worker decommissioning not enabled, SIGPWR will result in exiting.") + } + // A scheduled executor used to send messages at the specified time. private val forwardMessageScheduler = ThreadUtils.newDaemonSingleThreadScheduledExecutor("worker-forward-message-scheduler") @@ -128,6 +136,7 @@ private[deploy] class Worker( private val workerUri = RpcEndpointAddress(rpcEnv.address, endpointName).toString private var registered = false private var connected = false + private var decommissioned = false private val workerId = generateWorkerId() private val sparkHome = if (sys.props.contains(IS_TESTING.key)) { @@ -190,14 +199,14 @@ private[deploy] class Worker( def coresFree: Int = cores - coresUsed def memoryFree: Int = memory - memoryUsed - private def createWorkDir() { + private def createWorkDir(): Unit = { workDir = Option(workDirPath).map(new File(_)).getOrElse(new File(sparkHome, "work")) if (!Utils.createDirectory(workDir)) { System.exit(1) } } - override def onStart() { + override def onStart(): Unit = { assert(!registered) logInfo("Starting Spark worker %s:%d with %d cores, %s RAM".format( host, port, cores, Utils.megabytesToString(memory))) @@ -268,7 +277,8 @@ private[deploy] class Worker( * @param masterAddress the new master address which the worker should use to connect in case of * failure */ - private def changeMaster(masterRef: RpcEndpointRef, uiUrl: String, masterAddress: RpcAddress) { + private def changeMaster(masterRef: RpcEndpointRef, uiUrl: String, + masterAddress: RpcAddress): Unit = { // activeMasterUrl it's a valid Spark url since we receive it from master. activeMasterUrl = masterRef.address.toSparkURL activeMasterWebUiUrl = uiUrl @@ -391,7 +401,7 @@ private[deploy] class Worker( registrationRetryTimer = None } - private def registerWithMaster() { + private def registerWithMaster(): Unit = { // onDisconnected may be triggered multiple times, so don't attempt registration // if there are outstanding registration attempts scheduled. registrationRetryTimer match { @@ -410,7 +420,7 @@ private[deploy] class Worker( } } - private def startExternalShuffleService() { + private def startExternalShuffleService(): Unit = { try { shuffleService.startIfEnabled() } catch { @@ -548,6 +558,8 @@ private[deploy] class Worker( case LaunchExecutor(masterUrl, appId, execId, appDesc, cores_, memory_, resources_) => if (masterUrl != activeMasterUrl) { logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.") + } else if (decommissioned) { + logWarning("Asked to launch an executor while decommissioned. Not launching executor.") } else { try { logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name)) @@ -671,6 +683,9 @@ private[deploy] class Worker( case ApplicationFinished(id) => finishedApps += id maybeCleanupApplication(id) + + case DecommissionSelf => + decommissionSelf() } override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { @@ -690,7 +705,7 @@ private[deploy] class Worker( } } - private def masterDisconnected() { + private def masterDisconnected(): Unit = { logError("Connection to master failed! Waiting for master to reconnect...") connected = false registerWithMaster() @@ -736,7 +751,7 @@ private[deploy] class Worker( "worker-%s-%s-%d".format(createDateFormat.format(new Date), host, port) } - override def onStop() { + override def onStop(): Unit = { releaseResources(conf, SPARK_WORKER_PREFIX, resources, pid) cleanupThreadExecutor.shutdownNow() metricsSystem.report() @@ -770,6 +785,18 @@ private[deploy] class Worker( } } + private[deploy] def decommissionSelf(): Boolean = { + if (conf.get(WORKER_DECOMMISSION_ENABLED)) { + logDebug("Decommissioning self") + decommissioned = true + sendToMaster(WorkerDecommission(workerId, self)) + } else { + logWarning("Asked to decommission self, but decommissioning not enabled") + } + // Return true since can be called as a signal handler + true + } + private[worker] def handleDriverStateChanged(driverStateChanged: DriverStateChanged): Unit = { val driverId = driverStateChanged.driverId val exception = driverStateChanged.exception @@ -834,7 +861,7 @@ private[deploy] object Worker extends Logging { val ENDPOINT_NAME = "Worker" private val SSL_NODE_LOCAL_CONFIG_PATTERN = """\-Dspark\.ssl\.useNodeLocalConf\=(.+)""".r - def main(argStrings: Array[String]) { + def main(argStrings: Array[String]): Unit = { Thread.setDefaultUncaughtExceptionHandler(new SparkUncaughtExceptionHandler( exitOnUncaughtException = false)) Utils.initDaemon(log) diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala index 8c87708e960e6..42f684c0a1973 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerArguments.scala @@ -122,7 +122,7 @@ private[worker] class WorkerArguments(args: Array[String], conf: SparkConf) { /** * Print usage and exit JVM with the given exit code. */ - def printUsageAndExit(exitCode: Int) { + def printUsageAndExit(exitCode: Int): Unit = { // scalastyle:off println System.err.println( "Usage: Worker [options] \n" + diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala index 96980c3ff0331..0f5e96c558490 100644 --- a/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala +++ b/core/src/main/scala/org/apache/spark/deploy/worker/ui/WorkerWebUI.scala @@ -43,7 +43,7 @@ class WorkerWebUI( initialize() /** Initialize all components of the server. */ - def initialize() { + def initialize(): Unit = { val logPage = new LogPage(this) attachPage(logPage) attachPage(new WorkerPage(this)) diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala index e96c41a61b066..faf03a64ae8b2 100644 --- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala +++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala @@ -17,6 +17,7 @@ package org.apache.spark.executor +import java.io.File import java.net.URL import java.nio.ByteBuffer import java.util.Locale @@ -35,34 +36,43 @@ import org.apache.spark.deploy.worker.WorkerWatcher import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ import org.apache.spark.resource.ResourceInformation +import org.apache.spark.resource.ResourceProfile +import org.apache.spark.resource.ResourceProfile._ import org.apache.spark.resource.ResourceUtils._ import org.apache.spark.rpc._ import org.apache.spark.scheduler.{ExecutorLossReason, TaskDescription} import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._ import org.apache.spark.serializer.SerializerInstance -import org.apache.spark.util.{ThreadUtils, Utils} +import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader, SignalUtils, ThreadUtils, Utils} private[spark] class CoarseGrainedExecutorBackend( override val rpcEnv: RpcEnv, driverUrl: String, executorId: String, + bindAddress: String, hostname: String, cores: Int, userClassPath: Seq[URL], env: SparkEnv, - resourcesFileOpt: Option[String]) - extends ThreadSafeRpcEndpoint with ExecutorBackend with Logging { + resourcesFileOpt: Option[String], + resourceProfile: ResourceProfile) + extends IsolatedRpcEndpoint with ExecutorBackend with Logging { + + import CoarseGrainedExecutorBackend._ private implicit val formats = DefaultFormats private[this] val stopping = new AtomicBoolean(false) var executor: Executor = null + @volatile private var decommissioned = false @volatile var driver: Option[RpcEndpointRef] = None // If this CoarseGrainedExecutorBackend is changed to support multiple threads, then this may need // to be changed so that we don't share the serializer instance across threads private[this] val ser: SerializerInstance = env.closureSerializer.newInstance() + private var _resources = Map.empty[String, ResourceInformation] + /** * Map each taskId to the information about the resource allocated to it, Please refer to * [[ResourceInformation]] for specifics. @@ -70,43 +80,60 @@ private[spark] class CoarseGrainedExecutorBackend( */ private[executor] val taskResources = new mutable.HashMap[Long, Map[String, ResourceInformation]] - override def onStart() { + override def onStart(): Unit = { + logInfo("Registering PWR handler.") + SignalUtils.register("PWR")(decommissionSelf) + logInfo("Connecting to driver: " + driverUrl) - val resources = parseOrFindResources(resourcesFileOpt) + try { + _resources = parseOrFindResources(resourcesFileOpt) + } catch { + case NonFatal(e) => + exitExecutor(1, "Unable to create executor due to " + e.getMessage, e) + } rpcEnv.asyncSetupEndpointRefByURI(driverUrl).flatMap { ref => // This is a very fast action so we can use "ThreadUtils.sameThread" driver = Some(ref) ref.ask[Boolean](RegisterExecutor(executorId, self, hostname, cores, extractLogUrls, - extractAttributes, resources)) + extractAttributes, _resources, resourceProfile.id)) }(ThreadUtils.sameThread).onComplete { - // This is a very fast action so we can use "ThreadUtils.sameThread" - case Success(msg) => - // Always receive `true`. Just ignore it + case Success(_) => + self.send(RegisteredExecutor) case Failure(e) => exitExecutor(1, s"Cannot register with driver: $driverUrl", e, notifyDriver = false) }(ThreadUtils.sameThread) } + /** + * Create a classLoader for use for resource discovery. The user could provide a class + * as a substitute for the default one so we have to be able to load it from a user specified + * jar. + */ + private def createClassLoader(): MutableURLClassLoader = { + val currentLoader = Utils.getContextOrSparkClassLoader + val urls = userClassPath.toArray + if (env.conf.get(EXECUTOR_USER_CLASS_PATH_FIRST)) { + new ChildFirstURLClassLoader(urls, currentLoader) + } else { + new MutableURLClassLoader(urls, currentLoader) + } + } + // visible for testing def parseOrFindResources(resourcesFileOpt: Option[String]): Map[String, ResourceInformation] = { - // only parse the resources if a task requires them - val resourceInfo = if (parseResourceRequirements(env.conf, SPARK_TASK_PREFIX).nonEmpty) { - val resources = getOrDiscoverAllResources(env.conf, SPARK_EXECUTOR_PREFIX, resourcesFileOpt) - if (resources.isEmpty) { - throw new SparkException("User specified resources per task via: " + - s"$SPARK_TASK_PREFIX, but can't find any resources available on the executor.") - } else { - logResourceInfo(SPARK_EXECUTOR_PREFIX, resources) - } + // use a classloader that includes the user classpath in case they specified a class for + // resource discovery + val urlClassLoader = createClassLoader() + logDebug(s"Resource profile id is: ${resourceProfile.id}") + Utils.withContextClassLoader(urlClassLoader) { + val resources = getOrDiscoverAllResourcesForResourceProfile( + resourcesFileOpt, + SPARK_EXECUTOR_PREFIX, + resourceProfile, + env.conf) + logResourceInfo(SPARK_EXECUTOR_PREFIX, resources) resources - } else { - if (resourcesFileOpt.nonEmpty) { - logWarning("A resources file was specified but the application is not configured " + - s"to use any resources, see the configs with prefix: ${SPARK_TASK_PREFIX}") - } - Map.empty[String, ResourceInformation] } - resourceInfo } def extractLogUrls: Map[String, String] = { @@ -125,19 +152,28 @@ private[spark] class CoarseGrainedExecutorBackend( case RegisteredExecutor => logInfo("Successfully registered with driver") try { - executor = new Executor(executorId, hostname, env, userClassPath, isLocal = false) + executor = new Executor(executorId, hostname, env, userClassPath, isLocal = false, + resources = _resources) + driver.get.send(LaunchedExecutor(executorId)) } catch { case NonFatal(e) => exitExecutor(1, "Unable to create executor due to " + e.getMessage, e) } - case RegisterExecutorFailed(message) => - exitExecutor(1, "Slave registration failed: " + message) - case LaunchTask(data) => if (executor == null) { exitExecutor(1, "Received LaunchTask command but executor was null") } else { + if (decommissioned) { + logError("Asked to launch a task while decommissioned.") + driver match { + case Some(endpoint) => + logInfo("Sending DecommissionExecutor to driver.") + endpoint.send(DecommissionExecutor(executorId)) + case _ => + logError("No registered driver to send Decommission to.") + } + } val taskDesc = TaskDescription.decode(data.value) logInfo("Got assigned task " + taskDesc.taskId) taskResources(taskDesc.taskId) = taskDesc.resources @@ -186,7 +222,7 @@ private[spark] class CoarseGrainedExecutorBackend( } } - override def statusUpdate(taskId: Long, state: TaskState, data: ByteBuffer) { + override def statusUpdate(taskId: Long, state: TaskState, data: ByteBuffer): Unit = { val resources = taskResources.getOrElse(taskId, Map.empty[String, ResourceInformation]) val msg = StatusUpdate(executorId, taskId, state, data, resources) if (TaskState.isFinished(state)) { @@ -220,26 +256,55 @@ private[spark] class CoarseGrainedExecutorBackend( System.exit(code) } + + private def decommissionSelf(): Boolean = { + logInfo("Decommissioning self w/sync") + try { + decommissioned = true + // Tell master we are are decommissioned so it stops trying to schedule us + if (driver.nonEmpty) { + driver.get.askSync[Boolean](DecommissionExecutor(executorId)) + } else { + logError("No driver to message decommissioning.") + } + if (executor != null) { + executor.decommission() + } + logInfo("Done decommissioning self.") + // Return true since we are handling a signal + true + } catch { + case e: Exception => + logError(s"Error ${e} during attempt to decommission self") + false + } + } } private[spark] object CoarseGrainedExecutorBackend extends Logging { + // Message used internally to start the executor when the driver successfully accepted the + // registration request. + case object RegisteredExecutor + case class Arguments( driverUrl: String, executorId: String, + bindAddress: String, hostname: String, cores: Int, appId: String, workerUrl: Option[String], userClassPath: mutable.ListBuffer[URL], - resourcesFileOpt: Option[String]) + resourcesFileOpt: Option[String], + resourceProfileId: Int) def main(args: Array[String]): Unit = { - val createFn: (RpcEnv, Arguments, SparkEnv) => - CoarseGrainedExecutorBackend = { case (rpcEnv, arguments, env) => + val createFn: (RpcEnv, Arguments, SparkEnv, ResourceProfile) => + CoarseGrainedExecutorBackend = { case (rpcEnv, arguments, env, resourceProfile) => new CoarseGrainedExecutorBackend(rpcEnv, arguments.driverUrl, arguments.executorId, - arguments.hostname, arguments.cores, arguments.userClassPath, env, - arguments.resourcesFileOpt) + arguments.bindAddress, arguments.hostname, arguments.cores, arguments.userClassPath, env, + arguments.resourcesFileOpt, resourceProfile) } run(parseArguments(args, this.getClass.getCanonicalName.stripSuffix("$")), createFn) System.exit(0) @@ -247,7 +312,8 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging { def run( arguments: Arguments, - backendCreateFn: (RpcEnv, Arguments, SparkEnv) => CoarseGrainedExecutorBackend): Unit = { + backendCreateFn: (RpcEnv, Arguments, SparkEnv, ResourceProfile) => + CoarseGrainedExecutorBackend): Unit = { Utils.initDaemon(log) @@ -259,10 +325,12 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging { val executorConf = new SparkConf val fetcher = RpcEnv.create( "driverPropsFetcher", + arguments.bindAddress, arguments.hostname, -1, executorConf, new SecurityManager(executorConf), + numUsableCores = 0, clientMode = true) var driver: RpcEndpointRef = null @@ -277,7 +345,7 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging { } } - val cfg = driver.askSync[SparkAppConfig](RetrieveSparkAppConfig) + val cfg = driver.askSync[SparkAppConfig](RetrieveSparkAppConfig(arguments.resourceProfileId)) val props = cfg.sparkProperties ++ Seq[(String, String)](("spark.app.id", arguments.appId)) fetcher.shutdown() @@ -297,10 +365,11 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging { } driverConf.set(EXECUTOR_ID, arguments.executorId) - val env = SparkEnv.createExecutorEnv(driverConf, arguments.executorId, arguments.hostname, - arguments.cores, cfg.ioEncryptionKey, isLocal = false) + val env = SparkEnv.createExecutorEnv(driverConf, arguments.executorId, arguments.bindAddress, + arguments.hostname, arguments.cores, cfg.ioEncryptionKey, isLocal = false) - env.rpcEnv.setupEndpoint("Executor", backendCreateFn(env.rpcEnv, arguments, env)) + env.rpcEnv.setupEndpoint("Executor", + backendCreateFn(env.rpcEnv, arguments, env, cfg.resourceProfile)) arguments.workerUrl.foreach { url => env.rpcEnv.setupEndpoint("WorkerWatcher", new WorkerWatcher(env.rpcEnv, url)) } @@ -311,12 +380,14 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging { def parseArguments(args: Array[String], classNameForEntry: String): Arguments = { var driverUrl: String = null var executorId: String = null + var bindAddress: String = null var hostname: String = null var cores: Int = 0 var resourcesFileOpt: Option[String] = None var appId: String = null var workerUrl: Option[String] = None val userClassPath = new mutable.ListBuffer[URL]() + var resourceProfileId: Int = DEFAULT_RESOURCE_PROFILE_ID var argv = args.toList while (!argv.isEmpty) { @@ -327,6 +398,9 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging { case ("--executor-id") :: value :: tail => executorId = value argv = tail + case ("--bind-address") :: value :: tail => + bindAddress = value + argv = tail case ("--hostname") :: value :: tail => hostname = value argv = tail @@ -346,6 +420,9 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging { case ("--user-class-path") :: value :: tail => userClassPath += new URL(value) argv = tail + case ("--resourceProfileId") :: value :: tail => + resourceProfileId = value.toInt + argv = tail case Nil => case tail => // scalastyle:off println @@ -364,8 +441,12 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging { printUsageAndExit(classNameForEntry) } - Arguments(driverUrl, executorId, hostname, cores, appId, workerUrl, - userClassPath, resourcesFileOpt) + if (bindAddress == null) { + bindAddress = hostname + } + + Arguments(driverUrl, executorId, bindAddress, hostname, cores, appId, workerUrl, + userClassPath, resourcesFileOpt, resourceProfileId) } private def printUsageAndExit(classNameForEntry: String): Unit = { @@ -377,12 +458,14 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging { | Options are: | --driver-url | --executor-id + | --bind-address | --hostname | --cores | --resourcesFile | --app-id | --worker-url | --user-class-path + | --resourceProfileId |""".stripMargin) // scalastyle:on println System.exit(1) diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala index c337d24381286..2bfa1cea4b26f 100644 --- a/core/src/main/scala/org/apache/spark/executor/Executor.scala +++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala @@ -24,9 +24,11 @@ import java.net.{URI, URL} import java.nio.ByteBuffer import java.util.Properties import java.util.concurrent._ +import java.util.concurrent.atomic.AtomicBoolean import javax.annotation.concurrent.GuardedBy import scala.collection.JavaConverters._ +import scala.collection.immutable import scala.collection.mutable.{ArrayBuffer, HashMap, Map, WrappedArray} import scala.concurrent.duration._ import scala.util.control.NonFatal @@ -37,8 +39,10 @@ import org.apache.spark._ import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ +import org.apache.spark.internal.plugin.PluginContainer import org.apache.spark.memory.{SparkOutOfMemoryError, TaskMemoryManager} import org.apache.spark.metrics.source.JVMCPUSource +import org.apache.spark.resource.ResourceInformation import org.apache.spark.rpc.RpcTimeout import org.apache.spark.scheduler._ import org.apache.spark.shuffle.FetchFailedException @@ -59,11 +63,16 @@ private[spark] class Executor( env: SparkEnv, userClassPath: Seq[URL] = Nil, isLocal: Boolean = false, - uncaughtExceptionHandler: UncaughtExceptionHandler = new SparkUncaughtExceptionHandler) + uncaughtExceptionHandler: UncaughtExceptionHandler = new SparkUncaughtExceptionHandler, + resources: immutable.Map[String, ResourceInformation]) extends Logging { logInfo(s"Starting executor ID $executorId on host $executorHostname") + private val executorShutdown = new AtomicBoolean(false) + ShutdownHookManager.addShutdownHook( + () => stop() + ) // Application dependencies (added through SparkContext) that we've fetched so far on this node. // Each map holds the master's timestamp for the version of that file or JAR we got. private val currentFiles: HashMap[String, Long] = new HashMap[String, Long]() @@ -112,10 +121,18 @@ private[spark] class Executor( // create. The map key is a task id. private val taskReaperForTask: HashMap[Long, TaskReaper] = HashMap[Long, TaskReaper]() + val executorMetricsSource = + if (conf.get(METRICS_EXECUTORMETRICS_SOURCE_ENABLED)) { + Some(new ExecutorMetricsSource) + } else { + None + } + if (!isLocal) { env.blockManager.initialize(conf.getAppId) env.metricsSystem.registerSource(executorSource) env.metricsSystem.registerSource(new JVMCPUSource()) + executorMetricsSource.foreach(_.register(env.metricsSystem)) env.metricsSystem.registerSource(env.blockManager.shuffleMetricsSource) } @@ -136,27 +153,9 @@ private[spark] class Executor( // for fetching remote cached RDD blocks, so need to make sure it uses the right classloader too. env.serializerManager.setDefaultClassLoader(replClassLoader) - private val executorPlugins: Seq[ExecutorPlugin] = { - val pluginNames = conf.get(EXECUTOR_PLUGINS) - if (pluginNames.nonEmpty) { - logDebug(s"Initializing the following plugins: ${pluginNames.mkString(", ")}") - - // Plugins need to load using a class loader that includes the executor's user classpath - val pluginList: Seq[ExecutorPlugin] = - Utils.withContextClassLoader(replClassLoader) { - val plugins = Utils.loadExtensions(classOf[ExecutorPlugin], pluginNames, conf) - plugins.foreach { plugin => - plugin.init() - logDebug(s"Successfully loaded plugin " + plugin.getClass().getCanonicalName()) - } - plugins - } - - logDebug("Finished initializing plugins") - pluginList - } else { - Nil - } + // Plugins need to load using a class loader that includes the executor's user classpath + private val plugins: Option[PluginContainer] = Utils.withContextClassLoader(replClassLoader) { + PluginContainer(env, resources.asJava) } // Max size of direct result. If task result is bigger than this, we use the block manager @@ -198,7 +197,8 @@ private[spark] class Executor( // Poller for the memory metrics. Visible for testing. private[executor] val metricsPoller = new ExecutorMetricsPoller( env.memoryManager, - METRICS_POLLING_INTERVAL_MS) + METRICS_POLLING_INTERVAL_MS, + executorMetricsSource) // Executor for the heartbeat task. private val heartbeater = new Heartbeater( @@ -216,16 +216,32 @@ private[spark] class Executor( */ private var heartbeatFailures = 0 + /** + * Flag to prevent launching new tasks while decommissioned. There could be a race condition + * accessing this, but decommissioning is only intended to help not be a hard stop. + */ + private var decommissioned = false + heartbeater.start() metricsPoller.start() private[executor] def numRunningTasks: Int = runningTasks.size() + /** + * Mark an executor for decommissioning and avoid launching new tasks. + */ + private[spark] def decommission(): Unit = { + decommissioned = true + } + def launchTask(context: ExecutorBackend, taskDescription: TaskDescription): Unit = { val tr = new TaskRunner(context, taskDescription) runningTasks.put(taskDescription.taskId, tr) threadPool.execute(tr) + if (decommissioned) { + log.error(s"Launching a task while in decommissioned state.") + } } def killTask(taskId: Long, interruptThread: Boolean, reason: String): Unit = { @@ -266,34 +282,29 @@ private[spark] class Executor( } def stop(): Unit = { - env.metricsSystem.report() - try { - metricsPoller.stop() - } catch { - case NonFatal(e) => - logWarning("Unable to stop executor metrics poller", e) - } - try { - heartbeater.stop() - } catch { - case NonFatal(e) => - logWarning("Unable to stop heartbeater", e) - } - threadPool.shutdown() - - // Notify plugins that executor is shutting down so they can terminate cleanly - Utils.withContextClassLoader(replClassLoader) { - executorPlugins.foreach { plugin => - try { - plugin.shutdown() - } catch { - case e: Exception => - logWarning("Plugin " + plugin.getClass().getCanonicalName() + " shutdown failed", e) - } + if (!executorShutdown.getAndSet(true)) { + env.metricsSystem.report() + try { + metricsPoller.stop() + } catch { + case NonFatal(e) => + logWarning("Unable to stop executor metrics poller", e) + } + try { + heartbeater.stop() + } catch { + case NonFatal(e) => + logWarning("Unable to stop heartbeater", e) + } + threadPool.shutdown() + + // Notify plugins that executor is shutting down so they can terminate cleanly + Utils.withContextClassLoader(replClassLoader) { + plugins.foreach(_.shutdown()) + } + if (!isLocal) { + env.stop() } - } - if (!isLocal) { - env.stop() } } @@ -623,6 +634,11 @@ private[spark] class Executor( setTaskFinishedAndClearInterruptStatus() execBackend.statusUpdate(taskId, TaskState.KILLED, ser.serialize(reason)) + case t: Throwable if env.isStopped => + // Log the expected exception after executor.stop without stack traces + // see: SPARK-19147 + logError(s"Exception in $taskName (TID $taskId): ${t.getMessage}") + case t: Throwable => // Attempt to exit cleanly by informing the driver of our failure. // If anything goes wrong (or this was a fatal exception), we will delegate to @@ -846,7 +862,7 @@ private[spark] class Executor( * Download any missing dependencies if we receive a new set of files and JARs from the * SparkContext. Also adds any new JARs we fetched to the class loader. */ - private def updateDependencies(newFiles: Map[String, Long], newJars: Map[String, Long]) { + private def updateDependencies(newFiles: Map[String, Long], newJars: Map[String, Long]): Unit = { lazy val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf) synchronized { // Fetch missing dependencies diff --git a/core/src/main/scala/org/apache/spark/executor/ExecutorMetricsPoller.scala b/core/src/main/scala/org/apache/spark/executor/ExecutorMetricsPoller.scala index 805b0f729b122..1c1a1ca8035d0 100644 --- a/core/src/main/scala/org/apache/spark/executor/ExecutorMetricsPoller.scala +++ b/core/src/main/scala/org/apache/spark/executor/ExecutorMetricsPoller.scala @@ -48,7 +48,8 @@ import org.apache.spark.util.{ThreadUtils, Utils} */ private[spark] class ExecutorMetricsPoller( memoryManager: MemoryManager, - pollingInterval: Long) extends Logging { + pollingInterval: Long, + executorMetricsSource: Option[ExecutorMetricsSource]) extends Logging { type StageKey = (Int, Int) // Task Count and Metric Peaks @@ -79,6 +80,7 @@ private[spark] class ExecutorMetricsPoller( // get the latest values for the metrics val latestMetrics = ExecutorMetrics.getCurrentMetrics(memoryManager) + executorMetricsSource.foreach(_.updateMetricsSnapshot(latestMetrics)) def updatePeaks(metrics: AtomicLongArray): Unit = { (0 until metrics.length).foreach { i => diff --git a/core/src/main/scala/org/apache/spark/executor/ExecutorMetricsSource.scala b/core/src/main/scala/org/apache/spark/executor/ExecutorMetricsSource.scala new file mode 100644 index 0000000000000..14645f73ef278 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/executor/ExecutorMetricsSource.scala @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.executor + +import com.codahale.metrics.{Gauge, MetricRegistry} + +import org.apache.spark.metrics.{ExecutorMetricType, MetricsSystem} +import org.apache.spark.metrics.source.Source + +/** + * Expose executor metrics from [[ExecutorMetricsType]] using the Dropwizard metrics system. + * + * Metrics related to the memory system can be expensive to gather, therefore + * we implement some optimizations: + * (1) Metrics values are cached, updated at each heartbeat (default period is 10 seconds). + * An alternative faster polling mechanism is used, only if activated, by setting + * spark.executor.metrics.pollingInterval=. + * (2) Procfs metrics are gathered all in one-go and only conditionally: + * if the /proc filesystem exists + * and spark.executor.processTreeMetrics.enabled=true. + */ +private[spark] class ExecutorMetricsSource extends Source { + + override val metricRegistry = new MetricRegistry() + override val sourceName = "ExecutorMetrics" + @volatile var metricsSnapshot: Array[Long] = Array.fill(ExecutorMetricType.numMetrics)(0L) + + // called by ExecutorMetricsPoller + def updateMetricsSnapshot(metricsUpdates: Array[Long]): Unit = { + metricsSnapshot = metricsUpdates + } + + private class ExecutorMetricGauge(idx: Int) extends Gauge[Long] { + def getValue: Long = metricsSnapshot(idx) + } + + def register(metricsSystem: MetricsSystem): Unit = { + val gauges: IndexedSeq[ExecutorMetricGauge] = (0 until ExecutorMetricType.numMetrics).map { + idx => new ExecutorMetricGauge(idx) + }.toIndexedSeq + + ExecutorMetricType.metricToOffset.foreach { + case (name, idx) => + metricRegistry.register(MetricRegistry.name(name), gauges(idx)) + } + + metricsSystem.registerSource(this) + } +} diff --git a/core/src/main/scala/org/apache/spark/executor/ProcfsMetricsGetter.scala b/core/src/main/scala/org/apache/spark/executor/ProcfsMetricsGetter.scala index 2111273d8b35a..80ef757332e43 100644 --- a/core/src/main/scala/org/apache/spark/executor/ProcfsMetricsGetter.scala +++ b/core/src/main/scala/org/apache/spark/executor/ProcfsMetricsGetter.scala @@ -18,7 +18,6 @@ package org.apache.spark.executor import java.io._ -import java.nio.charset.Charset import java.nio.charset.StandardCharsets.UTF_8 import java.nio.file.{Files, Paths} import java.util.Locale @@ -59,11 +58,9 @@ private[spark] class ProcfsMetricsGetter(procfsDir: String = "/proc/") extends L logWarning("Exception checking for procfs dir", ioe) false } - val shouldLogStageExecutorMetrics = - SparkEnv.get.conf.get(config.EVENT_LOG_STAGE_EXECUTOR_METRICS) - val shouldLogStageExecutorProcessTreeMetrics = - SparkEnv.get.conf.get(config.EVENT_LOG_PROCESS_TREE_METRICS) - procDirExists.get && shouldLogStageExecutorProcessTreeMetrics && shouldLogStageExecutorMetrics + val shouldPollProcessTreeMetrics = + SparkEnv.get.conf.get(config.EXECUTOR_PROCESS_TREE_METRICS_ENABLED) + procDirExists.get && shouldPollProcessTreeMetrics } } diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala index ea79c7310349d..1470a23884bb0 100644 --- a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala +++ b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala @@ -137,6 +137,7 @@ class TaskMetrics private[spark] () extends Serializable { private[spark] def setJvmGCTime(v: Long): Unit = _jvmGCTime.setValue(v) private[spark] def setResultSerializationTime(v: Long): Unit = _resultSerializationTime.setValue(v) + private[spark] def setPeakExecutionMemory(v: Long): Unit = _peakExecutionMemory.setValue(v) private[spark] def incMemoryBytesSpilled(v: Long): Unit = _memoryBytesSpilled.add(v) private[spark] def incDiskBytesSpilled(v: Long): Unit = _diskBytesSpilled.add(v) private[spark] def incPeakExecutionMemory(v: Long): Unit = _peakExecutionMemory.add(v) diff --git a/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryRecordReader.scala b/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryRecordReader.scala index 549395314ba61..f6902d1bf83a1 100644 --- a/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryRecordReader.scala +++ b/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryRecordReader.scala @@ -46,7 +46,7 @@ private[spark] class FixedLengthBinaryRecordReader private var recordKey: LongWritable = null private var recordValue: BytesWritable = null - override def close() { + override def close(): Unit = { if (fileInputStream != null) { fileInputStream.close() } @@ -69,7 +69,7 @@ private[spark] class FixedLengthBinaryRecordReader } } - override def initialize(inputSplit: InputSplit, context: TaskAttemptContext) { + override def initialize(inputSplit: InputSplit, context: TaskAttemptContext): Unit = { // the file input val fileSplit = inputSplit.asInstanceOf[FileSplit] diff --git a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala index 6a4af01475646..57210da6a48eb 100644 --- a/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala +++ b/core/src/main/scala/org/apache/spark/input/PortableDataStream.scala @@ -44,7 +44,7 @@ private[spark] abstract class StreamFileInputFormat[T] * Allow minPartitions set by end-user in order to keep compatibility with old Hadoop API * which is set through setMaxSplitSize */ - def setMinPartitions(sc: SparkContext, context: JobContext, minPartitions: Int) { + def setMinPartitions(sc: SparkContext, context: JobContext, minPartitions: Int): Unit = { val defaultMaxSplitBytes = sc.getConf.get(config.FILES_MAX_PARTITION_BYTES) val openCostInBytes = sc.getConf.get(config.FILES_OPEN_COST_IN_BYTES) val defaultParallelism = Math.max(sc.defaultParallelism, minPartitions) diff --git a/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala b/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala index 04c5c4b90e8a1..692deb7a3282f 100644 --- a/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala +++ b/core/src/main/scala/org/apache/spark/input/WholeTextFileInputFormat.scala @@ -48,7 +48,7 @@ private[spark] class WholeTextFileInputFormat * Allow minPartitions set by end-user in order to keep compatibility with old Hadoop API, * which is set through setMaxSplitSize */ - def setMinPartitions(context: JobContext, minPartitions: Int) { + def setMinPartitions(context: JobContext, minPartitions: Int): Unit = { val files = listStatus(context).asScala val totalLen = files.map(file => if (file.isDirectory) 0L else file.getLen).sum val maxSplitSize = Math.ceil(totalLen * 1.0 / diff --git a/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala b/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala index 28fd1ff1b77ca..0bd2d551cc912 100644 --- a/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala +++ b/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala @@ -31,7 +31,7 @@ import org.apache.hadoop.mapreduce.lib.input.{CombineFileRecordReader, CombineFi */ private[spark] trait Configurable extends HConfigurable { private var conf: Configuration = _ - def setConf(c: Configuration) { + def setConf(c: Configuration): Unit = { conf = c } def getConf: Configuration = conf diff --git a/core/src/main/scala/org/apache/spark/internal/Logging.scala b/core/src/main/scala/org/apache/spark/internal/Logging.scala index 0987917bac0e7..0c1d9635b6535 100644 --- a/core/src/main/scala/org/apache/spark/internal/Logging.scala +++ b/core/src/main/scala/org/apache/spark/internal/Logging.scala @@ -53,44 +53,44 @@ trait Logging { } // Log methods that take only a String - protected def logInfo(msg: => String) { + protected def logInfo(msg: => String): Unit = { if (log.isInfoEnabled) log.info(msg) } - protected def logDebug(msg: => String) { + protected def logDebug(msg: => String): Unit = { if (log.isDebugEnabled) log.debug(msg) } - protected def logTrace(msg: => String) { + protected def logTrace(msg: => String): Unit = { if (log.isTraceEnabled) log.trace(msg) } - protected def logWarning(msg: => String) { + protected def logWarning(msg: => String): Unit = { if (log.isWarnEnabled) log.warn(msg) } - protected def logError(msg: => String) { + protected def logError(msg: => String): Unit = { if (log.isErrorEnabled) log.error(msg) } // Log methods that take Throwables (Exceptions/Errors) too - protected def logInfo(msg: => String, throwable: Throwable) { + protected def logInfo(msg: => String, throwable: Throwable): Unit = { if (log.isInfoEnabled) log.info(msg, throwable) } - protected def logDebug(msg: => String, throwable: Throwable) { + protected def logDebug(msg: => String, throwable: Throwable): Unit = { if (log.isDebugEnabled) log.debug(msg, throwable) } - protected def logTrace(msg: => String, throwable: Throwable) { + protected def logTrace(msg: => String, throwable: Throwable): Unit = { if (log.isTraceEnabled) log.trace(msg, throwable) } - protected def logWarning(msg: => String, throwable: Throwable) { + protected def logWarning(msg: => String, throwable: Throwable): Unit = { if (log.isWarnEnabled) log.warn(msg, throwable) } - protected def logError(msg: => String, throwable: Throwable) { + protected def logError(msg: => String, throwable: Throwable): Unit = { if (log.isErrorEnabled) log.error(msg, throwable) } @@ -116,6 +116,11 @@ trait Logging { false } + // For testing + private[spark] def initializeForcefully(isInterpreter: Boolean, silent: Boolean): Unit = { + initializeLogging(isInterpreter, silent) + } + private def initializeLogging(isInterpreter: Boolean, silent: Boolean): Unit = { // Don't use a logger in here, as this is itself occurring during initialization of a logger // If Log4j 1.2 is being used, but is not initialized, load a default properties file @@ -230,19 +235,18 @@ private class SparkShellLoggingFilter extends Filter { */ def decide(loggingEvent: LoggingEvent): Int = { if (Logging.sparkShellThresholdLevel == null) { - return Filter.NEUTRAL - } - val rootLevel = LogManager.getRootLogger().getLevel() - if (!loggingEvent.getLevel().eq(rootLevel)) { - return Filter.NEUTRAL - } - var logger = loggingEvent.getLogger() - while (logger.getParent() != null) { - if (logger.getLevel() != null) { - return Filter.NEUTRAL + Filter.NEUTRAL + } else if (loggingEvent.getLevel.isGreaterOrEqual(Logging.sparkShellThresholdLevel)) { + Filter.NEUTRAL + } else { + var logger = loggingEvent.getLogger() + while (logger.getParent() != null) { + if (logger.getLevel != null || logger.getAllAppenders.hasMoreElements) { + return Filter.NEUTRAL + } + logger = logger.getParent() } - logger = logger.getParent() + Filter.DENY } - return Filter.DENY } } diff --git a/core/src/main/scala/org/apache/spark/internal/config/History.scala b/core/src/main/scala/org/apache/spark/internal/config/History.scala index ca9af316dffd0..14fb5ff075472 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/History.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/History.scala @@ -84,6 +84,25 @@ private[spark] object History { .bytesConf(ByteUnit.BYTE) .createWithDefaultString("1m") + private[spark] val EVENT_LOG_ROLLING_MAX_FILES_TO_RETAIN = + ConfigBuilder("spark.history.fs.eventLog.rolling.maxFilesToRetain") + .doc("The maximum number of event log files which will be retained as non-compacted. " + + "By default, all event log files will be retained. Please set the configuration " + + s"and ${EVENT_LOG_ROLLING_MAX_FILE_SIZE.key} accordingly if you want to control " + + "the overall size of event log files.") + .intConf + .checkValue(_ > 0, "Max event log files to retain should be higher than 0.") + .createWithDefault(Integer.MAX_VALUE) + + private[spark] val EVENT_LOG_COMPACTION_SCORE_THRESHOLD = + ConfigBuilder("spark.history.fs.eventLog.rolling.compaction.score.threshold") + .doc("The threshold score to determine whether it's good to do the compaction or not. " + + "The compaction score is calculated in analyzing, and being compared to this value. " + + "Compaction will proceed only when the score is higher than the threshold value.") + .internal() + .doubleConf + .createWithDefault(0.7d) + val DRIVER_LOG_CLEANER_ENABLED = ConfigBuilder("spark.history.fs.driverlog.cleaner.enabled") .fallbackConf(CLEANER_ENABLED) diff --git a/core/src/main/scala/org/apache/spark/internal/config/Status.scala b/core/src/main/scala/org/apache/spark/internal/config/Status.scala index 3e6a4e9810664..3cc00a6f094cf 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/Status.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/Status.scala @@ -55,8 +55,8 @@ private[spark] object Status { .intConf .createWithDefault(Int.MaxValue) - val APP_STATUS_METRICS_ENABLED = - ConfigBuilder("spark.app.status.metrics.enabled") + val METRICS_APP_STATUS_SOURCE_ENABLED = + ConfigBuilder("spark.metrics.appStatusSource.enabled") .doc("Whether Dropwizard/Codahale metrics " + "will be reported for the status of the running spark app.") .booleanConf diff --git a/core/src/main/scala/org/apache/spark/internal/config/Tests.scala b/core/src/main/scala/org/apache/spark/internal/config/Tests.scala index 21660ab3a9512..51df73ebde07d 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/Tests.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/Tests.scala @@ -53,4 +53,13 @@ private[spark] object Tests { val TEST_N_CORES_EXECUTOR = ConfigBuilder("spark.testing.nCoresPerExecutor") .intConf .createWithDefault(2) + + val RESOURCES_WARNING_TESTING = + ConfigBuilder("spark.resources.warnings.testing").booleanConf.createWithDefault(false) + + val RESOURCE_PROFILE_MANAGER_TESTING = + ConfigBuilder("spark.testing.resourceProfileManager") + .booleanConf + .createWithDefault(false) + } diff --git a/core/src/main/scala/org/apache/spark/internal/config/UI.scala b/core/src/main/scala/org/apache/spark/internal/config/UI.scala index a11970ec73d88..60d985713d30e 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/UI.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/UI.scala @@ -81,6 +81,13 @@ private[spark] object UI { .booleanConf .createWithDefault(true) + val UI_PROMETHEUS_ENABLED = ConfigBuilder("spark.ui.prometheus.enabled") + .internal() + .doc("Expose executor metrics at /metrics/executors/prometheus. " + + "For master/worker/driver metrics, you need to configure `conf/metrics.properties`.") + .booleanConf + .createWithDefault(false) + val UI_X_XSS_PROTECTION = ConfigBuilder("spark.ui.xXssProtection") .doc("Value for HTTP X-XSS-Protection response header") .stringConf @@ -143,6 +150,11 @@ private[spark] object UI { .stringConf .createWithDefault("org.apache.spark.security.ShellBasedGroupsMappingProvider") + val PROXY_REDIRECT_URI = ConfigBuilder("spark.ui.proxyRedirectUri") + .doc("Proxy address to use when responding with HTTP redirects.") + .stringConf + .createOptional + val CUSTOM_EXECUTOR_LOG_URL = ConfigBuilder("spark.ui.custom.executor.log.url") .doc("Specifies custom spark executor log url for supporting external log service instead of " + "using cluster managers' application log urls in the Spark UI. Spark will support " + diff --git a/core/src/main/scala/org/apache/spark/internal/config/Worker.scala b/core/src/main/scala/org/apache/spark/internal/config/Worker.scala index f1eaae29f18df..2b175c1e14ee5 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/Worker.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/Worker.scala @@ -71,4 +71,9 @@ private[spark] object Worker { ConfigBuilder("spark.worker.ui.compressedLogFileLengthCacheSize") .intConf .createWithDefault(100) + + private[spark] val WORKER_DECOMMISSION_ENABLED = + ConfigBuilder("spark.worker.decommission.enabled") + .booleanConf + .createWithDefault(false) } diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index b898413ac8d76..02acb6b530737 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -35,6 +35,7 @@ package object config { private[spark] val SPARK_DRIVER_PREFIX = "spark.driver" private[spark] val SPARK_EXECUTOR_PREFIX = "spark.executor" private[spark] val SPARK_TASK_PREFIX = "spark.task" + private[spark] val LISTENER_BUS_EVENT_QUEUE_PREFIX = "spark.scheduler.listenerbus.eventqueue" private[spark] val SPARK_RESOURCES_COORDINATE = ConfigBuilder("spark.resources.coordinate.enable") @@ -53,6 +54,18 @@ package object config { .stringConf .createOptional + private[spark] val RESOURCES_DISCOVERY_PLUGIN = + ConfigBuilder("spark.resources.discovery.plugin") + .doc("Comma-separated list of class names implementing" + + "org.apache.spark.api.resource.ResourceDiscoveryPlugin to load into the application." + + "This is for advanced users to replace the resource discovery class with a " + + "custom implementation. Spark will try each class specified until one of them " + + "returns the resource information for that resource. It tries the discovery " + + "script last if none of the plugins return information for that resource.") + .stringConf + .toSequence + .createWithDefault(Nil) + private[spark] val DRIVER_RESOURCES_FILE = ConfigBuilder("spark.driver.resourcesFile") .internal() @@ -106,6 +119,11 @@ package object config { .booleanConf .createWithDefault(false) + private[spark] val DRIVER_LOG_ALLOW_EC = + ConfigBuilder("spark.driver.log.allowErasureCoding") + .booleanConf + .createWithDefault(false) + private[spark] val EVENT_LOG_ENABLED = ConfigBuilder("spark.eventLog.enabled") .booleanConf .createWithDefault(false) @@ -125,7 +143,7 @@ package object config { .createWithDefault(false) private[spark] val EVENT_LOG_ALLOW_EC = - ConfigBuilder("spark.eventLog.allowErasureCoding") + ConfigBuilder("spark.eventLog.erasureCoding.enabled") .booleanConf .createWithDefault(false) @@ -142,11 +160,8 @@ package object config { private[spark] val EVENT_LOG_STAGE_EXECUTOR_METRICS = ConfigBuilder("spark.eventLog.logStageExecutorMetrics.enabled") - .booleanConf - .createWithDefault(false) - - private[spark] val EVENT_LOG_PROCESS_TREE_METRICS = - ConfigBuilder("spark.eventLog.logStageExecutorProcessTreeMetrics.enabled") + .doc("Whether to write per-stage peaks of executor metrics (for each executor) " + + "to the event log.") .booleanConf .createWithDefault(false) @@ -174,6 +189,21 @@ package object config { private[spark] val EVENT_LOG_CALLSITE_LONG_FORM = ConfigBuilder("spark.eventLog.longForm.enabled").booleanConf.createWithDefault(false) + private[spark] val EVENT_LOG_ENABLE_ROLLING = + ConfigBuilder("spark.eventLog.rolling.enabled") + .doc("Whether rolling over event log files is enabled. If set to true, it cuts down " + + "each event log file to the configured size.") + .booleanConf + .createWithDefault(false) + + private[spark] val EVENT_LOG_ROLLING_MAX_FILE_SIZE = + ConfigBuilder("spark.eventLog.rolling.maxFileSize") + .doc("The max size of event log file to be rolled over.") + .bytesConf(ByteUnit.BYTE) + .checkValue(_ >= ByteUnit.MiB.toBytes(10), "Max file size of event log should be " + + "configured to be at least 10 MiB.") + .createWithDefaultString("128m") + private[spark] val EXECUTOR_ID = ConfigBuilder("spark.executor.id").stringConf.createOptional @@ -194,8 +224,18 @@ package object config { private[spark] val EXECUTOR_HEARTBEAT_MAX_FAILURES = ConfigBuilder("spark.executor.heartbeat.maxFailures").internal().intConf.createWithDefault(60) + private[spark] val EXECUTOR_PROCESS_TREE_METRICS_ENABLED = + ConfigBuilder("spark.executor.processTreeMetrics.enabled") + .doc("Whether to collect process tree metrics (from the /proc filesystem) when collecting " + + "executor metrics.") + .booleanConf + .createWithDefault(false) + private[spark] val EXECUTOR_METRICS_POLLING_INTERVAL = ConfigBuilder("spark.executor.metrics.pollingInterval") + .doc("How often to collect executor metrics (in milliseconds). " + + "If 0, the polling is done on executor heartbeats. " + + "If positive, the polling is done at this interval.") .timeConf(TimeUnit.MILLISECONDS) .createWithDefaultString("0") @@ -243,7 +283,8 @@ package object config { .createWithDefault(false) private[spark] val MEMORY_OFFHEAP_SIZE = ConfigBuilder("spark.memory.offHeap.size") - .doc("The absolute amount of memory in bytes which can be used for off-heap allocation. " + + .doc("The absolute amount of memory which can be used for off-heap allocation, " + + " in bytes unless otherwise specified. " + "This setting has no impact on heap memory usage, so if your executors' total memory " + "consumption must fit within some hard limit then be sure to shrink your JVM heap size " + "accordingly. This must be set to a positive value when spark.memory.offHeap.enabled=true.") @@ -575,6 +616,10 @@ package object config { private[spark] val LISTENER_BUS_EVENT_QUEUE_CAPACITY = ConfigBuilder("spark.scheduler.listenerbus.eventqueue.capacity") + .doc("The default capacity for event queues. Spark will try to initialize " + + "an event queue using capacity specified by `spark.scheduler.listenerbus" + + ".eventqueue.queueName.capacity` first. If it's not configured, Spark will " + + "use the default capacity specified by this config.") .intConf .checkValue(_ > 0, "The capacity of listener bus event queue must be positive") .createWithDefault(10000) @@ -585,6 +630,23 @@ package object config { .intConf .createWithDefault(128) + private[spark] val LISTENER_BUS_LOG_SLOW_EVENT_ENABLED = + ConfigBuilder("spark.scheduler.listenerbus.logSlowEvent.enabled") + .internal() + .doc("When enabled, log the event that takes too much time to process. This helps us " + + "discover the event types that cause performance bottlenecks. The time threshold is " + + "controlled by spark.scheduler.listenerbus.logSlowEvent.threshold.") + .booleanConf + .createWithDefault(true) + + private[spark] val LISTENER_BUS_LOG_SLOW_EVENT_TIME_THRESHOLD = + ConfigBuilder("spark.scheduler.listenerbus.logSlowEvent.threshold") + .internal() + .doc("The time threshold of whether a event is considered to be taking too much time to " + + "process. Log the event if spark.scheduler.listenerbus.logSlowEvent.enabled is true.") + .timeConf(TimeUnit.NANOSECONDS) + .createWithDefaultString("1s") + // This property sets the root namespace for metrics reporting private[spark] val METRICS_NAMESPACE = ConfigBuilder("spark.metrics.namespace") .stringConf @@ -594,6 +656,18 @@ package object config { .stringConf .createOptional + private[spark] val METRICS_EXECUTORMETRICS_SOURCE_ENABLED = + ConfigBuilder("spark.metrics.executorMetricsSource.enabled") + .doc("Whether to register the ExecutorMetrics source with the metrics system.") + .booleanConf + .createWithDefault(true) + + private[spark] val METRICS_STATIC_SOURCES_ENABLED = + ConfigBuilder("spark.metrics.staticSources.enabled") + .doc("Whether to register static sources with the metrics system.") + .booleanConf + .createWithDefault(true) + private[spark] val PYSPARK_DRIVER_PYTHON = ConfigBuilder("spark.pyspark.driver.python") .stringConf .createOptional @@ -777,6 +851,17 @@ package object config { .booleanConf .createWithDefault(false) + private[spark] val CACHE_CHECKPOINT_PREFERRED_LOCS_EXPIRE_TIME = + ConfigBuilder("spark.rdd.checkpoint.cachePreferredLocsExpireTime") + .internal() + .doc("Expire time in minutes for caching preferred locations of checkpointed RDD." + + "Caching preferred locations can relieve query loading to DFS and save the query " + + "time. The drawback is that the cached locations can be possibly outdated and " + + "lose data locality. If this config is not specified, it will not cache.") + .timeConf(TimeUnit.MINUTES) + .checkValue(_ > 0, "The expire time for caching preferred locations cannot be non-positive.") + .createOptional + private[spark] val SHUFFLE_ACCURATE_BLOCK_THRESHOLD = ConfigBuilder("spark.shuffle.accurateBlockThreshold") .doc("Threshold in bytes above which the size of shuffle blocks in " + @@ -810,7 +895,7 @@ package object config { .createWithDefault(Int.MaxValue) private[spark] val MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM = - ConfigBuilder("spark.maxRemoteBlockSizeFetchToMem") + ConfigBuilder("spark.network.maxRemoteBlockSizeFetchToMem") .doc("Remote block will be fetched to disk when size of the block is above this threshold " + "in bytes. This is to avoid a giant request takes too much memory. Note this " + "configuration will affect both shuffle fetch and block manager remote block fetch. " + @@ -961,6 +1046,15 @@ package object config { .booleanConf .createWithDefault(true) + private[spark] val MAP_STATUS_COMPRESSION_CODEC = + ConfigBuilder("spark.shuffle.mapStatus.compression.codec") + .internal() + .doc("The codec used to compress MapStatus, which is generated by ShuffleMapTask. " + + "By default, Spark provides four codecs: lz4, lzf, snappy, and zstd. You can also " + + "use fully qualified class names to specify the codec.") + .stringConf + .createWithDefault("zstd") + private[spark] val SHUFFLE_SPILL_INITIAL_MEM_THRESHOLD = ConfigBuilder("spark.shuffle.spill.initialMemoryThreshold") .internal() @@ -1020,13 +1114,23 @@ package object config { .booleanConf .createWithDefault(false) + private[spark] val STORAGE_LOCAL_DISK_BY_EXECUTORS_CACHE_SIZE = + ConfigBuilder("spark.storage.localDiskByExecutors.cacheSize") + .doc("The max number of executors for which the local dirs are stored. This size is " + + "both applied for the driver and both for the executors side to avoid having an " + + "unbounded store. This cache will be used to avoid the network in case of fetching disk " + + "persisted RDD blocks or shuffle blocks (when `spark.shuffle.readHostLocalDisk.enabled` " + + "is set) from the same host.") + .intConf + .createWithDefault(1000) + private[spark] val SHUFFLE_SYNC = ConfigBuilder("spark.shuffle.sync") .doc("Whether to force outstanding writes to disk.") .booleanConf .createWithDefault(false) - private[spark] val SHUFFLE_UNDAFE_FAST_MERGE_ENABLE = + private[spark] val SHUFFLE_UNSAFE_FAST_MERGE_ENABLE = ConfigBuilder("spark.shuffle.unsafe.fastMergeEnabled") .doc("Whether to perform a fast spill merge.") .booleanConf @@ -1047,6 +1151,22 @@ package object config { .checkValue(v => v > 0, "The value should be a positive integer.") .createWithDefault(2000) + private[spark] val SHUFFLE_USE_OLD_FETCH_PROTOCOL = + ConfigBuilder("spark.shuffle.useOldFetchProtocol") + .doc("Whether to use the old protocol while doing the shuffle block fetching. " + + "It is only enabled while we need the compatibility in the scenario of new Spark " + + "version job fetching shuffle blocks from old version external shuffle service.") + .booleanConf + .createWithDefault(false) + + private[spark] val SHUFFLE_HOST_LOCAL_DISK_READING_ENABLED = + ConfigBuilder("spark.shuffle.readHostLocalDisk.enabled") + .doc(s"If enabled (and `${SHUFFLE_USE_OLD_FETCH_PROTOCOL.key}` is disabled), shuffle " + + "blocks requested from those block managers which are running on the same host are read " + + "from the disk directly instead of being fetched as remote blocks over the network.") + .booleanConf + .createWithDefault(true) + private[spark] val MEMORY_MAP_LIMIT_FOR_TESTS = ConfigBuilder("spark.storage.memoryMapLimitForTests") .internal() @@ -1119,12 +1239,13 @@ package object config { s"The value must be in allowed range [1,048,576, ${MAX_BUFFER_SIZE_BYTES}].") .createWithDefault(1024 * 1024) - private[spark] val EXECUTOR_PLUGINS = - ConfigBuilder("spark.executor.plugins") - .doc("Comma-separated list of class names for \"plugins\" implementing " + - "org.apache.spark.ExecutorPlugin. Plugins have the same privileges as any task " + - "in a Spark executor. They can also interfere with task execution and fail in " + - "unexpected ways. So be sure to only use this for trusted plugins.") + private[spark] val DEFAULT_PLUGINS_LIST = "spark.plugins.defaultList" + + private[spark] val PLUGINS = + ConfigBuilder("spark.plugins") + .withPrepended(DEFAULT_PLUGINS_LIST, separator = ",") + .doc("Comma-separated list of class names implementing " + + "org.apache.spark.api.plugin.SparkPlugin to load into the application.") .stringConf .toSequence .createWithDefault(Nil) @@ -1229,9 +1350,9 @@ package object config { private[spark] val IO_WARNING_LARGEFILETHRESHOLD = ConfigBuilder("spark.io.warning.largeFileThreshold") .internal() - .doc("When spark loading one single large file, if file size exceed this " + - "threshold, then log warning with possible reasons.") - .longConf + .doc("If the size in bytes of a file loaded by Spark exceeds this threshold, " + + "a warning is logged with the possible reasons.") + .bytesConf(ByteUnit.BYTE) .createWithDefault(1024 * 1024 * 1024) private[spark] val EVENT_LOG_COMPRESSION_CODEC = @@ -1397,6 +1518,19 @@ package object config { .doubleConf .createWithDefault(0.75) + private[spark] val SPECULATION_TASK_DURATION_THRESHOLD = + ConfigBuilder("spark.speculation.task.duration.threshold") + .doc("Task duration after which scheduler would try to speculative run the task. If " + + "provided, tasks would be speculatively run if current stage contains less tasks " + + "than or equal to the number of slots on a single executor and the task is taking " + + "longer time than the threshold. This config helps speculate stage with very few " + + "tasks. Regular speculation configs may also apply if the executor slots are " + + "large enough. E.g. tasks might be re-launched if there are enough successful runs " + + "even though the threshold hasn't been reached. The number of slots is computed based " + + "on the conf values of spark.executor.cores and spark.task.cpus minimum 1.") + .timeConf(TimeUnit.MILLISECONDS) + .createOptional + private[spark] val STAGING_DIR = ConfigBuilder("spark.yarn.stagingDir") .doc("Staging directory used while submitting applications.") .stringConf diff --git a/core/src/main/scala/org/apache/spark/internal/plugin/PluginContainer.scala b/core/src/main/scala/org/apache/spark/internal/plugin/PluginContainer.scala new file mode 100644 index 0000000000000..4eda4767094ad --- /dev/null +++ b/core/src/main/scala/org/apache/spark/internal/plugin/PluginContainer.scala @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.internal.plugin + +import scala.collection.JavaConverters._ +import scala.util.{Either, Left, Right} + +import org.apache.spark.{SparkContext, SparkEnv} +import org.apache.spark.api.plugin._ +import org.apache.spark.internal.Logging +import org.apache.spark.internal.config._ +import org.apache.spark.resource.ResourceInformation +import org.apache.spark.util.Utils + +sealed abstract class PluginContainer { + + def shutdown(): Unit + def registerMetrics(appId: String): Unit + +} + +private class DriverPluginContainer( + sc: SparkContext, + resources: java.util.Map[String, ResourceInformation], + plugins: Seq[SparkPlugin]) + extends PluginContainer with Logging { + + private val driverPlugins: Seq[(String, DriverPlugin, PluginContextImpl)] = plugins.flatMap { p => + val driverPlugin = p.driverPlugin() + if (driverPlugin != null) { + val name = p.getClass().getName() + val ctx = new PluginContextImpl(name, sc.env.rpcEnv, sc.env.metricsSystem, sc.conf, + sc.env.executorId, resources) + + val extraConf = driverPlugin.init(sc, ctx) + if (extraConf != null) { + extraConf.asScala.foreach { case (k, v) => + sc.conf.set(s"${PluginContainer.EXTRA_CONF_PREFIX}$name.$k", v) + } + } + logInfo(s"Initialized driver component for plugin $name.") + Some((p.getClass().getName(), driverPlugin, ctx)) + } else { + None + } + } + + if (driverPlugins.nonEmpty) { + val pluginsByName = driverPlugins.map { case (name, plugin, _) => (name, plugin) }.toMap + sc.env.rpcEnv.setupEndpoint(classOf[PluginEndpoint].getName(), + new PluginEndpoint(pluginsByName, sc.env.rpcEnv)) + } + + override def registerMetrics(appId: String): Unit = { + driverPlugins.foreach { case (_, plugin, ctx) => + plugin.registerMetrics(appId, ctx) + ctx.registerMetrics() + } + } + + override def shutdown(): Unit = { + driverPlugins.foreach { case (name, plugin, _) => + try { + logDebug(s"Stopping plugin $name.") + plugin.shutdown() + } catch { + case t: Throwable => + logInfo(s"Exception while shutting down plugin $name.", t) + } + } + } + +} + +private class ExecutorPluginContainer( + env: SparkEnv, + resources: java.util.Map[String, ResourceInformation], + plugins: Seq[SparkPlugin]) + extends PluginContainer with Logging { + + private val executorPlugins: Seq[(String, ExecutorPlugin)] = { + val allExtraConf = env.conf.getAllWithPrefix(PluginContainer.EXTRA_CONF_PREFIX) + + plugins.flatMap { p => + val executorPlugin = p.executorPlugin() + if (executorPlugin != null) { + val name = p.getClass().getName() + val prefix = name + "." + val extraConf = allExtraConf + .filter { case (k, v) => k.startsWith(prefix) } + .map { case (k, v) => k.substring(prefix.length()) -> v } + .toMap + .asJava + val ctx = new PluginContextImpl(name, env.rpcEnv, env.metricsSystem, env.conf, + env.executorId, resources) + executorPlugin.init(ctx, extraConf) + ctx.registerMetrics() + + logInfo(s"Initialized executor component for plugin $name.") + Some(p.getClass().getName() -> executorPlugin) + } else { + None + } + } + } + + override def registerMetrics(appId: String): Unit = { + throw new IllegalStateException("Should not be called for the executor container.") + } + + override def shutdown(): Unit = { + executorPlugins.foreach { case (name, plugin) => + try { + logDebug(s"Stopping plugin $name.") + plugin.shutdown() + } catch { + case t: Throwable => + logInfo(s"Exception while shutting down plugin $name.", t) + } + } + } +} + +object PluginContainer { + + val EXTRA_CONF_PREFIX = "spark.plugins.internal.conf." + + def apply( + sc: SparkContext, + resources: java.util.Map[String, ResourceInformation]): Option[PluginContainer] = { + PluginContainer(Left(sc), resources) + } + + def apply( + env: SparkEnv, + resources: java.util.Map[String, ResourceInformation]): Option[PluginContainer] = { + PluginContainer(Right(env), resources) + } + + + private def apply( + ctx: Either[SparkContext, SparkEnv], + resources: java.util.Map[String, ResourceInformation]): Option[PluginContainer] = { + val conf = ctx.fold(_.conf, _.conf) + val plugins = Utils.loadExtensions(classOf[SparkPlugin], conf.get(PLUGINS).distinct, conf) + if (plugins.nonEmpty) { + ctx match { + case Left(sc) => Some(new DriverPluginContainer(sc, resources, plugins)) + case Right(env) => Some(new ExecutorPluginContainer(env, resources, plugins)) + } + } else { + None + } + } +} diff --git a/core/src/main/scala/org/apache/spark/internal/plugin/PluginContextImpl.scala b/core/src/main/scala/org/apache/spark/internal/plugin/PluginContextImpl.scala new file mode 100644 index 0000000000000..ca9119409d4b9 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/internal/plugin/PluginContextImpl.scala @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.internal.plugin + +import java.util + +import com.codahale.metrics.MetricRegistry + +import org.apache.spark.{SparkConf, SparkException} +import org.apache.spark.api.plugin.PluginContext +import org.apache.spark.internal.Logging +import org.apache.spark.metrics.MetricsSystem +import org.apache.spark.metrics.source.Source +import org.apache.spark.resource.ResourceInformation +import org.apache.spark.rpc.RpcEnv +import org.apache.spark.util.RpcUtils + +private class PluginContextImpl( + pluginName: String, + rpcEnv: RpcEnv, + metricsSystem: MetricsSystem, + override val conf: SparkConf, + override val executorID: String, + override val resources: util.Map[String, ResourceInformation]) + extends PluginContext with Logging { + + override def hostname(): String = rpcEnv.address.hostPort.split(":")(0) + + private val registry = new MetricRegistry() + + private lazy val driverEndpoint = try { + RpcUtils.makeDriverRef(classOf[PluginEndpoint].getName(), conf, rpcEnv) + } catch { + case e: Exception => + logWarning(s"Failed to create driver plugin endpoint ref.", e) + null + } + + override def metricRegistry(): MetricRegistry = registry + + override def send(message: AnyRef): Unit = { + if (driverEndpoint == null) { + throw new IllegalStateException("Driver endpoint is not known.") + } + driverEndpoint.send(PluginMessage(pluginName, message)) + } + + override def ask(message: AnyRef): AnyRef = { + try { + if (driverEndpoint != null) { + driverEndpoint.askSync[AnyRef](PluginMessage(pluginName, message)) + } else { + throw new IllegalStateException("Driver endpoint is not known.") + } + } catch { + case e: SparkException if e.getCause() != null => + throw e.getCause() + } + } + + def registerMetrics(): Unit = { + if (!registry.getMetrics().isEmpty()) { + val src = new PluginMetricsSource(s"plugin.$pluginName", registry) + metricsSystem.registerSource(src) + } + } + + class PluginMetricsSource( + override val sourceName: String, + override val metricRegistry: MetricRegistry) + extends Source + +} diff --git a/core/src/main/scala/org/apache/spark/internal/plugin/PluginEndpoint.scala b/core/src/main/scala/org/apache/spark/internal/plugin/PluginEndpoint.scala new file mode 100644 index 0000000000000..9a59b6bf678f9 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/internal/plugin/PluginEndpoint.scala @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.internal.plugin + +import org.apache.spark.api.plugin.DriverPlugin +import org.apache.spark.internal.Logging +import org.apache.spark.rpc.{IsolatedRpcEndpoint, RpcCallContext, RpcEnv} + +case class PluginMessage(pluginName: String, message: AnyRef) + +private class PluginEndpoint( + plugins: Map[String, DriverPlugin], + override val rpcEnv: RpcEnv) + extends IsolatedRpcEndpoint with Logging { + + override def receive: PartialFunction[Any, Unit] = { + case PluginMessage(pluginName, message) => + plugins.get(pluginName) match { + case Some(plugin) => + try { + val reply = plugin.receive(message) + if (reply != null) { + logInfo( + s"Plugin $pluginName returned reply for one-way message of type " + + s"${message.getClass().getName()}.") + } + } catch { + case e: Exception => + logWarning(s"Error in plugin $pluginName when handling message of type " + + s"${message.getClass().getName()}.", e) + } + + case None => + throw new IllegalArgumentException(s"Received message for unknown plugin $pluginName.") + } + } + + override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { + case PluginMessage(pluginName, message) => + plugins.get(pluginName) match { + case Some(plugin) => + context.reply(plugin.receive(message)) + + case None => + throw new IllegalArgumentException(s"Received message for unknown plugin $pluginName.") + } + } + +} diff --git a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala index adbd59c9f03b4..5205a2d568ac3 100644 --- a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala +++ b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala @@ -44,6 +44,10 @@ trait CompressionCodec { def compressedOutputStream(s: OutputStream): OutputStream + private[spark] def compressedContinuousOutputStream(s: OutputStream): OutputStream = { + compressedOutputStream(s) + } + def compressedInputStream(s: InputStream): InputStream private[spark] def compressedContinuousInputStream(s: InputStream): InputStream = { @@ -220,6 +224,12 @@ class ZStdCompressionCodec(conf: SparkConf) extends CompressionCodec { new BufferedOutputStream(new ZstdOutputStream(s, level), bufferSize) } + override private[spark] def compressedContinuousOutputStream(s: OutputStream) = { + // SPARK-29322: Set "closeFrameOnFlush" to 'true' to let continuous input stream not being + // stuck on reading open frame. + new BufferedOutputStream(new ZstdOutputStream(s, level).setCloseFrameOnFlush(true), bufferSize) + } + override def compressedInputStream(s: InputStream): InputStream = { // Wrap the zstd input stream in a buffered input stream so that we can // avoid overhead excessive of JNI call while trying to uncompress small amount of data. diff --git a/core/src/main/scala/org/apache/spark/memory/ExecutionMemoryPool.scala b/core/src/main/scala/org/apache/spark/memory/ExecutionMemoryPool.scala index 50055dcd2954a..4cffbb2a5701c 100644 --- a/core/src/main/scala/org/apache/spark/memory/ExecutionMemoryPool.scala +++ b/core/src/main/scala/org/apache/spark/memory/ExecutionMemoryPool.scala @@ -91,7 +91,7 @@ private[memory] class ExecutionMemoryPool( private[memory] def acquireMemory( numBytes: Long, taskAttemptId: Long, - maybeGrowPool: Long => Unit = (additionalSpaceNeeded: Long) => Unit, + maybeGrowPool: Long => Unit = (additionalSpaceNeeded: Long) => (), computeMaxPoolSize: () => Long = () => poolSize): Long = lock.synchronized { assert(numBytes > 0, s"invalid number of bytes requested: $numBytes") diff --git a/core/src/main/scala/org/apache/spark/memory/package.scala b/core/src/main/scala/org/apache/spark/memory/package.scala index 7f782193f246f..5909cb20b8e9c 100644 --- a/core/src/main/scala/org/apache/spark/memory/package.scala +++ b/core/src/main/scala/org/apache/spark/memory/package.scala @@ -41,23 +41,26 @@ package org.apache.spark * Diagrammatically: * * {{{ - * +-------------+ - * | MemConsumer |----+ +------------------------+ - * +-------------+ | +-------------------+ | MemoryManager | - * +--->| TaskMemoryManager |----+ | | - * +-------------+ | +-------------------+ | | +------------------+ | - * | MemConsumer |----+ | | | StorageMemPool | | - * +-------------+ +-------------------+ | | +------------------+ | - * | TaskMemoryManager |----+ | | - * +-------------------+ | | +------------------+ | - * +---->| |OnHeapExecMemPool | | - * * | | +------------------+ | - * * | | | - * +-------------+ * | | +------------------+ | - * | MemConsumer |----+ | | |OffHeapExecMemPool| | - * +-------------+ | +-------------------+ | | +------------------+ | - * +--->| TaskMemoryManager |----+ | | - * +-------------------+ +------------------------+ + * +---------------------------+ + * +-------------+ | MemoryManager | + * | MemConsumer |----+ | | + * +-------------+ | +-------------------+ | +---------------------+ | + * +--->| TaskMemoryManager |----+ | |OnHeapStorageMemPool | | + * +-------------+ | +-------------------+ | | +---------------------+ | + * | MemConsumer |----+ | | | + * +-------------+ +-------------------+ | | +---------------------+ | + * | TaskMemoryManager |----+ | |OffHeapStorageMemPool| | + * +-------------------+ | | +---------------------+ | + * +---->| | + * * | | +---------------------+ | + * * | | |OnHeapExecMemPool | | + * +-------------+ * | | +---------------------+ | + * | MemConsumer |----+ | | | + * +-------------+ | +-------------------+ | | +---------------------+ | + * +--->| TaskMemoryManager |----+ | |OffHeapExecMemPool | | + * +-------------------+ | +---------------------+ | + * | | + * +---------------------------+ * }}} * * diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala index b6be8aaefd351..d98d5e3b81aa0 100644 --- a/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala +++ b/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala @@ -38,7 +38,7 @@ private[spark] class MetricsConfig(conf: SparkConf) extends Logging { private[metrics] val properties = new Properties() private[metrics] var perInstanceSubProperties: mutable.HashMap[String, Properties] = null - private def setDefaultProperties(prop: Properties) { + private def setDefaultProperties(prop: Properties): Unit = { prop.setProperty("*.sink.servlet.class", "org.apache.spark.metrics.sink.MetricsServlet") prop.setProperty("*.sink.servlet.path", "/metrics/json") prop.setProperty("master.sink.servlet.path", "/metrics/master/json") @@ -49,7 +49,7 @@ private[spark] class MetricsConfig(conf: SparkConf) extends Logging { * Load properties from various places, based on precedence * If the same property is set again latter on in the method, it overwrites the previous value */ - def initialize() { + def initialize(): Unit = { // Add default properties in case there's no properties file setDefaultProperties(properties) diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala index c96640a6fab3f..57dcbe501c6dd 100644 --- a/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala +++ b/core/src/main/scala/org/apache/spark/metrics/MetricsSystem.scala @@ -28,7 +28,7 @@ import org.eclipse.jetty.servlet.ServletContextHandler import org.apache.spark.{SecurityManager, SparkConf} import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ -import org.apache.spark.metrics.sink.{MetricsServlet, Sink} +import org.apache.spark.metrics.sink.{MetricsServlet, PrometheusServlet, Sink} import org.apache.spark.metrics.source.{Source, StaticSources} import org.apache.spark.util.Utils @@ -83,18 +83,20 @@ private[spark] class MetricsSystem private ( // Treat MetricsServlet as a special sink as it should be exposed to add handlers to web ui private var metricsServlet: Option[MetricsServlet] = None + private var prometheusServlet: Option[PrometheusServlet] = None /** * Get any UI handlers used by this metrics system; can only be called after start(). */ def getServletHandlers: Array[ServletContextHandler] = { require(running, "Can only call getServletHandlers on a running MetricsSystem") - metricsServlet.map(_.getHandlers(conf)).getOrElse(Array()) + metricsServlet.map(_.getHandlers(conf)).getOrElse(Array()) ++ + prometheusServlet.map(_.getHandlers(conf)).getOrElse(Array()) } metricsConfig.initialize() - def start(registerStaticSources: Boolean = true) { + def start(registerStaticSources: Boolean = true): Unit = { require(!running, "Attempting to start a MetricsSystem that is already running") running = true if (registerStaticSources) { @@ -105,16 +107,17 @@ private[spark] class MetricsSystem private ( sinks.foreach(_.start) } - def stop() { + def stop(): Unit = { if (running) { sinks.foreach(_.stop) + registry.removeMatching((_: String, _: Metric) => true) } else { logWarning("Stopping a MetricsSystem that is not running") } running = false } - def report() { + def report(): Unit = { sinks.foreach(_.report()) } @@ -124,7 +127,7 @@ private[spark] class MetricsSystem private ( * If either ID is not available, this defaults to just using . * * @param source Metric source to be named by this method. - * @return An unique metric name for each combination of + * @return A unique metric name for each combination of * application, executor/driver and metric source. */ private[spark] def buildRegistryName(source: Source): String = { @@ -155,7 +158,7 @@ private[spark] class MetricsSystem private ( def getSourcesByName(sourceName: String): Seq[Source] = sources.filter(_.sourceName == sourceName) - def registerSource(source: Source) { + def registerSource(source: Source): Unit = { sources += source try { val regName = buildRegistryName(source) @@ -165,13 +168,13 @@ private[spark] class MetricsSystem private ( } } - def removeSource(source: Source) { + def removeSource(source: Source): Unit = { sources -= source val regName = buildRegistryName(source) registry.removeMatching((name: String, _: Metric) => name.startsWith(regName)) } - private def registerSources() { + private def registerSources(): Unit = { val instConfig = metricsConfig.getInstance(instance) val sourceConfigs = metricsConfig.subProperties(instConfig, MetricsSystem.SOURCE_REGEX) @@ -187,7 +190,7 @@ private[spark] class MetricsSystem private ( } } - private def registerSinks() { + private def registerSinks(): Unit = { val instConfig = metricsConfig.getInstance(instance) val sinkConfigs = metricsConfig.subProperties(instConfig, MetricsSystem.SINK_REGEX) @@ -201,6 +204,12 @@ private[spark] class MetricsSystem private ( classOf[Properties], classOf[MetricRegistry], classOf[SecurityManager]) .newInstance(kv._2, registry, securityMgr) metricsServlet = Some(servlet) + } else if (kv._1 == "prometheusServlet") { + val servlet = Utils.classForName[PrometheusServlet](classPath) + .getConstructor( + classOf[Properties], classOf[MetricRegistry], classOf[SecurityManager]) + .newInstance(kv._2, registry, securityMgr) + prometheusServlet = Some(servlet) } else { val sink = Utils.classForName[Sink](classPath) .getConstructor( @@ -225,7 +234,7 @@ private[spark] object MetricsSystem { private[this] val MINIMAL_POLL_UNIT = TimeUnit.SECONDS private[this] val MINIMAL_POLL_PERIOD = 1 - def checkMinimalPollingPeriod(pollUnit: TimeUnit, pollPeriod: Int) { + def checkMinimalPollingPeriod(pollUnit: TimeUnit, pollPeriod: Int): Unit = { val period = MINIMAL_POLL_UNIT.convert(pollPeriod, pollUnit) if (period < MINIMAL_POLL_PERIOD) { throw new IllegalArgumentException("Polling period " + pollPeriod + " " + pollUnit + diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/ConsoleSink.scala b/core/src/main/scala/org/apache/spark/metrics/sink/ConsoleSink.scala index fce556fd0382c..bfd23168e4003 100644 --- a/core/src/main/scala/org/apache/spark/metrics/sink/ConsoleSink.scala +++ b/core/src/main/scala/org/apache/spark/metrics/sink/ConsoleSink.scala @@ -50,15 +50,15 @@ private[spark] class ConsoleSink(val property: Properties, val registry: MetricR .convertRatesTo(TimeUnit.SECONDS) .build() - override def start() { + override def start(): Unit = { reporter.start(pollPeriod, pollUnit) } - override def stop() { + override def stop(): Unit = { reporter.stop() } - override def report() { + override def report(): Unit = { reporter.report() } } diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/CsvSink.scala b/core/src/main/scala/org/apache/spark/metrics/sink/CsvSink.scala index 88bba2fdbd1c6..579b8e0c0e984 100644 --- a/core/src/main/scala/org/apache/spark/metrics/sink/CsvSink.scala +++ b/core/src/main/scala/org/apache/spark/metrics/sink/CsvSink.scala @@ -59,15 +59,15 @@ private[spark] class CsvSink(val property: Properties, val registry: MetricRegis .convertRatesTo(TimeUnit.SECONDS) .build(new File(pollDir)) - override def start() { + override def start(): Unit = { reporter.start(pollPeriod, pollUnit) } - override def stop() { + override def stop(): Unit = { reporter.stop() } - override def report() { + override def report(): Unit = { reporter.report() } } diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/GraphiteSink.scala b/core/src/main/scala/org/apache/spark/metrics/sink/GraphiteSink.scala index 05d553ed30ff0..6ce64cd3543fe 100644 --- a/core/src/main/scala/org/apache/spark/metrics/sink/GraphiteSink.scala +++ b/core/src/main/scala/org/apache/spark/metrics/sink/GraphiteSink.scala @@ -89,15 +89,15 @@ private[spark] class GraphiteSink(val property: Properties, val registry: Metric .filter(filter) .build(graphite) - override def start() { + override def start(): Unit = { reporter.start(pollPeriod, pollUnit) } - override def stop() { + override def stop(): Unit = { reporter.stop() } - override def report() { + override def report(): Unit = { reporter.report() } } diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/JmxSink.scala b/core/src/main/scala/org/apache/spark/metrics/sink/JmxSink.scala index 1992b42ac7f6b..a7b7b5573cfe8 100644 --- a/core/src/main/scala/org/apache/spark/metrics/sink/JmxSink.scala +++ b/core/src/main/scala/org/apache/spark/metrics/sink/JmxSink.scala @@ -19,7 +19,8 @@ package org.apache.spark.metrics.sink import java.util.Properties -import com.codahale.metrics.{JmxReporter, MetricRegistry} +import com.codahale.metrics.MetricRegistry +import com.codahale.metrics.jmx.JmxReporter import org.apache.spark.SecurityManager @@ -28,14 +29,14 @@ private[spark] class JmxSink(val property: Properties, val registry: MetricRegis val reporter: JmxReporter = JmxReporter.forRegistry(registry).build() - override def start() { + override def start(): Unit = { reporter.start() } - override def stop() { + override def stop(): Unit = { reporter.stop() } - override def report() { } + override def report(): Unit = { } } diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/MetricsServlet.scala b/core/src/main/scala/org/apache/spark/metrics/sink/MetricsServlet.scala index bea24ca7807e4..7dd27d4fb9bf3 100644 --- a/core/src/main/scala/org/apache/spark/metrics/sink/MetricsServlet.scala +++ b/core/src/main/scala/org/apache/spark/metrics/sink/MetricsServlet.scala @@ -59,9 +59,9 @@ private[spark] class MetricsServlet( mapper.writeValueAsString(registry) } - override def start() { } + override def start(): Unit = { } - override def stop() { } + override def stop(): Unit = { } - override def report() { } + override def report(): Unit = { } } diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/PrometheusServlet.scala b/core/src/main/scala/org/apache/spark/metrics/sink/PrometheusServlet.scala new file mode 100644 index 0000000000000..7c33bce78378d --- /dev/null +++ b/core/src/main/scala/org/apache/spark/metrics/sink/PrometheusServlet.scala @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.metrics.sink + +import java.util.Properties +import javax.servlet.http.HttpServletRequest + +import com.codahale.metrics.MetricRegistry +import org.eclipse.jetty.servlet.ServletContextHandler + +import org.apache.spark.{SecurityManager, SparkConf} +import org.apache.spark.ui.JettyUtils._ + +/** + * This exposes the metrics of the given registry with Prometheus format. + * + * The output is consistent with /metrics/json result in terms of item ordering + * and with the previous result of Spark JMX Sink + Prometheus JMX Converter combination + * in terms of key string format. + */ +private[spark] class PrometheusServlet( + val property: Properties, + val registry: MetricRegistry, + securityMgr: SecurityManager) + extends Sink { + + val SERVLET_KEY_PATH = "path" + + val servletPath = property.getProperty(SERVLET_KEY_PATH) + + def getHandlers(conf: SparkConf): Array[ServletContextHandler] = { + Array[ServletContextHandler]( + createServletHandler(servletPath, + new ServletParams(request => getMetricsSnapshot(request), "text/plain"), conf) + ) + } + + def getMetricsSnapshot(request: HttpServletRequest): String = { + import scala.collection.JavaConverters._ + + val sb = new StringBuilder() + registry.getGauges.asScala.foreach { case (k, v) => + if (!v.getValue.isInstanceOf[String]) { + sb.append(s"${normalizeKey(k)}Value ${v.getValue}\n") + } + } + registry.getCounters.asScala.foreach { case (k, v) => + sb.append(s"${normalizeKey(k)}Count ${v.getCount}\n") + } + registry.getHistograms.asScala.foreach { case (k, h) => + val snapshot = h.getSnapshot + val prefix = normalizeKey(k) + sb.append(s"${prefix}Count ${h.getCount}\n") + sb.append(s"${prefix}Max ${snapshot.getMax}\n") + sb.append(s"${prefix}Mean ${snapshot.getMean}\n") + sb.append(s"${prefix}Min ${snapshot.getMin}\n") + sb.append(s"${prefix}50thPercentile ${snapshot.getMedian}\n") + sb.append(s"${prefix}75thPercentile ${snapshot.get75thPercentile}\n") + sb.append(s"${prefix}95thPercentile ${snapshot.get95thPercentile}\n") + sb.append(s"${prefix}98thPercentile ${snapshot.get98thPercentile}\n") + sb.append(s"${prefix}99thPercentile ${snapshot.get99thPercentile}\n") + sb.append(s"${prefix}999thPercentile ${snapshot.get999thPercentile}\n") + sb.append(s"${prefix}StdDev ${snapshot.getStdDev}\n") + } + registry.getMeters.entrySet.iterator.asScala.foreach { kv => + val prefix = normalizeKey(kv.getKey) + val meter = kv.getValue + sb.append(s"${prefix}Count ${meter.getCount}\n") + sb.append(s"${prefix}MeanRate ${meter.getMeanRate}\n") + sb.append(s"${prefix}OneMinuteRate ${meter.getOneMinuteRate}\n") + sb.append(s"${prefix}FiveMinuteRate ${meter.getFiveMinuteRate}\n") + sb.append(s"${prefix}FifteenMinuteRate ${meter.getFifteenMinuteRate}\n") + } + registry.getTimers.entrySet.iterator.asScala.foreach { kv => + val prefix = normalizeKey(kv.getKey) + val timer = kv.getValue + val snapshot = timer.getSnapshot + sb.append(s"${prefix}Count ${timer.getCount}\n") + sb.append(s"${prefix}Max ${snapshot.getMax}\n") + sb.append(s"${prefix}Mean ${snapshot.getMax}\n") + sb.append(s"${prefix}Min ${snapshot.getMin}\n") + sb.append(s"${prefix}50thPercentile ${snapshot.getMedian}\n") + sb.append(s"${prefix}75thPercentile ${snapshot.get75thPercentile}\n") + sb.append(s"${prefix}95thPercentile ${snapshot.get95thPercentile}\n") + sb.append(s"${prefix}98thPercentile ${snapshot.get98thPercentile}\n") + sb.append(s"${prefix}99thPercentile ${snapshot.get99thPercentile}\n") + sb.append(s"${prefix}999thPercentile ${snapshot.get999thPercentile}\n") + sb.append(s"${prefix}StdDev ${snapshot.getStdDev}\n") + sb.append(s"${prefix}FifteenMinuteRate ${timer.getFifteenMinuteRate}\n") + sb.append(s"${prefix}FiveMinuteRate ${timer.getFiveMinuteRate}\n") + sb.append(s"${prefix}OneMinuteRate ${timer.getOneMinuteRate}\n") + sb.append(s"${prefix}MeanRate ${timer.getMeanRate}\n") + } + sb.toString() + } + + private def normalizeKey(key: String): String = { + s"metrics_${key.replaceAll("[^a-zA-Z0-9]", "_")}_" + } + + override def start(): Unit = { } + + override def stop(): Unit = { } + + override def report(): Unit = { } +} diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/Slf4jSink.scala b/core/src/main/scala/org/apache/spark/metrics/sink/Slf4jSink.scala index 7fa4ba7622980..968d5ca809e72 100644 --- a/core/src/main/scala/org/apache/spark/metrics/sink/Slf4jSink.scala +++ b/core/src/main/scala/org/apache/spark/metrics/sink/Slf4jSink.scala @@ -53,15 +53,15 @@ private[spark] class Slf4jSink( .convertRatesTo(TimeUnit.SECONDS) .build() - override def start() { + override def start(): Unit = { reporter.start(pollPeriod, pollUnit) } - override def stop() { + override def stop(): Unit = { reporter.stop() } - override def report() { + override def report(): Unit = { reporter.report() } } diff --git a/core/src/main/scala/org/apache/spark/network/BlockDataManager.scala b/core/src/main/scala/org/apache/spark/network/BlockDataManager.scala index 4993519aa3843..0bd5774b632bf 100644 --- a/core/src/main/scala/org/apache/spark/network/BlockDataManager.scala +++ b/core/src/main/scala/org/apache/spark/network/BlockDataManager.scala @@ -22,16 +22,22 @@ import scala.reflect.ClassTag import org.apache.spark.TaskContext import org.apache.spark.network.buffer.ManagedBuffer import org.apache.spark.network.client.StreamCallbackWithID -import org.apache.spark.storage.{BlockId, StorageLevel} +import org.apache.spark.storage.{BlockId, ShuffleBlockId, StorageLevel} private[spark] trait BlockDataManager { + /** + * Interface to get host-local shuffle block data. Throws an exception if the block cannot be + * found or cannot be read successfully. + */ + def getHostLocalShuffleData(blockId: BlockId, dirs: Array[String]): ManagedBuffer + /** * Interface to get local block data. Throws an exception if the block cannot be found or * cannot be read successfully. */ - def getBlockData(blockId: BlockId): ManagedBuffer + def getLocalBlockData(blockId: BlockId): ManagedBuffer /** * Put the block locally, using the given storage level. @@ -57,7 +63,7 @@ trait BlockDataManager { classTag: ClassTag[_]): StreamCallbackWithID /** - * Release locks acquired by [[putBlockData()]] and [[getBlockData()]]. + * Release locks acquired by [[putBlockData()]] and [[getLocalBlockData()]]. */ def releaseLock(blockId: BlockId, taskContext: Option[TaskContext]): Unit } diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala index b2ab31488e4c1..b3904f3362e8e 100644 --- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala +++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala @@ -29,7 +29,7 @@ import org.apache.spark.network.client.{RpcResponseCallback, StreamCallbackWithI import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol._ import org.apache.spark.serializer.Serializer -import org.apache.spark.storage.{BlockId, ShuffleBlockId, StorageLevel} +import org.apache.spark.storage.{BlockId, ShuffleBlockBatchId, ShuffleBlockId, StorageLevel} /** * Serves requests to open blocks by simply registering one chunk per block requested. @@ -56,8 +56,12 @@ class NettyBlockRpcServer( message match { case openBlocks: OpenBlocks => val blocksNum = openBlocks.blockIds.length - val blocks = for (i <- (0 until blocksNum).view) - yield blockManager.getBlockData(BlockId.apply(openBlocks.blockIds(i))) + val blocks = (0 until blocksNum).map { i => + val blockId = BlockId.apply(openBlocks.blockIds(i)) + assert(!blockId.isInstanceOf[ShuffleBlockBatchId], + "Continuous shuffle block fetching only works for new fetch protocol.") + blockManager.getLocalBlockData(blockId) + } val streamId = streamManager.registerStream(appId, blocks.iterator.asJava, client.getChannel) logTrace(s"Registered streamId $streamId with $blocksNum buffers") @@ -65,12 +69,29 @@ class NettyBlockRpcServer( case fetchShuffleBlocks: FetchShuffleBlocks => val blocks = fetchShuffleBlocks.mapIds.zipWithIndex.flatMap { case (mapId, index) => - fetchShuffleBlocks.reduceIds.apply(index).map { reduceId => - blockManager.getBlockData( - ShuffleBlockId(fetchShuffleBlocks.shuffleId, mapId, reduceId)) + if (!fetchShuffleBlocks.batchFetchEnabled) { + fetchShuffleBlocks.reduceIds(index).map { reduceId => + blockManager.getLocalBlockData( + ShuffleBlockId(fetchShuffleBlocks.shuffleId, mapId, reduceId)) + } + } else { + val startAndEndId = fetchShuffleBlocks.reduceIds(index) + if (startAndEndId.length != 2) { + throw new IllegalStateException(s"Invalid shuffle fetch request when batch mode " + + s"is enabled: $fetchShuffleBlocks") + } + Array(blockManager.getLocalBlockData( + ShuffleBlockBatchId( + fetchShuffleBlocks.shuffleId, mapId, startAndEndId(0), startAndEndId(1)))) } } - val numBlockIds = fetchShuffleBlocks.reduceIds.map(_.length).sum + + val numBlockIds = if (fetchShuffleBlocks.batchFetchEnabled) { + fetchShuffleBlocks.mapIds.length + } else { + fetchShuffleBlocks.reduceIds.map(_.length).sum + } + val streamId = streamManager.registerStream(appId, blocks.iterator.asJava, client.getChannel) logTrace(s"Registered streamId $streamId with $numBlockIds buffers") diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala index 1d27fe7db193f..ffb696029a033 100644 --- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala +++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala @@ -116,7 +116,8 @@ private[spark] class NettyBlockTransferService( logTrace(s"Fetch blocks from $host:$port (executor id $execId)") try { val blockFetchStarter = new RetryingBlockFetcher.BlockFetchStarter { - override def createAndStart(blockIds: Array[String], listener: BlockFetchingListener) { + override def createAndStart(blockIds: Array[String], + listener: BlockFetchingListener): Unit = { try { val client = clientFactory.createClient(host, port) new OneForOneBlockFetcher(client, appId, execId, blockIds, listener, diff --git a/core/src/main/scala/org/apache/spark/partial/ApproximateActionListener.scala b/core/src/main/scala/org/apache/spark/partial/ApproximateActionListener.scala index b089bbd7e972e..34c04f4025a96 100644 --- a/core/src/main/scala/org/apache/spark/partial/ApproximateActionListener.scala +++ b/core/src/main/scala/org/apache/spark/partial/ApproximateActionListener.scala @@ -43,7 +43,7 @@ private[spark] class ApproximateActionListener[T, U, R]( var failure: Option[Exception] = None // Set if the job has failed (permanently) var resultObject: Option[PartialResult[R]] = None // Set if we've already returned a PartialResult - override def taskSucceeded(index: Int, result: Any) { + override def taskSucceeded(index: Int, result: Any): Unit = { synchronized { evaluator.merge(index, result.asInstanceOf[U]) finishedTasks += 1 @@ -56,7 +56,7 @@ private[spark] class ApproximateActionListener[T, U, R]( } } - override def jobFailed(exception: Exception) { + override def jobFailed(exception: Exception): Unit = { synchronized { failure = Some(exception) this.notifyAll() diff --git a/core/src/main/scala/org/apache/spark/partial/PartialResult.scala b/core/src/main/scala/org/apache/spark/partial/PartialResult.scala index 25cb7490aa9c9..012d4769617f6 100644 --- a/core/src/main/scala/org/apache/spark/partial/PartialResult.scala +++ b/core/src/main/scala/org/apache/spark/partial/PartialResult.scala @@ -61,7 +61,7 @@ class PartialResult[R](initialVal: R, isFinal: Boolean) { * Set a handler to be called if this PartialResult's job fails. Only one failure handler * is supported per PartialResult. */ - def onFail(handler: Exception => Unit) { + def onFail(handler: Exception => Unit): Unit = { synchronized { if (failureHandler.isDefined) { throw new UnsupportedOperationException("onFail cannot be called twice") @@ -85,7 +85,7 @@ class PartialResult[R](initialVal: R, isFinal: Boolean) { override def onComplete(handler: T => Unit): PartialResult[T] = synchronized { PartialResult.this.onComplete(handler.compose(f)).map(f) } - override def onFail(handler: Exception => Unit) { + override def onFail(handler: Exception => Unit): Unit = { synchronized { PartialResult.this.onFail(handler) } @@ -100,7 +100,7 @@ class PartialResult[R](initialVal: R, isFinal: Boolean) { } } - private[spark] def setFinalValue(value: R) { + private[spark] def setFinalValue(value: R): Unit = { synchronized { if (finalValue.isDefined) { throw new UnsupportedOperationException("setFinalValue called twice on a PartialResult") @@ -115,7 +115,7 @@ class PartialResult[R](initialVal: R, isFinal: Boolean) { private def getFinalValueInternal() = finalValue - private[spark] def setFailure(exception: Exception) { + private[spark] def setFailure(exception: Exception): Unit = { synchronized { if (failure.isDefined) { throw new UnsupportedOperationException("setFailure called twice on a PartialResult") diff --git a/core/src/main/scala/org/apache/spark/rdd/AsyncRDDActions.scala b/core/src/main/scala/org/apache/spark/rdd/AsyncRDDActions.scala index ba9dae4ad48ec..d6379156ccf72 100644 --- a/core/src/main/scala/org/apache/spark/rdd/AsyncRDDActions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/AsyncRDDActions.scala @@ -109,7 +109,7 @@ class AsyncRDDActions[T: ClassTag](self: RDD[T]) extends Serializable with Loggi (it: Iterator[T]) => it.take(left).toArray, p, (index: Int, data: Array[T]) => buf(index) = data, - Unit) + ()) job.flatMap { _ => buf.foreach(results ++= _.take(num - results.size)) continue(partsScanned + p.size) @@ -125,7 +125,7 @@ class AsyncRDDActions[T: ClassTag](self: RDD[T]) extends Serializable with Loggi def foreachAsync(f: T => Unit): FutureAction[Unit] = self.withScope { val cleanF = self.context.clean(f) self.context.submitJob[T, Unit, Unit](self, _.foreach(cleanF), Range(0, self.partitions.length), - (index, data) => Unit, Unit) + (index, data) => (), ()) } /** @@ -133,7 +133,7 @@ class AsyncRDDActions[T: ClassTag](self: RDD[T]) extends Serializable with Loggi */ def foreachPartitionAsync(f: Iterator[T] => Unit): FutureAction[Unit] = self.withScope { self.context.submitJob[T, Unit, Unit](self, f, Range(0, self.partitions.length), - (index, data) => Unit, Unit) + (index, data) => (), ()) } } diff --git a/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala b/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala index 23cf19d55b4ae..a5c3e2a2dfe2a 100644 --- a/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/BlockRDD.scala @@ -61,7 +61,7 @@ class BlockRDD[T: ClassTag](sc: SparkContext, @transient val blockIds: Array[Blo * irreversible operation, as the data in the blocks cannot be recovered back * once removed. Use it with caution. */ - private[spark] def removeBlocks() { + private[spark] def removeBlocks(): Unit = { blockIds.foreach { blockId => sparkContext.env.blockManager.master.removeBlock(blockId) } @@ -77,7 +77,7 @@ class BlockRDD[T: ClassTag](sc: SparkContext, @transient val blockIds: Array[Blo } /** Check if this BlockRDD is valid. If not valid, exception is thrown. */ - private[spark] def assertValid() { + private[spark] def assertValid(): Unit = { if (!isValid) { throw new SparkException( "Attempted to use %s after its blocks have been removed!".format(toString)) diff --git a/core/src/main/scala/org/apache/spark/rdd/CartesianRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CartesianRDD.scala index 57108dcedcf0c..fddd35b657479 100644 --- a/core/src/main/scala/org/apache/spark/rdd/CartesianRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/CartesianRDD.scala @@ -85,7 +85,7 @@ class CartesianRDD[T: ClassTag, U: ClassTag]( } ) - override def clearDependencies() { + override def clearDependencies(): Unit = { super.clearDependencies() rdd1 = null rdd2 = null diff --git a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala index 909f58512153b..500d306f336ac 100644 --- a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala @@ -187,7 +187,7 @@ class CoGroupedRDD[K: ClassTag]( createCombiner, mergeValue, mergeCombiners) } - override def clearDependencies() { + override def clearDependencies(): Unit = { super.clearDependencies() rdds = null } diff --git a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala index 55c141c2b8a0a..58a0c0c400e09 100644 --- a/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/CoalescedRDD.scala @@ -107,7 +107,7 @@ private[spark] class CoalescedRDD[T: ClassTag]( }) } - override def clearDependencies() { + override def clearDependencies(): Unit = { super.clearDependencies() prev = null } @@ -239,7 +239,7 @@ private class DefaultPartitionCoalescer(val balanceSlack: Double = 0.10) * locations (2 * n log(n)) * @param targetLen The number of desired partition groups */ - def setupGroups(targetLen: Int, partitionLocs: PartitionLocations) { + def setupGroups(targetLen: Int, partitionLocs: PartitionLocations): Unit = { // deal with empty case, just create targetLen partition groups with no preferred location if (partitionLocs.partsWithLocs.isEmpty) { (1 to targetLen).foreach(_ => groupArr += new PartitionGroup()) @@ -328,7 +328,7 @@ private class DefaultPartitionCoalescer(val balanceSlack: Double = 0.10) def throwBalls( maxPartitions: Int, prev: RDD[_], - balanceSlack: Double, partitionLocs: PartitionLocations) { + balanceSlack: Double, partitionLocs: PartitionLocations): Unit = { if (noLocality) { // no preferredLocations in parent RDD, no randomization needed if (maxPartitions > groupArr.size) { // just return prev.partitions for ((p, i) <- prev.partitions.zipWithIndex) { diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala index f3f9be3562922..9742d12cfe01e 100644 --- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala @@ -375,7 +375,7 @@ class HadoopRDD[K, V]( locs.getOrElse(hsplit.getLocations.filter(_ != "localhost")) } - override def checkpoint() { + override def checkpoint(): Unit = { // Do nothing. Hadoop RDD should not be checkpointed. } @@ -405,14 +405,14 @@ private[spark] object HadoopRDD extends Logging { * The three methods below are helpers for accessing the local map, a property of the SparkEnv of * the local process. */ - def getCachedMetadata(key: String): Any = SparkEnv.get.hadoopJobMetadata.get(key) + def getCachedMetadata(key: String): AnyRef = SparkEnv.get.hadoopJobMetadata.get(key) - private def putCachedMetadata(key: String, value: Any): Unit = + private def putCachedMetadata(key: String, value: AnyRef): Unit = SparkEnv.get.hadoopJobMetadata.put(key, value) /** Add Hadoop configuration specific to a single partition and attempt. */ def addLocalConfiguration(jobTrackerId: String, jobId: Int, splitId: Int, attemptId: Int, - conf: JobConf) { + conf: JobConf): Unit = { val jobID = new JobID(jobTrackerId, jobId) val taId = new TaskAttemptID(new TaskID(jobID, TaskType.MAP, splitId), attemptId) diff --git a/core/src/main/scala/org/apache/spark/rdd/InputFileBlockHolder.scala b/core/src/main/scala/org/apache/spark/rdd/InputFileBlockHolder.scala index bfe8152d4dee2..1beb085db27d9 100644 --- a/core/src/main/scala/org/apache/spark/rdd/InputFileBlockHolder.scala +++ b/core/src/main/scala/org/apache/spark/rdd/InputFileBlockHolder.scala @@ -76,7 +76,7 @@ private[spark] object InputFileBlockHolder { def set(filePath: String, startOffset: Long, length: Long): Unit = { require(filePath != null, "filePath cannot be null") require(startOffset >= 0, s"startOffset ($startOffset) cannot be negative") - require(length >= 0, s"length ($length) cannot be negative") + require(length >= -1, s"length ($length) cannot be smaller than -1") inputBlock.get().set(new FileBlock(UTF8String.fromString(filePath), startOffset, length)) } diff --git a/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala b/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala index 56ef3e107a980..fccabcdd169c6 100644 --- a/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/JdbcRDD.scala @@ -109,7 +109,7 @@ class JdbcRDD[T: ClassTag]( } } - override def close() { + override def close(): Unit = { try { if (null != rs) { rs.close() diff --git a/core/src/main/scala/org/apache/spark/rdd/MapPartitionsRDD.scala b/core/src/main/scala/org/apache/spark/rdd/MapPartitionsRDD.scala index aa61997122cf4..39520a9734b06 100644 --- a/core/src/main/scala/org/apache/spark/rdd/MapPartitionsRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/MapPartitionsRDD.scala @@ -51,7 +51,7 @@ private[spark] class MapPartitionsRDD[U: ClassTag, T: ClassTag]( override def compute(split: Partition, context: TaskContext): Iterator[U] = f(context, split.index, firstParent[T].iterator(split, context)) - override def clearDependencies() { + override def clearDependencies(): Unit = { super.clearDependencies() prev = null } diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala index e23133682360f..1e39e10856877 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala @@ -261,7 +261,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) } else { StratifiedSamplingUtils.getBernoulliSamplingFunction(self, fractions, false, seed) } - self.mapPartitionsWithIndex(samplingFunc, preservesPartitioning = true) + self.mapPartitionsWithIndex(samplingFunc, preservesPartitioning = true, isOrderSensitive = true) } /** @@ -291,7 +291,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)]) } else { StratifiedSamplingUtils.getBernoulliSamplingFunction(self, fractions, true, seed) } - self.mapPartitionsWithIndex(samplingFunc, preservesPartitioning = true) + self.mapPartitionsWithIndex(samplingFunc, preservesPartitioning = true, isOrderSensitive = true) } /** diff --git a/core/src/main/scala/org/apache/spark/rdd/PartitionerAwareUnionRDD.scala b/core/src/main/scala/org/apache/spark/rdd/PartitionerAwareUnionRDD.scala index d744d67592545..965618ee827d1 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PartitionerAwareUnionRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PartitionerAwareUnionRDD.scala @@ -101,7 +101,7 @@ class PartitionerAwareUnionRDD[T: ClassTag]( } } - override def clearDependencies() { + override def clearDependencies(): Unit = { super.clearDependencies() rdds = null } diff --git a/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala b/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala index 15691a8fc8eaa..c8cdaa60e4335 100644 --- a/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/PartitionwiseSampledRDD.scala @@ -67,4 +67,12 @@ private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } + + override protected def getOutputDeterministicLevel = { + if (prev.outputDeterministicLevel == DeterministicLevel.UNORDERED) { + DeterministicLevel.INDETERMINATE + } else { + super.getOutputDeterministicLevel + } + } } diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala index eafe3b17c2136..a26b5791fa08b 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala @@ -225,10 +225,24 @@ abstract class RDD[T: ClassTag]( /** Get the RDD's current storage level, or StorageLevel.NONE if none is set. */ def getStorageLevel: StorageLevel = storageLevel + /** + * Lock for all mutable state of this RDD (persistence, partitions, dependencies, etc.). We do + * not use `this` because RDDs are user-visible, so users might have added their own locking on + * RDDs; sharing that could lead to a deadlock. + * + * One thread might hold the lock on many of these, for a chain of RDD dependencies; but + * because DAGs are acyclic, and we only ever hold locks for one path in that DAG, there is no + * chance of deadlock. + * + * Executors may reference the shared fields (though they should never mutate them, + * that only happens on the driver). + */ + private val stateLock = new Serializable {} + // Our dependencies and partitions will be gotten by calling subclass's methods below, and will // be overwritten when we're checkpointed - private var dependencies_ : Seq[Dependency[_]] = _ - @transient private var partitions_ : Array[Partition] = _ + @volatile private var dependencies_ : Seq[Dependency[_]] = _ + @volatile @transient private var partitions_ : Array[Partition] = _ /** An Option holding our checkpoint RDD, if we are checkpointed */ private def checkpointRDD: Option[CheckpointRDD[T]] = checkpointData.flatMap(_.checkpointRDD) @@ -240,7 +254,11 @@ abstract class RDD[T: ClassTag]( final def dependencies: Seq[Dependency[_]] = { checkpointRDD.map(r => List(new OneToOneDependency(r))).getOrElse { if (dependencies_ == null) { - dependencies_ = getDependencies + stateLock.synchronized { + if (dependencies_ == null) { + dependencies_ = getDependencies + } + } } dependencies_ } @@ -253,10 +271,14 @@ abstract class RDD[T: ClassTag]( final def partitions: Array[Partition] = { checkpointRDD.map(_.partitions).getOrElse { if (partitions_ == null) { - partitions_ = getPartitions - partitions_.zipWithIndex.foreach { case (partition, index) => - require(partition.index == index, - s"partitions($index).partition == ${partition.index}, but it should equal $index") + stateLock.synchronized { + if (partitions_ == null) { + partitions_ = getPartitions + partitions_.zipWithIndex.foreach { case (partition, index) => + require(partition.index == index, + s"partitions($index).partition == ${partition.index}, but it should equal $index") + } + } } } partitions_ @@ -339,6 +361,7 @@ abstract class RDD[T: ClassTag]( readCachedBlock = false computeOrReadCheckpoint(partition, context) }) match { + // Block hit. case Left(blockResult) => if (readCachedBlock) { val existingMetrics = context.taskMetrics().inputMetrics @@ -352,6 +375,7 @@ abstract class RDD[T: ClassTag]( } else { new InterruptibleIterator(context, blockResult.data.asInstanceOf[Iterator[T]]) } + // Need to compute the block. case Right(iter) => new InterruptibleIterator(context, iter.asInstanceOf[Iterator[T]]) } @@ -430,8 +454,6 @@ abstract class RDD[T: ClassTag]( * * If you are decreasing the number of partitions in this RDD, consider using `coalesce`, * which can avoid performing a shuffle. - * - * TODO Fix the Shuffle+Repartition data loss issue described in SPARK-23207. */ def repartition(numPartitions: Int)(implicit ord: Ordering[T] = null): RDD[T] = withScope { coalesce(numPartitions, shuffle = true) @@ -557,7 +579,7 @@ abstract class RDD[T: ClassTag]( val sampler = new BernoulliCellSampler[T](lb, ub) sampler.setSeed(seed + index) sampler.sample(partition) - }, preservesPartitioning = true) + }, isOrderSensitive = true, preservesPartitioning = true) } /** @@ -870,6 +892,29 @@ abstract class RDD[T: ClassTag]( preservesPartitioning) } + /** + * Return a new RDD by applying a function to each partition of this RDD, while tracking the index + * of the original partition. + * + * `preservesPartitioning` indicates whether the input function preserves the partitioner, which + * should be `false` unless this is a pair RDD and the input function doesn't modify the keys. + * + * `isOrderSensitive` indicates whether the function is order-sensitive. If it is order + * sensitive, it may return totally different result when the input order + * is changed. Mostly stateful functions are order-sensitive. + */ + private[spark] def mapPartitionsWithIndex[U: ClassTag]( + f: (Int, Iterator[T]) => Iterator[U], + preservesPartitioning: Boolean, + isOrderSensitive: Boolean): RDD[U] = withScope { + val cleanedF = sc.clean(f) + new MapPartitionsRDD( + this, + (_: TaskContext, index: Int, iter: Iterator[T]) => cleanedF(index, iter), + preservesPartitioning, + isOrderSensitive = isOrderSensitive) + } + /** * Zips this RDD with another one, returning key-value pairs with the first element in each RDD, * second element in each RDD, etc. Assumes that the two RDDs have the *same number of @@ -1767,7 +1812,7 @@ abstract class RDD[T: ClassTag]( * Changes the dependencies of this RDD from its original parents to a new RDD (`newRDD`) * created from the checkpoint file, and forget its old dependencies and partitions. */ - private[spark] def markCheckpointed(): Unit = { + private[spark] def markCheckpointed(): Unit = stateLock.synchronized { clearDependencies() partitions_ = null deps = null // Forget the constructor argument for dependencies too @@ -1779,7 +1824,7 @@ abstract class RDD[T: ClassTag]( * collected. Subclasses of RDD may override this method for implementing their own cleaning * logic. See [[org.apache.spark.rdd.UnionRDD]] for an example. */ - protected def clearDependencies(): Unit = { + protected def clearDependencies(): Unit = stateLock.synchronized { dependencies_ = null } @@ -1938,6 +1983,7 @@ abstract class RDD[T: ClassTag]( deterministicLevelCandidates.maxBy(_.id) } } + } diff --git a/core/src/main/scala/org/apache/spark/rdd/RDDBarrier.scala b/core/src/main/scala/org/apache/spark/rdd/RDDBarrier.scala index 42802f7113a19..b70ea0073c9a0 100644 --- a/core/src/main/scala/org/apache/spark/rdd/RDDBarrier.scala +++ b/core/src/main/scala/org/apache/spark/rdd/RDDBarrier.scala @@ -54,5 +54,27 @@ class RDDBarrier[T: ClassTag] private[spark] (rdd: RDD[T]) { ) } + /** + * :: Experimental :: + * Returns a new RDD by applying a function to each partition of the wrapped RDD, while tracking + * the index of the original partition. And all tasks are launched together in a barrier stage. + * The interface is the same as [[org.apache.spark.rdd.RDD#mapPartitionsWithIndex]]. + * Please see the API doc there. + * @see [[org.apache.spark.BarrierTaskContext]] + */ + @Experimental + @Since("3.0.0") + def mapPartitionsWithIndex[S: ClassTag]( + f: (Int, Iterator[T]) => Iterator[S], + preservesPartitioning: Boolean = false): RDD[S] = rdd.withScope { + val cleanedF = rdd.sparkContext.clean(f) + new MapPartitionsRDD( + rdd, + (_: TaskContext, index: Int, iter: Iterator[T]) => cleanedF(index, iter), + preservesPartitioning, + isFromBarrier = true + ) + } + // TODO: [SPARK-25247] add extra conf to RDDBarrier, e.g., timeout. } diff --git a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala index d165610291f1d..a5c07c07e8f2b 100644 --- a/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/ReliableCheckpointRDD.scala @@ -20,15 +20,17 @@ package org.apache.spark.rdd import java.io.{FileNotFoundException, IOException} import java.util.concurrent.TimeUnit +import scala.collection.mutable import scala.reflect.ClassTag import scala.util.control.NonFatal +import com.google.common.cache.{CacheBuilder, CacheLoader} import org.apache.hadoop.fs.Path import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging -import org.apache.spark.internal.config.{BUFFER_SIZE, CHECKPOINT_COMPRESS} +import org.apache.spark.internal.config.{BUFFER_SIZE, CACHE_CHECKPOINT_PREFERRED_LOCS_EXPIRE_TIME, CHECKPOINT_COMPRESS} import org.apache.spark.io.CompressionCodec import org.apache.spark.util.{SerializableConfiguration, Utils} @@ -82,16 +84,40 @@ private[spark] class ReliableCheckpointRDD[T: ClassTag]( Array.tabulate(inputFiles.length)(i => new CheckpointRDDPartition(i)) } - /** - * Return the locations of the checkpoint file associated with the given partition. - */ - protected override def getPreferredLocations(split: Partition): Seq[String] = { + // Cache of preferred locations of checkpointed files. + @transient private[spark] lazy val cachedPreferredLocations = CacheBuilder.newBuilder() + .expireAfterWrite( + SparkEnv.get.conf.get(CACHE_CHECKPOINT_PREFERRED_LOCS_EXPIRE_TIME).get, + TimeUnit.MINUTES) + .build( + new CacheLoader[Partition, Seq[String]]() { + override def load(split: Partition): Seq[String] = { + getPartitionBlockLocations(split) + } + }) + + // Returns the block locations of given partition on file system. + private def getPartitionBlockLocations(split: Partition): Seq[String] = { val status = fs.getFileStatus( new Path(checkpointPath, ReliableCheckpointRDD.checkpointFileName(split.index))) val locations = fs.getFileBlockLocations(status, 0, status.getLen) locations.headOption.toList.flatMap(_.getHosts).filter(_ != "localhost") } + private lazy val cachedExpireTime = + SparkEnv.get.conf.get(CACHE_CHECKPOINT_PREFERRED_LOCS_EXPIRE_TIME) + + /** + * Return the locations of the checkpoint file associated with the given partition. + */ + protected override def getPreferredLocations(split: Partition): Seq[String] = { + if (cachedExpireTime.isDefined && cachedExpireTime.get > 0) { + cachedPreferredLocations.get(split) + } else { + getPartitionBlockLocations(split) + } + } + /** * Read the content of the checkpoint file associated with the given partition. */ @@ -166,7 +192,7 @@ private[spark] object ReliableCheckpointRDD extends Logging { def writePartitionToCheckpointFile[T: ClassTag]( path: String, broadcastedConf: Broadcast[SerializableConfiguration], - blockSize: Int = -1)(ctx: TaskContext, iterator: Iterator[T]) { + blockSize: Int = -1)(ctx: TaskContext, iterator: Iterator[T]): Unit = { val env = SparkEnv.get val outputDir = new Path(path) val fs = outputDir.getFileSystem(broadcastedConf.value.value) diff --git a/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala index 5ec99b7f4f3ab..0930a5c9cfb96 100644 --- a/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala @@ -108,7 +108,7 @@ class ShuffledRDD[K: ClassTag, V: ClassTag, C: ClassTag]( .asInstanceOf[Iterator[(K, C)]] } - override def clearDependencies() { + override def clearDependencies(): Unit = { super.clearDependencies() prev = null } diff --git a/core/src/main/scala/org/apache/spark/rdd/SubtractedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/SubtractedRDD.scala index 42d190377f104..d5a811d4dc3fd 100644 --- a/core/src/main/scala/org/apache/spark/rdd/SubtractedRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/SubtractedRDD.scala @@ -127,7 +127,7 @@ private[spark] class SubtractedRDD[K: ClassTag, V: ClassTag, W: ClassTag]( map.asScala.iterator.map(t => t._2.iterator.map((t._1, _))).flatten } - override def clearDependencies() { + override def clearDependencies(): Unit = { super.clearDependencies() rdd1 = null rdd2 = null diff --git a/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala b/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala index 36589e93a1c5e..63fa3c2487c33 100644 --- a/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala @@ -21,6 +21,7 @@ import java.io.{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.collection.parallel.ForkJoinTaskSupport +import scala.collection.parallel.immutable.ParVector import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} @@ -75,13 +76,13 @@ class UnionRDD[T: ClassTag]( override def getPartitions: Array[Partition] = { val parRDDs = if (isPartitionListingParallel) { - val parArray = rdds.par + val parArray = new ParVector(rdds.toVector) parArray.tasksupport = UnionRDD.partitionEvalTaskSupport parArray } else { rdds } - val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum) + val array = new Array[Partition](parRDDs.map(_.partitions.length).sum) var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) @@ -108,7 +109,7 @@ class UnionRDD[T: ClassTag]( override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() - override def clearDependencies() { + override def clearDependencies(): Unit = { super.clearDependencies() rdds = null } diff --git a/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala b/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala index 3cb1231bd3477..678a48948a3c1 100644 --- a/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/ZippedPartitionsRDD.scala @@ -70,7 +70,7 @@ private[spark] abstract class ZippedPartitionsBaseRDD[V: ClassTag]( s.asInstanceOf[ZippedPartitionsPartition].preferredLocations } - override def clearDependencies() { + override def clearDependencies(): Unit = { super.clearDependencies() rdds = null } @@ -89,7 +89,7 @@ private[spark] class ZippedPartitionsRDD2[A: ClassTag, B: ClassTag, V: ClassTag] f(rdd1.iterator(partitions(0), context), rdd2.iterator(partitions(1), context)) } - override def clearDependencies() { + override def clearDependencies(): Unit = { super.clearDependencies() rdd1 = null rdd2 = null @@ -114,7 +114,7 @@ private[spark] class ZippedPartitionsRDD3 rdd3.iterator(partitions(2), context)) } - override def clearDependencies() { + override def clearDependencies(): Unit = { super.clearDependencies() rdd1 = null rdd2 = null @@ -142,7 +142,7 @@ private[spark] class ZippedPartitionsRDD4 rdd4.iterator(partitions(3), context)) } - override def clearDependencies() { + override def clearDependencies(): Unit = { super.clearDependencies() rdd1 = null rdd2 = null diff --git a/core/src/main/scala/org/apache/spark/rdd/util/PeriodicRDDCheckpointer.scala b/core/src/main/scala/org/apache/spark/rdd/util/PeriodicRDDCheckpointer.scala index 4a6106984a495..e460542f0319e 100644 --- a/core/src/main/scala/org/apache/spark/rdd/util/PeriodicRDDCheckpointer.scala +++ b/core/src/main/scala/org/apache/spark/rdd/util/PeriodicRDDCheckpointer.scala @@ -76,8 +76,13 @@ import org.apache.spark.util.PeriodicCheckpointer */ private[spark] class PeriodicRDDCheckpointer[T]( checkpointInterval: Int, - sc: SparkContext) + sc: SparkContext, + storageLevel: StorageLevel) extends PeriodicCheckpointer[RDD[T]](checkpointInterval, sc) { + require(storageLevel != StorageLevel.NONE) + + def this(checkpointInterval: Int, sc: SparkContext) = + this(checkpointInterval, sc, StorageLevel.MEMORY_ONLY) override protected def checkpoint(data: RDD[T]): Unit = data.checkpoint() @@ -85,7 +90,7 @@ private[spark] class PeriodicRDDCheckpointer[T]( override protected def persist(data: RDD[T]): Unit = { if (data.getStorageLevel == StorageLevel.NONE) { - data.persist() + data.persist(storageLevel) } } diff --git a/core/src/main/scala/org/apache/spark/resource/ExecutorResourceRequest.scala b/core/src/main/scala/org/apache/spark/resource/ExecutorResourceRequest.scala new file mode 100644 index 0000000000000..9a920914ed674 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/resource/ExecutorResourceRequest.scala @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.resource + +/** + * An Executor resource request. This is used in conjunction with the ResourceProfile to + * programmatically specify the resources needed for an RDD that will be applied at the + * stage level. + * + * This is used to specify what the resource requirements are for an Executor and how + * Spark can find out specific details about those resources. Not all the parameters are + * required for every resource type. Resources like GPUs are supported and have same limitations + * as using the global spark configs spark.executor.resource.gpu.*. The amount, discoveryScript, + * and vendor parameters for resources are all the same parameters a user would specify through the + * configs: spark.executor.resource.{resourceName}.{amount, discoveryScript, vendor}. + * + * For instance, a user wants to allocate an Executor with GPU resources on YARN. The user has + * to specify the resource name (gpu), the amount or number of GPUs per Executor, + * the discovery script would be specified so that when the Executor starts up it can + * discovery what GPU addresses are available for it to use because YARN doesn't tell + * Spark that, then vendor would not be used because its specific for Kubernetes. + * + * See the configuration and cluster specific docs for more details. + * + * Use ExecutorResourceRequests class as a convenience API. + * + * @param resourceName Name of the resource + * @param amount Amount requesting + * @param discoveryScript Optional script used to discover the resources. This is required on some + * cluster managers that don't tell Spark the addresses of the resources + * allocated. The script runs on Executors startup to discover the addresses + * of the resources available. + * @param vendor Optional vendor, required for some cluster managers + * + * This api is currently private until the rest of the pieces are in place and then it + * will become public. + */ +private[spark] class ExecutorResourceRequest( + val resourceName: String, + val amount: Long, + val discoveryScript: String = "", + val vendor: String = "") extends Serializable { + + override def equals(obj: Any): Boolean = { + obj match { + case that: ExecutorResourceRequest => + that.getClass == this.getClass && + that.resourceName == resourceName && that.amount == amount && + that.discoveryScript == discoveryScript && that.vendor == vendor + case _ => + false + } + } + + override def hashCode(): Int = + Seq(resourceName, amount, discoveryScript, vendor).hashCode() + + override def toString(): String = { + s"name: $resourceName, amount: $amount, script: $discoveryScript, vendor: $vendor" + } +} diff --git a/core/src/main/scala/org/apache/spark/resource/ExecutorResourceRequests.scala b/core/src/main/scala/org/apache/spark/resource/ExecutorResourceRequests.scala new file mode 100644 index 0000000000000..d4c29f9a70c44 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/resource/ExecutorResourceRequests.scala @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.resource + +import java.util.concurrent.ConcurrentHashMap + +import scala.collection.JavaConverters._ + +import org.apache.spark.network.util.JavaUtils +import org.apache.spark.resource.ResourceProfile._ + +/** + * A set of Executor resource requests. This is used in conjunction with the ResourceProfile to + * programmatically specify the resources needed for an RDD that will be applied at the + * stage level. + * + * This api is currently private until the rest of the pieces are in place and then it + * will become public. + */ +private[spark] class ExecutorResourceRequests() extends Serializable { + + private val _executorResources = new ConcurrentHashMap[String, ExecutorResourceRequest]() + + def requests: Map[String, ExecutorResourceRequest] = _executorResources.asScala.toMap + + /** + * Specify heap memory. The value specified will be converted to MiB. + * + * @param amount Amount of memory. In the same format as JVM memory strings (e.g. 512m, 2g). + * Default unit is MiB if not specified. + */ + def memory(amount: String): this.type = { + val amountMiB = JavaUtils.byteStringAsMb(amount) + val req = new ExecutorResourceRequest(MEMORY, amountMiB) + _executorResources.put(MEMORY, req) + this + } + + /** + * Specify overhead memory. The value specified will be converted to MiB. + * + * @param amount Amount of memory. In the same format as JVM memory strings (e.g. 512m, 2g). + * Default unit is MiB if not specified. + */ + def memoryOverhead(amount: String): this.type = { + val amountMiB = JavaUtils.byteStringAsMb(amount) + val req = new ExecutorResourceRequest(OVERHEAD_MEM, amountMiB) + _executorResources.put(OVERHEAD_MEM, req) + this + } + + /** + * Specify pyspark memory. The value specified will be converted to MiB. + * + * @param amount Amount of memory. In the same format as JVM memory strings (e.g. 512m, 2g). + * Default unit is MiB if not specified. + */ + def pysparkMemory(amount: String): this.type = { + val amountMiB = JavaUtils.byteStringAsMb(amount) + val req = new ExecutorResourceRequest(PYSPARK_MEM, amountMiB) + _executorResources.put(PYSPARK_MEM, req) + this + } + + /** + * Specify number of cores per Executor. + * + * @param amount Number of cores to allocate per Executor. + */ + def cores(amount: Int): this.type = { + val req = new ExecutorResourceRequest(CORES, amount) + _executorResources.put(CORES, req) + this + } + + /** + * Amount of a particular custom resource(GPU, FPGA, etc) to use. The resource names supported + * correspond to the regular Spark configs with the prefix removed. For instance, resources + * like GPUs are gpu (spark configs spark.executor.resource.gpu.*). If you pass in a resource + * that the cluster manager doesn't support the result is undefined, it may error or may just + * be ignored. + * + * @param resourceName Name of the resource. + * @param amount amount of that resource per executor to use. + * @param discoveryScript Optional script used to discover the resources. This is required on + * some cluster managers that don't tell Spark the addresses of + * the resources allocated. The script runs on Executors startup to + * of the resources available. + * @param vendor Optional vendor, required for some cluster managers + */ + def resource( + resourceName: String, + amount: Long, + discoveryScript: String = "", + vendor: String = ""): this.type = { + // a bit weird but for Java api use empty string as meaning None because empty + // string is otherwise invalid for those parameters anyway + val req = new ExecutorResourceRequest(resourceName, amount, discoveryScript, vendor) + _executorResources.put(resourceName, req) + this + } + + override def toString: String = { + s"Executor resource requests: ${_executorResources}" + } +} diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceAllocator.scala b/core/src/main/scala/org/apache/spark/resource/ResourceAllocator.scala index e64fadc113149..22272a0f98a6c 100644 --- a/core/src/main/scala/org/apache/spark/resource/ResourceAllocator.scala +++ b/core/src/main/scala/org/apache/spark/resource/ResourceAllocator.scala @@ -30,27 +30,44 @@ trait ResourceAllocator { protected def resourceName: String protected def resourceAddresses: Seq[String] + protected def slotsPerAddress: Int /** - * Map from an address to its availability, the value `true` means the address is available, - * while value `false` means the address is assigned. + * Map from an address to its availability, a value > 0 means the address is available, + * while value of 0 means the address is fully assigned. + * + * For task resources ([[org.apache.spark.scheduler.ExecutorResourceInfo]]), this value + * can be a multiple, such that each address can be allocated up to [[slotsPerAddress]] + * times. + * * TODO Use [[OpenHashMap]] instead to gain better performance. */ - private lazy val addressAvailabilityMap = mutable.HashMap(resourceAddresses.map(_ -> true): _*) + private lazy val addressAvailabilityMap = { + mutable.HashMap(resourceAddresses.map(_ -> slotsPerAddress): _*) + } /** * Sequence of currently available resource addresses. + * + * With [[slotsPerAddress]] greater than 1, [[availableAddrs]] can contain duplicate addresses + * e.g. with [[slotsPerAddress]] == 2, availableAddrs for addresses 0 and 1 can look like + * Seq("0", "0", "1"), where address 0 has two assignments available, and 1 has one. */ - def availableAddrs: Seq[String] = addressAvailabilityMap.flatMap { case (addr, available) => - if (available) Some(addr) else None - }.toSeq + def availableAddrs: Seq[String] = addressAvailabilityMap + .flatMap { case (addr, available) => + (0 until available).map(_ => addr) + }.toSeq /** * Sequence of currently assigned resource addresses. + * + * With [[slotsPerAddress]] greater than 1, [[assignedAddrs]] can contain duplicate addresses + * e.g. with [[slotsPerAddress]] == 2, assignedAddrs for addresses 0 and 1 can look like + * Seq("0", "1", "1"), where address 0 was assigned once, and 1 was assigned twice. */ private[spark] def assignedAddrs: Seq[String] = addressAvailabilityMap .flatMap { case (addr, available) => - if (!available) Some(addr) else None + (0 until slotsPerAddress - available).map(_ => addr) }.toSeq /** @@ -65,8 +82,8 @@ trait ResourceAllocator { s"address $address doesn't exist.") } val isAvailable = addressAvailabilityMap(address) - if (isAvailable) { - addressAvailabilityMap(address) = false + if (isAvailable > 0) { + addressAvailabilityMap(address) = addressAvailabilityMap(address) - 1 } else { throw new SparkException("Try to acquire an address that is not available. " + s"$resourceName address $address is not available.") @@ -86,8 +103,8 @@ trait ResourceAllocator { s"address $address doesn't exist.") } val isAvailable = addressAvailabilityMap(address) - if (!isAvailable) { - addressAvailabilityMap(address) = true + if (isAvailable < slotsPerAddress) { + addressAvailabilityMap(address) = addressAvailabilityMap(address) + 1 } else { throw new SparkException(s"Try to release an address that is not assigned. $resourceName " + s"address $address is not assigned.") diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceDiscoveryScriptPlugin.scala b/core/src/main/scala/org/apache/spark/resource/ResourceDiscoveryScriptPlugin.scala new file mode 100644 index 0000000000000..2ac6d3c500f9d --- /dev/null +++ b/core/src/main/scala/org/apache/spark/resource/ResourceDiscoveryScriptPlugin.scala @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.resource + +import java.io.File +import java.util.Optional + +import org.apache.spark.{SparkConf, SparkException} +import org.apache.spark.api.resource.ResourceDiscoveryPlugin +import org.apache.spark.internal.Logging +import org.apache.spark.util.Utils.executeAndGetOutput + +/** + * The default plugin that is loaded into a Spark application to control how custom + * resources are discovered. This executes the discovery script specified by the user + * and gets the json output back and contructs ResourceInformation objects from that. + * If the user specifies custom plugins, this is the last one to be executed and + * throws if the resource isn't discovered. + */ +class ResourceDiscoveryScriptPlugin extends ResourceDiscoveryPlugin with Logging { + override def discoverResource( + request: ResourceRequest, + sparkConf: SparkConf): Optional[ResourceInformation] = { + val script = request.discoveryScript + val resourceName = request.id.resourceName + val result = if (script.isPresent) { + val scriptFile = new File(script.get) + logInfo(s"Discovering resources for $resourceName with script: $scriptFile") + // check that script exists and try to execute + if (scriptFile.exists()) { + val output = executeAndGetOutput(Seq(script.get), new File(".")) + ResourceInformation.parseJson(output) + } else { + throw new SparkException(s"Resource script: $scriptFile to discover $resourceName " + + "doesn't exist!") + } + } else { + throw new SparkException(s"User is expecting to use resource: $resourceName, but " + + "didn't specify a discovery script!") + } + if (!result.name.equals(resourceName)) { + throw new SparkException(s"Error running the resource discovery script ${script.get}: " + + s"script returned resource name ${result.name} and we were expecting $resourceName.") + } + Optional.of(result) + } +} diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala b/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala new file mode 100644 index 0000000000000..03dcf5e317798 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/resource/ResourceProfile.scala @@ -0,0 +1,321 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.resource + +import java.util.{Map => JMap} +import java.util.concurrent.atomic.AtomicInteger +import javax.annotation.concurrent.GuardedBy + +import scala.collection.JavaConverters._ +import scala.collection.mutable + +import org.apache.spark.{SparkConf, SparkException} +import org.apache.spark.annotation.Evolving +import org.apache.spark.internal.Logging +import org.apache.spark.internal.config._ +import org.apache.spark.internal.config.Python.PYSPARK_EXECUTOR_MEMORY +import org.apache.spark.util.Utils + +/** + * Resource profile to associate with an RDD. A ResourceProfile allows the user to + * specify executor and task requirements for an RDD that will get applied during a + * stage. This allows the user to change the resource requirements between stages. + * This is meant to be immutable so user can't change it after building. + */ +@Evolving +class ResourceProfile( + val executorResources: Map[String, ExecutorResourceRequest], + val taskResources: Map[String, TaskResourceRequest]) extends Serializable with Logging { + + // _id is only a var for testing purposes + private var _id = ResourceProfile.getNextProfileId + // This is used for any resources that use fractional amounts, the key is the resource name + // and the value is the number of tasks that can share a resource address. For example, + // if the user says task gpu amount is 0.5, that results in 2 tasks per resource address. + private var _executorResourceSlotsPerAddr: Option[Map[String, Int]] = None + private var _limitingResource: Option[String] = None + private var _maxTasksPerExecutor: Option[Int] = None + private var _coresLimitKnown: Boolean = false + + def id: Int = _id + + /** + * (Java-specific) gets a Java Map of resources to TaskResourceRequest + */ + def taskResourcesJMap: JMap[String, TaskResourceRequest] = taskResources.asJava + + /** + * (Java-specific) gets a Java Map of resources to ExecutorResourceRequest + */ + def executorResourcesJMap: JMap[String, ExecutorResourceRequest] = { + executorResources.asJava + } + + // Note that some cluster managers don't set the executor cores explicitly so + // be sure to check the Option as required + private[spark] def getExecutorCores: Option[Int] = { + executorResources.get(ResourceProfile.CORES).map(_.amount.toInt) + } + + private[spark] def getTaskCpus: Option[Int] = { + taskResources.get(ResourceProfile.CPUS).map(_.amount.toInt) + } + + private[spark] def getNumSlotsPerAddress(resource: String, sparkConf: SparkConf): Int = { + _executorResourceSlotsPerAddr.getOrElse { + calculateTasksAndLimitingResource(sparkConf) + } + _executorResourceSlotsPerAddr.get.getOrElse(resource, + throw new SparkException(s"Resource $resource doesn't exist in profile id: $id")) + } + + // Maximum tasks you could put on an executor with this profile based on the limiting resource. + // If the executor cores config is not present this value is based on the other resources + // available or 1 if no other resources. You need to check the isCoresLimitKnown to + // calculate proper value. + private[spark] def maxTasksPerExecutor(sparkConf: SparkConf): Int = { + _maxTasksPerExecutor.getOrElse { + calculateTasksAndLimitingResource(sparkConf) + _maxTasksPerExecutor.get + } + } + + // Returns whether the executor cores was available to use to calculate the max tasks + // per executor and limiting resource. Some cluster managers (like standalone and coarse + // grained mesos) don't use the cores config by default so we can't use it to calculate slots. + private[spark] def isCoresLimitKnown: Boolean = _coresLimitKnown + + // The resource that has the least amount of slots per executor. Its possible multiple or all + // resources result in same number of slots and this could be any of those. + // If the executor cores config is not present this value is based on the other resources + // available or empty string if no other resources. You need to check the isCoresLimitKnown to + // calculate proper value. + private[spark] def limitingResource(sparkConf: SparkConf): String = { + _limitingResource.getOrElse { + calculateTasksAndLimitingResource(sparkConf) + _limitingResource.get + } + } + + // executor cores config is not set for some masters by default and the default value + // only applies to yarn/k8s + private def shouldCheckExecutorCores(sparkConf: SparkConf): Boolean = { + val master = sparkConf.getOption("spark.master") + sparkConf.contains(EXECUTOR_CORES) || + (master.isDefined && (master.get.equalsIgnoreCase("yarn") || master.get.startsWith("k8s"))) + } + + /** + * Utility function to calculate the number of tasks you can run on a single Executor based + * on the task and executor resource requests in the ResourceProfile. This will be based + * off the resource that is most restrictive. For instance, if the executor + * request is for 4 cpus and 2 gpus and your task request is for 1 cpu and 1 gpu each, the + * limiting resource is gpu and the number of tasks you can run on a single executor is 2. + * This function also sets the limiting resource, isCoresLimitKnown and number of slots per + * resource address. + */ + private def calculateTasksAndLimitingResource(sparkConf: SparkConf): Unit = synchronized { + val shouldCheckExecCores = shouldCheckExecutorCores(sparkConf) + var (taskLimit, limitingResource) = if (shouldCheckExecCores) { + val cpusPerTask = taskResources.get(ResourceProfile.CPUS) + .map(_.amount).getOrElse(sparkConf.get(CPUS_PER_TASK).toDouble).toInt + assert(cpusPerTask > 0, "CPUs per task configuration has to be > 0") + val coresPerExecutor = getExecutorCores.getOrElse(sparkConf.get(EXECUTOR_CORES)) + _coresLimitKnown = true + ResourceUtils.validateTaskCpusLargeEnough(coresPerExecutor, cpusPerTask) + val tasksBasedOnCores = coresPerExecutor / cpusPerTask + // Note that if the cores per executor aren't set properly this calculation could be off, + // we default it to just be 1 in order to allow checking of the rest of the custom + // resources. We set the limit based on the other resources available. + (tasksBasedOnCores, ResourceProfile.CPUS) + } else { + (-1, "") + } + val numPartsPerResourceMap = new mutable.HashMap[String, Int] + numPartsPerResourceMap(ResourceProfile.CORES) = 1 + val taskResourcesToCheck = new mutable.HashMap[String, TaskResourceRequest] + taskResourcesToCheck ++= ResourceProfile.getCustomTaskResources(this) + val execResourceToCheck = ResourceProfile.getCustomExecutorResources(this) + execResourceToCheck.foreach { case (rName, execReq) => + val taskReq = taskResources.get(rName).map(_.amount).getOrElse(0.0) + numPartsPerResourceMap(rName) = 1 + if (taskReq > 0.0) { + if (taskReq > execReq.amount) { + throw new SparkException(s"The executor resource: $rName, amount: ${execReq.amount} " + + s"needs to be >= the task resource request amount of $taskReq") + } + val (numPerTask, parts) = ResourceUtils.calculateAmountAndPartsForFraction(taskReq) + numPartsPerResourceMap(rName) = parts + val numTasks = ((execReq.amount * parts) / numPerTask).toInt + if (taskLimit == -1 || numTasks < taskLimit) { + if (shouldCheckExecCores) { + // TODO - until resource profiles full implemented we need to error if cores not + // limiting resource because the scheduler code uses that for slots + throw new IllegalArgumentException("The number of slots on an executor has to be " + + "limited by the number of cores, otherwise you waste resources and " + + "dynamic allocation doesn't work properly. Your configuration has " + + s"core/task cpu slots = ${taskLimit} and " + + s"${execReq.resourceName} = ${numTasks}. " + + "Please adjust your configuration so that all resources require same number " + + "of executor slots.") + } + limitingResource = rName + taskLimit = numTasks + } + taskResourcesToCheck -= rName + } else { + logWarning(s"The executor resource config for resource: $rName was specified but " + + "no corresponding task resource request was specified.") + } + } + if(!shouldCheckExecCores) { + // if we can't rely on the executor cores config throw a warning for user + logWarning("Please ensure that the number of slots available on your " + + "executors is limited by the number of cores to task cpus and not another " + + "custom resource. If cores is not the limiting resource then dynamic " + + "allocation will not work properly!") + } + if (taskResourcesToCheck.nonEmpty) { + throw new SparkException("No executor resource configs were not specified for the " + + s"following task configs: ${taskResourcesToCheck.keys.mkString(",")}") + } + logInfo(s"Limiting resource is $limitingResource at $taskLimit tasks per executor") + _executorResourceSlotsPerAddr = Some(numPartsPerResourceMap.toMap) + _maxTasksPerExecutor = if (taskLimit == -1) Some(1) else Some(taskLimit) + _limitingResource = Some(limitingResource) + if (shouldCheckExecCores) { + ResourceUtils.warnOnWastedResources(this, sparkConf) + } + } + + // to be used only by history server for reconstruction from events + private[spark] def setResourceProfileId(id: Int): Unit = { + _id = id + } + + // testing only + private[spark] def setToDefaultProfile(): Unit = { + _id = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID + } + + override def equals(obj: Any): Boolean = { + obj match { + case that: ResourceProfile => + that.getClass == this.getClass && that.id == _id && + that.taskResources == taskResources && that.executorResources == executorResources + case _ => + false + } + } + + override def hashCode(): Int = Seq(taskResources, executorResources).hashCode() + + override def toString(): String = { + s"Profile: id = ${_id}, executor resources: ${executorResources.mkString(",")}, " + + s"task resources: ${taskResources.mkString(",")}" + } +} + +object ResourceProfile extends Logging { + // task resources + val CPUS = "cpus" + // Executor resources + val CORES = "cores" + val MEMORY = "memory" + val OVERHEAD_MEM = "memoryOverhead" + val PYSPARK_MEM = "pyspark.memory" + + // all supported spark executor resources (minus the custom resources like GPUs/FPGAs) + val allSupportedExecutorResources = Seq(CORES, MEMORY, OVERHEAD_MEM, PYSPARK_MEM) + + val UNKNOWN_RESOURCE_PROFILE_ID = -1 + val DEFAULT_RESOURCE_PROFILE_ID = 0 + + private lazy val nextProfileId = new AtomicInteger(0) + private val DEFAULT_PROFILE_LOCK = new Object() + + // The default resource profile uses the application level configs. + // var so that it can be reset for testing purposes. + @GuardedBy("DEFAULT_PROFILE_LOCK") + private var defaultProfile: Option[ResourceProfile] = None + + private[spark] def getNextProfileId: Int = nextProfileId.getAndIncrement() + + private[spark] def getOrCreateDefaultProfile(conf: SparkConf): ResourceProfile = { + DEFAULT_PROFILE_LOCK.synchronized { + defaultProfile match { + case Some(prof) => prof + case None => + val taskResources = getDefaultTaskResources(conf) + val executorResources = getDefaultExecutorResources(conf) + val defProf = new ResourceProfile(executorResources, taskResources) + defProf.setToDefaultProfile() + defaultProfile = Some(defProf) + logInfo("Default ResourceProfile created, executor resources: " + + s"${defProf.executorResources}, task resources: " + + s"${defProf.taskResources}") + defProf + } + } + } + + private def getDefaultTaskResources(conf: SparkConf): Map[String, TaskResourceRequest] = { + val cpusPerTask = conf.get(CPUS_PER_TASK) + val treqs = new TaskResourceRequests().cpus(cpusPerTask) + ResourceUtils.addTaskResourceRequests(conf, treqs) + treqs.requests + } + + private def getDefaultExecutorResources(conf: SparkConf): Map[String, ExecutorResourceRequest] = { + val ereqs = new ExecutorResourceRequests() + ereqs.cores(conf.get(EXECUTOR_CORES)) + ereqs.memory(conf.get(EXECUTOR_MEMORY).toString) + conf.get(EXECUTOR_MEMORY_OVERHEAD).map(mem => ereqs.memoryOverhead(mem.toString)) + conf.get(PYSPARK_EXECUTOR_MEMORY).map(mem => ereqs.pysparkMemory(mem.toString)) + val execReq = ResourceUtils.parseAllResourceRequests(conf, SPARK_EXECUTOR_PREFIX) + execReq.foreach { req => + val name = req.id.resourceName + ereqs.resource(name, req.amount, req.discoveryScript.orElse(""), + req.vendor.orElse("")) + } + ereqs.requests + } + + // for testing only + private[spark] def reInitDefaultProfile(conf: SparkConf): Unit = { + clearDefaultProfile() + // force recreate it after clearing + getOrCreateDefaultProfile(conf) + } + + private[spark] def clearDefaultProfile(): Unit = { + DEFAULT_PROFILE_LOCK.synchronized { + defaultProfile = None + } + } + + private[spark] def getCustomTaskResources( + rp: ResourceProfile): Map[String, TaskResourceRequest] = { + rp.taskResources.filterKeys(k => !k.equals(ResourceProfile.CPUS)) + } + + private[spark] def getCustomExecutorResources( + rp: ResourceProfile): Map[String, ExecutorResourceRequest] = { + rp.executorResources.filterKeys(k => !ResourceProfile.allSupportedExecutorResources.contains(k)) + } +} diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceProfileBuilder.scala b/core/src/main/scala/org/apache/spark/resource/ResourceProfileBuilder.scala new file mode 100644 index 0000000000000..26f23f4bf0476 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/resource/ResourceProfileBuilder.scala @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.resource + +import java.util.{Map => JMap} +import java.util.concurrent.ConcurrentHashMap + +import scala.collection.JavaConverters._ + +import org.apache.spark.annotation.Evolving + +/** + * Resource profile builder to build a Resource profile to associate with an RDD. + * A ResourceProfile allows the user to specify executor and task requirements for an RDD + * that will get applied during a stage. This allows the user to change the resource + * requirements between stages. + */ +@Evolving +private[spark] class ResourceProfileBuilder() { + + private val _taskResources = new ConcurrentHashMap[String, TaskResourceRequest]() + private val _executorResources = new ConcurrentHashMap[String, ExecutorResourceRequest]() + + def taskResources: Map[String, TaskResourceRequest] = _taskResources.asScala.toMap + def executorResources: Map[String, ExecutorResourceRequest] = _executorResources.asScala.toMap + + /** + * (Java-specific) gets a Java Map of resources to TaskResourceRequest + */ + def taskResourcesJMap: JMap[String, TaskResourceRequest] = _taskResources.asScala.asJava + + /** + * (Java-specific) gets a Java Map of resources to ExecutorResourceRequest + */ + def executorResourcesJMap: JMap[String, ExecutorResourceRequest] = { + _executorResources.asScala.asJava + } + + def require(requests: ExecutorResourceRequests): this.type = { + _executorResources.putAll(requests.requests.asJava) + this + } + + def require(requests: TaskResourceRequests): this.type = { + _taskResources.putAll(requests.requests.asJava) + this + } + + def clearExecutorResourceRequests(): this.type = { + _executorResources.clear() + this + } + + def clearTaskResourceRequests(): this.type = { + _taskResources.clear() + this + } + + override def toString(): String = { + "Profile executor resources: " + + s"${_executorResources.asScala.map(pair => s"${pair._1}=${pair._2.toString()}")}, " + + s"task resources: ${_taskResources.asScala.map(pair => s"${pair._1}=${pair._2.toString()}")}" + } + + def build: ResourceProfile = { + new ResourceProfile(executorResources, taskResources) + } +} + diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceProfileManager.scala b/core/src/main/scala/org/apache/spark/resource/ResourceProfileManager.scala new file mode 100644 index 0000000000000..06db9468c451e --- /dev/null +++ b/core/src/main/scala/org/apache/spark/resource/ResourceProfileManager.scala @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.resource + +import java.util.concurrent.ConcurrentHashMap + +import org.apache.spark.{SparkConf, SparkException} +import org.apache.spark.annotation.Evolving +import org.apache.spark.internal.Logging +import org.apache.spark.internal.config.Tests._ +import org.apache.spark.util.Utils +import org.apache.spark.util.Utils.isTesting + +/** + * Manager of resource profiles. The manager allows one place to keep the actual ResourceProfiles + * and everywhere else we can use the ResourceProfile Id to save on space. + * Note we never remove a resource profile at this point. Its expected this number if small + * so this shouldn't be much overhead. + */ +@Evolving +private[spark] class ResourceProfileManager(sparkConf: SparkConf) extends Logging { + private val resourceProfileIdToResourceProfile = new ConcurrentHashMap[Int, ResourceProfile]() + + private val defaultProfile = ResourceProfile.getOrCreateDefaultProfile(sparkConf) + addResourceProfile(defaultProfile) + + def defaultResourceProfile: ResourceProfile = defaultProfile + + private val taskCpusDefaultProfile = defaultProfile.getTaskCpus.get + private val dynamicEnabled = Utils.isDynamicAllocationEnabled(sparkConf) + private val master = sparkConf.getOption("spark.master") + private val isNotYarn = master.isDefined && !master.get.equals("yarn") + private val errorForTesting = !isTesting || sparkConf.get(RESOURCE_PROFILE_MANAGER_TESTING) + + // If we use anything except the default profile, its only supported on YARN right now. + // Throw an exception if not supported. + private[spark] def isSupported(rp: ResourceProfile): Boolean = { + val isNotDefaultProfile = rp.id != ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID + val notYarnAndNotDefaultProfile = isNotDefaultProfile && isNotYarn + val YarnNotDynAllocAndNotDefaultProfile = isNotDefaultProfile && !isNotYarn && !dynamicEnabled + if (errorForTesting && (notYarnAndNotDefaultProfile || YarnNotDynAllocAndNotDefaultProfile)) { + throw new SparkException("ResourceProfiles are only supported on YARN with dynamic " + + "allocation enabled.") + } + true + } + + def addResourceProfile(rp: ResourceProfile): Unit = { + isSupported(rp) + // force the computation of maxTasks and limitingResource now so we don't have cost later + rp.limitingResource(sparkConf) + logInfo(s"Adding ResourceProfile id: ${rp.id}") + resourceProfileIdToResourceProfile.putIfAbsent(rp.id, rp) + } + + /* + * Gets the ResourceProfile associated with the id, if a profile doesn't exist + * it returns the default ResourceProfile created from the application level configs. + */ + def resourceProfileFromId(rpId: Int): ResourceProfile = { + val rp = resourceProfileIdToResourceProfile.get(rpId) + if (rp == null) { + throw new SparkException(s"ResourceProfileId $rpId not found!") + } + rp + } + + def taskCpusForProfileId(rpId: Int): Int = { + resourceProfileFromId(rpId).getTaskCpus.getOrElse(taskCpusDefaultProfile) + } +} diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala index 150ba09f77dd9..cdb761c7566e7 100644 --- a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala +++ b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala @@ -17,8 +17,8 @@ package org.apache.spark.resource -import java.io.File import java.nio.file.{Files, Paths} +import java.util.Optional import scala.util.control.NonFatal @@ -26,28 +26,97 @@ import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods._ import org.apache.spark.{SparkConf, SparkException} +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.api.resource.ResourceDiscoveryPlugin import org.apache.spark.internal.Logging -import org.apache.spark.util.Utils.executeAndGetOutput +import org.apache.spark.internal.config.{CPUS_PER_TASK, EXECUTOR_CORES, RESOURCES_DISCOVERY_PLUGIN, SPARK_TASK_PREFIX} +import org.apache.spark.internal.config.Tests.{RESOURCES_WARNING_TESTING} +import org.apache.spark.util.Utils /** * Resource identifier. * @param componentName spark.driver / spark.executor / spark.task * @param resourceName gpu, fpga, etc + * + * @since 3.0.0 */ -private[spark] case class ResourceID(componentName: String, resourceName: String) { - def confPrefix: String = s"$componentName.resource.$resourceName." // with ending dot - def amountConf: String = s"$confPrefix${ResourceUtils.AMOUNT}" - def discoveryScriptConf: String = s"$confPrefix${ResourceUtils.DISCOVERY_SCRIPT}" - def vendorConf: String = s"$confPrefix${ResourceUtils.VENDOR}" +@DeveloperApi +class ResourceID(val componentName: String, val resourceName: String) { + private[spark] def confPrefix: String = { + s"$componentName.${ResourceUtils.RESOURCE_PREFIX}.$resourceName." + } + private[spark] def amountConf: String = s"$confPrefix${ResourceUtils.AMOUNT}" + private[spark] def discoveryScriptConf: String = s"$confPrefix${ResourceUtils.DISCOVERY_SCRIPT}" + private[spark] def vendorConf: String = s"$confPrefix${ResourceUtils.VENDOR}" + + override def equals(obj: Any): Boolean = { + obj match { + case that: ResourceID => + that.getClass == this.getClass && + that.componentName == componentName && that.resourceName == resourceName + case _ => + false + } + } + + override def hashCode(): Int = Seq(componentName, resourceName).hashCode() } -private[spark] case class ResourceRequest( - id: ResourceID, - amount: Int, - discoveryScript: Option[String], - vendor: Option[String]) +/** + * Class that represents a resource request. + * + * The class used when discovering resources (using the discovery script), + * or via the context as it is parsing configuration for the ResourceID. + * + * @param id object identifying the resource + * @param amount integer amount for the resource. Note that for a request (executor level), + * fractional resources does not make sense, so amount is an integer. + * @param discoveryScript optional discovery script file name + * @param vendor optional vendor name + * + * @since 3.0.0 + */ +@DeveloperApi +class ResourceRequest( + val id: ResourceID, + val amount: Long, + val discoveryScript: Optional[String], + val vendor: Optional[String]) { -private[spark] case class ResourceRequirement(resourceName: String, amount: Int) + override def equals(obj: Any): Boolean = { + obj match { + case that: ResourceRequest => + that.getClass == this.getClass && + that.id == id && that.amount == amount && discoveryScript == discoveryScript && + vendor == vendor + case _ => + false + } + } + + override def hashCode(): Int = Seq(id, amount, discoveryScript, vendor).hashCode() +} + +/** + * Case class that represents resource requirements for a component in a + * an application (components are driver, executor or task). + * + * A configuration of spark.task.resource.[resourceName].amount = 4, equates to: + * amount = 4, and numParts = 1. + * + * A configuration of spark.task.resource.[resourceName].amount = 0.25, equates to: + * amount = 1, and numParts = 4. + * + * @param resourceName gpu, fpga, etc. + * @param amount whole units of the resource we expect (e.g. 1 gpus, 2 fpgas) + * @param numParts if not 1, the number of ways a whole resource is subdivided. + * This is always an integer greater than or equal to 1, + * where 1 is whole resource, 2 is divide a resource in two, and so on. + */ +private[spark] case class ResourceRequirement( + resourceName: String, + amount: Int, + numParts: Int = 1) /** * Case class representing allocated resource addresses for a specific resource. @@ -73,29 +142,78 @@ private[spark] object ResourceUtils extends Logging { val amount = settings.getOrElse(AMOUNT, throw new SparkException(s"You must specify an amount for ${resourceId.resourceName}") ).toInt - val discoveryScript = settings.get(DISCOVERY_SCRIPT) - val vendor = settings.get(VENDOR) - ResourceRequest(resourceId, amount, discoveryScript, vendor) + val discoveryScript = Optional.ofNullable(settings.get(DISCOVERY_SCRIPT).orNull) + val vendor = Optional.ofNullable(settings.get(VENDOR).orNull) + new ResourceRequest(resourceId, amount, discoveryScript, vendor) } def listResourceIds(sparkConf: SparkConf, componentName: String): Seq[ResourceID] = { - sparkConf.getAllWithPrefix(s"$componentName.resource.").map { case (key, _) => + sparkConf.getAllWithPrefix(s"$componentName.$RESOURCE_PREFIX.").map { case (key, _) => key.substring(0, key.indexOf('.')) - }.toSet.toSeq.map(name => ResourceID(componentName, name)) + }.toSet.toSeq.map(name => new ResourceID(componentName, name)) } def parseAllResourceRequests( sparkConf: SparkConf, componentName: String): Seq[ResourceRequest] = { - listResourceIds(sparkConf, componentName).map { id => - parseResourceRequest(sparkConf, id) + listResourceIds(sparkConf, componentName) + .map(id => parseResourceRequest(sparkConf, id)) + .filter(_.amount > 0) + } + + // Used to take a fraction amount from a task resource requirement and split into a real + // integer amount and the number of slots per address. For instance, if the amount is 0.5, + // the we get (1, 2) back out. This indicates that for each 1 address, it has 2 slots per + // address, which allows you to put 2 tasks on that address. Note if amount is greater + // than 1, then the number of slots per address has to be 1. This would indicate that a + // would have multiple addresses assigned per task. This can be used for calculating + // the number of tasks per executor -> (executorAmount * numParts) / (integer amount). + // Returns tuple of (integer amount, numParts) + def calculateAmountAndPartsForFraction(doubleAmount: Double): (Int, Int) = { + val parts = if (doubleAmount <= 0.5) { + Math.floor(1.0 / doubleAmount).toInt + } else if (doubleAmount % 1 != 0) { + throw new SparkException( + s"The resource amount ${doubleAmount} must be either <= 0.5, or a whole number.") + } else { + 1 + } + (Math.ceil(doubleAmount).toInt, parts) + } + + // Add any task resource requests from the spark conf to the TaskResourceRequests passed in + def addTaskResourceRequests( + sparkConf: SparkConf, + treqs: TaskResourceRequests): Unit = { + listResourceIds(sparkConf, SPARK_TASK_PREFIX).map { resourceId => + val settings = sparkConf.getAllWithPrefix(resourceId.confPrefix).toMap + val amountDouble = settings.getOrElse(AMOUNT, + throw new SparkException(s"You must specify an amount for ${resourceId.resourceName}") + ).toDouble + treqs.resource(resourceId.resourceName, amountDouble) } } def parseResourceRequirements(sparkConf: SparkConf, componentName: String) : Seq[ResourceRequirement] = { - parseAllResourceRequests(sparkConf, componentName).map { request => - ResourceRequirement(request.id.resourceName, request.amount) + val resourceIds = listResourceIds(sparkConf, componentName) + val rnamesAndAmounts = resourceIds.map { resourceId => + val settings = sparkConf.getAllWithPrefix(resourceId.confPrefix).toMap + val amountDouble = settings.getOrElse(AMOUNT, + throw new SparkException(s"You must specify an amount for ${resourceId.resourceName}") + ).toDouble + (resourceId.resourceName, amountDouble) + } + rnamesAndAmounts.filter { case (_, amount) => amount > 0 }.map { case (rName, amountDouble) => + val (amount, parts) = if (componentName.equalsIgnoreCase(SPARK_TASK_PREFIX)) { + calculateAmountAndPartsForFraction(amountDouble) + } else if (amountDouble % 1 != 0) { + throw new SparkException( + s"Only tasks support fractional resources, please check your $componentName settings") + } else { + (amountDouble.toInt, 1) + } + ResourceRequirement(rName, amount, parts) } } @@ -125,17 +243,28 @@ private[spark] object ResourceUtils extends Logging { } } + def parseAllocated( + resourcesFileOpt: Option[String], + componentName: String): Seq[ResourceAllocation] = { + resourcesFileOpt.toSeq.flatMap(parseAllocatedFromJsonFile) + .filter(_.id.componentName == componentName) + } + private def parseAllocatedOrDiscoverResources( sparkConf: SparkConf, componentName: String, resourcesFileOpt: Option[String]): Seq[ResourceAllocation] = { - val allocated = resourcesFileOpt.toSeq.flatMap(parseAllocatedFromJsonFile) - .filter(_.id.componentName == componentName) + val allocated = parseAllocated(resourcesFileOpt, componentName) val otherResourceIds = listResourceIds(sparkConf, componentName).diff(allocated.map(_.id)) - allocated ++ otherResourceIds.map { id => + val otherResources = otherResourceIds.flatMap { id => val request = parseResourceRequest(sparkConf, id) - ResourceAllocation(id, discoverResource(request).addresses) + if (request.amount > 0) { + Some(ResourceAllocation(id, discoverResource(sparkConf, request).addresses)) + } else { + None + } } + allocated ++ otherResources } private def assertResourceAllocationMeetsRequest( @@ -154,9 +283,24 @@ private[spark] object ResourceUtils extends Logging { requests.foreach(r => assertResourceAllocationMeetsRequest(allocated(r.id), r)) } + private def assertAllResourceAllocationsMatchResourceProfile( + allocations: Map[String, ResourceInformation], + execReqs: Map[String, ExecutorResourceRequest]): Unit = { + execReqs.foreach { case (rName, req) => + require(allocations.contains(rName) && allocations(rName).addresses.size >= req.amount, + s"Resource: ${rName}, with addresses: " + + s"${allocations(rName).addresses.mkString(",")} " + + s"is less than what the user requested: ${req.amount})") + } + } + /** * Gets all allocated resource information for the input component from input resources file and - * discover the remaining via discovery scripts. + * the application level Spark configs. It first looks to see if resource were explicitly + * specified in the resources file (this would include specified address assignments and it only + * specified in certain cluster managers) and then it looks at the Spark configs to get any + * others not specified in the file. The resources not explicitly set in the file require a + * discovery script for it to run to get the addresses of the resource. * It also verifies the resource allocation meets required amount for each resource. * @return a map from resource name to resource info */ @@ -171,6 +315,51 @@ private[spark] object ResourceUtils extends Logging { resourceInfoMap } + // create an empty Optional if the string is empty + private def emptyStringToOptional(optStr: String): Optional[String] = { + if (optStr.isEmpty) { + Optional.empty[String] + } else { + Optional.of(optStr) + } + } + + /** + * This function is similar to getOrDiscoverallResources, except for it uses the ResourceProfile + * information instead of the application level configs. + * + * It first looks to see if resource were explicitly specified in the resources file + * (this would include specified address assignments and it only specified in certain + * cluster managers) and then it looks at the ResourceProfile to get + * any others not specified in the file. The resources not explicitly set in the file require a + * discovery script for it to run to get the addresses of the resource. + * It also verifies the resource allocation meets required amount for each resource. + * + * @return a map from resource name to resource info + */ + def getOrDiscoverAllResourcesForResourceProfile( + resourcesFileOpt: Option[String], + componentName: String, + resourceProfile: ResourceProfile, + sparkConf: SparkConf): Map[String, ResourceInformation] = { + val fileAllocated = parseAllocated(resourcesFileOpt, componentName) + val fileAllocResMap = fileAllocated.map(a => (a.id.resourceName, a.toResourceInformation)).toMap + // only want to look at the ResourceProfile for resources not in the resources file + val execReq = ResourceProfile.getCustomExecutorResources(resourceProfile) + val filteredExecreq = execReq.filterNot { case (rname, _) => fileAllocResMap.contains(rname) } + val rpAllocations = filteredExecreq.map { case (rName, execRequest) => + val resourceId = new ResourceID(componentName, rName) + val scriptOpt = emptyStringToOptional(execRequest.discoveryScript) + val vendorOpt = emptyStringToOptional(execRequest.vendor) + val resourceReq = new ResourceRequest(resourceId, execRequest.amount, scriptOpt, vendorOpt) + val addrs = discoverResource(sparkConf, resourceReq).addresses + (rName, new ResourceInformation(rName, addrs)) + } + val allAllocations = fileAllocResMap ++ rpAllocations + assertAllResourceAllocationsMatchResourceProfile(allAllocations, execReq) + allAllocations + } + def logResourceInfo(componentName: String, resources: Map[String, ResourceInformation]) : Unit = { logInfo("==============================================================") @@ -178,32 +367,113 @@ private[spark] object ResourceUtils extends Logging { logInfo("==============================================================") } - // visible for test - private[spark] def discoverResource(resourceRequest: ResourceRequest): ResourceInformation = { - val resourceName = resourceRequest.id.resourceName - val script = resourceRequest.discoveryScript - val result = if (script.nonEmpty) { - val scriptFile = new File(script.get) - // check that script exists and try to execute - if (scriptFile.exists()) { - val output = executeAndGetOutput(Seq(script.get), new File(".")) - ResourceInformation.parseJson(output) - } else { - throw new SparkException(s"Resource script: $scriptFile to discover $resourceName " + - "doesn't exist!") + private[spark] def discoverResource( + sparkConf: SparkConf, + resourceRequest: ResourceRequest): ResourceInformation = { + // always put the discovery script plugin as last plugin + val discoveryScriptPlugin = "org.apache.spark.resource.ResourceDiscoveryScriptPlugin" + val pluginClasses = sparkConf.get(RESOURCES_DISCOVERY_PLUGIN) :+ discoveryScriptPlugin + val resourcePlugins = Utils.loadExtensions(classOf[ResourceDiscoveryPlugin], pluginClasses, + sparkConf) + // apply each plugin until one of them returns the information for this resource + var riOption: Optional[ResourceInformation] = Optional.empty() + resourcePlugins.foreach { plugin => + val riOption = plugin.discoverResource(resourceRequest, sparkConf) + if (riOption.isPresent()) { + return riOption.get() } + } + throw new SparkException(s"None of the discovery plugins returned ResourceInformation for " + + s"${resourceRequest.id.resourceName}") + } + + def validateTaskCpusLargeEnough(execCores: Int, taskCpus: Int): Boolean = { + // Number of cores per executor must meet at least one task requirement. + if (execCores < taskCpus) { + throw new SparkException(s"The number of cores per executor (=$execCores) has to be >= " + + s"the number of cpus per task = $taskCpus.") + } + true + } + + // the option executor cores parameter is by the different local modes since it not configured + // via the config + def warnOnWastedResources( + rp: ResourceProfile, + sparkConf: SparkConf, + execCores: Option[Int] = None): Unit = { + // There have been checks on the ResourceProfile to make sure the executor resources were + // specified and are large enough if any task resources were specified. + // Now just do some sanity test and log warnings when it looks like the user will + // waste some resources. + val coresKnown = rp.isCoresLimitKnown + var limitingResource = rp.limitingResource(sparkConf) + var maxTaskPerExec = rp.maxTasksPerExecutor(sparkConf) + val taskCpus = rp.getTaskCpus.getOrElse(sparkConf.get(CPUS_PER_TASK)) + val cores = if (execCores.isDefined) { + execCores.get + } else if (coresKnown) { + rp.getExecutorCores.getOrElse(sparkConf.get(EXECUTOR_CORES)) } else { - throw new SparkException(s"User is expecting to use resource: $resourceName, but " + - "didn't specify a discovery script!") + // can't calculate cores limit + return } - if (!result.name.equals(resourceName)) { - throw new SparkException(s"Error running the resource discovery script ${script.get}: " + - s"script returned resource name ${result.name} and we were expecting $resourceName.") + // when executor cores config isn't set, we can't calculate the real limiting resource + // and number of tasks per executor ahead of time, so calculate it now. + if (!coresKnown) { + val numTasksPerExecCores = cores / taskCpus + val numTasksPerExecCustomResource = rp.maxTasksPerExecutor(sparkConf) + if (limitingResource.isEmpty || + (limitingResource.nonEmpty && numTasksPerExecCores < numTasksPerExecCustomResource)) { + limitingResource = ResourceProfile.CPUS + maxTaskPerExec = numTasksPerExecCores + } + } + val taskReq = ResourceProfile.getCustomTaskResources(rp) + val execReq = ResourceProfile.getCustomExecutorResources(rp) + + if (limitingResource.nonEmpty && !limitingResource.equals(ResourceProfile.CPUS)) { + if ((taskCpus * maxTaskPerExec) < cores) { + val resourceNumSlots = Math.floor(cores/taskCpus).toInt + val message = s"The configuration of cores (exec = ${cores} " + + s"task = ${taskCpus}, runnable tasks = ${resourceNumSlots}) will " + + s"result in wasted resources due to resource ${limitingResource} limiting the " + + s"number of runnable tasks per executor to: ${maxTaskPerExec}. Please adjust " + + "your configuration." + if (sparkConf.get(RESOURCES_WARNING_TESTING)) { + throw new SparkException(message) + } else { + logWarning(message) + } + } + } + + taskReq.foreach { case (rName, treq) => + val execAmount = execReq(rName).amount + val numParts = rp.getNumSlotsPerAddress(rName, sparkConf) + // handle fractional + val taskAmount = if (numParts > 1) 1 else treq.amount + if (maxTaskPerExec < (execAmount * numParts / taskAmount)) { + val taskReqStr = s"${taskAmount}/${numParts}" + val resourceNumSlots = Math.floor(execAmount * numParts / taskAmount).toInt + val message = s"The configuration of resource: ${treq.resourceName} " + + s"(exec = ${execAmount}, task = ${taskReqStr}, " + + s"runnable tasks = ${resourceNumSlots}) will " + + s"result in wasted resources due to resource ${limitingResource} limiting the " + + s"number of runnable tasks per executor to: ${maxTaskPerExec}. Please adjust " + + "your configuration." + if (sparkConf.get(RESOURCES_WARNING_TESTING)) { + throw new SparkException(message) + } else { + logWarning(message) + } + } } - result } // known types of resources final val GPU: String = "gpu" final val FPGA: String = "fpga" + + final val RESOURCE_PREFIX: String = "resource" } diff --git a/core/src/main/scala/org/apache/spark/resource/TaskResourceRequest.scala b/core/src/main/scala/org/apache/spark/resource/TaskResourceRequest.scala new file mode 100644 index 0000000000000..bffb0a2f523b1 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/resource/TaskResourceRequest.scala @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.resource + +/** + * A task resource request. This is used in conjuntion with the ResourceProfile to + * programmatically specify the resources needed for an RDD that will be applied at the + * stage level. + * + * Use TaskResourceRequests class as a convenience API. + * + * This api is currently private until the rest of the pieces are in place and then it + * will become public. + */ +private[spark] class TaskResourceRequest(val resourceName: String, val amount: Double) + extends Serializable { + + assert(amount <= 0.5 || amount % 1 == 0, + s"The resource amount ${amount} must be either <= 0.5, or a whole number.") + + override def equals(obj: Any): Boolean = { + obj match { + case that: TaskResourceRequest => + that.getClass == this.getClass && + that.resourceName == resourceName && that.amount == amount + case _ => + false + } + } + + override def hashCode(): Int = Seq(resourceName, amount).hashCode() + + override def toString(): String = { + s"name: $resourceName, amount: $amount" + } +} diff --git a/core/src/main/scala/org/apache/spark/resource/TaskResourceRequests.scala b/core/src/main/scala/org/apache/spark/resource/TaskResourceRequests.scala new file mode 100644 index 0000000000000..9624b51dd158e --- /dev/null +++ b/core/src/main/scala/org/apache/spark/resource/TaskResourceRequests.scala @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.resource + +import java.util.concurrent.ConcurrentHashMap + +import scala.collection.JavaConverters._ + +import org.apache.spark.resource.ResourceProfile._ + +/** + * A set of task resource requests. This is used in conjuntion with the ResourceProfile to + * programmatically specify the resources needed for an RDD that will be applied at the + * stage level. + * + * This api is currently private until the rest of the pieces are in place and then it + * will become public. + */ +private[spark] class TaskResourceRequests() extends Serializable { + + private val _taskResources = new ConcurrentHashMap[String, TaskResourceRequest]() + + def requests: Map[String, TaskResourceRequest] = _taskResources.asScala.toMap + + /** + * Specify number of cpus per Task. + * + * @param amount Number of cpus to allocate per Task. + */ + def cpus(amount: Int): this.type = { + val treq = new TaskResourceRequest(CPUS, amount) + _taskResources.put(CPUS, treq) + this + } + + /** + * Amount of a particular custom resource(GPU, FPGA, etc) to use. + * + * @param resourceName Name of the resource. + * @param amount Amount requesting as a Double to support fractional resource requests. + * Valid values are less than or equal to 0.5 or whole numbers. This essentially + * lets you configure X number of tasks to run on a single resource, + * ie amount equals 0.5 translates into 2 tasks per resource address. + */ + def resource(resourceName: String, amount: Double): this.type = { + val treq = new TaskResourceRequest(resourceName, amount) + _taskResources.put(resourceName, treq) + this + } + + def addRequest(treq: TaskResourceRequest): this.type = { + _taskResources.put(treq.resourceName, treq) + this + } + + override def toString: String = { + s"Task resource requests: ${_taskResources}" + } +} diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala index 97eed540b8f59..4728759e7fb0d 100644 --- a/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala +++ b/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala @@ -146,3 +146,19 @@ private[spark] trait RpcEndpoint { * [[ThreadSafeRpcEndpoint]] for different messages. */ private[spark] trait ThreadSafeRpcEndpoint extends RpcEndpoint + +/** + * An endpoint that uses a dedicated thread pool for delivering messages. + */ +private[spark] trait IsolatedRpcEndpoint extends RpcEndpoint { + + /** + * How many threads to use for delivering messages. By default, use a single thread. + * + * Note that requesting more than one thread means that the endpoint should be able to handle + * messages arriving from many threads at once, and all the things that entails (including + * messages being delivered to the endpoint out of order). + */ + def threadCount(): Int = 1 + +} diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala index 2f923d7902b05..41d6d146a86d7 100644 --- a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala +++ b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala @@ -17,20 +17,17 @@ package org.apache.spark.rpc.netty -import java.util.concurrent.{ConcurrentHashMap, ConcurrentMap, LinkedBlockingQueue, ThreadPoolExecutor, TimeUnit} +import java.util.concurrent.{ConcurrentHashMap, ConcurrentMap, CountDownLatch} import javax.annotation.concurrent.GuardedBy import scala.collection.JavaConverters._ import scala.concurrent.Promise import scala.util.control.NonFatal -import org.apache.spark.{SparkConf, SparkContext, SparkException} +import org.apache.spark.SparkException import org.apache.spark.internal.Logging -import org.apache.spark.internal.config.EXECUTOR_ID -import org.apache.spark.internal.config.Network.RPC_NETTY_DISPATCHER_NUM_THREADS import org.apache.spark.network.client.RpcResponseCallback import org.apache.spark.rpc._ -import org.apache.spark.util.ThreadUtils /** * A message dispatcher, responsible for routing RPC messages to the appropriate endpoint(s). @@ -40,20 +37,13 @@ import org.apache.spark.util.ThreadUtils */ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv, numUsableCores: Int) extends Logging { - private class EndpointData( - val name: String, - val endpoint: RpcEndpoint, - val ref: NettyRpcEndpointRef) { - val inbox = new Inbox(ref, endpoint) - } - - private val endpoints: ConcurrentMap[String, EndpointData] = - new ConcurrentHashMap[String, EndpointData] + private val endpoints: ConcurrentMap[String, MessageLoop] = + new ConcurrentHashMap[String, MessageLoop] private val endpointRefs: ConcurrentMap[RpcEndpoint, RpcEndpointRef] = new ConcurrentHashMap[RpcEndpoint, RpcEndpointRef] - // Track the receivers whose inboxes may contain messages. - private val receivers = new LinkedBlockingQueue[EndpointData] + private val shutdownLatch = new CountDownLatch(1) + private lazy val sharedLoop = new SharedMessageLoop(nettyEnv.conf, this, numUsableCores) /** * True if the dispatcher has been stopped. Once stopped, all messages posted will be bounced @@ -69,12 +59,30 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv, numUsableCores: Int) exte if (stopped) { throw new IllegalStateException("RpcEnv has been stopped") } - if (endpoints.putIfAbsent(name, new EndpointData(name, endpoint, endpointRef)) != null) { + if (endpoints.containsKey(name)) { throw new IllegalArgumentException(s"There is already an RpcEndpoint called $name") } - val data = endpoints.get(name) - endpointRefs.put(data.endpoint, data.ref) - receivers.offer(data) // for the OnStart message + + // This must be done before assigning RpcEndpoint to MessageLoop, as MessageLoop sets Inbox be + // active when registering, and endpointRef must be put into endpointRefs before onStart is + // called. + endpointRefs.put(endpoint, endpointRef) + + var messageLoop: MessageLoop = null + try { + messageLoop = endpoint match { + case e: IsolatedRpcEndpoint => + new DedicatedMessageLoop(name, e, this) + case _ => + sharedLoop.register(name, endpoint) + sharedLoop + } + endpoints.put(name, messageLoop) + } catch { + case NonFatal(e) => + endpointRefs.remove(endpoint) + throw e + } } endpointRef } @@ -85,10 +93,9 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv, numUsableCores: Int) exte // Should be idempotent private def unregisterRpcEndpoint(name: String): Unit = { - val data = endpoints.remove(name) - if (data != null) { - data.inbox.stop() - receivers.offer(data) // for the OnStop message + val loop = endpoints.remove(name) + if (loop != null) { + loop.unregister(name) } // Don't clean `endpointRefs` here because it's possible that some messages are being processed // now and they can use `getRpcEndpointRef`. So `endpointRefs` will be cleaned in Inbox via @@ -155,14 +162,13 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv, numUsableCores: Int) exte message: InboxMessage, callbackIfStopped: (Exception) => Unit): Unit = { val error = synchronized { - val data = endpoints.get(endpointName) + val loop = endpoints.get(endpointName) if (stopped) { Some(new RpcEnvStoppedException()) - } else if (data == null) { + } else if (loop == null) { Some(new SparkException(s"Could not find $endpointName.")) } else { - data.inbox.post(message) - receivers.offer(data) + loop.post(endpointName, message) None } } @@ -177,15 +183,23 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv, numUsableCores: Int) exte } stopped = true } - // Stop all endpoints. This will queue all endpoints for processing by the message loops. - endpoints.keySet().asScala.foreach(unregisterRpcEndpoint) - // Enqueue a message that tells the message loops to stop. - receivers.offer(PoisonPill) - threadpool.shutdown() + var stopSharedLoop = false + endpoints.asScala.foreach { case (name, loop) => + unregisterRpcEndpoint(name) + if (!loop.isInstanceOf[SharedMessageLoop]) { + loop.stop() + } else { + stopSharedLoop = true + } + } + if (stopSharedLoop) { + sharedLoop.stop() + } + shutdownLatch.countDown() } def awaitTermination(): Unit = { - threadpool.awaitTermination(Long.MaxValue, TimeUnit.MILLISECONDS) + shutdownLatch.await() } /** @@ -194,61 +208,4 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv, numUsableCores: Int) exte def verify(name: String): Boolean = { endpoints.containsKey(name) } - - private def getNumOfThreads(conf: SparkConf): Int = { - val availableCores = - if (numUsableCores > 0) numUsableCores else Runtime.getRuntime.availableProcessors() - - val modNumThreads = conf.get(RPC_NETTY_DISPATCHER_NUM_THREADS) - .getOrElse(math.max(2, availableCores)) - - conf.get(EXECUTOR_ID).map { id => - val role = if (id == SparkContext.DRIVER_IDENTIFIER) "driver" else "executor" - conf.getInt(s"spark.$role.rpc.netty.dispatcher.numThreads", modNumThreads) - }.getOrElse(modNumThreads) - } - - /** Thread pool used for dispatching messages. */ - private val threadpool: ThreadPoolExecutor = { - val numThreads = getNumOfThreads(nettyEnv.conf) - val pool = ThreadUtils.newDaemonFixedThreadPool(numThreads, "dispatcher-event-loop") - for (i <- 0 until numThreads) { - pool.execute(new MessageLoop) - } - pool - } - - /** Message loop used for dispatching messages. */ - private class MessageLoop extends Runnable { - override def run(): Unit = { - try { - while (true) { - try { - val data = receivers.take() - if (data == PoisonPill) { - // Put PoisonPill back so that other MessageLoops can see it. - receivers.offer(PoisonPill) - return - } - data.inbox.process(Dispatcher.this) - } catch { - case NonFatal(e) => logError(e.getMessage, e) - } - } - } catch { - case _: InterruptedException => // exit - case t: Throwable => - try { - // Re-submit a MessageLoop so that Dispatcher will still work if - // UncaughtExceptionHandler decides to not kill JVM. - threadpool.execute(new MessageLoop) - } finally { - throw t - } - } - } - } - - /** A poison endpoint that indicates MessageLoop should exit its message loop. */ - private val PoisonPill = new EndpointData(null, null, null) } diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala index 44d2622a42f58..2ed03f7430c32 100644 --- a/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala +++ b/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala @@ -54,9 +54,7 @@ private[netty] case class RemoteProcessConnectionError(cause: Throwable, remoteA /** * An inbox that stores messages for an [[RpcEndpoint]] and posts messages to it thread-safely. */ -private[netty] class Inbox( - val endpointRef: NettyRpcEndpointRef, - val endpoint: RpcEndpoint) +private[netty] class Inbox(val endpointName: String, val endpoint: RpcEndpoint) extends Logging { inbox => // Give this an alias so we can use it more clearly in closures. @@ -195,7 +193,7 @@ private[netty] class Inbox( * Exposed for testing. */ protected def onDrop(message: InboxMessage): Unit = { - logWarning(s"Drop $message because $endpointRef is stopped") + logWarning(s"Drop $message because endpoint $endpointName is stopped") } /** diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/MessageLoop.scala b/core/src/main/scala/org/apache/spark/rpc/netty/MessageLoop.scala new file mode 100644 index 0000000000000..c985c72f2adce --- /dev/null +++ b/core/src/main/scala/org/apache/spark/rpc/netty/MessageLoop.scala @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.rpc.netty + +import java.util.concurrent._ + +import scala.util.control.NonFatal + +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.internal.Logging +import org.apache.spark.internal.config.EXECUTOR_ID +import org.apache.spark.internal.config.Network._ +import org.apache.spark.rpc.{IsolatedRpcEndpoint, RpcEndpoint} +import org.apache.spark.util.ThreadUtils + +/** + * A message loop used by [[Dispatcher]] to deliver messages to endpoints. + */ +private sealed abstract class MessageLoop(dispatcher: Dispatcher) extends Logging { + + // List of inboxes with pending messages, to be processed by the message loop. + private val active = new LinkedBlockingQueue[Inbox]() + + // Message loop task; should be run in all threads of the message loop's pool. + protected val receiveLoopRunnable = new Runnable() { + override def run(): Unit = receiveLoop() + } + + protected val threadpool: ExecutorService + + private var stopped = false + + def post(endpointName: String, message: InboxMessage): Unit + + def unregister(name: String): Unit + + def stop(): Unit = { + synchronized { + if (!stopped) { + setActive(MessageLoop.PoisonPill) + threadpool.shutdown() + stopped = true + } + } + threadpool.awaitTermination(Long.MaxValue, TimeUnit.MILLISECONDS) + } + + protected final def setActive(inbox: Inbox): Unit = active.offer(inbox) + + private def receiveLoop(): Unit = { + try { + while (true) { + try { + val inbox = active.take() + if (inbox == MessageLoop.PoisonPill) { + // Put PoisonPill back so that other threads can see it. + setActive(MessageLoop.PoisonPill) + return + } + inbox.process(dispatcher) + } catch { + case NonFatal(e) => logError(e.getMessage, e) + } + } + } catch { + case _: InterruptedException => // exit + case t: Throwable => + try { + // Re-submit a receive task so that message delivery will still work if + // UncaughtExceptionHandler decides to not kill JVM. + threadpool.execute(receiveLoopRunnable) + } finally { + throw t + } + } + } +} + +private object MessageLoop { + /** A poison inbox that indicates the message loop should stop processing messages. */ + val PoisonPill = new Inbox(null, null) +} + +/** + * A message loop that serves multiple RPC endpoints, using a shared thread pool. + */ +private class SharedMessageLoop( + conf: SparkConf, + dispatcher: Dispatcher, + numUsableCores: Int) + extends MessageLoop(dispatcher) { + + private val endpoints = new ConcurrentHashMap[String, Inbox]() + + private def getNumOfThreads(conf: SparkConf): Int = { + val availableCores = + if (numUsableCores > 0) numUsableCores else Runtime.getRuntime.availableProcessors() + + val modNumThreads = conf.get(RPC_NETTY_DISPATCHER_NUM_THREADS) + .getOrElse(math.max(2, availableCores)) + + conf.get(EXECUTOR_ID).map { id => + val role = if (id == SparkContext.DRIVER_IDENTIFIER) "driver" else "executor" + conf.getInt(s"spark.$role.rpc.netty.dispatcher.numThreads", modNumThreads) + }.getOrElse(modNumThreads) + } + + /** Thread pool used for dispatching messages. */ + override protected val threadpool: ThreadPoolExecutor = { + val numThreads = getNumOfThreads(conf) + val pool = ThreadUtils.newDaemonFixedThreadPool(numThreads, "dispatcher-event-loop") + for (i <- 0 until numThreads) { + pool.execute(receiveLoopRunnable) + } + pool + } + + override def post(endpointName: String, message: InboxMessage): Unit = { + val inbox = endpoints.get(endpointName) + inbox.post(message) + setActive(inbox) + } + + override def unregister(name: String): Unit = { + val inbox = endpoints.remove(name) + if (inbox != null) { + inbox.stop() + // Mark active to handle the OnStop message. + setActive(inbox) + } + } + + def register(name: String, endpoint: RpcEndpoint): Unit = { + val inbox = new Inbox(name, endpoint) + endpoints.put(name, inbox) + // Mark active to handle the OnStart message. + setActive(inbox) + } +} + +/** + * A message loop that is dedicated to a single RPC endpoint. + */ +private class DedicatedMessageLoop( + name: String, + endpoint: IsolatedRpcEndpoint, + dispatcher: Dispatcher) + extends MessageLoop(dispatcher) { + + private val inbox = new Inbox(name, endpoint) + + override protected val threadpool = if (endpoint.threadCount() > 1) { + ThreadUtils.newDaemonCachedThreadPool(s"dispatcher-$name", endpoint.threadCount()) + } else { + ThreadUtils.newDaemonSingleThreadExecutor(s"dispatcher-$name") + } + + (1 to endpoint.threadCount()).foreach { _ => + threadpool.submit(receiveLoopRunnable) + } + + // Mark active to handle the OnStart message. + setActive(inbox) + + override def post(endpointName: String, message: InboxMessage): Unit = { + require(endpointName == name) + inbox.post(message) + setActive(inbox) + } + + override def unregister(endpointName: String): Unit = synchronized { + require(endpointName == name) + inbox.stop() + // Mark active to handle the OnStop message. + setActive(inbox) + setActive(MessageLoop.PoisonPill) + threadpool.shutdown() + } +} diff --git a/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala b/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala index 11e2c475d9b45..1bcddaceb3576 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala @@ -50,11 +50,11 @@ private class AsyncEventQueue( // if no such conf is specified, use the value specified in // LISTENER_BUS_EVENT_QUEUE_CAPACITY private[scheduler] def capacity: Int = { - val queuesize = conf.getInt(s"spark.scheduler.listenerbus.eventqueue.${name}.capacity", - conf.get(LISTENER_BUS_EVENT_QUEUE_CAPACITY)) - assert(queuesize > 0, s"capacity for event queue $name must be greater than 0, " + - s"but $queuesize is configured.") - queuesize + val queueSize = conf.getInt(s"$LISTENER_BUS_EVENT_QUEUE_PREFIX.$name.capacity", + conf.get(LISTENER_BUS_EVENT_QUEUE_CAPACITY)) + assert(queueSize > 0, s"capacity for event queue $name must be greater than 0, " + + s"but $queueSize is configured.") + queueSize } private val eventQueue = new LinkedBlockingQueue[SparkListenerEvent](capacity) diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index b08483267c141..fd5c3e0827bf9 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -29,8 +29,6 @@ import scala.collection.mutable.{HashMap, HashSet, ListBuffer} import scala.concurrent.duration._ import scala.util.control.NonFatal -import org.apache.commons.lang3.SerializationUtils - import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics} @@ -39,7 +37,8 @@ import org.apache.spark.internal.config import org.apache.spark.internal.config.Tests.TEST_NO_STAGE_RETRY import org.apache.spark.network.util.JavaUtils import org.apache.spark.partial.{ApproximateActionListener, ApproximateEvaluator, PartialResult} -import org.apache.spark.rdd.{DeterministicLevel, RDD, RDDCheckpointData} +import org.apache.spark.rdd.{RDD, RDDCheckpointData} +import org.apache.spark.resource.ResourceProfile import org.apache.spark.rpc.RpcTimeout import org.apache.spark.storage._ import org.apache.spark.storage.BlockManagerMessages.BlockManagerHeartbeat @@ -229,7 +228,7 @@ private[spark] class DAGScheduler( /** * Called by the TaskSetManager to report task's starting. */ - def taskStarted(task: Task[_], taskInfo: TaskInfo) { + def taskStarted(task: Task[_], taskInfo: TaskInfo): Unit = { eventProcessLoop.post(BeginEvent(task, taskInfo)) } @@ -237,7 +236,7 @@ private[spark] class DAGScheduler( * Called by the TaskSetManager to report that a task has completed * and results are being fetched remotely. */ - def taskGettingResult(taskInfo: TaskInfo) { + def taskGettingResult(taskInfo: TaskInfo): Unit = { eventProcessLoop.post(GettingResultEvent(taskInfo)) } @@ -269,7 +268,7 @@ private[spark] class DAGScheduler( executorUpdates: mutable.Map[(Int, Int), ExecutorMetrics]): Boolean = { listenerBus.post(SparkListenerExecutorMetricsUpdate(execId, accumUpdates, executorUpdates)) - blockManagerMaster.driverEndpoint.askSync[Boolean]( + blockManagerMaster.driverHeartbeatEndPoint.askSync[Boolean]( BlockManagerHeartbeat(blockManagerId), new RpcTimeout(10.minutes, "BlockManagerHeartbeat")) } @@ -393,7 +392,8 @@ private[spark] class DAGScheduler( val parents = getOrCreateParentStages(rdd, jobId) val id = nextStageId.getAndIncrement() val stage = new ShuffleMapStage( - id, rdd, numTasks, parents, jobId, rdd.creationSite, shuffleDep, mapOutputTracker) + id, rdd, numTasks, parents, jobId, rdd.creationSite, shuffleDep, mapOutputTracker, + ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) stageIdToStage(id) = stage shuffleIdToMapStage(shuffleDep.shuffleId) = stage @@ -402,7 +402,8 @@ private[spark] class DAGScheduler( if (!mapOutputTracker.containsShuffle(shuffleDep.shuffleId)) { // Kind of ugly: need to register RDDs with the cache and map output tracker here // since we can't do it in the RDD constructor because # of partitions is unknown - logInfo("Registering RDD " + rdd.id + " (" + rdd.getCreationSite + ")") + logInfo(s"Registering RDD ${rdd.id} (${rdd.getCreationSite}) as input to " + + s"shuffle ${shuffleDep.shuffleId}") mapOutputTracker.registerShuffle(shuffleDep.shuffleId, rdd.partitions.length) } stage @@ -454,7 +455,8 @@ private[spark] class DAGScheduler( checkBarrierStageWithRDDChainPattern(rdd, partitions.toSet.size) val parents = getOrCreateParentStages(rdd, jobId) val id = nextStageId.getAndIncrement() - val stage = new ResultStage(id, rdd, func, partitions, parents, jobId, callSite) + val stage = new ResultStage(id, rdd, func, partitions, parents, jobId, callSite, + ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) stageIdToStage(id) = stage updateJobIdStageIdMaps(jobId, stage) stage @@ -560,7 +562,7 @@ private[spark] class DAGScheduler( // caused by recursively visiting val waitingForVisit = new ListBuffer[RDD[_]] waitingForVisit += stage.rdd - def visit(rdd: RDD[_]) { + def visit(rdd: RDD[_]): Unit = { if (!visited(rdd)) { visited += rdd val rddHasUncachedPartitions = getCacheLocs(rdd).contains(Nil) @@ -591,7 +593,7 @@ private[spark] class DAGScheduler( */ private def updateJobIdStageIdMaps(jobId: Int, stage: Stage): Unit = { @tailrec - def updateJobIdStageIdMapsList(stages: List[Stage]) { + def updateJobIdStageIdMapsList(stages: List[Stage]): Unit = { if (stages.nonEmpty) { val s = stages.head s.jobIds += jobId @@ -622,7 +624,7 @@ private[spark] class DAGScheduler( "Job %d not registered for stage %d even though that stage was registered for the job" .format(job.jobId, stageId)) } else { - def removeStage(stageId: Int) { + def removeStage(stageId: Int): Unit = { // data structures based on Stage for (stage <- stageIdToStage.get(stageId)) { if (runningStages.contains(stage)) { @@ -696,9 +698,13 @@ private[spark] class DAGScheduler( val jobId = nextJobId.getAndIncrement() if (partitions.isEmpty) { + val clonedProperties = Utils.cloneProperties(properties) + if (sc.getLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION) == null) { + clonedProperties.setProperty(SparkContext.SPARK_JOB_DESCRIPTION, callSite.shortForm) + } val time = clock.getTimeMillis() listenerBus.post( - SparkListenerJobStart(jobId, time, Seq[StageInfo](), properties)) + SparkListenerJobStart(jobId, time, Seq.empty, clonedProperties)) listenerBus.post( SparkListenerJobEnd(jobId, time, JobSucceeded)) // Return immediately if the job is running 0 tasks @@ -710,7 +716,7 @@ private[spark] class DAGScheduler( val waiter = new JobWaiter[U](this, jobId, partitions.size, resultHandler) eventProcessLoop.post(JobSubmitted( jobId, rdd, func2, partitions.toArray, callSite, waiter, - SerializationUtils.clone(properties))) + Utils.cloneProperties(properties))) waiter } @@ -782,7 +788,7 @@ private[spark] class DAGScheduler( val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _] eventProcessLoop.post(JobSubmitted( jobId, rdd, func2, rdd.partitions.indices.toArray, callSite, listener, - SerializationUtils.clone(properties))) + Utils.cloneProperties(properties))) listener.awaitResult() // Will throw an exception if the job fails } @@ -819,7 +825,7 @@ private[spark] class DAGScheduler( this, jobId, 1, (_: Int, r: MapOutputStatistics) => callback(r)) eventProcessLoop.post(MapStageSubmitted( - jobId, dependency, callSite, waiter, SerializationUtils.clone(properties))) + jobId, dependency, callSite, waiter, Utils.cloneProperties(properties))) waiter } @@ -846,7 +852,7 @@ private[spark] class DAGScheduler( eventProcessLoop.post(AllJobsCancelled) } - private[scheduler] def doCancelAllJobs() { + private[scheduler] def doCancelAllJobs(): Unit = { // Cancel all running jobs. runningStages.map(_.firstJobId).foreach(handleJobCancellation(_, Option("as part of cancellation of all jobs"))) @@ -857,7 +863,7 @@ private[spark] class DAGScheduler( /** * Cancel all jobs associated with a running or scheduled stage. */ - def cancelStage(stageId: Int, reason: Option[String]) { + def cancelStage(stageId: Int, reason: Option[String]): Unit = { eventProcessLoop.post(StageCancelled(stageId, reason)) } @@ -874,7 +880,7 @@ private[spark] class DAGScheduler( * Resubmit any failed stages. Ordinarily called after a small amount of time has passed since * the last fetch failure. */ - private[scheduler] def resubmitFailedStages() { + private[scheduler] def resubmitFailedStages(): Unit = { if (failedStages.nonEmpty) { // Failed stages may be removed by job cancellation, so failed might be empty even if // the ResubmitFailedStages event has been scheduled. @@ -893,7 +899,7 @@ private[spark] class DAGScheduler( * Submits stages that depend on the given parent stage. Called when the parent stage completes * successfully. */ - private def submitWaitingChildStages(parent: Stage) { + private def submitWaitingChildStages(parent: Stage): Unit = { logTrace(s"Checking if any dependencies of $parent are now runnable") logTrace("running: " + runningStages) logTrace("waiting: " + waitingStages) @@ -915,7 +921,7 @@ private[spark] class DAGScheduler( jobsThatUseStage.find(jobIdToActiveJob.contains) } - private[scheduler] def handleJobGroupCancelled(groupId: String) { + private[scheduler] def handleJobGroupCancelled(groupId: String): Unit = { // Cancel all jobs belonging to this job group. // First finds all active jobs with this group id, and then kill stages for them. val activeInGroup = activeJobs.filter { activeJob => @@ -928,7 +934,7 @@ private[spark] class DAGScheduler( Option("part of cancelled job group %s".format(groupId)))) } - private[scheduler] def handleBeginEvent(task: Task[_], taskInfo: TaskInfo) { + private[scheduler] def handleBeginEvent(task: Task[_], taskInfo: TaskInfo): Unit = { // Note that there is a chance that this task is launched after the stage is cancelled. // In that case, we wouldn't have the stage anymore in stageIdToStage. val stageAttemptId = @@ -947,7 +953,7 @@ private[spark] class DAGScheduler( stageIdToStage.get(taskSet.stageId).foreach { abortStage(_, reason, exception) } } - private[scheduler] def cleanUpAfterSchedulerStop() { + private[scheduler] def cleanUpAfterSchedulerStop(): Unit = { for (job <- activeJobs) { val error = new SparkException(s"Job ${job.jobId} cancelled because SparkContext was shut down") @@ -965,7 +971,7 @@ private[spark] class DAGScheduler( } } - private[scheduler] def handleGetTaskResult(taskInfo: TaskInfo) { + private[scheduler] def handleGetTaskResult(taskInfo: TaskInfo): Unit = { listenerBus.post(SparkListenerTaskGettingResult(taskInfo)) } @@ -975,7 +981,7 @@ private[spark] class DAGScheduler( partitions: Array[Int], callSite: CallSite, listener: JobListener, - properties: Properties) { + properties: Properties): Unit = { var finalStage: ResultStage = null try { // New stage creation may throw an exception if, for example, jobs are run on a @@ -1039,7 +1045,7 @@ private[spark] class DAGScheduler( dependency: ShuffleDependency[_, _, _], callSite: CallSite, listener: JobListener, - properties: Properties) { + properties: Properties): Unit = { // Submitting this map stage might still require the creation of some parent stages, so make // sure that happens. var finalStage: ShuffleMapStage = null @@ -1079,10 +1085,11 @@ private[spark] class DAGScheduler( } /** Submits stage, but first recursively submits any missing parents. */ - private def submitStage(stage: Stage) { + private def submitStage(stage: Stage): Unit = { val jobId = activeJobForStage(stage) if (jobId.isDefined) { - logDebug("submitStage(" + stage + ")") + logDebug(s"submitStage($stage (name=${stage.name};" + + s"jobs=${stage.jobIds.toSeq.sorted.mkString(",")}))") if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) { val missing = getMissingParentStages(stage).sortBy(_.id) logDebug("missing: " + missing) @@ -1102,10 +1109,19 @@ private[spark] class DAGScheduler( } /** Called when stage's parents are available and we can now do its task. */ - private def submitMissingTasks(stage: Stage, jobId: Int) { + private def submitMissingTasks(stage: Stage, jobId: Int): Unit = { logDebug("submitMissingTasks(" + stage + ")") - // First figure out the indexes of partition ids to compute. + // Before find missing partition, do the intermediate state clean work first. + // The operation here can make sure for the partially completed intermediate stage, + // `findMissingPartitions()` returns all partitions every time. + stage match { + case sms: ShuffleMapStage if stage.isIndeterminate && !sms.isAvailable => + mapOutputTracker.unregisterAllMapOutput(sms.shuffleDep.shuffleId) + case _ => + } + + // Figure out the indexes of partition ids to compute. val partitionsToCompute: Seq[Int] = stage.findMissingPartitions() // Use the scheduling pool, job group, description, etc. from an ActiveJob associated @@ -1346,7 +1362,7 @@ private[spark] class DAGScheduler( * Responds to a task finishing. This is called inside the event loop so it assumes that it can * modify the scheduler's internal state. Use taskEnded() to post a task end event from outside. */ - private[scheduler] def handleTaskCompletion(event: CompletionEvent) { + private[scheduler] def handleTaskCompletion(event: CompletionEvent): Unit = { val task = event.task val stageId = task.stageId @@ -1500,7 +1516,7 @@ private[spark] class DAGScheduler( } } - case FetchFailed(bmAddress, shuffleId, mapId, _, failureMessage) => + case FetchFailed(bmAddress, shuffleId, _, mapIndex, _, failureMessage) => val failedStage = stageIdToStage(task.stageId) val mapStage = shuffleIdToMapStage(shuffleId) @@ -1523,17 +1539,17 @@ private[spark] class DAGScheduler( markStageAsFinished(failedStage, errorMessage = Some(failureMessage), willRetry = !shouldAbortStage) } else { - logDebug(s"Received fetch failure from $task, but its from $failedStage which is no " + - s"longer running") + logDebug(s"Received fetch failure from $task, but it's from $failedStage which is no " + + "longer running") } if (mapStage.rdd.isBarrier()) { // Mark all the map as broken in the map stage, to ensure retry all the tasks on // resubmitted stage attempt. mapOutputTracker.unregisterAllMapOutput(shuffleId) - } else if (mapId != -1) { + } else if (mapIndex != -1) { // Mark the map whose fetch failed as broken in the map stage - mapOutputTracker.unregisterMapOutput(shuffleId, mapId, bmAddress) + mapOutputTracker.unregisterMapOutput(shuffleId, mapIndex, bmAddress) } if (failedStage.rdd.isBarrier()) { @@ -1575,7 +1591,7 @@ private[spark] class DAGScheduler( // Note that, if map stage is UNORDERED, we are fine. The shuffle partitioner is // guaranteed to be determinate, so the input data of the reducers will not change // even if the map tasks are re-tried. - if (mapStage.rdd.outputDeterministicLevel == DeterministicLevel.INDETERMINATE) { + if (mapStage.isIndeterminate) { // It's a little tricky to find all the succeeding stages of `mapStage`, because // each stage only know its parents not children. Here we traverse the stages from // the leaf nodes (the result stages of active jobs), and rollback all the stages @@ -1603,15 +1619,22 @@ private[spark] class DAGScheduler( activeJobs.foreach(job => collectStagesToRollback(job.finalStage :: Nil)) + // The stages will be rolled back after checking + val rollingBackStages = HashSet[Stage](mapStage) stagesToRollback.foreach { case mapStage: ShuffleMapStage => val numMissingPartitions = mapStage.findMissingPartitions().length if (numMissingPartitions < mapStage.numTasks) { - // TODO: support to rollback shuffle files. - // Currently the shuffle writing is "first write wins", so we can't re-run a - // shuffle map stage and overwrite existing shuffle files. We have to finish - // SPARK-8029 first. - abortStage(mapStage, generateErrorMessage(mapStage), None) + if (sc.getConf.get(config.SHUFFLE_USE_OLD_FETCH_PROTOCOL)) { + val reason = "A shuffle map stage with indeterminate output was failed " + + "and retried. However, Spark can only do this while using the new " + + "shuffle block fetching protocol. Please check the config " + + "'spark.shuffle.useOldFetchProtocol', see more detail in " + + "SPARK-27665 and SPARK-25341." + abortStage(mapStage, reason, None) + } else { + rollingBackStages += mapStage + } } case resultStage: ResultStage if resultStage.activeJob.isDefined => @@ -1623,6 +1646,9 @@ private[spark] class DAGScheduler( case _ => } + logInfo(s"The shuffle map stage $mapStage with indeterminate output was failed, " + + s"we will roll back and rerun below stages which include itself and all its " + + s"indeterminate child stages: $rollingBackStages") } // We expect one executor failure to trigger many FetchFailures in rapid succession, @@ -1862,7 +1888,7 @@ private[spark] class DAGScheduler( clearCacheLocs() } - private[scheduler] def handleExecutorAdded(execId: String, host: String) { + private[scheduler] def handleExecutorAdded(execId: String, host: String): Unit = { // remove from failedEpoch(execId) ? if (failedEpoch.contains(execId)) { logInfo("Host added was in lost list earlier: " + host) @@ -1870,7 +1896,7 @@ private[spark] class DAGScheduler( } } - private[scheduler] def handleStageCancellation(stageId: Int, reason: Option[String]) { + private[scheduler] def handleStageCancellation(stageId: Int, reason: Option[String]): Unit = { stageIdToStage.get(stageId) match { case Some(stage) => val jobsThatUseStage: Array[Int] = stage.jobIds.toArray @@ -1888,7 +1914,7 @@ private[spark] class DAGScheduler( } } - private[scheduler] def handleJobCancellation(jobId: Int, reason: Option[String]) { + private[scheduler] def handleJobCancellation(jobId: Int, reason: Option[String]): Unit = { if (!jobIdToStageIds.contains(jobId)) { logDebug("Trying to cancel unregistered job " + jobId) } else { @@ -2010,7 +2036,7 @@ private[spark] class DAGScheduler( // caused by recursively visiting val waitingForVisit = new ListBuffer[RDD[_]] waitingForVisit += stage.rdd - def visit(rdd: RDD[_]) { + def visit(rdd: RDD[_]): Unit = { if (!visitedRdds(rdd)) { visitedRdds += rdd for (dep <- rdd.dependencies) { @@ -2103,7 +2129,7 @@ private[spark] class DAGScheduler( listenerBus.post(SparkListenerJobEnd(job.jobId, clock.getTimeMillis(), JobSucceeded)) } - def stop() { + def stop(): Unit = { messageScheduler.shutdownNow() eventProcessLoop.stop() taskScheduler.stop() diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala index 48eb2da3015f8..8c23388b37a3d 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala @@ -17,24 +17,20 @@ package org.apache.spark.scheduler -import java.io._ import java.net.URI -import java.nio.charset.StandardCharsets -import scala.collection.mutable.{ArrayBuffer, Map} +import scala.collection.mutable import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FileSystem, FSDataOutputStream, Path} -import org.apache.hadoop.fs.permission.FsPermission import org.json4s.JsonAST.JValue import org.json4s.jackson.JsonMethods._ import org.apache.spark.{SPARK_VERSION, SparkConf} import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.deploy.history.EventLogFileWriter import org.apache.spark.executor.ExecutorMetrics import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ -import org.apache.spark.io.CompressionCodec import org.apache.spark.util.{JsonProtocol, Utils} /** @@ -42,13 +38,12 @@ import org.apache.spark.util.{JsonProtocol, Utils} * * Event logging is specified by the following configurable parameters: * spark.eventLog.enabled - Whether event logging is enabled. - * spark.eventLog.logBlockUpdates.enabled - Whether to log block updates - * spark.eventLog.compress - Whether to compress logged events - * spark.eventLog.compression.codec - The codec to compress logged events - * spark.eventLog.overwrite - Whether to overwrite any existing files. * spark.eventLog.dir - Path to the directory in which events are logged. - * spark.eventLog.buffer.kb - Buffer size to use when writing to output streams + * spark.eventLog.logBlockUpdates.enabled - Whether to log block updates * spark.eventLog.logStageExecutorMetrics.enabled - Whether to log stage executor metrics + * + * Event log file writer maintains its own parameters: refer the doc of [[EventLogFileWriter]] + * and its descendant for more details. */ private[spark] class EventLoggingListener( appId: String, @@ -64,96 +59,43 @@ private[spark] class EventLoggingListener( this(appId, appAttemptId, logBaseDir, sparkConf, SparkHadoopUtil.get.newConfiguration(sparkConf)) - private val shouldCompress = sparkConf.get(EVENT_LOG_COMPRESS) - private val shouldOverwrite = sparkConf.get(EVENT_LOG_OVERWRITE) - private val shouldLogBlockUpdates = sparkConf.get(EVENT_LOG_BLOCK_UPDATES) - private val shouldAllowECLogs = sparkConf.get(EVENT_LOG_ALLOW_EC) - private val shouldLogStageExecutorMetrics = sparkConf.get(EVENT_LOG_STAGE_EXECUTOR_METRICS) - private val testing = sparkConf.get(EVENT_LOG_TESTING) - private val outputBufferSize = sparkConf.get(EVENT_LOG_OUTPUT_BUFFER_SIZE).toInt - private val fileSystem = Utils.getHadoopFileSystem(logBaseDir, hadoopConf) - private val compressionCodec = - if (shouldCompress) { - Some(CompressionCodec.createCodec(sparkConf, sparkConf.get(EVENT_LOG_COMPRESSION_CODEC))) - } else { - None - } - // Visible for tests only. - private[scheduler] val compressionCodecName = compressionCodec.map { c => - CompressionCodec.getShortName(c.getClass.getName) - } - - // Only defined if the file system scheme is not local - private var hadoopDataStream: Option[FSDataOutputStream] = None - - private var writer: Option[PrintWriter] = None + // For testing. + private[scheduler] val logWriter: EventLogFileWriter = + EventLogFileWriter(appId, appAttemptId, logBaseDir, sparkConf, hadoopConf) // For testing. Keep track of all JSON serialized events that have been logged. - private[scheduler] val loggedEvents = new ArrayBuffer[JValue] + private[scheduler] val loggedEvents = new mutable.ArrayBuffer[JValue] - // Visible for tests only. - private[scheduler] val logPath = getLogPath(logBaseDir, appId, appAttemptId, compressionCodecName) + private val shouldLogBlockUpdates = sparkConf.get(EVENT_LOG_BLOCK_UPDATES) + private val shouldLogStageExecutorMetrics = sparkConf.get(EVENT_LOG_STAGE_EXECUTOR_METRICS) + private val testing = sparkConf.get(EVENT_LOG_TESTING) // map of (stageId, stageAttempt) to executor metric peaks per executor/driver for the stage - private val liveStageExecutorMetrics = Map.empty[(Int, Int), Map[String, ExecutorMetrics]] + private val liveStageExecutorMetrics = + mutable.HashMap.empty[(Int, Int), mutable.HashMap[String, ExecutorMetrics]] /** * Creates the log file in the configured log directory. */ - def start() { - if (!fileSystem.getFileStatus(new Path(logBaseDir)).isDirectory) { - throw new IllegalArgumentException(s"Log directory $logBaseDir is not a directory.") - } - - val workingPath = logPath + IN_PROGRESS - val path = new Path(workingPath) - val uri = path.toUri - val defaultFs = FileSystem.getDefaultUri(hadoopConf).getScheme - val isDefaultLocal = defaultFs == null || defaultFs == "file" - - if (shouldOverwrite && fileSystem.delete(path, true)) { - logWarning(s"Event log $path already exists. Overwriting...") - } - - /* The Hadoop LocalFileSystem (r1.0.4) has known issues with syncing (HADOOP-7844). - * Therefore, for local files, use FileOutputStream instead. */ - val dstream = - if ((isDefaultLocal && uri.getScheme == null) || uri.getScheme == "file") { - new FileOutputStream(uri.getPath) - } else { - hadoopDataStream = Some(if (shouldAllowECLogs) { - fileSystem.create(path) - } else { - SparkHadoopUtil.createNonECFile(fileSystem, path) - }) - hadoopDataStream.get - } + def start(): Unit = { + logWriter.start() + initEventLog() + } - try { - val cstream = compressionCodec.map(_.compressedOutputStream(dstream)).getOrElse(dstream) - val bstream = new BufferedOutputStream(cstream, outputBufferSize) - - EventLoggingListener.initEventLog(bstream, testing, loggedEvents) - fileSystem.setPermission(path, LOG_FILE_PERMISSIONS) - writer = Some(new PrintWriter(bstream)) - logInfo("Logging events to %s".format(logPath)) - } catch { - case e: Exception => - dstream.close() - throw e + private def initEventLog(): Unit = { + val metadata = SparkListenerLogStart(SPARK_VERSION) + val eventJson = JsonProtocol.logStartToJson(metadata) + val metadataJson = compact(eventJson) + logWriter.writeEvent(metadataJson, flushLogger = true) + if (testing && loggedEvents != null) { + loggedEvents += eventJson } } /** Log the event as JSON. */ - private def logEvent(event: SparkListenerEvent, flushLogger: Boolean = false) { + private def logEvent(event: SparkListenerEvent, flushLogger: Boolean = false): Unit = { val eventJson = JsonProtocol.sparkEventToJson(event) - // scalastyle:off println - writer.foreach(_.println(compact(render(eventJson)))) - // scalastyle:on println - if (flushLogger) { - writer.foreach(_.flush()) - hadoopDataStream.foreach(_.hflush()) - } + logWriter.writeEvent(compact(render(eventJson)), flushLogger) if (testing) { loggedEvents += eventJson } @@ -165,7 +107,7 @@ private[spark] class EventLoggingListener( if (shouldLogStageExecutorMetrics) { // record the peak metrics for the new stage liveStageExecutorMetrics.put((event.stageInfo.stageId, event.stageInfo.attemptNumber()), - Map.empty[String, ExecutorMetrics]) + mutable.HashMap.empty[String, ExecutorMetrics]) } } @@ -299,32 +241,9 @@ private[spark] class EventLoggingListener( } } - /** - * Stop logging events. The event log file will be renamed so that it loses the - * ".inprogress" suffix. - */ + /** Stop logging events. */ def stop(): Unit = { - writer.foreach(_.close()) - - val target = new Path(logPath) - if (fileSystem.exists(target)) { - if (shouldOverwrite) { - logWarning(s"Event log $target already exists. Overwriting...") - if (!fileSystem.delete(target, true)) { - logWarning(s"Error deleting $target") - } - } else { - throw new IOException("Target log file already exists (%s)".format(logPath)) - } - } - fileSystem.rename(new Path(logPath + IN_PROGRESS), target) - // touch file to ensure modtime is current across those filesystems where rename() - // does not set it, -and which support setTimes(); it's a no-op on most object stores - try { - fileSystem.setTimes(target, System.currentTimeMillis(), -1) - } catch { - case e: Exception => logDebug(s"failed to set time of $target", e) - } + logWriter.stop() } private[spark] def redactEvent( @@ -336,8 +255,10 @@ private[spark] class EventLoggingListener( // ... // where jvmInformation, sparkProperties, etc. are sequence of tuples. // We go through the various of properties and redact sensitive information from them. - val redactedProps = event.environmentDetails.map{ case (name, props) => - name -> Utils.redact(sparkConf, props) + val noRedactProps = Seq("Classpath Entries") + val redactedProps = event.environmentDetails.map { + case (name, props) if noRedactProps.contains(name) => name -> props + case (name, props) => name -> Utils.redact(sparkConf, props) } SparkListenerEnvironmentUpdate(redactedProps) } @@ -345,93 +266,7 @@ private[spark] class EventLoggingListener( } private[spark] object EventLoggingListener extends Logging { - // Suffix applied to the names of files still being written by applications. - val IN_PROGRESS = ".inprogress" val DEFAULT_LOG_DIR = "/tmp/spark-events" // Dummy stage key used by driver in executor metrics updates val DRIVER_STAGE_KEY = (-1, -1) - - private val LOG_FILE_PERMISSIONS = new FsPermission(Integer.parseInt("770", 8).toShort) - - // A cache for compression codecs to avoid creating the same codec many times - private val codecMap = Map.empty[String, CompressionCodec] - - /** - * Write metadata about an event log to the given stream. - * The metadata is encoded in the first line of the event log as JSON. - * - * @param logStream Raw output stream to the event log file. - */ - def initEventLog( - logStream: OutputStream, - testing: Boolean, - loggedEvents: ArrayBuffer[JValue]): Unit = { - val metadata = SparkListenerLogStart(SPARK_VERSION) - val eventJson = JsonProtocol.logStartToJson(metadata) - val metadataJson = compact(eventJson) + "\n" - logStream.write(metadataJson.getBytes(StandardCharsets.UTF_8)) - if (testing && loggedEvents != null) { - loggedEvents += eventJson - } - } - - /** - * Return a file-system-safe path to the log file for the given application. - * - * Note that because we currently only create a single log file for each application, - * we must encode all the information needed to parse this event log in the file name - * instead of within the file itself. Otherwise, if the file is compressed, for instance, - * we won't know which codec to use to decompress the metadata needed to open the file in - * the first place. - * - * The log file name will identify the compression codec used for the contents, if any. - * For example, app_123 for an uncompressed log, app_123.lzf for an LZF-compressed log. - * - * @param logBaseDir Directory where the log file will be written. - * @param appId A unique app ID. - * @param appAttemptId A unique attempt id of appId. May be the empty string. - * @param compressionCodecName Name to identify the codec used to compress the contents - * of the log, or None if compression is not enabled. - * @return A path which consists of file-system-safe characters. - */ - def getLogPath( - logBaseDir: URI, - appId: String, - appAttemptId: Option[String], - compressionCodecName: Option[String] = None): String = { - val base = new Path(logBaseDir).toString.stripSuffix("/") + "/" + Utils.sanitizeDirName(appId) - val codec = compressionCodecName.map("." + _).getOrElse("") - if (appAttemptId.isDefined) { - base + "_" + Utils.sanitizeDirName(appAttemptId.get) + codec - } else { - base + codec - } - } - - /** - * Opens an event log file and returns an input stream that contains the event data. - * - * @return input stream that holds one JSON record per line. - */ - def openEventLog(log: Path, fs: FileSystem): InputStream = { - val in = new BufferedInputStream(fs.open(log)) - try { - val codec = codecName(log).map { c => - codecMap.getOrElseUpdate(c, CompressionCodec.createCodec(new SparkConf, c)) - } - codec.map(_.compressedContinuousInputStream(in)).getOrElse(in) - } catch { - case e: Throwable => - in.close() - throw e - } - } - - def codecName(log: Path): Option[String] = { - // Compression codec is encoded as an extension, e.g. app_123.lzf - // Since we sanitize the app ID to not include periods, it is safe to split on it - val logName = log.getName.stripSuffix(IN_PROGRESS) - logName.split("\\.").tail.lastOption - } - } diff --git a/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala b/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala index 46a35b6a2eaf9..ee31093ec0652 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/ExecutorLossReason.scala @@ -58,3 +58,11 @@ private [spark] object LossReasonPending extends ExecutorLossReason("Pending los private[spark] case class SlaveLost(_message: String = "Slave lost", workerLost: Boolean = false) extends ExecutorLossReason(_message) + +/** + * A loss reason that means the executor is marked for decommissioning. + * + * This is used by the task scheduler to remove state associated with the executor, but + * not yet fail any tasks that were running in the executor before the executor is "fully" lost. + */ +private [spark] object ExecutorDecommission extends ExecutorLossReason("Executor decommission.") diff --git a/core/src/main/scala/org/apache/spark/scheduler/ExecutorResourceInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/ExecutorResourceInfo.scala index 02047609edd96..fd04db8c09d76 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/ExecutorResourceInfo.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/ExecutorResourceInfo.scala @@ -25,10 +25,15 @@ import org.apache.spark.resource.{ResourceAllocator, ResourceInformation} * information. * @param name Resource name * @param addresses Resource addresses provided by the executor + * @param numParts Number of ways each resource is subdivided when scheduling tasks */ -private[spark] class ExecutorResourceInfo(name: String, addresses: Seq[String]) +private[spark] class ExecutorResourceInfo( + name: String, + addresses: Seq[String], + numParts: Int) extends ResourceInformation(name, addresses.toArray) with ResourceAllocator { override protected def resourceName = this.name override protected def resourceAddresses = this.addresses + override protected def slotsPerAddress: Int = numParts } diff --git a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala index 66ab9a52b7781..2d26a314e7a62 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/InputFormatInfo.scala @@ -64,7 +64,7 @@ class InputFormatInfo(val configuration: Configuration, val inputFormatClazz: Cl case _ => false } - private def validate() { + private def validate(): Unit = { logDebug("validate InputFormatInfo : " + inputFormatClazz + ", path " + path) try { diff --git a/core/src/main/scala/org/apache/spark/scheduler/JobWaiter.scala b/core/src/main/scala/org/apache/spark/scheduler/JobWaiter.scala index 65d7184231e24..feed831620840 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/JobWaiter.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/JobWaiter.scala @@ -49,7 +49,7 @@ private[spark] class JobWaiter[T]( * asynchronously. After the low level scheduler cancels all the tasks belonging to this job, it * will fail this job with a SparkException. */ - def cancel() { + def cancel(): Unit = { dagScheduler.cancelJob(jobId, None) } diff --git a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala index 302ebd30da228..95b0096cade38 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala @@ -186,6 +186,17 @@ private[spark] class LiveListenerBus(conf: SparkConf) { metricsSystem.registerSource(metrics) } + /** + * For testing only. Wait until there are no more events in the queue, or until the default + * wait time has elapsed. Throw `TimeoutException` if the specified time elapsed before the queue + * emptied. + * Exposed for testing. + */ + @throws(classOf[TimeoutException]) + private[spark] def waitUntilEmpty(): Unit = { + waitUntilEmpty(TimeUnit.SECONDS.toMillis(10)) + } + /** * For testing only. Wait until there are no more events in the queue, or until the specified * time has elapsed. Throw `TimeoutException` if the specified time elapsed before the queue @@ -215,10 +226,8 @@ private[spark] class LiveListenerBus(conf: SparkConf) { return } - synchronized { - queues.asScala.foreach(_.stop()) - queues.clear() - } + queues.asScala.foreach(_.stop()) + queues.clear() } // For testing only. diff --git a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala index 64f0a060a247c..7f8893ff3b9d8 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala @@ -43,6 +43,12 @@ private[spark] sealed trait MapStatus { * necessary for correctness, since block fetchers are allowed to skip zero-size blocks. */ def getSizeForBlock(reduceId: Int): Long + + /** + * The unique ID of this shuffle map task, if spark.shuffle.useOldFetchProtocol enabled we use + * partitionId of the task or taskContext.taskAttemptId is used. + */ + def mapId: Long } @@ -56,11 +62,14 @@ private[spark] object MapStatus { .map(_.conf.get(config.SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS)) .getOrElse(config.SHUFFLE_MIN_NUM_PARTS_TO_HIGHLY_COMPRESS.defaultValue.get) - def apply(loc: BlockManagerId, uncompressedSizes: Array[Long]): MapStatus = { + def apply( + loc: BlockManagerId, + uncompressedSizes: Array[Long], + mapTaskId: Long): MapStatus = { if (uncompressedSizes.length > minPartitionsToUseHighlyCompressMapStatus) { - HighlyCompressedMapStatus(loc, uncompressedSizes) + HighlyCompressedMapStatus(loc, uncompressedSizes, mapTaskId) } else { - new CompressedMapStatus(loc, uncompressedSizes) + new CompressedMapStatus(loc, uncompressedSizes, mapTaskId) } } @@ -100,16 +109,19 @@ private[spark] object MapStatus { * * @param loc location where the task is being executed. * @param compressedSizes size of the blocks, indexed by reduce partition id. + * @param _mapTaskId unique task id for the task */ private[spark] class CompressedMapStatus( private[this] var loc: BlockManagerId, - private[this] var compressedSizes: Array[Byte]) + private[this] var compressedSizes: Array[Byte], + private[this] var _mapTaskId: Long) extends MapStatus with Externalizable { - protected def this() = this(null, null.asInstanceOf[Array[Byte]]) // For deserialization only + // For deserialization only + protected def this() = this(null, null.asInstanceOf[Array[Byte]], -1) - def this(loc: BlockManagerId, uncompressedSizes: Array[Long]) { - this(loc, uncompressedSizes.map(MapStatus.compressSize)) + def this(loc: BlockManagerId, uncompressedSizes: Array[Long], mapTaskId: Long) { + this(loc, uncompressedSizes.map(MapStatus.compressSize), mapTaskId) } override def location: BlockManagerId = loc @@ -118,10 +130,13 @@ private[spark] class CompressedMapStatus( MapStatus.decompressSize(compressedSizes(reduceId)) } + override def mapId: Long = _mapTaskId + override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException { loc.writeExternal(out) out.writeInt(compressedSizes.length) out.write(compressedSizes) + out.writeLong(_mapTaskId) } override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException { @@ -129,6 +144,7 @@ private[spark] class CompressedMapStatus( val len = in.readInt() compressedSizes = new Array[Byte](len) in.readFully(compressedSizes) + _mapTaskId = in.readLong() } } @@ -142,20 +158,23 @@ private[spark] class CompressedMapStatus( * @param emptyBlocks a bitmap tracking which blocks are empty * @param avgSize average size of the non-empty and non-huge blocks * @param hugeBlockSizes sizes of huge blocks by their reduceId. + * @param _mapTaskId unique task id for the task */ private[spark] class HighlyCompressedMapStatus private ( private[this] var loc: BlockManagerId, private[this] var numNonEmptyBlocks: Int, private[this] var emptyBlocks: RoaringBitmap, private[this] var avgSize: Long, - private[this] var hugeBlockSizes: scala.collection.Map[Int, Byte]) + private[this] var hugeBlockSizes: scala.collection.Map[Int, Byte], + private[this] var _mapTaskId: Long) extends MapStatus with Externalizable { // loc could be null when the default constructor is called during deserialization - require(loc == null || avgSize > 0 || hugeBlockSizes.size > 0 || numNonEmptyBlocks == 0, + require(loc == null || avgSize > 0 || hugeBlockSizes.size > 0 + || numNonEmptyBlocks == 0 || _mapTaskId > 0, "Average size can only be zero for map stages that produced no output") - protected def this() = this(null, -1, null, -1, null) // For deserialization only + protected def this() = this(null, -1, null, -1, null, -1) // For deserialization only override def location: BlockManagerId = loc @@ -171,6 +190,8 @@ private[spark] class HighlyCompressedMapStatus private ( } } + override def mapId: Long = _mapTaskId + override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException { loc.writeExternal(out) emptyBlocks.writeExternal(out) @@ -180,6 +201,7 @@ private[spark] class HighlyCompressedMapStatus private ( out.writeInt(kv._1) out.writeByte(kv._2) } + out.writeLong(_mapTaskId) } override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException { @@ -195,11 +217,15 @@ private[spark] class HighlyCompressedMapStatus private ( hugeBlockSizesImpl(block) = size } hugeBlockSizes = hugeBlockSizesImpl + _mapTaskId = in.readLong() } } private[spark] object HighlyCompressedMapStatus { - def apply(loc: BlockManagerId, uncompressedSizes: Array[Long]): HighlyCompressedMapStatus = { + def apply( + loc: BlockManagerId, + uncompressedSizes: Array[Long], + mapTaskId: Long): HighlyCompressedMapStatus = { // We must keep track of which blocks are empty so that we don't report a zero-sized // block as being non-empty (or vice-versa) when using the average block size. var i = 0 @@ -240,6 +266,6 @@ private[spark] object HighlyCompressedMapStatus { emptyBlocks.trim() emptyBlocks.runOptimize() new HighlyCompressedMapStatus(loc, numNonEmptyBlocks, emptyBlocks, avgSize, - hugeBlockSizes) + hugeBlockSizes, mapTaskId) } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala index f4b0ab10155a2..2e2851eb9070b 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/Pool.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Pool.scala @@ -59,14 +59,14 @@ private[spark] class Pool( } } - override def addSchedulable(schedulable: Schedulable) { + override def addSchedulable(schedulable: Schedulable): Unit = { require(schedulable != null) schedulableQueue.add(schedulable) schedulableNameToSchedulable.put(schedulable.name, schedulable) schedulable.parent = this } - override def removeSchedulable(schedulable: Schedulable) { + override def removeSchedulable(schedulable: Schedulable): Unit = { schedulableQueue.remove(schedulable) schedulableNameToSchedulable.remove(schedulable.name) } @@ -84,10 +84,14 @@ private[spark] class Pool( null } - override def executorLost(executorId: String, host: String, reason: ExecutorLossReason) { + override def executorLost(executorId: String, host: String, reason: ExecutorLossReason): Unit = { schedulableQueue.asScala.foreach(_.executorLost(executorId, host, reason)) } + override def executorDecommission(executorId: String): Unit = { + schedulableQueue.asScala.foreach(_.executorDecommission(executorId)) + } + override def checkSpeculatableTasks(minTimeToSpeculation: Int): Boolean = { var shouldRevive = false for (schedulable <- schedulableQueue.asScala) { @@ -106,14 +110,14 @@ private[spark] class Pool( sortedTaskSetQueue } - def increaseRunningTasks(taskNum: Int) { + def increaseRunningTasks(taskNum: Int): Unit = { runningTasks += taskNum if (parent != null) { parent.increaseRunningTasks(taskNum) } } - def decreaseRunningTasks(taskNum: Int) { + def decreaseRunningTasks(taskNum: Int): Unit = { runningTasks -= taskNum if (parent != null) { parent.decreaseRunningTasks(taskNum) diff --git a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala index 226c23733c870..60b6fe7a60915 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala @@ -19,7 +19,7 @@ package org.apache.spark.scheduler import java.io.{EOFException, InputStream, IOException} -import scala.io.Source +import scala.io.{Codec, Source} import com.fasterxml.jackson.core.JsonParseException import com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException @@ -48,13 +48,15 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging { * @param eventsFilter Filter function to select JSON event strings in the log data stream that * should be parsed and replayed. When not specified, all event strings in the log data * are parsed and replayed. + * @return whether it succeeds to replay the log file entirely without error including + * HaltReplayException. false otherwise. */ def replay( logData: InputStream, sourceName: String, maybeTruncated: Boolean = false, - eventsFilter: ReplayEventsFilter = SELECT_ALL_FILTER): Unit = { - val lines = Source.fromInputStream(logData).getLines() + eventsFilter: ReplayEventsFilter = SELECT_ALL_FILTER): Boolean = { + val lines = Source.fromInputStream(logData)(Codec.UTF8).getLines() replay(lines, sourceName, maybeTruncated, eventsFilter) } @@ -66,7 +68,7 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging { lines: Iterator[String], sourceName: String, maybeTruncated: Boolean, - eventsFilter: ReplayEventsFilter): Unit = { + eventsFilter: ReplayEventsFilter): Boolean = { var currentLine: String = null var lineNumber: Int = 0 val unrecognizedEvents = new scala.collection.mutable.HashSet[String] @@ -114,15 +116,18 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging { } } } + true } catch { case e: HaltReplayException => // Just stop replay. - case _: EOFException if maybeTruncated => + false + case _: EOFException if maybeTruncated => false case ioe: IOException => throw ioe case e: Exception => logError(s"Exception parsing Spark event log: $sourceName", e) logError(s"Malformed line #$lineNumber: $currentLine\n") + false } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala index d1687830ff7bf..7fdc3186e86bd 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala @@ -34,8 +34,9 @@ private[spark] class ResultStage( val partitions: Array[Int], parents: List[Stage], firstJobId: Int, - callSite: CallSite) - extends Stage(id, rdd, partitions.length, parents, firstJobId, callSite) { + callSite: CallSite, + resourceProfileId: Int) + extends Stage(id, rdd, partitions.length, parents, firstJobId, callSite, resourceProfileId) { /** * The active job for this result stage. Will be empty if the job has already finished diff --git a/core/src/main/scala/org/apache/spark/scheduler/Schedulable.scala b/core/src/main/scala/org/apache/spark/scheduler/Schedulable.scala index b6f88ed0a93aa..8cc239c81d11a 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/Schedulable.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Schedulable.scala @@ -43,6 +43,7 @@ private[spark] trait Schedulable { def removeSchedulable(schedulable: Schedulable): Unit def getSchedulableByName(name: String): Schedulable def executorLost(executorId: String, host: String, reason: ExecutorLossReason): Unit + def executorDecommission(executorId: String): Unit def checkSpeculatableTasks(minTimeToSpeculation: Int): Boolean def getSortedTaskSetQueue: ArrayBuffer[TaskSetManager] } diff --git a/core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala b/core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala index c85c74f2fb973..8f6a22177a5b8 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/SchedulableBuilder.scala @@ -45,11 +45,11 @@ private[spark] trait SchedulableBuilder { private[spark] class FIFOSchedulableBuilder(val rootPool: Pool) extends SchedulableBuilder with Logging { - override def buildPools() { + override def buildPools(): Unit = { // nothing } - override def addTaskSetManager(manager: Schedulable, properties: Properties) { + override def addTaskSetManager(manager: Schedulable, properties: Properties): Unit = { rootPool.addSchedulable(manager) } } @@ -70,7 +70,7 @@ private[spark] class FairSchedulableBuilder(val rootPool: Pool, conf: SparkConf) val DEFAULT_MINIMUM_SHARE = 0 val DEFAULT_WEIGHT = 1 - override def buildPools() { + override def buildPools(): Unit = { var fileData: Option[(InputStream, String)] = None try { fileData = schedulerAllocFile.map { f => @@ -106,7 +106,7 @@ private[spark] class FairSchedulableBuilder(val rootPool: Pool, conf: SparkConf) buildDefaultPool() } - private def buildDefaultPool() { + private def buildDefaultPool(): Unit = { if (rootPool.getSchedulableByName(DEFAULT_POOL_NAME) == null) { val pool = new Pool(DEFAULT_POOL_NAME, DEFAULT_SCHEDULING_MODE, DEFAULT_MINIMUM_SHARE, DEFAULT_WEIGHT) @@ -116,7 +116,7 @@ private[spark] class FairSchedulableBuilder(val rootPool: Pool, conf: SparkConf) } } - private def buildFairSchedulerPool(is: InputStream, fileName: String) { + private def buildFairSchedulerPool(is: InputStream, fileName: String): Unit = { val xml = XML.load(is) for (poolNode <- (xml \\ POOLS_PROPERTY)) { @@ -180,7 +180,7 @@ private[spark] class FairSchedulableBuilder(val rootPool: Pool, conf: SparkConf) } } - override def addTaskSetManager(manager: Schedulable, properties: Properties) { + override def addTaskSetManager(manager: Schedulable, properties: Properties): Unit = { val poolName = if (properties != null) { properties.getProperty(FAIR_SCHEDULER_PROPERTIES, DEFAULT_POOL_NAME) } else { diff --git a/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala index 9159d2a0158d5..4752353046c19 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/SchedulerBackend.scala @@ -27,6 +27,9 @@ private[spark] trait SchedulerBackend { def start(): Unit def stop(): Unit + /** + * Update the current offers and schedule tasks + */ def reviveOffers(): Unit def defaultParallelism(): Int diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala index 1b44d0aee3195..be1984de9837f 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala @@ -42,8 +42,9 @@ private[spark] class ShuffleMapStage( firstJobId: Int, callSite: CallSite, val shuffleDep: ShuffleDependency[_, _, _], - mapOutputTrackerMaster: MapOutputTrackerMaster) - extends Stage(id, rdd, numTasks, parents, firstJobId, callSite) { + mapOutputTrackerMaster: MapOutputTrackerMaster, + resourceProfileId: Int) + extends Stage(id, rdd, numTasks, parents, firstJobId, callSite, resourceProfileId) { private[this] var _mapStageJobs: List[ActiveJob] = Nil diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala index 710f5eb211dde..4c0c30a3caf67 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala @@ -23,7 +23,7 @@ import java.util.Properties import org.apache.spark._ import org.apache.spark.broadcast.Broadcast -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{config, Logging} import org.apache.spark.rdd.RDD /** @@ -91,7 +91,12 @@ private[spark] class ShuffleMapTask( val rdd = rddAndDep._1 val dep = rddAndDep._2 - dep.shuffleWriterProcessor.write(rdd, dep, partitionId, context, partition) + // While we use the old shuffle fetch protocol, we use partitionId as mapId in the + // ShuffleBlockId construction. + val mapId = if (SparkEnv.get.conf.get(config.SHUFFLE_USE_OLD_FETCH_PROTOCOL)) { + partitionId + } else context.taskAttemptId() + dep.shuffleWriterProcessor.write(rdd, dep, mapId, context, partition) } override def preferredLocations: Seq[TaskLocation] = preferredLocs diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala index 26cca334d3bd5..ae7924d66a301 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala @@ -21,7 +21,7 @@ import scala.collection.mutable.HashSet import org.apache.spark.executor.TaskMetrics import org.apache.spark.internal.Logging -import org.apache.spark.rdd.RDD +import org.apache.spark.rdd.{DeterministicLevel, RDD} import org.apache.spark.util.CallSite /** @@ -59,7 +59,8 @@ private[scheduler] abstract class Stage( val numTasks: Int, val parents: List[Stage], val firstJobId: Int, - val callSite: CallSite) + val callSite: CallSite, + val resourceProfileId: Int) extends Logging { val numPartitions = rdd.partitions.length @@ -79,7 +80,8 @@ private[scheduler] abstract class Stage( * StageInfo to tell SparkListeners when a job starts (which happens before any stage attempts * have been created). */ - private var _latestInfo: StageInfo = StageInfo.fromStage(this, nextAttemptId) + private var _latestInfo: StageInfo = + StageInfo.fromStage(this, nextAttemptId, resourceProfileId = resourceProfileId) /** * Set of stage attempt IDs that have failed. We keep track of these failures in order to avoid @@ -100,7 +102,8 @@ private[scheduler] abstract class Stage( val metrics = new TaskMetrics metrics.register(rdd.sparkContext) _latestInfo = StageInfo.fromStage( - this, nextAttemptId, Some(numPartitionsToCompute), metrics, taskLocalityPreferences) + this, nextAttemptId, Some(numPartitionsToCompute), metrics, taskLocalityPreferences, + resourceProfileId = resourceProfileId) nextAttemptId += 1 } @@ -116,4 +119,8 @@ private[scheduler] abstract class Stage( /** Returns the sequence of partition ids that are missing (i.e. needs to be computed). */ def findMissingPartitions(): Seq[Int] + + def isIndeterminate: Boolean = { + rdd.outputDeterministicLevel == DeterministicLevel.INDETERMINATE + } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala index e3216151462bd..556478d83cf39 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala @@ -38,7 +38,8 @@ class StageInfo( val details: String, val taskMetrics: TaskMetrics = null, private[spark] val taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty, - private[spark] val shuffleDepId: Option[Int] = None) { + private[spark] val shuffleDepId: Option[Int] = None, + val resourceProfileId: Int) { /** When this stage was submitted from the DAGScheduler to a TaskScheduler. */ var submissionTime: Option[Long] = None /** Time when all tasks in the stage completed or when the stage was cancelled. */ @@ -52,7 +53,7 @@ class StageInfo( */ val accumulables = HashMap[Long, AccumulableInfo]() - def stageFailed(reason: String) { + def stageFailed(reason: String): Unit = { failureReason = Some(reason) completionTime = Some(System.currentTimeMillis) } @@ -87,7 +88,8 @@ private[spark] object StageInfo { attemptId: Int, numTasks: Option[Int] = None, taskMetrics: TaskMetrics = null, - taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty + taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty, + resourceProfileId: Int ): StageInfo = { val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd) val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos @@ -105,6 +107,7 @@ private[spark] object StageInfo { stage.details, taskMetrics, taskLocalityPreferences, - shuffleDepId) + shuffleDepId, + resourceProfileId) } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/StatsReportListener.scala b/core/src/main/scala/org/apache/spark/scheduler/StatsReportListener.scala index 3c7af4f6146fa..ca48775e77f27 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/StatsReportListener.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/StatsReportListener.scala @@ -36,7 +36,7 @@ class StatsReportListener extends SparkListener with Logging { private val taskInfoMetrics = mutable.Buffer[(TaskInfo, TaskMetrics)]() - override def onTaskEnd(taskEnd: SparkListenerTaskEnd) { + override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = { val info = taskEnd.taskInfo val metrics = taskEnd.taskMetrics if (info != null && metrics != null) { @@ -44,7 +44,7 @@ class StatsReportListener extends SparkListener with Logging { } } - override def onStageCompleted(stageCompleted: SparkListenerStageCompleted) { + override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = { implicit val sc = stageCompleted this.logInfo(s"Finished stage: ${getStatusDetail(stageCompleted.stageInfo)}") showMillisDistribution("task runtime:", (info, _) => info.duration, taskInfoMetrics) @@ -108,7 +108,7 @@ private[spark] object StatsReportListener extends Logging { (info, metric) => { getMetric(info, metric).toDouble }) } - def showDistribution(heading: String, d: Distribution, formatNumber: Double => String) { + def showDistribution(heading: String, d: Distribution, formatNumber: Double => String): Unit = { val stats = d.statCounter val quantiles = d.getQuantiles(probabilities).map(formatNumber) logInfo(heading + stats) @@ -119,11 +119,11 @@ private[spark] object StatsReportListener extends Logging { def showDistribution( heading: String, dOpt: Option[Distribution], - formatNumber: Double => String) { + formatNumber: Double => String): Unit = { dOpt.foreach { d => showDistribution(heading, d, formatNumber)} } - def showDistribution(heading: String, dOpt: Option[Distribution], format: String) { + def showDistribution(heading: String, dOpt: Option[Distribution], format: String): Unit = { def f(d: Double): String = format.format(d) showDistribution(heading, dOpt, f _) } @@ -132,26 +132,26 @@ private[spark] object StatsReportListener extends Logging { heading: String, format: String, getMetric: (TaskInfo, TaskMetrics) => Double, - taskInfoMetrics: Seq[(TaskInfo, TaskMetrics)]) { + taskInfoMetrics: Seq[(TaskInfo, TaskMetrics)]): Unit = { showDistribution(heading, extractDoubleDistribution(taskInfoMetrics, getMetric), format) } def showBytesDistribution( heading: String, getMetric: (TaskInfo, TaskMetrics) => Long, - taskInfoMetrics: Seq[(TaskInfo, TaskMetrics)]) { + taskInfoMetrics: Seq[(TaskInfo, TaskMetrics)]): Unit = { showBytesDistribution(heading, extractLongDistribution(taskInfoMetrics, getMetric)) } - def showBytesDistribution(heading: String, dOpt: Option[Distribution]) { + def showBytesDistribution(heading: String, dOpt: Option[Distribution]): Unit = { dOpt.foreach { dist => showBytesDistribution(heading, dist) } } - def showBytesDistribution(heading: String, dist: Distribution) { + def showBytesDistribution(heading: String, dist: Distribution): Unit = { showDistribution(heading, dist, (d => Utils.bytesToString(d.toLong)): Double => String) } - def showMillisDistribution(heading: String, dOpt: Option[Distribution]) { + def showMillisDistribution(heading: String, dOpt: Option[Distribution]): Unit = { showDistribution(heading, dOpt, (d => StatsReportListener.millisToString(d.toLong)): Double => String) } @@ -159,7 +159,7 @@ private[spark] object StatsReportListener extends Logging { def showMillisDistribution( heading: String, getMetric: (TaskInfo, TaskMetrics) => Long, - taskInfoMetrics: Seq[(TaskInfo, TaskMetrics)]) { + taskInfoMetrics: Seq[(TaskInfo, TaskMetrics)]): Unit = { showMillisDistribution(heading, extractLongDistribution(taskInfoMetrics, getMetric)) } diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala index 01828f860bd5e..ebc1c05435fee 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala @@ -225,7 +225,7 @@ private[spark] abstract class Task[T]( * be called multiple times. * If interruptThread is true, we will also call Thread.interrupt() on the Task's executor thread. */ - def kill(interruptThread: Boolean, reason: String) { + def kill(interruptThread: Boolean, reason: String): Unit = { require(reason != null) _reasonIfKilled = reason if (context != null) { diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala index 9843eab4f1346..921562bd15dae 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskInfo.scala @@ -70,11 +70,11 @@ class TaskInfo( var killed = false - private[spark] def markGettingResult(time: Long) { + private[spark] def markGettingResult(time: Long): Unit = { gettingResultTime = time } - private[spark] def markFinished(state: TaskState, time: Long) { + private[spark] def markFinished(state: TaskState, time: Long): Unit = { // finishTime should be set larger than 0, otherwise "finished" below will return false. assert(time > 0) finishTime = time diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala index 9b7f901c55e00..6c3d2a4ee3125 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala @@ -64,6 +64,9 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul val (result, size) = serializer.get().deserialize[TaskResult[_]](serializedData) match { case directResult: DirectTaskResult[_] => if (!taskSetManager.canFetchMoreResults(serializedData.limit())) { + // kill the task so that it will not become zombie task + scheduler.handleFailedTask(taskSetManager, tid, TaskState.KILLED, TaskKilled( + "Tasks result size has exceeded maxResultSize")) return } // deserialize "value" without holding any lock so that it won't block other threads. @@ -75,6 +78,9 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul if (!taskSetManager.canFetchMoreResults(size)) { // dropped by executor if size is larger than maxResultSize sparkEnv.blockManager.master.removeBlock(blockId) + // kill the task so that it will not become zombie task + scheduler.handleFailedTask(taskSetManager, tid, TaskState.KILLED, TaskKilled( + "Tasks result size has exceeded maxResultSize")) return } logDebug("Fetching indirect task result for TID %s".format(tid)) @@ -125,7 +131,7 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul } def enqueueFailedTask(taskSetManager: TaskSetManager, tid: Long, taskState: TaskState, - serializedData: ByteBuffer) { + serializedData: ByteBuffer): Unit = { var reason : TaskFailedReason = UnknownReason try { getTaskResultExecutor.execute(() => Utils.logUncaughtExceptions { @@ -164,7 +170,7 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul }) } - def stop() { + def stop(): Unit = { getTaskResultExecutor.shutdownNow() } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala index 8c73d563043c2..e9e638a3645ac 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala @@ -46,7 +46,7 @@ private[spark] trait TaskScheduler { // Invoked after system has successfully initialized (typically in spark context). // Yarn uses this to bootstrap allocation of resources based on preferred locations, // wait for slave registrations, etc. - def postStartHook() { } + def postStartHook(): Unit = { } // Disconnect from the cluster. def stop(): Unit @@ -72,7 +72,7 @@ private[spark] trait TaskScheduler { // Notify the corresponding `TaskSetManager`s of the stage, that a partition has already completed // and they can skip running tasks for it. - def notifyPartitionCompletion(stageId: Int, partitionId: Int) + def notifyPartitionCompletion(stageId: Int, partitionId: Int): Unit // Set the DAG scheduler for upcalls. This is guaranteed to be set before submitTasks is called. def setDAGScheduler(dagScheduler: DAGScheduler): Unit @@ -98,6 +98,11 @@ private[spark] trait TaskScheduler { */ def applicationId(): String = appId + /** + * Process a decommissioning executor. + */ + def executorDecommission(executorId: String): Unit + /** * Process a lost executor */ diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala index 1496dff31a4dc..1b197c4cca53e 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala @@ -170,11 +170,11 @@ private[spark] class TaskSchedulerImpl( } } - override def setDAGScheduler(dagScheduler: DAGScheduler) { + override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = { this.dagScheduler = dagScheduler } - def initialize(backend: SchedulerBackend) { + def initialize(backend: SchedulerBackend): Unit = { this.backend = backend schedulableBuilder = { schedulingMode match { @@ -192,7 +192,7 @@ private[spark] class TaskSchedulerImpl( def newTaskId(): Long = nextTaskId.getAndIncrement() - override def start() { + override def start(): Unit = { backend.start() if (!isLocal && conf.get(SPECULATION_ENABLED)) { @@ -203,11 +203,11 @@ private[spark] class TaskSchedulerImpl( } } - override def postStartHook() { + override def postStartHook(): Unit = { waitBackendReady() } - override def submitTasks(taskSet: TaskSet) { + override def submitTasks(taskSet: TaskSet): Unit = { val tasks = taskSet.tasks logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks") this.synchronized { @@ -233,7 +233,7 @@ private[spark] class TaskSchedulerImpl( if (!isLocal && !hasReceivedTask) { starvationTimer.scheduleAtFixedRate(new TimerTask() { - override def run() { + override def run(): Unit = { if (!hasLaunchedTask) { logWarning("Initial job has not accepted any resources; " + "check your cluster UI to ensure that workers are registered " + @@ -384,7 +384,9 @@ private[spark] class TaskSchedulerImpl( */ private def resourcesMeetTaskRequirements(resources: Map[String, Buffer[String]]): Boolean = { val resourcesFree = resources.map(r => r._1 -> r._2.length) - ResourceUtils.resourcesMeetRequirements(resourcesFree, resourcesReqsPerTask) + val meetsReqs = ResourceUtils.resourcesMeetRequirements(resourcesFree, resourcesReqsPerTask) + logDebug(s"Resources meet task requirements is: $meetsReqs") + meetsReqs } /** @@ -430,8 +432,7 @@ private[spark] class TaskSchedulerImpl( val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores / CPUS_PER_TASK)) val availableResources = shuffledOffers.map(_.resources).toArray val availableCpus = shuffledOffers.map(o => o.cores).toArray - val availableSlots = shuffledOffers.map(o => o.cores / CPUS_PER_TASK).sum - val sortedTaskSets = rootPool.getSortedTaskSetQueue + val sortedTaskSets = rootPool.getSortedTaskSetQueue.filterNot(_.isZombie) for (taskSet <- sortedTaskSets) { logDebug("parentName: %s, name: %s, runningTasks: %s".format( taskSet.parent.name, taskSet.name, taskSet.runningTasks)) @@ -444,6 +445,7 @@ private[spark] class TaskSchedulerImpl( // of locality levels so that it gets a chance to launch local tasks on all of them. // NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY for (taskSet <- sortedTaskSets) { + val availableSlots = availableCpus.map(c => c / CPUS_PER_TASK).sum // Skip the barrier taskSet if the available slots are less than the number of pending tasks. if (taskSet.isBarrier && availableSlots < taskSet.numTasks) { // Skip the launch process. @@ -572,7 +574,7 @@ private[spark] class TaskSchedulerImpl( Random.shuffle(offers) } - def statusUpdate(tid: Long, state: TaskState, serializedData: ByteBuffer) { + def statusUpdate(tid: Long, state: TaskState, serializedData: ByteBuffer): Unit = { var failedExecutor: Option[String] = None var reason: Option[ExecutorLossReason] = None synchronized { @@ -681,7 +683,7 @@ private[spark] class TaskSchedulerImpl( }) } - def error(message: String) { + def error(message: String): Unit = { synchronized { if (taskSetsByStageIdAndAttempt.nonEmpty) { // Have each task set throw a SparkException with the error @@ -704,7 +706,7 @@ private[spark] class TaskSchedulerImpl( } } - override def stop() { + override def stop(): Unit = { speculationScheduler.shutdown() if (backend != null) { backend.stop() @@ -722,7 +724,7 @@ private[spark] class TaskSchedulerImpl( override def defaultParallelism(): Int = backend.defaultParallelism() // Check for speculatable tasks in all our active jobs. - def checkSpeculatableTasks() { + def checkSpeculatableTasks(): Unit = { var shouldRevive = false synchronized { shouldRevive = rootPool.checkSpeculatableTasks(MIN_TIME_TO_SPECULATION) @@ -732,6 +734,11 @@ private[spark] class TaskSchedulerImpl( } } + override def executorDecommission(executorId: String): Unit = { + rootPool.executorDecommission(executorId) + backend.reviveOffers() + } + override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = { var failedExecutor: Option[String] = None @@ -798,7 +805,7 @@ private[spark] class TaskSchedulerImpl( * reason is not yet known, do not yet remove its association with its host nor update the status * of any running tasks, since the loss reason defines whether we'll fail those tasks. */ - private def removeExecutor(executorId: String, reason: ExecutorLossReason) { + private def removeExecutor(executorId: String, reason: ExecutorLossReason): Unit = { // The tasks on the lost executor may not send any more status updates (because the executor // has been lost), so they should be cleaned up here. executorIdToRunningTaskIds.remove(executorId).foreach { taskIds => @@ -829,7 +836,7 @@ private[spark] class TaskSchedulerImpl( blacklistTrackerOpt.foreach(_.handleRemovedExecutor(executorId)) } - def executorAdded(execId: String, host: String) { + def executorAdded(execId: String, host: String): Unit = { dagScheduler.executorAdded(execId, host) } diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetBlacklist.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetBlacklist.scala index b680979a466a5..4df2889089ee9 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetBlacklist.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetBlacklist.scala @@ -69,7 +69,6 @@ private[scheduler] class TaskSetBlacklist( /** * Get the most recent failure reason of this TaskSet. - * @return */ def getLatestFailureReason: String = { latestFailureReason diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala index 49bd55e553482..18684ee8ebbc2 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala @@ -81,6 +81,19 @@ private[spark] class TaskSetManager( val speculationQuantile = conf.get(SPECULATION_QUANTILE) val speculationMultiplier = conf.get(SPECULATION_MULTIPLIER) val minFinishedForSpeculation = math.max((speculationQuantile * numTasks).floor.toInt, 1) + // User provided threshold for speculation regardless of whether the quantile has been reached + val speculationTaskDurationThresOpt = conf.get(SPECULATION_TASK_DURATION_THRESHOLD) + // SPARK-29976: Only when the total number of tasks in the stage is less than or equal to the + // number of slots on a single executor, would the task manager speculative run the tasks if + // their duration is longer than the given threshold. In this way, we wouldn't speculate too + // aggressively but still handle basic cases. + // SPARK-30417: #cores per executor might not be set in spark conf for standalone mode, then + // the value of the conf would 1 by default. However, the executor would use all the cores on + // the worker. Therefore, CPUS_PER_TASK is okay to be greater than 1 without setting #cores. + // To handle this case, we assume the minimum number of slots is 1. + // TODO: use the actual number of slots for standalone mode. + val speculationTasksLessEqToSlots = + numTasks <= Math.max(conf.get(EXECUTOR_CORES) / sched.CPUS_PER_TASK, 1) // For each task, tracks whether a copy of the task has succeeded. A task will also be // marked as "succeeded" if it failed with a fetch failure, in which case it should not @@ -216,6 +229,8 @@ private[spark] class TaskSetManager( index: Int, resolveRacks: Boolean = true, speculatable: Boolean = false): Unit = { + // A zombie TaskSetManager may reach here while handling failed task. + if (isZombie) return val pendingTaskSetToAddTo = if (speculatable) pendingSpeculatableTasks else pendingTasks for (loc <- tasks(index).preferredLocations) { loc match { @@ -474,7 +489,7 @@ private[spark] class TaskSetManager( } } - private def maybeFinishTaskSet() { + private def maybeFinishTaskSet(): Unit = { if (isZombie && runningTasks == 0) { sched.taskSetFinished(this) if (tasksSuccessful == numTasks) { @@ -758,7 +773,7 @@ private[spark] class TaskSetManager( * Marks the task as failed, re-adds it to the list of pending tasks, and notifies the * DAG Scheduler. */ - def handleFailedTask(tid: Long, state: TaskState, reason: TaskFailedReason) { + def handleFailedTask(tid: Long, state: TaskState, reason: TaskFailedReason): Unit = { val info = taskInfos(tid) if (info.failed || info.killed) { return @@ -799,6 +814,15 @@ private[spark] class TaskSetManager( info.id, taskSet.id, tid, ef.description)) return } + if (ef.className == classOf[TaskOutputFileAlreadyExistException].getName) { + // If we can not write to output file in the task, there's no point in trying to + // re-execute it. + logError("Task %s in stage %s (TID %d) can not write to output file: %s; not retrying" + .format(info.id, taskSet.id, tid, ef.description)) + abort("Task %s in stage %s (TID %d) can not write to output file: %s".format( + info.id, taskSet.id, tid, ef.description)) + return + } val key = ef.description val now = clock.getTimeMillis() val (printFull, dupCount) = { @@ -886,14 +910,14 @@ private[spark] class TaskSetManager( * * Used to keep track of the number of running tasks, for enforcing scheduling policies. */ - def addRunningTask(tid: Long) { + def addRunningTask(tid: Long): Unit = { if (runningTasksSet.add(tid) && parent != null) { parent.increaseRunningTasks(1) } } /** If the given task ID is in the set of running tasks, removes it. */ - def removeRunningTask(tid: Long) { + def removeRunningTask(tid: Long): Unit = { if (runningTasksSet.remove(tid) && parent != null) { parent.decreaseRunningTasks(1) } @@ -903,9 +927,9 @@ private[spark] class TaskSetManager( null } - override def addSchedulable(schedulable: Schedulable) {} + override def addSchedulable(schedulable: Schedulable): Unit = {} - override def removeSchedulable(schedulable: Schedulable) {} + override def removeSchedulable(schedulable: Schedulable): Unit = {} override def getSortedTaskSetQueue(): ArrayBuffer[TaskSetManager] = { val sortedTaskSetQueue = new ArrayBuffer[TaskSetManager]() @@ -914,7 +938,7 @@ private[spark] class TaskSetManager( } /** Called by TaskScheduler when an executor is lost so we can re-enqueue our tasks */ - override def executorLost(execId: String, host: String, reason: ExecutorLossReason) { + override def executorLost(execId: String, host: String, reason: ExecutorLossReason): Unit = { // Re-enqueue any tasks that ran on the failed executor if this is a shuffle map stage, // and we are not using an external shuffle server which could serve the shuffle outputs. // The reason is the next stage wouldn't be able to fetch the data from this dead executor @@ -923,7 +947,10 @@ private[spark] class TaskSetManager( && !isZombie) { for ((tid, info) <- taskInfos if info.executorId == execId) { val index = taskInfos(tid).index - if (successful(index) && !killedByOtherAttempt.contains(tid)) { + // We may have a running task whose partition has been marked as successful, + // this partition has another task completed in another stage attempt. + // We treat it as a running task and will call handleFailedTask later. + if (successful(index) && !info.running && !killedByOtherAttempt.contains(tid)) { successful(index) = false copiesRunning(index) -= 1 tasksSuccessful -= 1 @@ -948,15 +975,41 @@ private[spark] class TaskSetManager( recomputeLocality() } + /** + * Check if the task associated with the given tid has past the time threshold and should be + * speculative run. + */ + private def checkAndSubmitSpeculatableTask( + tid: Long, + currentTimeMillis: Long, + threshold: Double): Boolean = { + val info = taskInfos(tid) + val index = info.index + if (!successful(index) && copiesRunning(index) == 1 && + info.timeRunning(currentTimeMillis) > threshold && !speculatableTasks.contains(index)) { + addPendingTask(index, speculatable = true) + logInfo( + ("Marking task %d in stage %s (on %s) as speculatable because it ran more" + + " than %.0f ms(%d speculatable tasks in this taskset now)") + .format(index, taskSet.id, info.host, threshold, speculatableTasks.size + 1)) + speculatableTasks += index + sched.dagScheduler.speculativeTaskSubmitted(tasks(index)) + true + } else { + false + } + } + /** * Check for tasks to be speculated and return true if there are any. This is called periodically * by the TaskScheduler. * */ override def checkSpeculatableTasks(minTimeToSpeculation: Int): Boolean = { - // Can't speculate if we only have one task, and no need to speculate if the task set is a - // zombie or is from a barrier stage. - if (isZombie || isBarrier || numTasks == 1) { + // No need to speculate if the task set is zombie or is from a barrier stage. If there is only + // one task we don't speculate since we don't have metrics to decide whether it's taking too + // long or not, unless a task duration threshold is explicitly provided. + if (isZombie || isBarrier || (numTasks == 1 && !speculationTaskDurationThresOpt.isDefined)) { return false } var foundTasks = false @@ -974,19 +1027,14 @@ private[spark] class TaskSetManager( // bound based on that. logDebug("Task length threshold for speculation: " + threshold) for (tid <- runningTasksSet) { - val info = taskInfos(tid) - val index = info.index - if (!successful(index) && copiesRunning(index) == 1 && info.timeRunning(time) > threshold && - !speculatableTasks.contains(index)) { - addPendingTask(index, speculatable = true) - logInfo( - ("Marking task %d in stage %s (on %s) as speculatable because it ran more" + - " than %.0f ms(%d speculatable tasks in this taskset now)") - .format(index, taskSet.id, info.host, threshold, speculatableTasks.size + 1)) - speculatableTasks += index - sched.dagScheduler.speculativeTaskSubmitted(tasks(index)) - foundTasks = true - } + foundTasks |= checkAndSubmitSpeculatableTask(tid, time, threshold) + } + } else if (speculationTaskDurationThresOpt.isDefined && speculationTasksLessEqToSlots) { + val time = clock.getTimeMillis() + val threshold = speculationTaskDurationThresOpt.get + logDebug(s"Tasks taking longer time than provided speculation threshold: $threshold") + for (tid <- runningTasksSet) { + foundTasks |= checkAndSubmitSpeculatableTask(tid, time, threshold) } } foundTasks @@ -1035,14 +1083,22 @@ private[spark] class TaskSetManager( levels.toArray } - def recomputeLocality() { + def executorDecommission(execId: String): Unit = { + recomputeLocality() + // Future consideration: if an executor is decommissioned it may make sense to add the current + // tasks to the spec exec queue. + } + + def recomputeLocality(): Unit = { + // A zombie TaskSetManager may reach here while executorLost happens + if (isZombie) return val previousLocalityLevel = myLocalityLevels(currentLocalityIndex) myLocalityLevels = computeValidLocalityLevels() localityWaits = myLocalityLevels.map(getLocalityWait) currentLocalityIndex = getLocalityIndex(previousLocalityLevel) } - def executorAdded() { + def executorAdded(): Unit = { recomputeLocality() } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala index a90fff02ac73d..8db0122f17ab4 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala @@ -20,7 +20,7 @@ package org.apache.spark.scheduler.cluster import java.nio.ByteBuffer import org.apache.spark.TaskState.TaskState -import org.apache.spark.resource.ResourceInformation +import org.apache.spark.resource.{ResourceInformation, ResourceProfile} import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.scheduler.ExecutorLossReason import org.apache.spark.util.SerializableBuffer @@ -29,12 +29,13 @@ private[spark] sealed trait CoarseGrainedClusterMessage extends Serializable private[spark] object CoarseGrainedClusterMessages { - case object RetrieveSparkAppConfig extends CoarseGrainedClusterMessage + case class RetrieveSparkAppConfig(resourceProfileId: Int) extends CoarseGrainedClusterMessage case class SparkAppConfig( sparkProperties: Seq[(String, String)], ioEncryptionKey: Option[Array[Byte]], - hadoopDelegationCreds: Option[Array[Byte]]) + hadoopDelegationCreds: Option[Array[Byte]], + resourceProfile: ResourceProfile) extends CoarseGrainedClusterMessage case object RetrieveLastAllocatedExecutorId extends CoarseGrainedClusterMessage @@ -48,13 +49,6 @@ private[spark] object CoarseGrainedClusterMessages { case class KillExecutorsOnHost(host: String) extends CoarseGrainedClusterMessage - sealed trait RegisterExecutorResponse - - case object RegisteredExecutor extends CoarseGrainedClusterMessage with RegisterExecutorResponse - - case class RegisterExecutorFailed(message: String) extends CoarseGrainedClusterMessage - with RegisterExecutorResponse - case class UpdateDelegationTokens(tokens: Array[Byte]) extends CoarseGrainedClusterMessage @@ -66,9 +60,12 @@ private[spark] object CoarseGrainedClusterMessages { cores: Int, logUrls: Map[String, String], attributes: Map[String, String], - resources: Map[String, ResourceInformation]) + resources: Map[String, ResourceInformation], + resourceProfileId: Int) extends CoarseGrainedClusterMessage + case class LaunchedExecutor(executorId: String) extends CoarseGrainedClusterMessage + case class StatusUpdate( executorId: String, taskId: Long, @@ -97,6 +94,8 @@ private[spark] object CoarseGrainedClusterMessages { case class RemoveExecutor(executorId: String, reason: ExecutorLossReason) extends CoarseGrainedClusterMessage + case class DecommissionExecutor(executorId: String) extends CoarseGrainedClusterMessage + case class RemoveWorker(workerId: String, host: String, message: String) extends CoarseGrainedClusterMessage diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala index d81070c362ba6..6e1efdaf5beb2 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala @@ -33,6 +33,7 @@ import org.apache.spark.executor.ExecutorLogUrlHandler import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ import org.apache.spark.internal.config.Network._ +import org.apache.spark.resource.ResourceProfile import org.apache.spark.rpc._ import org.apache.spark.scheduler._ import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._ @@ -68,36 +69,39 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp conf.get(SCHEDULER_MAX_REGISTERED_RESOURCE_WAITING_TIME)) private val createTimeNs = System.nanoTime() - // Accessing `executorDataMap` in `DriverEndpoint.receive/receiveAndReply` doesn't need any - // protection. But accessing `executorDataMap` out of `DriverEndpoint.receive/receiveAndReply` - // must be protected by `CoarseGrainedSchedulerBackend.this`. Besides, `executorDataMap` should - // only be modified in `DriverEndpoint.receive/receiveAndReply` with protection by + // Accessing `executorDataMap` in the inherited methods from ThreadSafeRpcEndpoint doesn't need + // any protection. But accessing `executorDataMap` out of the inherited methods must be + // protected by `CoarseGrainedSchedulerBackend.this`. Besides, `executorDataMap` should only + // be modified in the inherited methods from ThreadSafeRpcEndpoint with protection by // `CoarseGrainedSchedulerBackend.this`. private val executorDataMap = new HashMap[String, ExecutorData] - // Number of executors requested by the cluster manager, [[ExecutorAllocationManager]] + // Number of executors for each ResourceProfile requested by the cluster + // manager, [[ExecutorAllocationManager]] @GuardedBy("CoarseGrainedSchedulerBackend.this") - private var requestedTotalExecutors = 0 - - // Number of executors requested from the cluster manager that have not registered yet - @GuardedBy("CoarseGrainedSchedulerBackend.this") - private var numPendingExecutors = 0 + private val requestedTotalExecutorsPerResourceProfile = new HashMap[ResourceProfile, Int] private val listenerBus = scheduler.sc.listenerBus // Executors we have requested the cluster manager to kill that have not died yet; maps // the executor ID to whether it was explicitly killed by the driver (and thus shouldn't - // be considered an app-related failure). + // be considered an app-related failure). Visible for testing only. @GuardedBy("CoarseGrainedSchedulerBackend.this") - private val executorsPendingToRemove = new HashMap[String, Boolean] + private[scheduler] val executorsPendingToRemove = new HashMap[String, Boolean] + + // Executors that have been lost, but for which we don't yet know the real exit reason. + private val executorsPendingLossReason = new HashSet[String] + + // Executors which are being decommissioned + protected val executorsPendingDecommission = new HashSet[String] - // A map to store hostname with its possible task number running on it + // A map of ResourceProfile id to map of hostname with its possible task number running on it @GuardedBy("CoarseGrainedSchedulerBackend.this") - protected var hostToLocalTaskCount: Map[String, Int] = Map.empty + protected var rpHostToLocalTaskCount: Map[Int, Map[String, Int]] = Map.empty - // The number of pending tasks which is locality required + // The number of pending tasks per ResourceProfile id which is locality required @GuardedBy("CoarseGrainedSchedulerBackend.this") - protected var localityAwareTasks = 0 + protected var numLocalityAwareTasksPerResourceProfileId = Map.empty[Int, Int] // The num of current max ExecutorId used to re-register appMaster @volatile protected var currentExecutorIdCounter = 0 @@ -111,13 +115,10 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp private val reviveThread = ThreadUtils.newDaemonSingleThreadScheduledExecutor("driver-revive-thread") - class DriverEndpoint extends ThreadSafeRpcEndpoint with Logging { + class DriverEndpoint extends IsolatedRpcEndpoint with Logging { override val rpcEnv: RpcEnv = CoarseGrainedSchedulerBackend.this.rpcEnv - // Executors that have been lost, but for which we don't yet know the real exit reason. - protected val executorsPendingLossReason = new HashSet[String] - protected val addressToExecutorId = new HashMap[RpcAddress, String] // Spark configuration sent to executors. This is a lazy val so that subclasses of the @@ -129,7 +130,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp private val logUrlHandler: ExecutorLogUrlHandler = new ExecutorLogUrlHandler( conf.get(UI.CUSTOM_EXECUTOR_LOG_URL)) - override def onStart() { + override def onStart(): Unit = { // Periodically revive offers to allow delay scheduling to work val reviveIntervalMs = conf.get(SCHEDULER_REVIVE_INTERVAL).getOrElse(1000L) @@ -186,22 +187,36 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp // automatically, so try to tell the executor to stop itself. See SPARK-13519. executorDataMap.get(executorId).foreach(_.executorEndpoint.send(StopExecutor)) removeExecutor(executorId, reason) + + case DecommissionExecutor(executorId) => + logError(s"Received decommission executor message ${executorId}.") + decommissionExecutor(executorId) + + case RemoveWorker(workerId, host, message) => + removeWorker(workerId, host, message) + + case LaunchedExecutor(executorId) => + executorDataMap.get(executorId).foreach { data => + data.freeCores = data.totalCores + } + makeOffers(executorId) + case e => + logError(s"Received unexpected message. ${e}") } override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case RegisterExecutor(executorId, executorRef, hostname, cores, logUrls, - attributes, resources) => + attributes, resources, resourceProfileId) => if (executorDataMap.contains(executorId)) { - executorRef.send(RegisterExecutorFailed("Duplicate executor ID: " + executorId)) - context.reply(true) - } else if (scheduler.nodeBlacklist.contains(hostname)) { + context.sendFailure(new IllegalStateException(s"Duplicate executor ID: $executorId")) + } else if (scheduler.nodeBlacklist.contains(hostname) || + isBlacklisted(executorId, hostname)) { // If the cluster manager gives us an executor on a blacklisted node (because it // already started allocating those resources before we informed it of our blacklist, // or if it ignored our blacklist), then we reject that executor immediately. logInfo(s"Rejecting $executorId as it has been blacklisted.") - executorRef.send(RegisterExecutorFailed(s"Executor is blacklisted: $executorId")) - context.reply(true) + context.sendFailure(new IllegalStateException(s"Executor is blacklisted: $executorId")) } else { // If the executor's rpc env is not listening for incoming connections, `hostPort` // will be null, and the client connection should be used to contact the executor. @@ -210,15 +225,21 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp } else { context.senderAddress } - logInfo(s"Registered executor $executorRef ($executorAddress) with ID $executorId") + logInfo(s"Registered executor $executorRef ($executorAddress) with ID $executorId, " + + s" ResourceProfileId $resourceProfileId") addressToExecutorId(executorAddress) = executorId totalCoreCount.addAndGet(cores) totalRegisteredExecutors.addAndGet(1) - val resourcesInfo = resources.map{ case (k, v) => - (v.name, new ExecutorResourceInfo(v.name, v.addresses))} + val resourcesInfo = resources.map { case (rName, info) => + // tell the executor it can schedule resources up to numParts times, + // as configured by the user, or set to 1 as that is the default (1 task/resource) + val numParts = scheduler.sc.resourceProfileManager + .resourceProfileFromId(resourceProfileId).getNumSlotsPerAddress(rName, conf) + (info.name, new ExecutorResourceInfo(info.name, info.addresses, numParts)) + } val data = new ExecutorData(executorRef, executorAddress, hostname, - cores, cores, logUrlHandler.applyPattern(logUrls, attributes), attributes, - resourcesInfo) + 0, cores, logUrlHandler.applyPattern(logUrls, attributes), attributes, + resourcesInfo, resourceProfileId) // This must be synchronized because variables mutated // in this block are read when requesting executors CoarseGrainedSchedulerBackend.this.synchronized { @@ -226,17 +247,11 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp if (currentExecutorIdCounter < executorId.toInt) { currentExecutorIdCounter = executorId.toInt } - if (numPendingExecutors > 0) { - numPendingExecutors -= 1 - logDebug(s"Decremented number of pending executors ($numPendingExecutors left)") - } } - executorRef.send(RegisteredExecutor) // Note: some tests expect the reply to come after we put the executor in the map context.reply(true) listenerBus.post( SparkListenerExecutorAdded(System.currentTimeMillis(), executorId, data)) - makeOffers() } case StopDriver => @@ -254,20 +269,29 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp removeWorker(workerId, host, message) context.reply(true) - case RetrieveSparkAppConfig => + case DecommissionExecutor(executorId) => + logError(s"Received decommission executor message ${executorId}.") + decommissionExecutor(executorId) + context.reply(true) + + case RetrieveSparkAppConfig(resourceProfileId) => + val rp = scheduler.sc.resourceProfileManager.resourceProfileFromId(resourceProfileId) val reply = SparkAppConfig( sparkProperties, SparkEnv.get.securityManager.getIOEncryptionKey(), - Option(delegationTokens.get())) + Option(delegationTokens.get()), + rp) context.reply(reply) + case e => + logError(s"Received unexpected ask ${e}") } // Make fake resource offers on all executors - private def makeOffers() { + private def makeOffers(): Unit = { // Make sure no executor is killed while some task is launching on it val taskDescs = withLock { // Filter out executors under killing - val activeExecutors = executorDataMap.filterKeys(executorIsAlive) + val activeExecutors = executorDataMap.filterKeys(isExecutorActive) val workOffers = activeExecutors.map { case (id, executorData) => new WorkerOffer(id, executorData.executorHost, executorData.freeCores, @@ -292,11 +316,11 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp } // Make fake resource offers on just one executor - private def makeOffers(executorId: String) { + private def makeOffers(executorId: String): Unit = { // Make sure no executor is killed while some task is launching on it val taskDescs = withLock { // Filter out executors under killing - if (executorIsAlive(executorId)) { + if (isExecutorActive(executorId)) { val executorData = executorDataMap(executorId) val workOffers = IndexedSeq( new WorkerOffer(executorId, executorData.executorHost, executorData.freeCores, @@ -314,13 +338,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp } } - private def executorIsAlive(executorId: String): Boolean = synchronized { - !executorsPendingToRemove.contains(executorId) && - !executorsPendingLossReason.contains(executorId) - } - // Launch tasks returned by a set of resource offers - private def launchTasks(tasks: Seq[Seq[TaskDescription]]) { + private def launchTasks(tasks: Seq[Seq[TaskDescription]]): Unit = { for (task <- tasks.flatten) { val serializedTask = TaskDescription.encode(task) if (serializedTask.limit() >= maxRpcMessageSize) { @@ -365,6 +384,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp addressToExecutorId -= executorInfo.executorAddress executorDataMap -= executorId executorsPendingLossReason -= executorId + executorsPendingDecommission -= executorId executorsPendingToRemove.remove(executorId).getOrElse(false) } totalCoreCount.addAndGet(-executorInfo.totalCores) @@ -389,6 +409,35 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp scheduler.workerRemoved(workerId, host, message) } + /** + * Mark a given executor as decommissioned and stop making resource offers for it. + */ + private def decommissionExecutor(executorId: String): Boolean = { + val shouldDisable = CoarseGrainedSchedulerBackend.this.synchronized { + // Only bother decommissioning executors which are alive. + if (isExecutorActive(executorId)) { + executorsPendingDecommission += executorId + true + } else { + false + } + } + + if (shouldDisable) { + logInfo(s"Starting decommissioning executor $executorId.") + try { + scheduler.executorDecommission(executorId) + } catch { + case e: Exception => + logError(s"Unexpected error during decommissioning ${e.toString}", e) + } + logInfo(s"Finished decommissioning executor $executorId.") + } else { + logInfo(s"Skipping decommissioning of executor $executorId.") + } + shouldDisable + } + /** * Stop making resource offers for the given executor. The executor is marked as lost with * the loss reason still pending. @@ -397,7 +446,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp */ protected def disableExecutor(executorId: String): Boolean = { val shouldDisable = CoarseGrainedSchedulerBackend.this.synchronized { - if (executorIsAlive(executorId)) { + if (isExecutorActive(executorId)) { executorsPendingLossReason += executorId true } else { @@ -420,19 +469,21 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp protected def minRegisteredRatio: Double = _minRegisteredRatio - override def start() { + override def start(): Unit = { if (UserGroupInformation.isSecurityEnabled()) { delegationTokenManager = createTokenManager() delegationTokenManager.foreach { dtm => val ugi = UserGroupInformation.getCurrentUser() val tokens = if (dtm.renewalEnabled) { dtm.start() - } else if (ugi.hasKerberosCredentials() || SparkHadoopUtil.get.isProxyUser(ugi)) { + } else { val creds = ugi.getCredentials() dtm.obtainDelegationTokens(creds) - SparkHadoopUtil.get.serialize(creds) - } else { - null + if (creds.numberOfTokens() > 0 || creds.numberOfSecretKeys() > 0) { + SparkHadoopUtil.get.serialize(creds) + } else { + null + } } if (tokens != null) { updateDelegationTokens(tokens) @@ -443,7 +494,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp protected def createDriverEndpoint(): DriverEndpoint = new DriverEndpoint() - def stopExecutors() { + def stopExecutors(): Unit = { try { if (driverEndpoint != null) { logInfo("Shutting down all executors") @@ -455,7 +506,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp } } - override def stop() { + override def stop(): Unit = { reviveThread.shutdownNow() stopExecutors() delegationTokenManager.foreach(_.stop()) @@ -472,12 +523,11 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp /** * Reset the state of CoarseGrainedSchedulerBackend to the initial state. Currently it will only * be called in the yarn-client mode when AM re-registers after a failure. + * Visible for testing only. * */ - protected def reset(): Unit = { + protected[scheduler] def reset(): Unit = { val executors: Set[String] = synchronized { - requestedTotalExecutors = 0 - numPendingExecutors = 0 - executorsPendingToRemove.clear() + requestedTotalExecutorsPerResourceProfile.clear() executorDataMap.keys.toSet } @@ -488,12 +538,12 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp } } - override def reviveOffers() { + override def reviveOffers(): Unit = { driverEndpoint.send(ReviveOffers) } override def killTask( - taskId: Long, executorId: String, interruptThread: Boolean, reason: String) { + taskId: Long, executorId: String, interruptThread: Boolean, reason: String): Unit = { driverEndpoint.send(KillTask(taskId, executorId, interruptThread, reason)) } @@ -510,8 +560,17 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp } protected def removeWorker(workerId: String, host: String, message: String): Unit = { - driverEndpoint.ask[Boolean](RemoveWorker(workerId, host, message)).failed.foreach(t => - logError(t.getMessage, t))(ThreadUtils.sameThread) + driverEndpoint.send(RemoveWorker(workerId, host, message)) + } + + /** + * Called by subclasses when notified of a decommissioning executor. + */ + private[spark] def decommissionExecutor(executorId: String): Unit = { + if (driverEndpoint != null) { + logInfo("Propegating executor decommission to driver.") + driverEndpoint.send(DecommissionExecutor(executorId)) + } } def sufficientResourcesRegistered(): Boolean = true @@ -533,29 +592,42 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp /** * Return the number of executors currently registered with this backend. */ - private def numExistingExecutors: Int = executorDataMap.size + private def numExistingExecutors: Int = synchronized { executorDataMap.size } - override def getExecutorIds(): Seq[String] = { + override def getExecutorIds(): Seq[String] = synchronized { executorDataMap.keySet.toSeq } override def isExecutorActive(id: String): Boolean = synchronized { - executorDataMap.contains(id) && !executorsPendingToRemove.contains(id) + executorDataMap.contains(id) && + !executorsPendingToRemove.contains(id) && + !executorsPendingLossReason.contains(id) && + !executorsPendingDecommission.contains(id) + } - override def maxNumConcurrentTasks(): Int = { + override def maxNumConcurrentTasks(): Int = synchronized { executorDataMap.values.map { executor => executor.totalCores / scheduler.CPUS_PER_TASK }.sum } // this function is for testing only - def getExecutorAvailableResources(executorId: String): Map[String, ExecutorResourceInfo] = { + def getExecutorAvailableResources( + executorId: String): Map[String, ExecutorResourceInfo] = synchronized { executorDataMap.get(executorId).map(_.resourcesInfo).getOrElse(Map.empty) } + // this function is for testing only + def getExecutorResourceProfileId(executorId: String): Int = synchronized { + val execDataOption = executorDataMap.get(executorId) + execDataOption.map(_.resourceProfileId).getOrElse(ResourceProfile.UNKNOWN_RESOURCE_PROFILE_ID) + } + /** - * Request an additional number of executors from the cluster manager. + * Request an additional number of executors from the cluster manager. This is + * requesting against the default ResourceProfile, we will need an API change to + * allow against other profiles. * @return whether the request is acknowledged. */ final override def requestExecutors(numAdditionalExecutors: Int): Boolean = { @@ -567,21 +639,11 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp logInfo(s"Requesting $numAdditionalExecutors additional executor(s) from the cluster manager") val response = synchronized { - requestedTotalExecutors += numAdditionalExecutors - numPendingExecutors += numAdditionalExecutors - logDebug(s"Number of pending executors is now $numPendingExecutors") - if (requestedTotalExecutors != - (numExistingExecutors + numPendingExecutors - executorsPendingToRemove.size)) { - logDebug( - s"""requestExecutors($numAdditionalExecutors): Executor request doesn't match: - |requestedTotalExecutors = $requestedTotalExecutors - |numExistingExecutors = $numExistingExecutors - |numPendingExecutors = $numPendingExecutors - |executorsPendingToRemove = ${executorsPendingToRemove.size}""".stripMargin) - } - + val defaultProf = scheduler.sc.resourceProfileManager.defaultResourceProfile + val numExisting = requestedTotalExecutorsPerResourceProfile.getOrElse(defaultProf, 0) + requestedTotalExecutorsPerResourceProfile(defaultProf) = numExisting + numAdditionalExecutors // Account for executors pending to be added or removed - doRequestTotalExecutors(requestedTotalExecutors) + doRequestTotalExecutors(requestedTotalExecutorsPerResourceProfile.toMap) } defaultAskTimeout.awaitResult(response) @@ -590,39 +652,41 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp /** * Update the cluster manager on our scheduling needs. Three bits of information are included * to help it make decisions. - * @param numExecutors The total number of executors we'd like to have. The cluster manager - * shouldn't kill any running executor to reach this number, but, - * if all existing executors were to die, this is the number of executors - * we'd want to be allocated. - * @param localityAwareTasks The number of tasks in all active stages that have a locality - * preferences. This includes running, pending, and completed tasks. + * @param resourceProfileToNumExecutors The total number of executors we'd like to have per + * ResourceProfile. The cluster manager shouldn't kill any + * running executor to reach this number, but, if all + * existing executors were to die, this is the number + * of executors we'd want to be allocated. + * @param numLocalityAwareTasksPerResourceProfileId The number of tasks in all active stages that + * have a locality preferences per + * ResourceProfile. This includes running, + * pending, and completed tasks. * @param hostToLocalTaskCount A map of hosts to the number of tasks from all active stages * that would like to like to run on that host. * This includes running, pending, and completed tasks. * @return whether the request is acknowledged by the cluster manager. */ final override def requestTotalExecutors( - numExecutors: Int, - localityAwareTasks: Int, - hostToLocalTaskCount: Map[String, Int] - ): Boolean = { - if (numExecutors < 0) { + resourceProfileIdToNumExecutors: Map[Int, Int], + numLocalityAwareTasksPerResourceProfileId: Map[Int, Int], + hostToLocalTaskCount: Map[Int, Map[String, Int]] + ): Boolean = { + val totalExecs = resourceProfileIdToNumExecutors.values.sum + if (totalExecs < 0) { throw new IllegalArgumentException( "Attempted to request a negative number of executor(s) " + - s"$numExecutors from the cluster manager. Please specify a positive number!") + s"$totalExecs from the cluster manager. Please specify a positive number!") + } + val resourceProfileToNumExecutors = resourceProfileIdToNumExecutors.map { case (rpid, num) => + (scheduler.sc.resourceProfileManager.resourceProfileFromId(rpid), num) } - val response = synchronized { - this.requestedTotalExecutors = numExecutors - this.localityAwareTasks = localityAwareTasks - this.hostToLocalTaskCount = hostToLocalTaskCount - - numPendingExecutors = - math.max(numExecutors - numExistingExecutors + executorsPendingToRemove.size, 0) - - doRequestTotalExecutors(numExecutors) + this.requestedTotalExecutorsPerResourceProfile.clear() + this.requestedTotalExecutorsPerResourceProfile ++= resourceProfileToNumExecutors + this.numLocalityAwareTasksPerResourceProfileId = numLocalityAwareTasksPerResourceProfileId + this.rpHostToLocalTaskCount = hostToLocalTaskCount + doRequestTotalExecutors(requestedTotalExecutorsPerResourceProfile.toMap) } - defaultAskTimeout.awaitResult(response) } @@ -638,7 +702,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp * * @return a future whose evaluation indicates whether the request is acknowledged. */ - protected def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = + protected def doRequestTotalExecutors( + resourceProfileToTotalExecs: Map[ResourceProfile, Int]): Future[Boolean] = Future.successful(false) /** @@ -679,20 +744,20 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp // take into account executors that are pending to be added or removed. val adjustTotalExecutors = if (adjustTargetNumExecutors) { - requestedTotalExecutors = math.max(requestedTotalExecutors - executorsToKill.size, 0) - if (requestedTotalExecutors != - (numExistingExecutors + numPendingExecutors - executorsPendingToRemove.size)) { - logDebug( - s"""killExecutors($executorIds, $adjustTargetNumExecutors, $countFailures, $force): - |Executor counts do not match: - |requestedTotalExecutors = $requestedTotalExecutors - |numExistingExecutors = $numExistingExecutors - |numPendingExecutors = $numPendingExecutors - |executorsPendingToRemove = ${executorsPendingToRemove.size}""".stripMargin) + executorsToKill.foreach { exec => + val rpId = executorDataMap(exec).resourceProfileId + val rp = scheduler.sc.resourceProfileManager.resourceProfileFromId(rpId) + if (requestedTotalExecutorsPerResourceProfile.isEmpty) { + // Assume that we are killing an executor that was started by default and + // not through the request api + requestedTotalExecutorsPerResourceProfile(rp) = 0 + } else { + val requestedTotalForRp = requestedTotalExecutorsPerResourceProfile(rp) + requestedTotalExecutorsPerResourceProfile(rp) = math.max(requestedTotalForRp - 1, 0) + } } - doRequestTotalExecutors(requestedTotalExecutors) + doRequestTotalExecutors(requestedTotalExecutorsPerResourceProfile.toMap) } else { - numPendingExecutors += executorsToKill.size Future.successful(true) } @@ -758,6 +823,15 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp protected def currentDelegationTokens: Array[Byte] = delegationTokens.get() + /** + * Checks whether the executor is blacklisted. This is called when the executor tries to + * register with the scheduler, and will deny registration if this method returns true. + * + * This is in addition to the blacklist kept by the task scheduler, so custom implementations + * don't need to check there. + */ + protected def isBlacklisted(executorId: String, hostname: String): Boolean = false + // SPARK-27112: We need to ensure that there is ordering of lock acquisition // between TaskSchedulerImpl and CoarseGrainedSchedulerBackend objects in order to fix // the deadlock issue exposed in SPARK-27112 diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala index 17907d88e50c8..062146174f6a8 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorData.scala @@ -29,6 +29,7 @@ import org.apache.spark.scheduler.ExecutorResourceInfo * @param freeCores The current number of cores available for work on the executor * @param totalCores The total number of cores available to the executor * @param resourcesInfo The information of the currently available resources on the executor + * @param resourceProfileId The id of the ResourceProfile being used by this executor */ private[cluster] class ExecutorData( val executorEndpoint: RpcEndpointRef, @@ -38,5 +39,7 @@ private[cluster] class ExecutorData( override val totalCores: Int, override val logUrlMap: Map[String, String], override val attributes: Map[String, String], - override val resourcesInfo: Map[String, ExecutorResourceInfo] -) extends ExecutorInfo(executorHost, totalCores, logUrlMap, attributes, resourcesInfo) + override val resourcesInfo: Map[String, ExecutorResourceInfo], + override val resourceProfileId: Int +) extends ExecutorInfo(executorHost, totalCores, logUrlMap, attributes, + resourcesInfo, resourceProfileId) diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorInfo.scala index 5a4ad6e00eb43..a97b08941ba78 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorInfo.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/ExecutorInfo.scala @@ -18,6 +18,7 @@ package org.apache.spark.scheduler.cluster import org.apache.spark.annotation.DeveloperApi import org.apache.spark.resource.ResourceInformation +import org.apache.spark.resource.ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID /** * :: DeveloperApi :: @@ -25,14 +26,15 @@ import org.apache.spark.resource.ResourceInformation */ @DeveloperApi class ExecutorInfo( - val executorHost: String, - val totalCores: Int, - val logUrlMap: Map[String, String], - val attributes: Map[String, String], - val resourcesInfo: Map[String, ResourceInformation]) { + val executorHost: String, + val totalCores: Int, + val logUrlMap: Map[String, String], + val attributes: Map[String, String], + val resourcesInfo: Map[String, ResourceInformation], + val resourceProfileId: Int) { def this(executorHost: String, totalCores: Int, logUrlMap: Map[String, String]) = { - this(executorHost, totalCores, logUrlMap, Map.empty, Map.empty) + this(executorHost, totalCores, logUrlMap, Map.empty, Map.empty, DEFAULT_RESOURCE_PROFILE_ID) } def this( @@ -40,7 +42,17 @@ class ExecutorInfo( totalCores: Int, logUrlMap: Map[String, String], attributes: Map[String, String]) = { - this(executorHost, totalCores, logUrlMap, attributes, Map.empty) + this(executorHost, totalCores, logUrlMap, attributes, Map.empty, DEFAULT_RESOURCE_PROFILE_ID) + } + + def this( + executorHost: String, + totalCores: Int, + logUrlMap: Map[String, String], + attributes: Map[String, String], + resourcesInfo: Map[String, ResourceInformation]) = { + this(executorHost, totalCores, logUrlMap, attributes, resourcesInfo, + DEFAULT_RESOURCE_PROFILE_ID) } def canEqual(other: Any): Boolean = other.isInstanceOf[ExecutorInfo] @@ -52,12 +64,14 @@ class ExecutorInfo( totalCores == that.totalCores && logUrlMap == that.logUrlMap && attributes == that.attributes && - resourcesInfo == that.resourcesInfo + resourcesInfo == that.resourcesInfo && + resourceProfileId == that.resourceProfileId case _ => false } override def hashCode(): Int = { - val state = Seq(executorHost, totalCores, logUrlMap, attributes, resourcesInfo) + val state = Seq(executorHost, totalCores, logUrlMap, attributes, resourcesInfo, + resourceProfileId) state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b) } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala index 2025a7dc24821..42c46464d79e1 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala @@ -28,7 +28,7 @@ import org.apache.spark.deploy.client.{StandaloneAppClient, StandaloneAppClientL import org.apache.spark.internal.{config, Logging} import org.apache.spark.internal.config.Tests.IS_TESTING import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle} -import org.apache.spark.resource.ResourceUtils +import org.apache.spark.resource.{ResourceProfile, ResourceUtils} import org.apache.spark.rpc.RpcEndpointAddress import org.apache.spark.scheduler._ import org.apache.spark.util.Utils @@ -58,8 +58,9 @@ private[spark] class StandaloneSchedulerBackend( private val maxCores = conf.get(config.CORES_MAX) private val totalExpectedCores = maxCores.getOrElse(0) + private val defaultProf = sc.resourceProfileManager.defaultResourceProfile - override def start() { + override def start(): Unit = { super.start() // SPARK-21159. The scheduler backend should only try to connect to the launcher when in client @@ -129,21 +130,21 @@ private[spark] class StandaloneSchedulerBackend( stop(SparkAppHandle.State.FINISHED) } - override def connected(appId: String) { + override def connected(appId: String): Unit = { logInfo("Connected to Spark cluster with app ID " + appId) this.appId = appId notifyContext() launcherBackend.setAppId(appId) } - override def disconnected() { + override def disconnected(): Unit = { notifyContext() if (!stopping.get) { logWarning("Disconnected from Spark cluster! Waiting for reconnection...") } } - override def dead(reason: String) { + override def dead(reason: String): Unit = { notifyContext() if (!stopping.get) { launcherBackend.setState(SparkAppHandle.State.KILLED) @@ -158,13 +159,13 @@ private[spark] class StandaloneSchedulerBackend( } override def executorAdded(fullId: String, workerId: String, hostPort: String, cores: Int, - memory: Int) { + memory: Int): Unit = { logInfo("Granted executor ID %s on hostPort %s with %d core(s), %s RAM".format( fullId, hostPort, cores, Utils.megabytesToString(memory))) } override def executorRemoved( - fullId: String, message: String, exitStatus: Option[Int], workerLost: Boolean) { + fullId: String, message: String, exitStatus: Option[Int], workerLost: Boolean): Unit = { val reason: ExecutorLossReason = exitStatus match { case Some(code) => ExecutorExited(code, exitCausedByApp = true, message) case None => SlaveLost(message, workerLost = workerLost) @@ -173,6 +174,12 @@ private[spark] class StandaloneSchedulerBackend( removeExecutor(fullId.split("/")(1), reason) } + override def executorDecommissioned(fullId: String, message: String) { + logInfo("Asked to decommission executor") + decommissionExecutor(fullId.split("/")(1)) + logInfo("Executor %s decommissioned: %s".format(fullId, message)) + } + override def workerRemoved(workerId: String, host: String, message: String): Unit = { logInfo("Worker %s removed: %s".format(workerId, message)) removeWorker(workerId, host, message) @@ -194,9 +201,13 @@ private[spark] class StandaloneSchedulerBackend( * * @return whether the request is acknowledged. */ - protected override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = { + protected override def doRequestTotalExecutors( + resourceProfileToTotalExecs: Map[ResourceProfile, Int]): Future[Boolean] = { + // resources profiles not supported Option(client) match { - case Some(c) => c.requestTotalExecutors(requestedTotal) + case Some(c) => + val numExecs = resourceProfileToTotalExecs.getOrElse(defaultProf, 0) + c.requestTotalExecutors(numExecs) case None => logWarning("Attempted to request executors before driver fully initialized.") Future.successful(false) diff --git a/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala b/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala index aa901d6568b26..c29546b7577fc 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitor.scala @@ -26,6 +26,7 @@ import scala.collection.mutable import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ +import org.apache.spark.resource.ResourceProfile.UNKNOWN_RESOURCE_PROFILE_ID import org.apache.spark.scheduler._ import org.apache.spark.storage.RDDBlockId import org.apache.spark.util.Clock @@ -39,11 +40,12 @@ private[spark] class ExecutorMonitor( listenerBus: LiveListenerBus, clock: Clock) extends SparkListener with CleanerListener with Logging { - private val idleTimeoutMs = TimeUnit.SECONDS.toMillis( + private val idleTimeoutNs = TimeUnit.SECONDS.toNanos( conf.get(DYN_ALLOCATION_EXECUTOR_IDLE_TIMEOUT)) - private val storageTimeoutMs = TimeUnit.SECONDS.toMillis( + private val storageTimeoutNs = TimeUnit.SECONDS.toNanos( conf.get(DYN_ALLOCATION_CACHED_EXECUTOR_IDLE_TIMEOUT)) - private val shuffleTimeoutMs = conf.get(DYN_ALLOCATION_SHUFFLE_TIMEOUT) + private val shuffleTimeoutNs = TimeUnit.MILLISECONDS.toNanos( + conf.get(DYN_ALLOCATION_SHUFFLE_TIMEOUT)) private val fetchFromShuffleSvcEnabled = conf.get(SHUFFLE_SERVICE_ENABLED) && conf.get(SHUFFLE_SERVICE_FETCH_RDD_ENABLED) @@ -51,6 +53,7 @@ private[spark] class ExecutorMonitor( conf.get(DYN_ALLOCATION_SHUFFLE_TRACKING) private val executors = new ConcurrentHashMap[String, Tracker]() + private val execResourceProfileCount = new ConcurrentHashMap[Int, Int]() // The following fields are an optimization to avoid having to scan all executors on every EAM // schedule interval to find out which ones are timed out. They keep track of when the next @@ -67,7 +70,7 @@ private[spark] class ExecutorMonitor( // this listener. There are safeguards in other parts of the code that would prevent that executor // from being removed. private val nextTimeout = new AtomicLong(Long.MaxValue) - private var timedOutExecs = Seq.empty[String] + private var timedOutExecs = Seq.empty[(String, Int)] // Active job tracking. // @@ -91,16 +94,17 @@ private[spark] class ExecutorMonitor( def reset(): Unit = { executors.clear() + execResourceProfileCount.clear() nextTimeout.set(Long.MaxValue) timedOutExecs = Nil } /** - * Returns the list of executors that are currently considered to be timed out. - * Should only be called from the EAM thread. + * Returns the list of executors and their ResourceProfile id that are currently considered to + * be timed out. Should only be called from the EAM thread. */ - def timedOutExecutors(): Seq[String] = { - val now = clock.getTimeMillis() + def timedOutExecutors(): Seq[(String, Int)] = { + val now = clock.nanoTime() if (now >= nextTimeout.get()) { // Temporarily set the next timeout at Long.MaxValue. This ensures that after // scanning all executors below, we know when the next timeout for non-timed out @@ -122,7 +126,7 @@ private[spark] class ExecutorMonitor( true } } - .keys + .map { case (name, exec) => (name, exec.resourceProfileId)} .toSeq updateNextTimeout(newNextTimeout) } @@ -147,8 +151,26 @@ private[spark] class ExecutorMonitor( def executorCount: Int = executors.size() + def executorCountWithResourceProfile(id: Int): Int = { + execResourceProfileCount.getOrDefault(id, 0) + } + + // for testing + def getResourceProfileId(executorId: String): Int = { + val execTrackingInfo = executors.get(executorId) + if (execTrackingInfo != null) { + execTrackingInfo.resourceProfileId + } else { + UNKNOWN_RESOURCE_PROFILE_ID + } + } + def pendingRemovalCount: Int = executors.asScala.count { case (_, exec) => exec.pendingRemoval } + def pendingRemovalCountPerResourceProfileId(id: Int): Int = { + executors.asScala.filter { case (k, v) => v.resourceProfileId == id && v.pendingRemoval }.size + } + override def onJobStart(event: SparkListenerJobStart): Unit = { if (!shuffleTrackingEnabled) { return @@ -260,7 +282,7 @@ private[spark] class ExecutorMonitor( val executorId = event.taskInfo.executorId // Guard against a late arriving task start event (SPARK-26927). if (client.isExecutorActive(executorId)) { - val exec = ensureExecutorIsTracked(executorId) + val exec = ensureExecutorIsTracked(executorId, UNKNOWN_RESOURCE_PROFILE_ID) exec.updateRunningTasks(1) } } @@ -289,15 +311,21 @@ private[spark] class ExecutorMonitor( } override def onExecutorAdded(event: SparkListenerExecutorAdded): Unit = { - val exec = ensureExecutorIsTracked(event.executorId) + val exec = ensureExecutorIsTracked(event.executorId, event.executorInfo.resourceProfileId) exec.updateRunningTasks(0) logInfo(s"New executor ${event.executorId} has registered (new total is ${executors.size()})") } + private def decrementExecResourceProfileCount(rpId: Int): Unit = { + val count = execResourceProfileCount.getOrDefault(rpId, 0) + execResourceProfileCount.replace(rpId, count, count - 1) + execResourceProfileCount.remove(rpId, 0) + } + override def onExecutorRemoved(event: SparkListenerExecutorRemoved): Unit = { val removed = executors.remove(event.executorId) if (removed != null) { - logInfo(s"Executor ${event.executorId} removed (new total is ${executors.size()})") + decrementExecResourceProfileCount(removed.resourceProfileId) if (!removed.pendingRemoval) { nextTimeout.set(Long.MinValue) } @@ -308,8 +336,8 @@ private[spark] class ExecutorMonitor( if (!event.blockUpdatedInfo.blockId.isInstanceOf[RDDBlockId]) { return } - - val exec = ensureExecutorIsTracked(event.blockUpdatedInfo.blockManagerId.executorId) + val exec = ensureExecutorIsTracked(event.blockUpdatedInfo.blockManagerId.executorId, + UNKNOWN_RESOURCE_PROFILE_ID) val storageLevel = event.blockUpdatedInfo.storageLevel val blockId = event.blockUpdatedInfo.blockId.asInstanceOf[RDDBlockId] @@ -391,8 +419,26 @@ private[spark] class ExecutorMonitor( * which the `SparkListenerTaskStart` event is posted before the `SparkListenerBlockManagerAdded` * event, which is possible because these events are posted in different threads. (see SPARK-4951) */ - private def ensureExecutorIsTracked(id: String): Tracker = { - executors.computeIfAbsent(id, _ => new Tracker()) + private def ensureExecutorIsTracked(id: String, resourceProfileId: Int): Tracker = { + val numExecsWithRpId = execResourceProfileCount.computeIfAbsent(resourceProfileId, _ => 0) + val execTracker = executors.computeIfAbsent(id, _ => { + val newcount = numExecsWithRpId + 1 + execResourceProfileCount.put(resourceProfileId, newcount) + logDebug(s"Executor added with ResourceProfile id: $resourceProfileId " + + s"count is now $newcount") + new Tracker(resourceProfileId) + }) + // if we had added executor before without knowing the resource profile id, fix it up + if (execTracker.resourceProfileId == UNKNOWN_RESOURCE_PROFILE_ID && + resourceProfileId != UNKNOWN_RESOURCE_PROFILE_ID) { + logDebug(s"Executor: $id, resource profile id was unknown, setting " + + s"it to $resourceProfileId") + execTracker.resourceProfileId = resourceProfileId + // fix up the counts for each resource profile id + execResourceProfileCount.put(resourceProfileId, numExecsWithRpId + 1) + decrementExecResourceProfileCount(UNKNOWN_RESOURCE_PROFILE_ID) + } + execTracker } private def updateNextTimeout(newValue: Long): Unit = { @@ -412,7 +458,7 @@ private[spark] class ExecutorMonitor( } } - private class Tracker { + private class Tracker(var resourceProfileId: Int) { @volatile var timeoutAt: Long = Long.MaxValue // Tracks whether this executor is thought to be timed out. It's used to detect when the list @@ -437,7 +483,7 @@ private[spark] class ExecutorMonitor( def updateRunningTasks(delta: Int): Unit = { runningTasks = math.max(0, runningTasks + delta) - idleStart = if (runningTasks == 0) clock.getTimeMillis() else -1L + idleStart = if (runningTasks == 0) clock.nanoTime() else -1L updateTimeout() } @@ -445,15 +491,15 @@ private[spark] class ExecutorMonitor( val oldDeadline = timeoutAt val newDeadline = if (idleStart >= 0) { val timeout = if (cachedBlocks.nonEmpty || (shuffleIds != null && shuffleIds.nonEmpty)) { - val _cacheTimeout = if (cachedBlocks.nonEmpty) storageTimeoutMs else Long.MaxValue + val _cacheTimeout = if (cachedBlocks.nonEmpty) storageTimeoutNs else Long.MaxValue val _shuffleTimeout = if (shuffleIds != null && shuffleIds.nonEmpty) { - shuffleTimeoutMs + shuffleTimeoutNs } else { Long.MaxValue } math.min(_cacheTimeout, _shuffleTimeout) } else { - idleTimeoutMs + idleTimeoutNs } val deadline = idleStart + timeout if (deadline >= 0) deadline else Long.MaxValue diff --git a/core/src/main/scala/org/apache/spark/scheduler/local/LocalSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/local/LocalSchedulerBackend.scala index cbcc5310a59f0..42a5afe0b3f9d 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/local/LocalSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/local/LocalSchedulerBackend.scala @@ -26,9 +26,11 @@ import org.apache.spark.TaskState.TaskState import org.apache.spark.executor.{Executor, ExecutorBackend} import org.apache.spark.internal.{config, Logging} import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle} +import org.apache.spark.resource.ResourceInformation import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.scheduler._ import org.apache.spark.scheduler.cluster.ExecutorInfo +import org.apache.spark.util.Utils private case class ReviveOffers() @@ -54,10 +56,12 @@ private[spark] class LocalEndpoint( private var freeCores = totalCores val localExecutorId = SparkContext.DRIVER_IDENTIFIER - val localExecutorHostname = "localhost" + val localExecutorHostname = Utils.localCanonicalHostName() + // local mode doesn't support extra resources like GPUs right now private val executor = new Executor( - localExecutorId, localExecutorHostname, SparkEnv.get, userClassPath, isLocal = true) + localExecutorId, localExecutorHostname, SparkEnv.get, userClassPath, isLocal = true, + resources = Map.empty[String, ResourceInformation]) override def receive: PartialFunction[Any, Unit] = { case ReviveOffers => @@ -80,7 +84,7 @@ private[spark] class LocalEndpoint( context.reply(true) } - def reviveOffers() { + def reviveOffers(): Unit = { // local mode doesn't support extra resources like GPUs right now val offers = IndexedSeq(new WorkerOffer(localExecutorId, localExecutorHostname, freeCores, Some(rpcEnv.address.hostPort))) @@ -123,7 +127,7 @@ private[spark] class LocalSchedulerBackend( launcherBackend.connect() - override def start() { + override def start(): Unit = { val rpcEnv = SparkEnv.get.rpcEnv val executorEndpoint = new LocalEndpoint(rpcEnv, userClassPath, scheduler, this, totalCores) localEndpoint = rpcEnv.setupEndpoint("LocalSchedulerBackendEndpoint", executorEndpoint) @@ -136,11 +140,11 @@ private[spark] class LocalSchedulerBackend( launcherBackend.setState(SparkAppHandle.State.RUNNING) } - override def stop() { + override def stop(): Unit = { stop(SparkAppHandle.State.FINISHED) } - override def reviveOffers() { + override def reviveOffers(): Unit = { localEndpoint.send(ReviveOffers) } @@ -148,11 +152,11 @@ private[spark] class LocalSchedulerBackend( scheduler.conf.getInt("spark.default.parallelism", totalCores) override def killTask( - taskId: Long, executorId: String, interruptThread: Boolean, reason: String) { + taskId: Long, executorId: String, interruptThread: Boolean, reason: String): Unit = { localEndpoint.send(KillTask(taskId, interruptThread, reason)) } - override def statusUpdate(taskId: Long, state: TaskState, serializedData: ByteBuffer) { + override def statusUpdate(taskId: Long, state: TaskState, serializedData: ByteBuffer): Unit = { localEndpoint.send(StatusUpdate(taskId, state, serializedData)) } diff --git a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala index 70564eeefda88..077b035f3d079 100644 --- a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala +++ b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala @@ -54,8 +54,8 @@ private[spark] class JavaSerializationStream( this } - def flush() { objOut.flush() } - def close() { objOut.close() } + def flush(): Unit = { objOut.flush() } + def close(): Unit = { objOut.close() } } private[spark] class JavaDeserializationStream(in: InputStream, loader: ClassLoader) @@ -74,7 +74,7 @@ private[spark] class JavaDeserializationStream(in: InputStream, loader: ClassLoa } def readObject[T: ClassTag](): T = objIn.readObject().asInstanceOf[T] - def close() { objIn.close() } + def close(): Unit = { objIn.close() } } private object JavaDeserializationStream { diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala index 20774c8d999c1..cdaab599e2a0b 100644 --- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala +++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala @@ -40,6 +40,7 @@ import org.apache.spark._ import org.apache.spark.api.python.PythonBroadcast import org.apache.spark.internal.Logging import org.apache.spark.internal.config.Kryo._ +import org.apache.spark.internal.io.FileCommitProtocol._ import org.apache.spark.network.util.ByteUnit import org.apache.spark.scheduler.{CompressedMapStatus, HighlyCompressedMapStatus} import org.apache.spark.storage._ @@ -259,14 +260,14 @@ class KryoSerializationStream( this } - override def flush() { + override def flush(): Unit = { if (output == null) { throw new IOException("Stream is closed") } output.flush() } - override def close() { + override def close(): Unit = { if (output != null) { try { output.close() @@ -301,7 +302,7 @@ class KryoDeserializationStream( } } - override def close() { + override def close(): Unit = { if (input != null) { try { // Kryo's Input automatically closes the input stream it is using. @@ -469,7 +470,8 @@ private[serializer] object KryoSerializer { classOf[Array[String]], classOf[Array[Array[String]]], classOf[BoundedPriorityQueue[_]], - classOf[SparkConf] + classOf[SparkConf], + classOf[TaskCommitMessage] ) private val toRegisterSerializer = Map[Class[_], KryoClassSerializer[_]]( diff --git a/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala b/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala index 5e7a98c8aa89c..75dc3982ab872 100644 --- a/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala +++ b/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala @@ -303,7 +303,7 @@ private[spark] object SerializationDebugger extends Logging { /** An output stream that emulates /dev/null */ private class NullOutputStream extends OutputStream { - override def write(b: Int) { } + override def write(b: Int): Unit = { } } /** diff --git a/core/src/main/scala/org/apache/spark/serializer/Serializer.scala b/core/src/main/scala/org/apache/spark/serializer/Serializer.scala index cb8b1cc077637..0c53a84af6e2f 100644 --- a/core/src/main/scala/org/apache/spark/serializer/Serializer.scala +++ b/core/src/main/scala/org/apache/spark/serializer/Serializer.scala @@ -173,7 +173,7 @@ abstract class DeserializationStream extends Closeable { } } - override protected def close() { + override protected def close(): Unit = { DeserializationStream.this.close() } } @@ -193,7 +193,7 @@ abstract class DeserializationStream extends Closeable { } } - override protected def close() { + override protected def close(): Unit = { DeserializationStream.this.close() } } diff --git a/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala b/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala index 3e3c387911d36..623db9d00ab53 100644 --- a/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala +++ b/core/src/main/scala/org/apache/spark/serializer/SerializerManager.scala @@ -114,6 +114,7 @@ private[spark] class SerializerManager( case _: RDDBlockId => compressRdds case _: TempLocalBlockId => compressShuffleSpill case _: TempShuffleBlockId => compressShuffle + case _: ShuffleBlockBatchId => compressShuffle case _ => false } } diff --git a/core/src/main/scala/org/apache/spark/shuffle/BaseShuffleHandle.scala b/core/src/main/scala/org/apache/spark/shuffle/BaseShuffleHandle.scala index 04e4cf88d7063..6fe183c078089 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/BaseShuffleHandle.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/BaseShuffleHandle.scala @@ -24,6 +24,5 @@ import org.apache.spark.ShuffleDependency */ private[spark] class BaseShuffleHandle[K, V, C]( shuffleId: Int, - val numMaps: Int, val dependency: ShuffleDependency[K, V, C]) extends ShuffleHandle(shuffleId) diff --git a/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala b/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala index 4329824b1b627..bc2a0fbc36d5b 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala @@ -19,35 +19,58 @@ package org.apache.spark.shuffle import org.apache.spark._ import org.apache.spark.internal.{config, Logging} +import org.apache.spark.io.CompressionCodec import org.apache.spark.serializer.SerializerManager -import org.apache.spark.storage.{BlockManager, ShuffleBlockFetcherIterator} +import org.apache.spark.storage.{BlockId, BlockManager, BlockManagerId, ShuffleBlockFetcherIterator} import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.ExternalSorter /** - * Fetches and reads the partitions in range [startPartition, endPartition) from a shuffle by - * requesting them from other nodes' block stores. + * Fetches and reads the blocks from a shuffle by requesting them from other nodes' block stores. */ private[spark] class BlockStoreShuffleReader[K, C]( handle: BaseShuffleHandle[K, _, C], - startPartition: Int, - endPartition: Int, + blocksByAddress: Iterator[(BlockManagerId, Seq[(BlockId, Long, Int)])], context: TaskContext, readMetrics: ShuffleReadMetricsReporter, serializerManager: SerializerManager = SparkEnv.get.serializerManager, blockManager: BlockManager = SparkEnv.get.blockManager, - mapOutputTracker: MapOutputTracker = SparkEnv.get.mapOutputTracker) + mapOutputTracker: MapOutputTracker = SparkEnv.get.mapOutputTracker, + shouldBatchFetch: Boolean = false) extends ShuffleReader[K, C] with Logging { private val dep = handle.dependency + private def fetchContinuousBlocksInBatch: Boolean = { + val conf = SparkEnv.get.conf + val serializerRelocatable = dep.serializer.supportsRelocationOfSerializedObjects + val compressed = conf.get(config.SHUFFLE_COMPRESS) + val codecConcatenation = if (compressed) { + CompressionCodec.supportsConcatenationOfSerializedStreams(CompressionCodec.createCodec(conf)) + } else { + true + } + val useOldFetchProtocol = conf.get(config.SHUFFLE_USE_OLD_FETCH_PROTOCOL) + + val doBatchFetch = shouldBatchFetch && serializerRelocatable && + (!compressed || codecConcatenation) && !useOldFetchProtocol + if (shouldBatchFetch && !doBatchFetch) { + logDebug("The feature tag of continuous shuffle block fetching is set to true, but " + + "we can not enable the feature because other conditions are not satisfied. " + + s"Shuffle compress: $compressed, serializer relocatable: $serializerRelocatable, " + + s"codec concatenation: $codecConcatenation, use old shuffle fetch protocol: " + + s"$useOldFetchProtocol.") + } + doBatchFetch + } + /** Read the combined key-values for this reduce task */ override def read(): Iterator[Product2[K, C]] = { val wrappedStreams = new ShuffleBlockFetcherIterator( context, blockManager.blockStoreClient, blockManager, - mapOutputTracker.getMapSizesByExecutorId(handle.shuffleId, startPartition, endPartition), + blocksByAddress, serializerManager.wrapStream, // Note: we use getSizeAsMb when no suffix is provided for backwards compatibility SparkEnv.get.conf.get(config.REDUCER_MAX_SIZE_IN_FLIGHT) * 1024 * 1024, @@ -56,7 +79,8 @@ private[spark] class BlockStoreShuffleReader[K, C]( SparkEnv.get.conf.get(config.MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM), SparkEnv.get.conf.get(config.SHUFFLE_DETECT_CORRUPT), SparkEnv.get.conf.get(config.SHUFFLE_DETECT_CORRUPT_MEMORY), - readMetrics).toCompletionIterator + readMetrics, + fetchContinuousBlocksInBatch).toCompletionIterator val serializerInstance = dep.serializer.newInstance() diff --git a/core/src/main/scala/org/apache/spark/shuffle/FetchFailedException.scala b/core/src/main/scala/org/apache/spark/shuffle/FetchFailedException.scala index 265a8acfa8d61..6509a04dc4893 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/FetchFailedException.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/FetchFailedException.scala @@ -35,7 +35,8 @@ import org.apache.spark.util.Utils private[spark] class FetchFailedException( bmAddress: BlockManagerId, shuffleId: Int, - mapId: Int, + mapId: Long, + mapIndex: Int, reduceId: Int, message: String, cause: Throwable = null) @@ -44,10 +45,11 @@ private[spark] class FetchFailedException( def this( bmAddress: BlockManagerId, shuffleId: Int, - mapId: Int, + mapTaskId: Long, + mapIndex: Int, reduceId: Int, cause: Throwable) { - this(bmAddress, shuffleId, mapId, reduceId, cause.getMessage, cause) + this(bmAddress, shuffleId, mapTaskId, mapIndex, reduceId, cause.getMessage, cause) } // SPARK-19276. We set the fetch failure in the task context, so that even if there is user-code @@ -56,8 +58,8 @@ private[spark] class FetchFailedException( // because the TaskContext is not defined in some test cases. Option(TaskContext.get()).map(_.setFetchFailed(this)) - def toTaskFailedReason: TaskFailedReason = FetchFailed(bmAddress, shuffleId, mapId, reduceId, - Utils.exceptionString(this)) + def toTaskFailedReason: TaskFailedReason = FetchFailed( + bmAddress, shuffleId, mapId, mapIndex, reduceId, Utils.exceptionString(this)) } /** @@ -67,4 +69,4 @@ private[spark] class MetadataFetchFailedException( shuffleId: Int, reduceId: Int, message: String) - extends FetchFailedException(null, shuffleId, -1, reduceId, message) + extends FetchFailedException(null, shuffleId, -1L, -1, reduceId, message) diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala index d3f1c7ec1bbee..af2c82e771970 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala @@ -26,6 +26,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.io.NioBufferedFileInputStream import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer} import org.apache.spark.network.netty.SparkTransportConf +import org.apache.spark.network.shuffle.ExecutorDiskUtils import org.apache.spark.shuffle.IndexShuffleBlockResolver.NOOP_REDUCE_ID import org.apache.spark.storage._ import org.apache.spark.util.Utils @@ -51,18 +52,42 @@ private[spark] class IndexShuffleBlockResolver( private val transportConf = SparkTransportConf.fromSparkConf(conf, "shuffle") - def getDataFile(shuffleId: Int, mapId: Int): File = { - blockManager.diskBlockManager.getFile(ShuffleDataBlockId(shuffleId, mapId, NOOP_REDUCE_ID)) + + def getDataFile(shuffleId: Int, mapId: Long): File = getDataFile(shuffleId, mapId, None) + + /** + * Get the shuffle data file. + * + * When the dirs parameter is None then use the disk manager's local directories. Otherwise, + * read from the specified directories. + */ + def getDataFile(shuffleId: Int, mapId: Long, dirs: Option[Array[String]]): File = { + val blockId = ShuffleDataBlockId(shuffleId, mapId, NOOP_REDUCE_ID) + dirs + .map(ExecutorDiskUtils.getFile(_, blockManager.subDirsPerLocalDir, blockId.name)) + .getOrElse(blockManager.diskBlockManager.getFile(blockId)) } - private def getIndexFile(shuffleId: Int, mapId: Int): File = { - blockManager.diskBlockManager.getFile(ShuffleIndexBlockId(shuffleId, mapId, NOOP_REDUCE_ID)) + /** + * Get the shuffle index file. + * + * When the dirs parameter is None then use the disk manager's local directories. Otherwise, + * read from the specified directories. + */ + private def getIndexFile( + shuffleId: Int, + mapId: Long, + dirs: Option[Array[String]] = None): File = { + val blockId = ShuffleIndexBlockId(shuffleId, mapId, NOOP_REDUCE_ID) + dirs + .map(ExecutorDiskUtils.getFile(_, blockManager.subDirsPerLocalDir, blockId.name)) + .getOrElse(blockManager.diskBlockManager.getFile(blockId)) } /** * Remove data file and index file that contain the output data from one map. */ - def removeDataByMap(shuffleId: Int, mapId: Int): Unit = { + def removeDataByMap(shuffleId: Int, mapId: Long): Unit = { var file = getDataFile(shuffleId, mapId) if (file.exists()) { if (!file.delete()) { @@ -135,7 +160,7 @@ private[spark] class IndexShuffleBlockResolver( */ def writeIndexFileAndCommit( shuffleId: Int, - mapId: Int, + mapId: Long, lengths: Array[Long], dataTmp: File): Unit = { val indexFile = getIndexFile(shuffleId, mapId) @@ -190,10 +215,20 @@ private[spark] class IndexShuffleBlockResolver( } } - override def getBlockData(blockId: ShuffleBlockId): ManagedBuffer = { + override def getBlockData( + blockId: BlockId, + dirs: Option[Array[String]]): ManagedBuffer = { + val (shuffleId, mapId, startReduceId, endReduceId) = blockId match { + case id: ShuffleBlockId => + (id.shuffleId, id.mapId, id.reduceId, id.reduceId + 1) + case batchId: ShuffleBlockBatchId => + (batchId.shuffleId, batchId.mapId, batchId.startReduceId, batchId.endReduceId) + case _ => + throw new IllegalArgumentException("unexpected shuffle block id format: " + blockId) + } // The block is actually going to be a range of a single map output file for this map, so // find out the consolidated file, then the offset within that from our index - val indexFile = getIndexFile(blockId.shuffleId, blockId.mapId) + val indexFile = getIndexFile(shuffleId, mapId, dirs) // SPARK-22982: if this FileInputStream's position is seeked forward by another piece of code // which is incorrectly using our file descriptor then this code will fetch the wrong offsets @@ -202,22 +237,23 @@ private[spark] class IndexShuffleBlockResolver( // class of issue from re-occurring in the future which is why they are left here even though // SPARK-22982 is fixed. val channel = Files.newByteChannel(indexFile.toPath) - channel.position(blockId.reduceId * 8L) + channel.position(startReduceId * 8L) val in = new DataInputStream(Channels.newInputStream(channel)) try { - val offset = in.readLong() - val nextOffset = in.readLong() + val startOffset = in.readLong() + channel.position(endReduceId * 8L) + val endOffset = in.readLong() val actualPosition = channel.position() - val expectedPosition = blockId.reduceId * 8L + 16 + val expectedPosition = endReduceId * 8L + 8 if (actualPosition != expectedPosition) { throw new Exception(s"SPARK-22982: Incorrect channel position after index file reads: " + s"expected $expectedPosition but actual position was $actualPosition.") } new FileSegmentManagedBuffer( transportConf, - getDataFile(blockId.shuffleId, blockId.mapId), - offset, - nextOffset - offset) + getDataFile(shuffleId, mapId, dirs), + startOffset, + endOffset - startOffset) } finally { in.close() } diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleBlockResolver.scala index d1ecbc1bf0178..5485cf955f11a 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleBlockResolver.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/ShuffleBlockResolver.scala @@ -18,7 +18,7 @@ package org.apache.spark.shuffle import org.apache.spark.network.buffer.ManagedBuffer -import org.apache.spark.storage.ShuffleBlockId +import org.apache.spark.storage.BlockId private[spark] /** @@ -31,10 +31,14 @@ trait ShuffleBlockResolver { type ShuffleId = Int /** - * Retrieve the data for the specified block. If the data for that block is not available, - * throws an unspecified exception. + * Retrieve the data for the specified block. + * + * When the dirs parameter is None then use the disk manager's local directories. Otherwise, + * read from the specified directories. + * + * If the data for that block is not available, throws an unspecified exception. */ - def getBlockData(blockId: ShuffleBlockId): ManagedBuffer + def getBlockData(blockId: BlockId, dirs: Option[Array[String]] = None): ManagedBuffer def stop(): Unit } diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleDataIOUtils.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleDataIOUtils.scala new file mode 100644 index 0000000000000..e9507a7584ba3 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/shuffle/ShuffleDataIOUtils.scala @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.shuffle + +import org.apache.spark.SparkConf +import org.apache.spark.internal.config.SHUFFLE_IO_PLUGIN_CLASS +import org.apache.spark.shuffle.api.ShuffleDataIO +import org.apache.spark.util.Utils + +private[spark] object ShuffleDataIOUtils { + + /** + * The prefix of spark config keys that are passed from the driver to the executor. + */ + val SHUFFLE_SPARK_CONF_PREFIX = "spark.shuffle.plugin.__config__." + + def loadShuffleDataIO(conf: SparkConf): ShuffleDataIO = { + val configuredPluginClass = conf.get(SHUFFLE_IO_PLUGIN_CLASS) + val maybeIO = Utils.loadExtensions( + classOf[ShuffleDataIO], Seq(configuredPluginClass), conf) + require(maybeIO.nonEmpty, s"A valid shuffle plugin must be specified by config " + + s"${SHUFFLE_IO_PLUGIN_CLASS.key}, but $configuredPluginClass resulted in zero valid " + + s"plugins.") + maybeIO.head + } + +} diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleManager.scala index 18a743fbfa6fc..057b0d6e0b0a7 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleManager.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/ShuffleManager.scala @@ -34,13 +34,12 @@ private[spark] trait ShuffleManager { */ def registerShuffle[K, V, C]( shuffleId: Int, - numMaps: Int, dependency: ShuffleDependency[K, V, C]): ShuffleHandle /** Get a writer for a given partition. Called on executors by map tasks. */ def getWriter[K, V]( handle: ShuffleHandle, - mapId: Int, + mapId: Long, context: TaskContext, metrics: ShuffleWriteMetricsReporter): ShuffleWriter[K, V] @@ -55,6 +54,20 @@ private[spark] trait ShuffleManager { context: TaskContext, metrics: ShuffleReadMetricsReporter): ShuffleReader[K, C] + /** + * Get a reader for a range of reduce partitions (startPartition to endPartition-1, inclusive) to + * read from map output (startMapIndex to endMapIndex - 1, inclusive). + * Called on executors by reduce tasks. + */ + def getReaderForRange[K, C]( + handle: ShuffleHandle, + startMapIndex: Int, + endMapIndex: Int, + startPartition: Int, + endPartition: Int, + context: TaskContext, + metrics: ShuffleReadMetricsReporter): ShuffleReader[K, C] + /** * Remove a shuffle's metadata from the ShuffleManager. * @return true if the metadata removed successfully, otherwise false. diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShufflePartitionPairsWriter.scala b/core/src/main/scala/org/apache/spark/shuffle/ShufflePartitionPairsWriter.scala index a988c5e126a76..e0affb858c359 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/ShufflePartitionPairsWriter.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/ShufflePartitionPairsWriter.scala @@ -21,7 +21,7 @@ import java.io.{Closeable, IOException, OutputStream} import org.apache.spark.serializer.{SerializationStream, SerializerInstance, SerializerManager} import org.apache.spark.shuffle.api.ShufflePartitionWriter -import org.apache.spark.storage.BlockId +import org.apache.spark.storage.{BlockId, TimeTrackingOutputStream} import org.apache.spark.util.Utils import org.apache.spark.util.collection.PairsWriter @@ -39,6 +39,7 @@ private[spark] class ShufflePartitionPairsWriter( private var isClosed = false private var partitionStream: OutputStream = _ + private var timeTrackingStream: OutputStream = _ private var wrappedStream: OutputStream = _ private var objOut: SerializationStream = _ private var numRecordsWritten = 0 @@ -59,7 +60,8 @@ private[spark] class ShufflePartitionPairsWriter( private def open(): Unit = { try { partitionStream = partitionWriter.openStream - wrappedStream = serializerManager.wrapStream(blockId, partitionStream) + timeTrackingStream = new TimeTrackingOutputStream(writeMetrics, partitionStream) + wrappedStream = serializerManager.wrapStream(blockId, timeTrackingStream) objOut = serializerInstance.serializeStream(wrappedStream) } catch { case e: Exception => @@ -78,6 +80,7 @@ private[spark] class ShufflePartitionPairsWriter( // Setting these to null will prevent the underlying streams from being closed twice // just in case any stream's close() implementation is not idempotent. wrappedStream = null + timeTrackingStream = null partitionStream = null } { // Normally closing objOut would close the inner streams as well, but just in case there @@ -86,9 +89,15 @@ private[spark] class ShufflePartitionPairsWriter( wrappedStream = closeIfNonNull(wrappedStream) // Same as above - if wrappedStream closes then assume it closes underlying // partitionStream and don't close again in the finally + timeTrackingStream = null partitionStream = null } { - partitionStream = closeIfNonNull(partitionStream) + Utils.tryWithSafeFinally { + timeTrackingStream = closeIfNonNull(timeTrackingStream) + partitionStream = null + } { + partitionStream = closeIfNonNull(partitionStream) + } } } updateBytesWritten() diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleWriteProcessor.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleWriteProcessor.scala index 5b0c7e9f2b0b4..1429144c6f6e2 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleWriteProcessor.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/ShuffleWriteProcessor.scala @@ -44,7 +44,7 @@ private[spark] class ShuffleWriteProcessor extends Serializable with Logging { def write( rdd: RDD[_], dep: ShuffleDependency[_, _, _], - partitionId: Int, + mapId: Long, context: TaskContext, partition: Partition): MapStatus = { var writer: ShuffleWriter[Any, Any] = null @@ -52,7 +52,7 @@ private[spark] class ShuffleWriteProcessor extends Serializable with Logging { val manager = SparkEnv.get.shuffleManager writer = manager.getWriter[Any, Any]( dep.shuffleHandle, - partitionId, + mapId, context, createMetricsReporter(context)) writer.write( diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala index 2a99c93b32af4..aefcb59b8bb87 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleManager.scala @@ -19,11 +19,14 @@ package org.apache.spark.shuffle.sort import java.util.concurrent.ConcurrentHashMap +import scala.collection.JavaConverters._ + import org.apache.spark._ import org.apache.spark.internal.{config, Logging} import org.apache.spark.shuffle._ import org.apache.spark.shuffle.api.{ShuffleDataIO, ShuffleExecutorComponents} import org.apache.spark.util.Utils +import org.apache.spark.util.collection.OpenHashSet /** * In sort-based shuffle, incoming records are sorted according to their target partition ids, then @@ -79,9 +82,9 @@ private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager } /** - * A mapping from shuffle ids to the number of mappers producing output for those shuffles. + * A mapping from shuffle ids to the task ids of mappers producing output for those shuffles. */ - private[this] val numMapsForShuffle = new ConcurrentHashMap[Int, Int]() + private[this] val taskIdMapsForShuffle = new ConcurrentHashMap[Int, OpenHashSet[Long]]() private lazy val shuffleExecutorComponents = loadShuffleExecutorComponents(conf) @@ -92,7 +95,6 @@ private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager */ override def registerShuffle[K, V, C]( shuffleId: Int, - numMaps: Int, dependency: ShuffleDependency[K, V, C]): ShuffleHandle = { if (SortShuffleWriter.shouldBypassMergeSort(conf, dependency)) { // If there are fewer than spark.shuffle.sort.bypassMergeThreshold partitions and we don't @@ -101,14 +103,14 @@ private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager // together the spilled files, which would happen with the normal code path. The downside is // having multiple files open at a time and thus more memory allocated to buffers. new BypassMergeSortShuffleHandle[K, V]( - shuffleId, numMaps, dependency.asInstanceOf[ShuffleDependency[K, V, V]]) + shuffleId, dependency.asInstanceOf[ShuffleDependency[K, V, V]]) } else if (SortShuffleManager.canUseSerializedShuffle(dependency)) { // Otherwise, try to buffer map outputs in a serialized form, since this is more efficient: new SerializedShuffleHandle[K, V]( - shuffleId, numMaps, dependency.asInstanceOf[ShuffleDependency[K, V, V]]) + shuffleId, dependency.asInstanceOf[ShuffleDependency[K, V, V]]) } else { // Otherwise, buffer map outputs in a deserialized form: - new BaseShuffleHandle(shuffleId, numMaps, dependency) + new BaseShuffleHandle(shuffleId, dependency) } } @@ -122,37 +124,54 @@ private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager endPartition: Int, context: TaskContext, metrics: ShuffleReadMetricsReporter): ShuffleReader[K, C] = { + val blocksByAddress = SparkEnv.get.mapOutputTracker.getMapSizesByExecutorId( + handle.shuffleId, startPartition, endPartition) + new BlockStoreShuffleReader( + handle.asInstanceOf[BaseShuffleHandle[K, _, C]], blocksByAddress, context, metrics, + shouldBatchFetch = canUseBatchFetch(startPartition, endPartition, context)) + } + + override def getReaderForRange[K, C]( + handle: ShuffleHandle, + startMapIndex: Int, + endMapIndex: Int, + startPartition: Int, + endPartition: Int, + context: TaskContext, + metrics: ShuffleReadMetricsReporter): ShuffleReader[K, C] = { + val blocksByAddress = SparkEnv.get.mapOutputTracker.getMapSizesByRange( + handle.shuffleId, startMapIndex, endMapIndex, startPartition, endPartition) new BlockStoreShuffleReader( - handle.asInstanceOf[BaseShuffleHandle[K, _, C]], - startPartition, endPartition, context, metrics) + handle.asInstanceOf[BaseShuffleHandle[K, _, C]], blocksByAddress, context, metrics, + shouldBatchFetch = canUseBatchFetch(startPartition, endPartition, context)) } /** Get a writer for a given partition. Called on executors by map tasks. */ override def getWriter[K, V]( handle: ShuffleHandle, - mapId: Int, + mapId: Long, context: TaskContext, metrics: ShuffleWriteMetricsReporter): ShuffleWriter[K, V] = { - numMapsForShuffle.putIfAbsent( - handle.shuffleId, handle.asInstanceOf[BaseShuffleHandle[_, _, _]].numMaps) + val mapTaskIds = taskIdMapsForShuffle.computeIfAbsent( + handle.shuffleId, _ => new OpenHashSet[Long](16)) + mapTaskIds.synchronized { mapTaskIds.add(context.taskAttemptId()) } val env = SparkEnv.get handle match { case unsafeShuffleHandle: SerializedShuffleHandle[K @unchecked, V @unchecked] => new UnsafeShuffleWriter( env.blockManager, - shuffleBlockResolver, context.taskMemoryManager(), unsafeShuffleHandle, mapId, context, env.conf, - metrics) + metrics, + shuffleExecutorComponents) case bypassMergeSortHandle: BypassMergeSortShuffleHandle[K @unchecked, V @unchecked] => new BypassMergeSortShuffleWriter( env.blockManager, bypassMergeSortHandle, mapId, - context.taskAttemptId(), env.conf, metrics, shuffleExecutorComponents) @@ -164,9 +183,9 @@ private[spark] class SortShuffleManager(conf: SparkConf) extends ShuffleManager /** Remove a shuffle's metadata from the ShuffleManager. */ override def unregisterShuffle(shuffleId: Int): Boolean = { - Option(numMapsForShuffle.remove(shuffleId)).foreach { numMaps => - (0 until numMaps).foreach { mapId => - shuffleBlockResolver.removeDataByMap(shuffleId, mapId) + Option(taskIdMapsForShuffle.remove(shuffleId)).foreach { mapTaskIds => + mapTaskIds.iterator.foreach { mapTaskId => + shuffleBlockResolver.removeDataByMap(shuffleId, mapTaskId) } } true @@ -185,10 +204,26 @@ private[spark] object SortShuffleManager extends Logging { * The maximum number of shuffle output partitions that SortShuffleManager supports when * buffering map outputs in a serialized form. This is an extreme defensive programming measure, * since it's extremely unlikely that a single shuffle produces over 16 million output partitions. - * */ + */ val MAX_SHUFFLE_OUTPUT_PARTITIONS_FOR_SERIALIZED_MODE = PackedRecordPointer.MAXIMUM_PARTITION_ID + 1 + /** + * The local property key for continuous shuffle block fetching feature. + */ + val FETCH_SHUFFLE_BLOCKS_IN_BATCH_ENABLED_KEY = + "__fetch_continuous_blocks_in_batch_enabled" + + /** + * Helper method for determining whether a shuffle reader should fetch the continuous blocks + * in batch. + */ + def canUseBatchFetch(startPartition: Int, endPartition: Int, context: TaskContext): Boolean = { + val fetchMultiPartitions = endPartition - startPartition > 1 + fetchMultiPartitions && + context.getLocalProperty(FETCH_SHUFFLE_BLOCKS_IN_BATCH_ENABLED_KEY) == "true" + } + /** * Helper method for determining whether a shuffle should use an optimized serialized shuffle * path or whether it should fall back to the original path that operates on deserialized objects. @@ -215,12 +250,13 @@ private[spark] object SortShuffleManager extends Logging { } private def loadShuffleExecutorComponents(conf: SparkConf): ShuffleExecutorComponents = { - val configuredPluginClasses = conf.get(config.SHUFFLE_IO_PLUGIN_CLASS) - val maybeIO = Utils.loadExtensions( - classOf[ShuffleDataIO], Seq(configuredPluginClasses), conf) - require(maybeIO.size == 1, s"Failed to load plugins of type $configuredPluginClasses") - val executorComponents = maybeIO.head.executor() - executorComponents.initializeExecutor(conf.getAppId, SparkEnv.get.executorId) + val executorComponents = ShuffleDataIOUtils.loadShuffleDataIO(conf).executor() + val extraConfigs = conf.getAllWithPrefix(ShuffleDataIOUtils.SHUFFLE_SPARK_CONF_PREFIX) + .toMap + executorComponents.initializeExecutor( + conf.getAppId, + SparkEnv.get.executorId, + extraConfigs.asJava) executorComponents } } @@ -231,9 +267,8 @@ private[spark] object SortShuffleManager extends Logging { */ private[spark] class SerializedShuffleHandle[K, V]( shuffleId: Int, - numMaps: Int, dependency: ShuffleDependency[K, V, V]) - extends BaseShuffleHandle(shuffleId, numMaps, dependency) { + extends BaseShuffleHandle(shuffleId, dependency) { } /** @@ -242,7 +277,6 @@ private[spark] class SerializedShuffleHandle[K, V]( */ private[spark] class BypassMergeSortShuffleHandle[K, V]( shuffleId: Int, - numMaps: Int, dependency: ShuffleDependency[K, V, V]) - extends BaseShuffleHandle(shuffleId, numMaps, dependency) { + extends BaseShuffleHandle(shuffleId, dependency) { } diff --git a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala index a781b16252432..a391bdf2db44e 100644 --- a/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala +++ b/core/src/main/scala/org/apache/spark/shuffle/sort/SortShuffleWriter.scala @@ -27,7 +27,7 @@ import org.apache.spark.util.collection.ExternalSorter private[spark] class SortShuffleWriter[K, V, C]( shuffleBlockResolver: IndexShuffleBlockResolver, handle: BaseShuffleHandle[K, V, C], - mapId: Int, + mapId: Long, context: TaskContext, shuffleExecutorComponents: ShuffleExecutorComponents) extends ShuffleWriter[K, V] with Logging { @@ -65,10 +65,10 @@ private[spark] class SortShuffleWriter[K, V, C]( // because it just opens a single file, so is typically too fast to measure accurately // (see SPARK-3570). val mapOutputWriter = shuffleExecutorComponents.createMapOutputWriter( - dep.shuffleId, mapId, context.taskAttemptId(), dep.partitioner.numPartitions) + dep.shuffleId, mapId, dep.partitioner.numPartitions) sorter.writePartitionedMapOutput(dep.shuffleId, mapId, mapOutputWriter) val partitionLengths = mapOutputWriter.commitAllPartitions() - mapStatus = MapStatus(blockManager.shuffleServerId, partitionLengths) + mapStatus = MapStatus(blockManager.shuffleServerId, partitionLengths, mapId) } /** Close this writer, passing along whether the map completed */ diff --git a/core/src/main/scala/org/apache/spark/status/AppHistoryServerPlugin.scala b/core/src/main/scala/org/apache/spark/status/AppHistoryServerPlugin.scala index d144a0e998fa1..2e9a31d5ac69c 100644 --- a/core/src/main/scala/org/apache/spark/status/AppHistoryServerPlugin.scala +++ b/core/src/main/scala/org/apache/spark/status/AppHistoryServerPlugin.scala @@ -35,4 +35,9 @@ private[spark] trait AppHistoryServerPlugin { * Sets up UI of this plugin to rebuild the history UI. */ def setupUI(ui: SparkUI): Unit + + /** + * The position of a plugin tab relative to the other plugin tabs in the history UI. + */ + def displayOrder: Int = Integer.MAX_VALUE } diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala index c85b3caf8a5ef..c3f22f32993a8 100644 --- a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala +++ b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala @@ -71,7 +71,7 @@ private[spark] class AppStatusListener( // causing too many writes to the underlying store, and other expensive operations). private val liveStages = new ConcurrentHashMap[(Int, Int), LiveStage]() private val liveJobs = new HashMap[Int, LiveJob]() - private val liveExecutors = new HashMap[String, LiveExecutor]() + private[spark] val liveExecutors = new HashMap[String, LiveExecutor]() private val deadExecutors = new HashMap[String, LiveExecutor]() private val liveTasks = new HashMap[Long, LiveTask]() private val liveRDDs = new HashMap[Int, LiveRDD]() @@ -234,8 +234,8 @@ private[spark] class AppStatusListener( (partition.memoryUsed / partition.executors.length) * -1) rdd.diskUsed = addDeltaToValue(rdd.diskUsed, (partition.diskUsed / partition.executors.length) * -1) - partition.update(partition.executors - .filter(!_.equals(event.executorId)), rdd.storageLevel, + partition.update( + partition.executors.filter(!_.equals(event.executorId)), addDeltaToValue(partition.memoryUsed, (partition.memoryUsed / partition.executors.length) * -1), addDeltaToValue(partition.diskUsed, @@ -355,6 +355,8 @@ private[spark] class AppStatusListener( val lastStageInfo = event.stageInfos.sortBy(_.stageId).lastOption val jobName = lastStageInfo.map(_.name).getOrElse("") + val description = Option(event.properties) + .flatMap { p => Option(p.getProperty(SparkContext.SPARK_JOB_DESCRIPTION)) } val jobGroup = Option(event.properties) .flatMap { p => Option(p.getProperty(SparkContext.SPARK_JOB_GROUP_ID)) } val sqlExecutionId = Option(event.properties) @@ -363,6 +365,7 @@ private[spark] class AppStatusListener( val job = new LiveJob( event.jobId, jobName, + description, if (event.time > 0) Some(new Date(event.time)) else None, event.stageIds, jobGroup, @@ -495,7 +498,7 @@ private[spark] class AppStatusListener( event.stageInfo.rddInfos.foreach { info => if (info.storageLevel.isValid) { - liveUpdate(liveRDDs.getOrElseUpdate(info.id, new LiveRDD(info)), now) + liveUpdate(liveRDDs.getOrElseUpdate(info.id, new LiveRDD(info, info.storageLevel)), now) } } @@ -769,6 +772,11 @@ private[spark] class AppStatusListener( event.maxOnHeapMem.foreach { _ => exec.totalOnHeap = event.maxOnHeapMem.get exec.totalOffHeap = event.maxOffHeapMem.get + // SPARK-30594: whenever(first time or re-register) a BlockManager added, all blocks + // from this BlockManager will be reported to driver later. So, we should clean up + // used memory to avoid overlapped count. + exec.usedOnHeap = 0 + exec.usedOffHeap = 0 } exec.isActive = true exec.maxMemory = event.maxMem @@ -916,12 +924,6 @@ private[spark] class AppStatusListener( val diskDelta = event.blockUpdatedInfo.diskSize * (if (storageLevel.useDisk) 1 else -1) val memoryDelta = event.blockUpdatedInfo.memSize * (if (storageLevel.useMemory) 1 else -1) - val updatedStorageLevel = if (storageLevel.isValid) { - Some(storageLevel.description) - } else { - None - } - // We need information about the executor to update some memory accounting values in the // RDD info, so read that beforehand. val maybeExec = liveExecutors.get(executorId) @@ -936,13 +938,9 @@ private[spark] class AppStatusListener( // Update the block entry in the RDD info, keeping track of the deltas above so that we // can update the executor information too. liveRDDs.get(block.rddId).foreach { rdd => - if (updatedStorageLevel.isDefined) { - rdd.setStorageLevel(updatedStorageLevel.get) - } - val partition = rdd.partition(block.name) - val executors = if (updatedStorageLevel.isDefined) { + val executors = if (storageLevel.isValid) { val current = partition.executors if (current.contains(executorId)) { current @@ -957,7 +955,7 @@ private[spark] class AppStatusListener( // Only update the partition if it's still stored in some executor, otherwise get rid of it. if (executors.nonEmpty) { - partition.update(executors, rdd.storageLevel, + partition.update(executors, addDeltaToValue(partition.memoryUsed, memoryDelta), addDeltaToValue(partition.diskUsed, diskDelta)) } else { @@ -1049,7 +1047,7 @@ private[spark] class AppStatusListener( } } - private def updateExecutorMemoryDiskInfo( + private[spark] def updateExecutorMemoryDiskInfo( exec: LiveExecutor, storageLevel: StorageLevel, memoryDelta: Long, diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusSource.scala b/core/src/main/scala/org/apache/spark/status/AppStatusSource.scala index f6a21578ff499..20f171bd3c375 100644 --- a/core/src/main/scala/org/apache/spark/status/AppStatusSource.scala +++ b/core/src/main/scala/org/apache/spark/status/AppStatusSource.scala @@ -22,7 +22,7 @@ import AppStatusSource.getCounter import com.codahale.metrics.{Counter, Gauge, MetricRegistry} import org.apache.spark.SparkConf -import org.apache.spark.internal.config.Status.APP_STATUS_METRICS_ENABLED +import org.apache.spark.internal.config.Status.METRICS_APP_STATUS_SOURCE_ENABLED import org.apache.spark.metrics.source.Source private [spark] class JobDuration(val value: AtomicLong) extends Gauge[Long] { @@ -71,7 +71,7 @@ private[spark] object AppStatusSource { } def createSource(conf: SparkConf): Option[AppStatusSource] = { - Option(conf.get(APP_STATUS_METRICS_ENABLED)) + Option(conf.get(METRICS_APP_STATUS_SOURCE_ENABLED)) .filter(identity) .map { _ => new AppStatusSource() } } diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala index 964ab27a524c4..6b89812cc2bf0 100644 --- a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala +++ b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala @@ -136,12 +136,6 @@ private[spark] class AppStatusStore( store.read(classOf[StageDataWrapper], Array(stageId, stageAttemptId)).locality } - // SPARK-26119: we only want to consider successful tasks when calculating the metrics summary, - // but currently this is very expensive when using a disk store. So we only trigger the slower - // code path when we know we have all data in memory. The following method checks whether all - // the data will be in memory. - private def isInMemoryStore: Boolean = store.isInstanceOf[InMemoryStore] || listener.isDefined - /** * Calculates a summary of the task metrics for the given stage attempt, returning the * requested quantiles for the recorded metrics. @@ -162,21 +156,11 @@ private[spark] class AppStatusStore( // cheaper for disk stores (avoids deserialization). val count = { Utils.tryWithResource( - if (isInMemoryStore) { - // For Live UI, we should count the tasks with status "SUCCESS" only. - store.view(classOf[TaskDataWrapper]) - .parent(stageKey) - .index(TaskIndexNames.STATUS) - .first("SUCCESS") - .last("SUCCESS") - .closeableIterator() - } else { - store.view(classOf[TaskDataWrapper]) - .parent(stageKey) - .index(TaskIndexNames.EXEC_RUN_TIME) - .first(0L) - .closeableIterator() - } + store.view(classOf[TaskDataWrapper]) + .parent(stageKey) + .index(TaskIndexNames.EXEC_RUN_TIME) + .first(0L) + .closeableIterator() ) { it => var _count = 0L while (it.hasNext()) { @@ -245,50 +229,30 @@ private[spark] class AppStatusStore( // stabilize once the stage finishes. It's also slow, especially with disk stores. val indices = quantiles.map { q => math.min((q * count).toLong, count - 1) } - // TODO: Summary metrics needs to display all the successful tasks' metrics (SPARK-26119). - // For InMemory case, it is efficient to find using the following code. But for diskStore case - // we need an efficient solution to avoid deserialization time overhead. For that, we need to - // rework on the way indexing works, so that we can index by specific metrics for successful - // and failed tasks differently (would be tricky). Also would require changing the disk store - // version (to invalidate old stores). def scanTasks(index: String)(fn: TaskDataWrapper => Long): IndexedSeq[Double] = { - if (isInMemoryStore) { - val quantileTasks = store.view(classOf[TaskDataWrapper]) + Utils.tryWithResource( + store.view(classOf[TaskDataWrapper]) .parent(stageKey) .index(index) .first(0L) - .asScala - .filter { _.status == "SUCCESS"} // Filter "SUCCESS" tasks - .toIndexedSeq - - indices.map { index => - fn(quantileTasks(index.toInt)).toDouble - }.toIndexedSeq - } else { - Utils.tryWithResource( - store.view(classOf[TaskDataWrapper]) - .parent(stageKey) - .index(index) - .first(0L) - .closeableIterator() - ) { it => - var last = Double.NaN - var currentIdx = -1L - indices.map { idx => - if (idx == currentIdx) { + .closeableIterator() + ) { it => + var last = Double.NaN + var currentIdx = -1L + indices.map { idx => + if (idx == currentIdx) { + last + } else { + val diff = idx - currentIdx + currentIdx = idx + if (it.skip(diff - 1)) { + last = fn(it.next()).toDouble last } else { - val diff = idx - currentIdx - currentIdx = idx - if (it.skip(diff - 1)) { - last = fn(it.next()).toDouble - last - } else { - Double.NaN - } + Double.NaN } - }.toIndexedSeq - } + } + }.toIndexedSeq } } @@ -582,7 +546,7 @@ private[spark] class AppStatusStore( private[spark] object AppStatusStore { - val CURRENT_VERSION = 1L + val CURRENT_VERSION = 2L /** * Create an in-memory store for a live application. diff --git a/core/src/main/scala/org/apache/spark/status/ElementTrackingStore.scala b/core/src/main/scala/org/apache/spark/status/ElementTrackingStore.scala index 38cb030297c81..1b8dc9c8275ad 100644 --- a/core/src/main/scala/org/apache/spark/status/ElementTrackingStore.scala +++ b/core/src/main/scala/org/apache/spark/status/ElementTrackingStore.scala @@ -18,14 +18,12 @@ package org.apache.spark.status import java.util.Collection -import java.util.concurrent.TimeUnit +import java.util.concurrent.{ExecutorService, TimeUnit} import java.util.concurrent.atomic.AtomicBoolean import scala.collection.JavaConverters._ import scala.collection.mutable.{HashMap, ListBuffer} -import com.google.common.util.concurrent.MoreExecutors - import org.apache.spark.SparkConf import org.apache.spark.internal.config.Status._ import org.apache.spark.status.ElementTrackingStore._ @@ -72,10 +70,10 @@ private[spark] class ElementTrackingStore(store: KVStore, conf: SparkConf) exten private val triggers = new HashMap[Class[_], LatchedTriggers]() private val flushTriggers = new ListBuffer[() => Unit]() - private val executor = if (conf.get(ASYNC_TRACKING_ENABLED)) { + private val executor: ExecutorService = if (conf.get(ASYNC_TRACKING_ENABLED)) { ThreadUtils.newDaemonSingleThreadExecutor("element-tracking-store-worker") } else { - MoreExecutors.sameThreadExecutor() + ThreadUtils.sameThreadExecutorService } @volatile private var stopped = false diff --git a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala index aa4a21c1bb818..2714f30de14f0 100644 --- a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala +++ b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala @@ -20,6 +20,7 @@ package org.apache.spark.status import java.util.Date import java.util.concurrent.atomic.AtomicInteger +import scala.collection.JavaConverters._ import scala.collection.immutable.{HashSet, TreeSet} import scala.collection.mutable.HashMap @@ -30,7 +31,7 @@ import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics} import org.apache.spark.resource.ResourceInformation import org.apache.spark.scheduler.{AccumulableInfo, StageInfo, TaskInfo} import org.apache.spark.status.api.v1 -import org.apache.spark.storage.RDDInfo +import org.apache.spark.storage.{RDDInfo, StorageLevel} import org.apache.spark.ui.SparkUI import org.apache.spark.util.AccumulatorContext import org.apache.spark.util.collection.OpenHashSet @@ -62,6 +63,7 @@ private[spark] abstract class LiveEntity { private class LiveJob( val jobId: Int, name: String, + description: Option[String], val submissionTime: Option[Date], val stageIds: Seq[Int], jobGroup: Option[String], @@ -92,7 +94,7 @@ private class LiveJob( val info = new v1.JobData( jobId, name, - None, // description is always None? + description, submissionTime, completionTime, stageIds, @@ -183,6 +185,19 @@ private class LiveTask( info.timeRunning(lastUpdateTime.getOrElse(System.currentTimeMillis())) } + val hasMetrics = metrics.executorDeserializeTime >= 0 + + /** + * SPARK-26260: For non successful tasks, store the metrics as negative to avoid + * the calculation in the task summary. `toApi` method in the `TaskDataWrapper` will make + * it actual value. + */ + val taskMetrics: v1.TaskMetrics = if (hasMetrics && !info.successful) { + makeNegative(metrics) + } else { + metrics + } + new TaskDataWrapper( info.taskId, info.index, @@ -198,30 +213,31 @@ private class LiveTask( newAccumulatorInfos(info.accumulables), errorMessage, - metrics.executorDeserializeTime, - metrics.executorDeserializeCpuTime, - metrics.executorRunTime, - metrics.executorCpuTime, - metrics.resultSize, - metrics.jvmGcTime, - metrics.resultSerializationTime, - metrics.memoryBytesSpilled, - metrics.diskBytesSpilled, - metrics.peakExecutionMemory, - metrics.inputMetrics.bytesRead, - metrics.inputMetrics.recordsRead, - metrics.outputMetrics.bytesWritten, - metrics.outputMetrics.recordsWritten, - metrics.shuffleReadMetrics.remoteBlocksFetched, - metrics.shuffleReadMetrics.localBlocksFetched, - metrics.shuffleReadMetrics.fetchWaitTime, - metrics.shuffleReadMetrics.remoteBytesRead, - metrics.shuffleReadMetrics.remoteBytesReadToDisk, - metrics.shuffleReadMetrics.localBytesRead, - metrics.shuffleReadMetrics.recordsRead, - metrics.shuffleWriteMetrics.bytesWritten, - metrics.shuffleWriteMetrics.writeTime, - metrics.shuffleWriteMetrics.recordsWritten, + hasMetrics, + taskMetrics.executorDeserializeTime, + taskMetrics.executorDeserializeCpuTime, + taskMetrics.executorRunTime, + taskMetrics.executorCpuTime, + taskMetrics.resultSize, + taskMetrics.jvmGcTime, + taskMetrics.resultSerializationTime, + taskMetrics.memoryBytesSpilled, + taskMetrics.diskBytesSpilled, + taskMetrics.peakExecutionMemory, + taskMetrics.inputMetrics.bytesRead, + taskMetrics.inputMetrics.recordsRead, + taskMetrics.outputMetrics.bytesWritten, + taskMetrics.outputMetrics.recordsWritten, + taskMetrics.shuffleReadMetrics.remoteBlocksFetched, + taskMetrics.shuffleReadMetrics.localBlocksFetched, + taskMetrics.shuffleReadMetrics.fetchWaitTime, + taskMetrics.shuffleReadMetrics.remoteBytesRead, + taskMetrics.shuffleReadMetrics.remoteBytesReadToDisk, + taskMetrics.shuffleReadMetrics.localBytesRead, + taskMetrics.shuffleReadMetrics.recordsRead, + taskMetrics.shuffleWriteMetrics.bytesWritten, + taskMetrics.shuffleWriteMetrics.writeTime, + taskMetrics.shuffleWriteMetrics.recordsWritten, stageId, stageAttemptId) @@ -229,7 +245,7 @@ private class LiveTask( } -private class LiveExecutor(val executorId: String, _addTime: Long) extends LiveEntity { +private[spark] class LiveExecutor(val executorId: String, _addTime: Long) extends LiveEntity { var hostPort: String = null var host: String = null @@ -458,7 +474,13 @@ private class LiveStage extends LiveEntity { } -private class LiveRDDPartition(val blockName: String) { +/** + * Data about a single partition of a cached RDD. The RDD storage level is used to compute the + * effective storage level of the partition, which takes into account the storage actually being + * used by the partition in the executors, and thus may differ from the storage level requested + * by the application. + */ +private class LiveRDDPartition(val blockName: String, rddLevel: StorageLevel) { import LiveEntityHelpers._ @@ -476,12 +498,13 @@ private class LiveRDDPartition(val blockName: String) { def update( executors: Seq[String], - storageLevel: String, memoryUsed: Long, diskUsed: Long): Unit = { + val level = StorageLevel(diskUsed > 0, memoryUsed > 0, rddLevel.useOffHeap, + if (memoryUsed > 0) rddLevel.deserialized else false, executors.size) value = new v1.RDDPartitionInfo( blockName, - weakIntern(storageLevel), + weakIntern(level.description), memoryUsed, diskUsed, executors) @@ -520,27 +543,31 @@ private class LiveRDDDistribution(exec: LiveExecutor) { } -private class LiveRDD(val info: RDDInfo) extends LiveEntity { +/** + * Tracker for data related to a persisted RDD. + * + * The RDD storage level is immutable, following the current behavior of `RDD.persist()`, even + * though it is mutable in the `RDDInfo` structure. Since the listener does not track unpersisted + * RDDs, this covers the case where an early stage is run on the unpersisted RDD, and a later stage + * it started after the RDD is marked for caching. + */ +private class LiveRDD(val info: RDDInfo, storageLevel: StorageLevel) extends LiveEntity { import LiveEntityHelpers._ - var storageLevel: String = weakIntern(info.storageLevel.description) var memoryUsed = 0L var diskUsed = 0L + private val levelDescription = weakIntern(storageLevel.description) private val partitions = new HashMap[String, LiveRDDPartition]() private val partitionSeq = new RDDPartitionSeq() private val distributions = new HashMap[String, LiveRDDDistribution]() - def setStorageLevel(level: String): Unit = { - this.storageLevel = weakIntern(level) - } - def partition(blockName: String): LiveRDDPartition = { partitions.getOrElseUpdate(blockName, { - val part = new LiveRDDPartition(blockName) - part.update(Nil, storageLevel, 0L, 0L) + val part = new LiveRDDPartition(blockName, storageLevel) + part.update(Nil, 0L, 0L) partitionSeq.addPartition(part) part }) @@ -578,7 +605,7 @@ private class LiveRDD(val info: RDDInfo) extends LiveEntity { info.name, info.numPartitions, partitions.size, - storageLevel, + levelDescription, memoryUsed, diskUsed, dists, @@ -599,10 +626,22 @@ private class SchedulerPool(name: String) extends LiveEntity { } -private object LiveEntityHelpers { +private[spark] object LiveEntityHelpers { private val stringInterner = Interners.newWeakInterner[String]() + private def accuValuetoString(value: Any): String = value match { + case list: java.util.List[_] => + // SPARK-30379: For collection accumulator, string representation might + // takes much more memory (e.g. long => string of it) and cause OOM. + // So we only show first few elements. + if (list.size() > 5) { + list.asScala.take(5).mkString("[", ",", "," + "... " + (list.size() - 5) + " more items]") + } else { + list.toString + } + case _ => value.toString + } def newAccumulatorInfos(accums: Iterable[AccumulableInfo]): Seq[v1.AccumulableInfo] = { accums @@ -615,8 +654,8 @@ private object LiveEntityHelpers { new v1.AccumulableInfo( acc.id, acc.name.map(weakIntern).orNull, - acc.update.map(_.toString()), - acc.value.map(_.toString()).orNull) + acc.update.map(accuValuetoString), + acc.value.map(accuValuetoString).orNull) } .toSeq } @@ -698,6 +737,46 @@ private object LiveEntityHelpers { addMetrics(m1, m2, -1) } + /** + * Convert all the metric values to negative as well as handle zero values. + * This method assumes that all the metric values are greater than or equal to zero + */ + def makeNegative(m: v1.TaskMetrics): v1.TaskMetrics = { + // To handle 0 metric value, add 1 and make the metric negative. + // To recover actual value do `math.abs(metric + 1)` + // Eg: if the metric values are (5, 3, 0, 1) => Updated metric values will be (-6, -4, -1, -2) + // To get actual metric value, do math.abs(metric + 1) => (5, 3, 0, 1) + def updateMetricValue(metric: Long): Long = { + metric * -1L - 1L + } + + createMetrics( + updateMetricValue(m.executorDeserializeTime), + updateMetricValue(m.executorDeserializeCpuTime), + updateMetricValue(m.executorRunTime), + updateMetricValue(m.executorCpuTime), + updateMetricValue(m.resultSize), + updateMetricValue(m.jvmGcTime), + updateMetricValue(m.resultSerializationTime), + updateMetricValue(m.memoryBytesSpilled), + updateMetricValue(m.diskBytesSpilled), + updateMetricValue(m.peakExecutionMemory), + updateMetricValue(m.inputMetrics.bytesRead), + updateMetricValue(m.inputMetrics.recordsRead), + updateMetricValue(m.outputMetrics.bytesWritten), + updateMetricValue(m.outputMetrics.recordsWritten), + updateMetricValue(m.shuffleReadMetrics.remoteBlocksFetched), + updateMetricValue(m.shuffleReadMetrics.localBlocksFetched), + updateMetricValue(m.shuffleReadMetrics.fetchWaitTime), + updateMetricValue(m.shuffleReadMetrics.remoteBytesRead), + updateMetricValue(m.shuffleReadMetrics.remoteBytesReadToDisk), + updateMetricValue(m.shuffleReadMetrics.localBytesRead), + updateMetricValue(m.shuffleReadMetrics.recordsRead), + updateMetricValue(m.shuffleWriteMetrics.bytesWritten), + updateMetricValue(m.shuffleWriteMetrics.writeTime), + updateMetricValue(m.shuffleWriteMetrics.recordsWritten)) + } + private def addMetrics(m1: v1.TaskMetrics, m2: v1.TaskMetrics, mult: Int): v1.TaskMetrics = { createMetrics( m1.executorDeserializeTime + m2.executorDeserializeTime * mult, diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala index 2ee9d3d0815a1..cf5c759bebdbb 100644 --- a/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala +++ b/core/src/main/scala/org/apache/spark/status/api/v1/OneApplicationResource.scala @@ -25,6 +25,8 @@ import javax.ws.rs.core.{MediaType, Response, StreamingOutput} import scala.util.control.NonFatal import org.apache.spark.{JobExecutionStatus, SparkContext} +import org.apache.spark.status.api.v1 +import org.apache.spark.util.Utils @Produces(Array(MediaType.APPLICATION_JSON)) private[v1] class AbstractApplicationResource extends BaseAppResource { @@ -97,7 +99,15 @@ private[v1] class AbstractApplicationResource extends BaseAppResource { @GET @Path("environment") - def environmentInfo(): ApplicationEnvironmentInfo = withUI(_.store.environmentInfo()) + def environmentInfo(): ApplicationEnvironmentInfo = withUI { ui => + val envInfo = ui.store.environmentInfo() + new v1.ApplicationEnvironmentInfo( + envInfo.runtime, + Utils.redact(ui.conf, envInfo.sparkProperties), + Utils.redact(ui.conf, envInfo.hadoopProperties), + Utils.redact(ui.conf, envInfo.systemProperties), + envInfo.classpathEntries) + } @GET @Path("logs") diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/PrometheusResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/PrometheusResource.scala new file mode 100644 index 0000000000000..f9fb78e65a3d9 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/status/api/v1/PrometheusResource.scala @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.status.api.v1 + +import javax.ws.rs._ +import javax.ws.rs.core.MediaType + +import org.eclipse.jetty.servlet.{ServletContextHandler, ServletHolder} +import org.glassfish.jersey.server.ServerProperties +import org.glassfish.jersey.servlet.ServletContainer + +import org.apache.spark.ui.SparkUI + +/** + * This aims to expose Executor metrics like REST API which is documented in + * + * https://spark.apache.org/docs/3.0.0/monitoring.html#executor-metrics + * + * Note that this is based on ExecutorSummary which is different from ExecutorSource. + */ +@Path("/executors") +private[v1] class PrometheusResource extends ApiRequestContext { + @GET + @Path("prometheus") + @Produces(Array(MediaType.TEXT_PLAIN)) + def executors(): String = { + val sb = new StringBuilder + val store = uiRoot.asInstanceOf[SparkUI].store + store.executorList(true).foreach { executor => + val prefix = "metrics_executor_" + val labels = Seq( + "application_id" -> store.applicationInfo.id, + "application_name" -> store.applicationInfo.name, + "executor_id" -> executor.id + ).map { case (k, v) => s"""$k="$v"""" }.mkString("{", ", ", "}") + sb.append(s"${prefix}rddBlocks_Count$labels ${executor.rddBlocks}\n") + sb.append(s"${prefix}memoryUsed_Count$labels ${executor.memoryUsed}\n") + sb.append(s"${prefix}diskUsed_Count$labels ${executor.diskUsed}\n") + sb.append(s"${prefix}totalCores_Count$labels ${executor.totalCores}\n") + sb.append(s"${prefix}maxTasks_Count$labels ${executor.maxTasks}\n") + sb.append(s"${prefix}activeTasks_Count$labels ${executor.activeTasks}\n") + sb.append(s"${prefix}failedTasks_Count$labels ${executor.failedTasks}\n") + sb.append(s"${prefix}completedTasks_Count$labels ${executor.completedTasks}\n") + sb.append(s"${prefix}totalTasks_Count$labels ${executor.totalTasks}\n") + sb.append(s"${prefix}totalDuration_Value$labels ${executor.totalDuration}\n") + sb.append(s"${prefix}totalGCTime_Value$labels ${executor.totalGCTime}\n") + sb.append(s"${prefix}totalInputBytes_Count$labels ${executor.totalInputBytes}\n") + sb.append(s"${prefix}totalShuffleRead_Count$labels ${executor.totalShuffleRead}\n") + sb.append(s"${prefix}totalShuffleWrite_Count$labels ${executor.totalShuffleWrite}\n") + sb.append(s"${prefix}maxMemory_Count$labels ${executor.maxMemory}\n") + executor.executorLogs.foreach { case (k, v) => } + executor.memoryMetrics.foreach { m => + sb.append(s"${prefix}usedOnHeapStorageMemory_Count$labels ${m.usedOnHeapStorageMemory}\n") + sb.append(s"${prefix}usedOffHeapStorageMemory_Count$labels ${m.usedOffHeapStorageMemory}\n") + sb.append(s"${prefix}totalOnHeapStorageMemory_Count$labels ${m.totalOnHeapStorageMemory}\n") + sb.append(s"${prefix}totalOffHeapStorageMemory_Count$labels " + + s"${m.totalOffHeapStorageMemory}\n") + } + executor.peakMemoryMetrics.foreach { m => + val names = Array( + "JVMHeapMemory", + "JVMOffHeapMemory", + "OnHeapExecutionMemory", + "OffHeapExecutionMemory", + "OnHeapStorageMemory", + "OffHeapStorageMemory", + "OnHeapUnifiedMemory", + "OffHeapUnifiedMemory", + "DirectPoolMemory", + "MappedPoolMemory", + "ProcessTreeJVMVMemory", + "ProcessTreeJVMRSSMemory", + "ProcessTreePythonVMemory", + "ProcessTreePythonRSSMemory", + "ProcessTreeOtherVMemory", + "ProcessTreeOtherRSSMemory", + "MinorGCCount", + "MinorGCTime", + "MajorGCCount", + "MajorGCTime" + ) + names.foreach { name => + sb.append(s"$prefix${name}_Count$labels ${m.getMetricValue(name)}\n") + } + } + } + sb.toString + } +} + +private[spark] object PrometheusResource { + def getServletHandler(uiRoot: UIRoot): ServletContextHandler = { + val jerseyContext = new ServletContextHandler(ServletContextHandler.NO_SESSIONS) + jerseyContext.setContextPath("/metrics") + val holder: ServletHolder = new ServletHolder(classOf[ServletContainer]) + holder.setInitParameter(ServerProperties.PROVIDER_PACKAGES, "org.apache.spark.status.api.v1") + UIRootFromServletContext.setUiRoot(jerseyContext, uiRoot) + jerseyContext.addServlet(holder, "/*") + jerseyContext + } +} diff --git a/core/src/main/scala/org/apache/spark/status/storeTypes.scala b/core/src/main/scala/org/apache/spark/status/storeTypes.scala index 9da5bea8bf5c4..f0a94d84d8a04 100644 --- a/core/src/main/scala/org/apache/spark/status/storeTypes.scala +++ b/core/src/main/scala/org/apache/spark/status/storeTypes.scala @@ -177,10 +177,13 @@ private[spark] class TaskDataWrapper( val accumulatorUpdates: Seq[AccumulableInfo], val errorMessage: Option[String], + val hasMetrics: Boolean, // The following is an exploded view of a TaskMetrics API object. This saves 5 objects - // (= 80 bytes of Java object overhead) per instance of this wrapper. If the first value - // (executorDeserializeTime) is -1L, it means the metrics for this task have not been - // recorded. + // (= 80 bytes of Java object overhead) per instance of this wrapper. Non successful + // tasks' metrics will have negative values in `TaskDataWrapper`. `TaskData` will have + // actual metric values. To recover the actual metric value from `TaskDataWrapper`, + // need use `getMetricValue` method. If `hasMetrics` is false, it means the metrics + // for this task have not been recorded. @KVIndexParam(value = TaskIndexNames.DESER_TIME, parent = TaskIndexNames.STAGE) val executorDeserializeTime: Long, @KVIndexParam(value = TaskIndexNames.DESER_CPU_TIME, parent = TaskIndexNames.STAGE) @@ -233,39 +236,46 @@ private[spark] class TaskDataWrapper( val stageId: Int, val stageAttemptId: Int) { - def hasMetrics: Boolean = executorDeserializeTime >= 0 + // SPARK-26260: To handle non successful tasks metrics (Running, Failed, Killed). + private def getMetricValue(metric: Long): Long = { + if (status != "SUCCESS") { + math.abs(metric + 1) + } else { + metric + } + } def toApi: TaskData = { val metrics = if (hasMetrics) { Some(new TaskMetrics( - executorDeserializeTime, - executorDeserializeCpuTime, - executorRunTime, - executorCpuTime, - resultSize, - jvmGcTime, - resultSerializationTime, - memoryBytesSpilled, - diskBytesSpilled, - peakExecutionMemory, + getMetricValue(executorDeserializeTime), + getMetricValue(executorDeserializeCpuTime), + getMetricValue(executorRunTime), + getMetricValue(executorCpuTime), + getMetricValue(resultSize), + getMetricValue(jvmGcTime), + getMetricValue(resultSerializationTime), + getMetricValue(memoryBytesSpilled), + getMetricValue(diskBytesSpilled), + getMetricValue(peakExecutionMemory), new InputMetrics( - inputBytesRead, - inputRecordsRead), + getMetricValue(inputBytesRead), + getMetricValue(inputRecordsRead)), new OutputMetrics( - outputBytesWritten, - outputRecordsWritten), + getMetricValue(outputBytesWritten), + getMetricValue(outputRecordsWritten)), new ShuffleReadMetrics( - shuffleRemoteBlocksFetched, - shuffleLocalBlocksFetched, - shuffleFetchWaitTime, - shuffleRemoteBytesRead, - shuffleRemoteBytesReadToDisk, - shuffleLocalBytesRead, - shuffleRecordsRead), + getMetricValue(shuffleRemoteBlocksFetched), + getMetricValue(shuffleLocalBlocksFetched), + getMetricValue(shuffleFetchWaitTime), + getMetricValue(shuffleRemoteBytesRead), + getMetricValue(shuffleRemoteBytesReadToDisk), + getMetricValue(shuffleLocalBytesRead), + getMetricValue(shuffleRecordsRead)), new ShuffleWriteMetrics( - shuffleBytesWritten, - shuffleWriteTime, - shuffleRecordsWritten))) + getMetricValue(shuffleBytesWritten), + getMetricValue(shuffleWriteTime), + getMetricValue(shuffleRecordsWritten)))) } else { None } @@ -296,8 +306,10 @@ private[spark] class TaskDataWrapper( @JsonIgnore @KVIndex(value = TaskIndexNames.SCHEDULER_DELAY, parent = TaskIndexNames.STAGE) def schedulerDelay: Long = { if (hasMetrics) { - AppStatusUtils.schedulerDelay(launchTime, resultFetchStart, duration, executorDeserializeTime, - resultSerializationTime, executorRunTime) + AppStatusUtils.schedulerDelay(launchTime, resultFetchStart, duration, + getMetricValue(executorDeserializeTime), + getMetricValue(resultSerializationTime), + getMetricValue(executorRunTime)) } else { -1L } @@ -330,7 +342,7 @@ private[spark] class TaskDataWrapper( @JsonIgnore @KVIndex(value = TaskIndexNames.SHUFFLE_TOTAL_READS, parent = TaskIndexNames.STAGE) private def shuffleTotalReads: Long = { if (hasMetrics) { - shuffleLocalBytesRead + shuffleRemoteBytesRead + getMetricValue(shuffleLocalBytesRead) + getMetricValue(shuffleRemoteBytesRead) } else { -1L } @@ -339,7 +351,7 @@ private[spark] class TaskDataWrapper( @JsonIgnore @KVIndex(value = TaskIndexNames.SHUFFLE_TOTAL_BLOCKS, parent = TaskIndexNames.STAGE) private def shuffleTotalBlocks: Long = { if (hasMetrics) { - shuffleLocalBlocksFetched + shuffleRemoteBlocksFetched + getMetricValue(shuffleLocalBlocksFetched) + getMetricValue(shuffleRemoteBlocksFetched) } else { -1L } diff --git a/core/src/main/scala/org/apache/spark/storage/BlockId.scala b/core/src/main/scala/org/apache/spark/storage/BlockId.scala index 7ac2c71c18eb3..68ed3aa5b062f 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockId.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockId.scala @@ -38,7 +38,7 @@ sealed abstract class BlockId { // convenience methods def asRDDId: Option[RDDBlockId] = if (isRDD) Some(asInstanceOf[RDDBlockId]) else None def isRDD: Boolean = isInstanceOf[RDDBlockId] - def isShuffle: Boolean = isInstanceOf[ShuffleBlockId] + def isShuffle: Boolean = isInstanceOf[ShuffleBlockId] || isInstanceOf[ShuffleBlockBatchId] def isBroadcast: Boolean = isInstanceOf[BroadcastBlockId] override def toString: String = name @@ -52,17 +52,29 @@ case class RDDBlockId(rddId: Int, splitIndex: Int) extends BlockId { // Format of the shuffle block ids (including data and index) should be kept in sync with // org.apache.spark.network.shuffle.ExternalShuffleBlockResolver#getBlockData(). @DeveloperApi -case class ShuffleBlockId(shuffleId: Int, mapId: Int, reduceId: Int) extends BlockId { +case class ShuffleBlockId(shuffleId: Int, mapId: Long, reduceId: Int) extends BlockId { override def name: String = "shuffle_" + shuffleId + "_" + mapId + "_" + reduceId } +// The batch id of continuous shuffle blocks of same mapId in range [startReduceId, endReduceId). @DeveloperApi -case class ShuffleDataBlockId(shuffleId: Int, mapId: Int, reduceId: Int) extends BlockId { +case class ShuffleBlockBatchId( + shuffleId: Int, + mapId: Long, + startReduceId: Int, + endReduceId: Int) extends BlockId { + override def name: String = { + "shuffle_" + shuffleId + "_" + mapId + "_" + startReduceId + "_" + endReduceId + } +} + +@DeveloperApi +case class ShuffleDataBlockId(shuffleId: Int, mapId: Long, reduceId: Int) extends BlockId { override def name: String = "shuffle_" + shuffleId + "_" + mapId + "_" + reduceId + ".data" } @DeveloperApi -case class ShuffleIndexBlockId(shuffleId: Int, mapId: Int, reduceId: Int) extends BlockId { +case class ShuffleIndexBlockId(shuffleId: Int, mapId: Long, reduceId: Int) extends BlockId { override def name: String = "shuffle_" + shuffleId + "_" + mapId + "_" + reduceId + ".index" } @@ -104,6 +116,7 @@ class UnrecognizedBlockId(name: String) object BlockId { val RDD = "rdd_([0-9]+)_([0-9]+)".r val SHUFFLE = "shuffle_([0-9]+)_([0-9]+)_([0-9]+)".r + val SHUFFLE_BATCH = "shuffle_([0-9]+)_([0-9]+)_([0-9]+)_([0-9]+)".r val SHUFFLE_DATA = "shuffle_([0-9]+)_([0-9]+)_([0-9]+).data".r val SHUFFLE_INDEX = "shuffle_([0-9]+)_([0-9]+)_([0-9]+).index".r val BROADCAST = "broadcast_([0-9]+)([_A-Za-z0-9]*)".r @@ -117,11 +130,13 @@ object BlockId { case RDD(rddId, splitIndex) => RDDBlockId(rddId.toInt, splitIndex.toInt) case SHUFFLE(shuffleId, mapId, reduceId) => - ShuffleBlockId(shuffleId.toInt, mapId.toInt, reduceId.toInt) + ShuffleBlockId(shuffleId.toInt, mapId.toLong, reduceId.toInt) + case SHUFFLE_BATCH(shuffleId, mapId, startReduceId, endReduceId) => + ShuffleBlockBatchId(shuffleId.toInt, mapId.toLong, startReduceId.toInt, endReduceId.toInt) case SHUFFLE_DATA(shuffleId, mapId, reduceId) => - ShuffleDataBlockId(shuffleId.toInt, mapId.toInt, reduceId.toInt) + ShuffleDataBlockId(shuffleId.toInt, mapId.toLong, reduceId.toInt) case SHUFFLE_INDEX(shuffleId, mapId, reduceId) => - ShuffleIndexBlockId(shuffleId.toInt, mapId.toInt, reduceId.toInt) + ShuffleIndexBlockId(shuffleId.toInt, mapId.toLong, reduceId.toInt) case BROADCAST(broadcastId, field) => BroadcastBlockId(broadcastId.toLong, field.stripPrefix("_")) case TASKRESULT(taskId) => diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala index 4b71dc1fff345..c47901314f53a 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala @@ -22,17 +22,18 @@ import java.lang.ref.{ReferenceQueue => JReferenceQueue, WeakReference} import java.nio.ByteBuffer import java.nio.channels.Channels import java.util.Collections -import java.util.concurrent.{ConcurrentHashMap, TimeUnit} +import java.util.concurrent.{CompletableFuture, ConcurrentHashMap, TimeUnit} import scala.collection.mutable import scala.collection.mutable.HashMap import scala.concurrent.{ExecutionContext, Future} import scala.concurrent.duration._ import scala.reflect.ClassTag -import scala.util.Random +import scala.util.{Failure, Random, Success, Try} import scala.util.control.NonFatal import com.codahale.metrics.{MetricRegistry, MetricSet} +import com.google.common.cache.CacheBuilder import org.apache.commons.io.IOUtils import org.apache.spark._ @@ -113,6 +114,47 @@ private[spark] class ByteBufferBlockData( } +private[spark] class HostLocalDirManager( + futureExecutionContext: ExecutionContext, + cacheSize: Int, + externalBlockStoreClient: ExternalBlockStoreClient, + host: String, + externalShuffleServicePort: Int) extends Logging { + + private val executorIdToLocalDirsCache = + CacheBuilder + .newBuilder() + .maximumSize(cacheSize) + .build[String, Array[String]]() + + private[spark] def getCachedHostLocalDirs() + : scala.collection.Map[String, Array[String]] = executorIdToLocalDirsCache.synchronized { + import scala.collection.JavaConverters._ + return executorIdToLocalDirsCache.asMap().asScala + } + + private[spark] def getHostLocalDirs( + executorIds: Array[String])( + callback: Try[java.util.Map[String, Array[String]]] => Unit): Unit = { + val hostLocalDirsCompletable = new CompletableFuture[java.util.Map[String, Array[String]]] + externalBlockStoreClient.getHostLocalDirs( + host, + externalShuffleServicePort, + executorIds, + hostLocalDirsCompletable) + hostLocalDirsCompletable.whenComplete { (hostLocalDirs, throwable) => + if (hostLocalDirs != null) { + callback(Success(hostLocalDirs)) + executorIdToLocalDirsCache.synchronized { + executorIdToLocalDirsCache.putAll(hostLocalDirs) + } + } else { + callback(Failure(throwable)) + } + } + } +} + /** * Manager running on every node (driver and executors) which provides interfaces for putting and * retrieving blocks both locally and remotely into various stores (memory, disk, and off-heap). @@ -206,6 +248,8 @@ private[spark] class BlockManager( new BlockManager.RemoteBlockDownloadFileManager(this) private val maxRemoteBlockToMem = conf.get(config.MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM) + var hostLocalDirManager: Option[HostLocalDirManager] = None + /** * Abstraction for storing blocks from bytes, whether they start in memory or on disk. * @@ -433,6 +477,21 @@ private[spark] class BlockManager( registerWithExternalShuffleServer() } + hostLocalDirManager = + if (conf.get(config.SHUFFLE_HOST_LOCAL_DISK_READING_ENABLED) && + !conf.get(config.SHUFFLE_USE_OLD_FETCH_PROTOCOL)) { + externalBlockStoreClient.map { blockStoreClient => + new HostLocalDirManager( + futureExecutionContext, + conf.get(config.STORAGE_LOCAL_DISK_BY_EXECUTORS_CACHE_SIZE), + blockStoreClient, + blockManagerId.host, + externalShuffleServicePort) + } + } else { + None + } + logInfo(s"Initialized BlockManager: $blockManagerId") } @@ -446,7 +505,7 @@ private[spark] class BlockManager( } } - private def registerWithExternalShuffleServer() { + private def registerWithExternalShuffleServer(): Unit = { logInfo("Registering executor with local external shuffle service.") val shuffleConfig = new ExecutorShuffleInfo( diskBlockManager.localDirsString, @@ -542,13 +601,19 @@ private[spark] class BlockManager( } } + override def getHostLocalShuffleData( + blockId: BlockId, + dirs: Array[String]): ManagedBuffer = { + shuffleManager.shuffleBlockResolver.getBlockData(blockId, Some(dirs)) + } + /** * Interface to get local block data. Throws an exception if the block cannot be found or * cannot be read successfully. */ - override def getBlockData(blockId: BlockId): ManagedBuffer = { + override def getLocalBlockData(blockId: BlockId): ManagedBuffer = { if (blockId.isShuffle) { - shuffleManager.shuffleBlockResolver.getBlockData(blockId.asInstanceOf[ShuffleBlockId]) + shuffleManager.shuffleBlockResolver.getBlockData(blockId) } else { getLocalBytes(blockId) match { case Some(blockData) => @@ -853,7 +918,6 @@ private[spark] class BlockManager( * @param bufferTransformer this transformer expected to open the file if the block is backed by a * file by this it is guaranteed the whole content can be loaded * @tparam T result type - * @return */ private[spark] def getRemoteBlock[T]( blockId: BlockId, @@ -1725,15 +1789,23 @@ private[spark] class BlockManager( * lock on the block. */ private def removeBlockInternal(blockId: BlockId, tellMaster: Boolean): Unit = { + val blockStatus = if (tellMaster) { + val blockInfo = blockInfoManager.assertBlockIsLockedForWriting(blockId) + Some(getCurrentBlockStatus(blockId, blockInfo)) + } else None + // Removals are idempotent in disk store and memory store. At worst, we get a warning. val removedFromMemory = memoryStore.remove(blockId) val removedFromDisk = diskStore.remove(blockId) if (!removedFromMemory && !removedFromDisk) { logWarning(s"Block $blockId could not be removed as it was not found on disk or in memory") } + blockInfoManager.removeBlock(blockId) if (tellMaster) { - reportBlockStatus(blockId, BlockStatus.empty) + // Only update storage level from the captured block status before deleting, so that + // memory size and disk size are being kept for calculating delta. + reportBlockStatus(blockId, blockStatus.get.copy(storageLevel = StorageLevel.NONE)) } } @@ -1831,7 +1903,7 @@ private[spark] object BlockManager { private val POLL_TIMEOUT = 1000 @volatile private var stopped = false - private val cleaningThread = new Thread() { override def run() { keepCleaning() } } + private val cleaningThread = new Thread() { override def run(): Unit = { keepCleaning() } } cleaningThread.setDaemon(true) cleaningThread.setName("RemoteBlock-temp-file-clean-thread") cleaningThread.start() diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerId.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerId.scala index d188bdd912e5e..49e32d04d450a 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerId.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerId.scala @@ -27,7 +27,7 @@ import org.apache.spark.util.Utils /** * :: DeveloperApi :: - * This class represent an unique identifier for a BlockManager. + * This class represent a unique identifier for a BlockManager. * * The first 2 constructors of this class are made private to ensure that BlockManagerId objects * can be created only using the apply method in the companion object. This allows de-duplication diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala index 9d13fedfb0c58..e335eb6ddb761 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala @@ -17,8 +17,8 @@ package org.apache.spark.storage -import scala.collection.Iterable import scala.collection.generic.CanBuildFrom +import scala.collection.immutable.Iterable import scala.concurrent.Future import org.apache.spark.{SparkConf, SparkException} @@ -30,6 +30,7 @@ import org.apache.spark.util.{RpcUtils, ThreadUtils} private[spark] class BlockManagerMaster( var driverEndpoint: RpcEndpointRef, + var driverHeartbeatEndPoint: RpcEndpointRef, conf: SparkConf, isDriver: Boolean) extends Logging { @@ -37,7 +38,7 @@ class BlockManagerMaster( val timeout = RpcUtils.askRpcTimeout(conf) /** Remove a dead executor from the driver endpoint. This is only called on the driver side. */ - def removeExecutor(execId: String) { + def removeExecutor(execId: String): Unit = { tell(RemoveExecutor(execId)) logInfo("Removed " + execId + " successfully in removeExecutor") } @@ -45,7 +46,7 @@ class BlockManagerMaster( /** Request removal of a dead executor from the driver endpoint. * This is only called on the driver side. Non-blocking */ - def removeExecutorAsync(execId: String) { + def removeExecutorAsync(execId: String): Unit = { driverEndpoint.ask[Boolean](RemoveExecutor(execId)) logInfo("Removal of executor " + execId + " requested") } @@ -120,12 +121,12 @@ class BlockManagerMaster( * Remove a block from the slaves that have it. This can only be used to remove * blocks that the driver knows about. */ - def removeBlock(blockId: BlockId) { + def removeBlock(blockId: BlockId): Unit = { driverEndpoint.askSync[Boolean](RemoveBlock(blockId)) } /** Remove all blocks belonging to the given RDD. */ - def removeRdd(rddId: Int, blocking: Boolean) { + def removeRdd(rddId: Int, blocking: Boolean): Unit = { val future = driverEndpoint.askSync[Future[Seq[Int]]](RemoveRdd(rddId)) future.failed.foreach(e => logWarning(s"Failed to remove RDD $rddId - ${e.getMessage}", e) @@ -136,7 +137,7 @@ class BlockManagerMaster( } /** Remove all blocks belonging to the given shuffle. */ - def removeShuffle(shuffleId: Int, blocking: Boolean) { + def removeShuffle(shuffleId: Int, blocking: Boolean): Unit = { val future = driverEndpoint.askSync[Future[Seq[Boolean]]](RemoveShuffle(shuffleId)) future.failed.foreach(e => logWarning(s"Failed to remove shuffle $shuffleId - ${e.getMessage}", e) @@ -147,7 +148,7 @@ class BlockManagerMaster( } /** Remove all blocks belonging to the given broadcast. */ - def removeBroadcast(broadcastId: Long, removeFromMaster: Boolean, blocking: Boolean) { + def removeBroadcast(broadcastId: Long, removeFromMaster: Boolean, blocking: Boolean): Unit = { val future = driverEndpoint.askSync[Future[Seq[Int]]]( RemoveBroadcast(broadcastId, removeFromMaster)) future.failed.foreach(e => @@ -200,7 +201,7 @@ class BlockManagerMaster( Option[BlockStatus], Iterable[Option[BlockStatus]]]] val blockStatus = timeout.awaitResult( - Future.sequence[Option[BlockStatus], Iterable](futures)(cbf, ThreadUtils.sameThread)) + Future.sequence(futures)(cbf, ThreadUtils.sameThread)) if (blockStatus == null) { throw new SparkException("BlockManager returned null for BlockStatus query: " + blockId) } @@ -226,16 +227,21 @@ class BlockManagerMaster( } /** Stop the driver endpoint, called only on the Spark driver node */ - def stop() { + def stop(): Unit = { if (driverEndpoint != null && isDriver) { tell(StopBlockManagerMaster) driverEndpoint = null + if (driverHeartbeatEndPoint.askSync[Boolean](StopBlockManagerMaster)) { + driverHeartbeatEndPoint = null + } else { + logWarning("Failed to stop BlockManagerMasterHeartbeatEndpoint") + } logInfo("BlockManagerMaster stopped") } } /** Send a one-way message to the master endpoint, to which we expect it to reply with true. */ - private def tell(message: Any) { + private def tell(message: Any): Unit = { if (!driverEndpoint.askSync[Boolean](message)) { throw new SparkException("BlockManagerMasterEndpoint returned false, expected true.") } @@ -245,4 +251,5 @@ class BlockManagerMaster( private[spark] object BlockManagerMaster { val DRIVER_ENDPOINT_NAME = "BlockManagerMaster" + val DRIVER_HEARTBEAT_ENDPOINT_NAME = "BlockManagerMasterHeartbeat" } diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala index 5e021b334fd2b..d7f7eedc7f33b 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala @@ -26,17 +26,19 @@ import scala.collection.mutable import scala.concurrent.{ExecutionContext, Future} import scala.util.Random +import com.google.common.cache.CacheBuilder + import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.{config, Logging} import org.apache.spark.network.shuffle.ExternalBlockStoreClient -import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, RpcEnv, ThreadSafeRpcEndpoint} +import org.apache.spark.rpc.{IsolatedRpcEndpoint, RpcCallContext, RpcEndpointRef, RpcEnv} import org.apache.spark.scheduler._ import org.apache.spark.storage.BlockManagerMessages._ import org.apache.spark.util.{RpcUtils, ThreadUtils, Utils} /** - * BlockManagerMasterEndpoint is an [[ThreadSafeRpcEndpoint]] on the master node to track statuses + * BlockManagerMasterEndpoint is an [[IsolatedRpcEndpoint]] on the master node to track statuses * of all slaves' block managers. */ private[spark] @@ -45,11 +47,16 @@ class BlockManagerMasterEndpoint( val isLocal: Boolean, conf: SparkConf, listenerBus: LiveListenerBus, - externalBlockStoreClient: Option[ExternalBlockStoreClient]) - extends ThreadSafeRpcEndpoint with Logging { + externalBlockStoreClient: Option[ExternalBlockStoreClient], + blockManagerInfo: mutable.Map[BlockManagerId, BlockManagerInfo]) + extends IsolatedRpcEndpoint with Logging { - // Mapping from block manager id to the block manager's information. - private val blockManagerInfo = new mutable.HashMap[BlockManagerId, BlockManagerInfo] + // Mapping from executor id to the block manager's local disk directories. + private val executorIdToLocalDirs = + CacheBuilder + .newBuilder() + .maximumSize(conf.get(config.STORAGE_LOCAL_DISK_BY_EXECUTORS_CACHE_SIZE)) + .build[String, Array[String]]() // Mapping from external shuffle service block manager id to the block statuses. private val blockStatusByShuffleService = @@ -91,8 +98,13 @@ class BlockManagerMasterEndpoint( case _updateBlockInfo @ UpdateBlockInfo(blockManagerId, blockId, storageLevel, deserializedSize, size) => - context.reply(updateBlockInfo(blockManagerId, blockId, storageLevel, deserializedSize, size)) - listenerBus.post(SparkListenerBlockUpdated(BlockUpdatedInfo(_updateBlockInfo))) + val isSuccess = updateBlockInfo(blockManagerId, blockId, storageLevel, deserializedSize, size) + context.reply(isSuccess) + // SPARK-30594: we should not post `SparkListenerBlockUpdated` when updateBlockInfo + // returns false since the block info would be updated again later. + if (isSuccess) { + listenerBus.post(SparkListenerBlockUpdated(BlockUpdatedInfo(_updateBlockInfo))) + } case GetLocations(blockId) => context.reply(getLocations(blockId)) @@ -144,9 +156,6 @@ class BlockManagerMasterEndpoint( case StopBlockManagerMaster => context.reply(true) stop() - - case BlockManagerHeartbeat(blockManagerId) => - context.reply(heartbeatReceived(blockManagerId)) } private def removeRdd(rddId: Int): Future[Seq[Int]] = { @@ -243,7 +252,7 @@ class BlockManagerMasterEndpoint( Future.sequence(futures) } - private def removeBlockManager(blockManagerId: BlockManagerId) { + private def removeBlockManager(blockManagerId: BlockManagerId): Unit = { val info = blockManagerInfo(blockManagerId) // Remove the block manager from blockManagerIdByExecutor. @@ -285,27 +294,14 @@ class BlockManagerMasterEndpoint( } - private def removeExecutor(execId: String) { + private def removeExecutor(execId: String): Unit = { logInfo("Trying to remove executor " + execId + " from BlockManagerMaster.") blockManagerIdByExecutor.get(execId).foreach(removeBlockManager) } - /** - * Return true if the driver knows about the given block manager. Otherwise, return false, - * indicating that the block manager should re-register. - */ - private def heartbeatReceived(blockManagerId: BlockManagerId): Boolean = { - if (!blockManagerInfo.contains(blockManagerId)) { - blockManagerId.isDriver && !isLocal - } else { - blockManagerInfo(blockManagerId).updateLastSeenMs() - true - } - } - // Remove a block from the slaves that have it. This can only be used to remove // blocks that the master knows about. - private def removeBlockFromWorkers(blockId: BlockId) { + private def removeBlockFromWorkers(blockId: BlockId): Unit = { val locations = blockLocations.get(blockId) if (locations != null) { locations.foreach { blockManagerId: BlockManagerId => @@ -411,6 +407,7 @@ class BlockManagerMasterEndpoint( topologyMapper.getTopologyForHost(idWithoutTopologyInfo.host)) val time = System.currentTimeMillis() + executorIdToLocalDirs.put(id.executorId, localDirs) if (!blockManagerInfo.contains(id)) { blockManagerIdByExecutor.get(id.executorId) match { case Some(oldId) => @@ -434,7 +431,7 @@ class BlockManagerMasterEndpoint( None } - blockManagerInfo(id) = new BlockManagerInfo(id, System.currentTimeMillis(), localDirs, + blockManagerInfo(id) = new BlockManagerInfo(id, System.currentTimeMillis(), maxOnHeapMemSize, maxOffHeapMemSize, slaveEndpoint, externalShuffleServiceBlockStatus) } listenerBus.post(SparkListenerBlockManagerAdded(time, id, maxOnHeapMemSize + maxOffHeapMemSize, @@ -514,15 +511,16 @@ class BlockManagerMasterEndpoint( if (locations.nonEmpty && status.isDefined) { val localDirs = locations.find { loc => - if (loc.port != externalShuffleServicePort && loc.host == requesterHost) { + // When the external shuffle service running on the same host is found among the block + // locations then the block must be persisted on the disk. In this case the executorId + // can be used to access this block even when the original executor is already stopped. + loc.host == requesterHost && + (loc.port == externalShuffleServicePort || blockManagerInfo .get(loc) .flatMap(_.getStatus(blockId).map(_.storageLevel.useDisk)) - .getOrElse(false) - } else { - false - } - }.map(blockManagerInfo(_).localDirs) + .getOrElse(false)) + }.flatMap { bmId => Option(executorIdToLocalDirs.getIfPresent(bmId.executorId)) } Some(BlockLocationsAndStatus(locations, status.get, localDirs)) } else { None @@ -574,7 +572,6 @@ object BlockStatus { private[spark] class BlockManagerInfo( val blockManagerId: BlockManagerId, timeMs: Long, - val localDirs: Array[String], val maxOnHeapMem: Long, val maxOffHeapMem: Long, val slaveEndpoint: RpcEndpointRef, @@ -593,7 +590,7 @@ private[spark] class BlockManagerInfo( def getStatus(blockId: BlockId): Option[BlockStatus] = Option(_blocks.get(blockId)) - def updateLastSeenMs() { + def updateLastSeenMs(): Unit = { _lastSeenMs = System.currentTimeMillis() } @@ -601,7 +598,7 @@ private[spark] class BlockManagerInfo( blockId: BlockId, storageLevel: StorageLevel, memSize: Long, - diskSize: Long) { + diskSize: Long): Unit = { updateLastSeenMs() @@ -681,7 +678,7 @@ private[spark] class BlockManagerInfo( } } - def removeBlock(blockId: BlockId) { + def removeBlock(blockId: BlockId): Unit = { if (_blocks.containsKey(blockId)) { _remainingMem += _blocks.get(blockId).memSize _blocks.remove(blockId) @@ -699,7 +696,7 @@ private[spark] class BlockManagerInfo( override def toString: String = "BlockManagerInfo " + timeMs + " " + _remainingMem - def clear() { + def clear(): Unit = { _blocks.clear() } } diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterHeartbeatEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterHeartbeatEndpoint.scala new file mode 100644 index 0000000000000..b06002123d803 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterHeartbeatEndpoint.scala @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.storage + +import scala.collection.mutable + +import org.apache.spark.internal.Logging +import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} +import org.apache.spark.storage.BlockManagerMessages.{BlockManagerHeartbeat, StopBlockManagerMaster} + +/** + * Separate heartbeat out of BlockManagerMasterEndpoint due to performance consideration. + */ +private[spark] class BlockManagerMasterHeartbeatEndpoint( + override val rpcEnv: RpcEnv, + isLocal: Boolean, + blockManagerInfo: mutable.Map[BlockManagerId, BlockManagerInfo]) + extends ThreadSafeRpcEndpoint with Logging { + + override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { + case BlockManagerHeartbeat(blockManagerId) => + context.reply(heartbeatReceived(blockManagerId)) + + case StopBlockManagerMaster => + stop() + context.reply(true) + + case _ => // do nothing for unexpected events + } + + /** + * Return true if the driver knows about the given block manager. Otherwise, return false, + * indicating that the block manager should re-register. + */ + private def heartbeatReceived(blockManagerId: BlockManagerId): Boolean = { + if (!blockManagerInfo.contains(blockManagerId)) { + blockManagerId.isDriver && !isLocal + } else { + blockManagerInfo(blockManagerId).updateLastSeenMs() + true + } + } +} diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala index 67544b20408a6..29e21142ce449 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerSlaveEndpoint.scala @@ -21,7 +21,7 @@ import scala.concurrent.{ExecutionContext, Future} import org.apache.spark.{MapOutputTracker, SparkEnv} import org.apache.spark.internal.Logging -import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} +import org.apache.spark.rpc.{IsolatedRpcEndpoint, RpcCallContext, RpcEnv} import org.apache.spark.storage.BlockManagerMessages._ import org.apache.spark.util.{ThreadUtils, Utils} @@ -34,7 +34,7 @@ class BlockManagerSlaveEndpoint( override val rpcEnv: RpcEnv, blockManager: BlockManager, mapOutputTracker: MapOutputTracker) - extends ThreadSafeRpcEndpoint with Logging { + extends IsolatedRpcEndpoint with Logging { private val asyncThreadPool = ThreadUtils.newDaemonCachedThreadPool("block-manager-slave-async-thread-pool", 100) @@ -80,7 +80,7 @@ class BlockManagerSlaveEndpoint( } - private def doAsync[T](actionMessage: String, context: RpcCallContext)(body: => T) { + private def doAsync[T](actionMessage: String, context: RpcCallContext)(body: => T): Unit = { val future = Future { logDebug(actionMessage) body diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala index c3990bf71e604..ee43b76e17010 100644 --- a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala +++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala @@ -20,6 +20,8 @@ package org.apache.spark.storage import java.io.{File, IOException} import java.util.UUID +import scala.util.control.NonFatal + import org.apache.spark.SparkConf import org.apache.spark.executor.ExecutorExitCode import org.apache.spark.internal.{config, Logging} @@ -117,20 +119,38 @@ private[spark] class DiskBlockManager(conf: SparkConf, deleteFilesOnStop: Boolea /** Produces a unique block id and File suitable for storing local intermediate results. */ def createTempLocalBlock(): (TempLocalBlockId, File) = { - var blockId = new TempLocalBlockId(UUID.randomUUID()) - while (getFile(blockId).exists()) { - blockId = new TempLocalBlockId(UUID.randomUUID()) + var blockId = TempLocalBlockId(UUID.randomUUID()) + var tempLocalFile = getFile(blockId) + var count = 0 + while (!canCreateFile(tempLocalFile) && count < Utils.MAX_DIR_CREATION_ATTEMPTS) { + blockId = TempLocalBlockId(UUID.randomUUID()) + tempLocalFile = getFile(blockId) + count += 1 } - (blockId, getFile(blockId)) + (blockId, tempLocalFile) } /** Produces a unique block id and File suitable for storing shuffled intermediate results. */ def createTempShuffleBlock(): (TempShuffleBlockId, File) = { - var blockId = new TempShuffleBlockId(UUID.randomUUID()) - while (getFile(blockId).exists()) { - blockId = new TempShuffleBlockId(UUID.randomUUID()) + var blockId = TempShuffleBlockId(UUID.randomUUID()) + var tempShuffleFile = getFile(blockId) + var count = 0 + while (!canCreateFile(tempShuffleFile) && count < Utils.MAX_DIR_CREATION_ATTEMPTS) { + blockId = TempShuffleBlockId(UUID.randomUUID()) + tempShuffleFile = getFile(blockId) + count += 1 + } + (blockId, tempShuffleFile) + } + + private def canCreateFile(file: File): Boolean = { + try { + file.createNewFile() + } catch { + case NonFatal(_) => + logError("Failed to create temporary block file: " + file.getAbsoluteFile) + false } - (blockId, getFile(blockId)) } /** @@ -161,7 +181,7 @@ private[spark] class DiskBlockManager(conf: SparkConf, deleteFilesOnStop: Boolea } /** Cleanup local dirs and stop shuffle sender. */ - private[spark] def stop() { + private[spark] def stop(): Unit = { // Remove the shutdown hook. It causes memory leaks if we leave it around. try { ShutdownHookManager.removeShutdownHook(shutdownHook) diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala index 758621c52495b..e55c09274cd9a 100644 --- a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala +++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala @@ -18,7 +18,7 @@ package org.apache.spark.storage import java.io.{BufferedOutputStream, File, FileOutputStream, OutputStream} -import java.nio.channels.FileChannel +import java.nio.channels.{ClosedByInterruptException, FileChannel} import org.apache.spark.internal.Logging import org.apache.spark.serializer.{SerializationStream, SerializerInstance, SerializerManager} @@ -150,7 +150,7 @@ private[spark] class DiskBlockObjectWriter( /** * Commits any remaining partial writes and closes resources. */ - override def close() { + override def close(): Unit = { if (initialized) { Utils.tryWithSafeFinally { commitAndGet() @@ -219,6 +219,12 @@ private[spark] class DiskBlockObjectWriter( truncateStream = new FileOutputStream(file, true) truncateStream.getChannel.truncate(committedPosition) } catch { + // ClosedByInterruptException is an excepted exception when kill task, + // don't log the exception stack trace to avoid confusing users. + // See: SPARK-28340 + case ce: ClosedByInterruptException => + logError("Exception occurred while reverting partial writes to file " + + file + ", " + ce.getMessage) case e: Exception => logError("Uncaught exception while reverting partial writes to file " + file, e) } finally { @@ -234,7 +240,7 @@ private[spark] class DiskBlockObjectWriter( /** * Writes a key-value pair. */ - override def write(key: Any, value: Any) { + override def write(key: Any, value: Any): Unit = { if (!streamOpen) { open() } @@ -270,14 +276,14 @@ private[spark] class DiskBlockObjectWriter( * Report the number of bytes written in this writer's shuffle write metrics. * Note that this is only valid before the underlying streams are closed. */ - private def updateBytesWritten() { + private def updateBytesWritten(): Unit = { val pos = channel.position() writeMetrics.incBytesWritten(pos - reportedPosition) reportedPosition = pos } // For testing - private[spark] override def flush() { + private[spark] override def flush(): Unit = { objOut.flush() bs.flush() } diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala index a5b7ee5762c49..cd4c86006af5a 100644 --- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala +++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala @@ -18,11 +18,13 @@ package org.apache.spark.storage import java.io.{InputStream, IOException} +import java.nio.channels.ClosedByInterruptException import java.util.concurrent.{LinkedBlockingQueue, TimeUnit} import javax.annotation.concurrent.GuardedBy import scala.collection.mutable -import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Queue} +import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, LinkedHashMap, Queue} +import scala.util.{Failure, Success} import org.apache.commons.io.IOUtils @@ -48,9 +50,10 @@ import org.apache.spark.util.{CompletionIterator, TaskCompletionListener, Utils} * @param shuffleClient [[BlockStoreClient]] for fetching remote blocks * @param blockManager [[BlockManager]] for reading local blocks * @param blocksByAddress list of blocks to fetch grouped by the [[BlockManagerId]]. - * For each block we also require the size (in bytes as a long field) in - * order to throttle the memory usage. Note that zero-sized blocks are - * already excluded, which happened in + * For each block we also require two info: 1. the size (in bytes as a long + * field) in order to throttle the memory usage; 2. the mapIndex for this + * block, which indicate the index in the map stage. + * Note that zero-sized blocks are already excluded, which happened in * [[org.apache.spark.MapOutputTracker.convertMapStatuses]]. * @param streamWrapper A function to wrap the returned input stream. * @param maxBytesInFlight max size (in bytes) of remote blocks to fetch at any given point. @@ -60,13 +63,15 @@ import org.apache.spark.util.{CompletionIterator, TaskCompletionListener, Utils} * @param maxReqSizeShuffleToMem max size (in bytes) of a request that can be shuffled to memory. * @param detectCorrupt whether to detect any corruption in fetched blocks. * @param shuffleMetrics used to report shuffle metrics. + * @param doBatchFetch fetch continuous shuffle blocks from same executor in batch if the server + * side supports. */ private[spark] final class ShuffleBlockFetcherIterator( context: TaskContext, shuffleClient: BlockStoreClient, blockManager: BlockManager, - blocksByAddress: Iterator[(BlockManagerId, Seq[(BlockId, Long)])], + blocksByAddress: Iterator[(BlockManagerId, Seq[(BlockId, Long, Int)])], streamWrapper: (BlockId, InputStream) => InputStream, maxBytesInFlight: Long, maxReqsInFlight: Int, @@ -74,16 +79,20 @@ final class ShuffleBlockFetcherIterator( maxReqSizeShuffleToMem: Long, detectCorrupt: Boolean, detectCorruptUseExtraMemory: Boolean, - shuffleMetrics: ShuffleReadMetricsReporter) + shuffleMetrics: ShuffleReadMetricsReporter, + doBatchFetch: Boolean) extends Iterator[(BlockId, InputStream)] with DownloadFileManager with Logging { import ShuffleBlockFetcherIterator._ + // Make remote requests at most maxBytesInFlight / 5 in length; the reason to keep them + // smaller than maxBytesInFlight is to allow multiple, parallel fetches from up to 5 + // nodes, rather than blocking on reading output from one node. + private val targetRemoteRequestSize = math.max(maxBytesInFlight / 5, 1L) + /** * Total number of blocks to fetch. This should be equal to the total number of blocks * in [[blocksByAddress]] because we already filter out zero-sized blocks in [[blocksByAddress]]. - * - * This should equal localBlocks.size + remoteBlocks.size. */ private[this] var numBlocksToFetch = 0 @@ -96,10 +105,14 @@ final class ShuffleBlockFetcherIterator( private[this] val startTimeNs = System.nanoTime() /** Local blocks to fetch, excluding zero-sized blocks. */ - private[this] val localBlocks = scala.collection.mutable.LinkedHashSet[BlockId]() + private[this] val localBlocks = scala.collection.mutable.LinkedHashSet[(BlockId, Int)]() + + /** Host local blockIds to fetch by executors, excluding zero-sized blocks. */ + private[this] val hostLocalBlocksByExecutor = + LinkedHashMap[BlockManagerId, Seq[(BlockId, Long, Int)]]() - /** Remote blocks to fetch, excluding zero-sized blocks. */ - private[this] val remoteBlocks = new HashSet[BlockId]() + /** Host local blocks to fetch, excluding zero-sized blocks. */ + private[this] val hostLocalBlocks = scala.collection.mutable.LinkedHashSet[(BlockId, Int)]() /** * A queue to hold our results. This turns the asynchronous model provided by @@ -188,7 +201,7 @@ final class ShuffleBlockFetcherIterator( /** * Mark the iterator as zombie, and release all buffers that haven't been deserialized yet. */ - private[storage] def cleanup() { + private[storage] def cleanup(): Unit = { synchronized { isZombie = true } @@ -198,7 +211,7 @@ final class ShuffleBlockFetcherIterator( while (iter.hasNext) { val result = iter.next() result match { - case SuccessFetchResult(_, address, _, buf, _) => + case SuccessFetchResult(_, _, address, _, buf, _) => if (address != blockManager.blockManagerId) { shuffleMetrics.incRemoteBytesRead(buf.size) if (buf.isInstanceOf[FileSegmentManagedBuffer]) { @@ -217,16 +230,18 @@ final class ShuffleBlockFetcherIterator( } } - private[this] def sendRequest(req: FetchRequest) { + private[this] def sendRequest(req: FetchRequest): Unit = { logDebug("Sending request for %d blocks (%s) from %s".format( req.blocks.size, Utils.bytesToString(req.size), req.address.hostPort)) bytesInFlight += req.size reqsInFlight += 1 - // so we can look up the size of each blockID - val sizeMap = req.blocks.map { case (blockId, size) => (blockId.toString, size) }.toMap - val remainingBlocks = new HashSet[String]() ++= sizeMap.keys - val blockIds = req.blocks.map(_._1.toString) + // so we can look up the block info of each blockID + val infoMap = req.blocks.map { + case FetchBlockInfo(blockId, size, mapIndex) => (blockId.toString, (size, mapIndex)) + }.toMap + val remainingBlocks = new HashSet[String]() ++= infoMap.keys + val blockIds = req.blocks.map(_.blockId.toString) val address = req.address val blockFetchingListener = new BlockFetchingListener { @@ -239,8 +254,8 @@ final class ShuffleBlockFetcherIterator( // This needs to be released after use. buf.retain() remainingBlocks -= blockId - results.put(new SuccessFetchResult(BlockId(blockId), address, sizeMap(blockId), buf, - remainingBlocks.isEmpty)) + results.put(new SuccessFetchResult(BlockId(blockId), infoMap(blockId)._2, + address, infoMap(blockId)._1, buf, remainingBlocks.isEmpty)) logDebug("remainingBlocks: " + remainingBlocks) } } @@ -249,7 +264,7 @@ final class ShuffleBlockFetcherIterator( override def onBlockFetchFailure(blockId: String, e: Throwable): Unit = { logError(s"Failed to get block(s) from ${req.address.host}:${req.address.port}", e) - results.put(new FailureFetchResult(BlockId(blockId), address, e)) + results.put(new FailureFetchResult(BlockId(blockId), infoMap(blockId)._2, address, e)) } } @@ -265,70 +280,179 @@ final class ShuffleBlockFetcherIterator( } } - private[this] def splitLocalRemoteBlocks(): ArrayBuffer[FetchRequest] = { - // Make remote requests at most maxBytesInFlight / 5 in length; the reason to keep them - // smaller than maxBytesInFlight is to allow multiple, parallel fetches from up to 5 - // nodes, rather than blocking on reading output from one node. - val targetRequestSize = math.max(maxBytesInFlight / 5, 1L) - logDebug("maxBytesInFlight: " + maxBytesInFlight + ", targetRequestSize: " + targetRequestSize - + ", maxBlocksInFlightPerAddress: " + maxBlocksInFlightPerAddress) - - // Split local and remote blocks. Remote blocks are further split into FetchRequests of size - // at most maxBytesInFlight in order to limit the amount of data in flight. - val remoteRequests = new ArrayBuffer[FetchRequest] + private[this] def partitionBlocksByFetchMode(): ArrayBuffer[FetchRequest] = { + logDebug(s"maxBytesInFlight: $maxBytesInFlight, targetRemoteRequestSize: " + + s"$targetRemoteRequestSize, maxBlocksInFlightPerAddress: $maxBlocksInFlightPerAddress") + + // Partition to local, host-local and remote blocks. Remote blocks are further split into + // FetchRequests of size at most maxBytesInFlight in order to limit the amount of data in flight + val collectedRemoteRequests = new ArrayBuffer[FetchRequest] var localBlockBytes = 0L + var hostLocalBlockBytes = 0L var remoteBlockBytes = 0L + var numRemoteBlocks = 0 + + val hostLocalDirReadingEnabled = + blockManager.hostLocalDirManager != null && blockManager.hostLocalDirManager.isDefined for ((address, blockInfos) <- blocksByAddress) { if (address.executorId == blockManager.blockManagerId.executorId) { - blockInfos.find(_._2 <= 0) match { - case Some((blockId, size)) if size < 0 => - throw new BlockException(blockId, "Negative block size " + size) - case Some((blockId, size)) if size == 0 => - throw new BlockException(blockId, "Zero-sized blocks should be excluded.") - case None => // do nothing. - } - localBlocks ++= blockInfos.map(_._1) - localBlockBytes += blockInfos.map(_._2).sum - numBlocksToFetch += localBlocks.size + checkBlockSizes(blockInfos) + val mergedBlockInfos = mergeContinuousShuffleBlockIdsIfNeeded( + blockInfos.map(info => FetchBlockInfo(info._1, info._2, info._3)).to[ArrayBuffer]) + localBlocks ++= mergedBlockInfos.map(info => (info.blockId, info.mapIndex)) + localBlockBytes += mergedBlockInfos.map(_.size).sum + } else if (hostLocalDirReadingEnabled && address.host == blockManager.blockManagerId.host) { + checkBlockSizes(blockInfos) + val mergedBlockInfos = mergeContinuousShuffleBlockIdsIfNeeded( + blockInfos.map(info => FetchBlockInfo(info._1, info._2, info._3)).to[ArrayBuffer]) + val blocksForAddress = + mergedBlockInfos.map(info => (info.blockId, info.size, info.mapIndex)) + hostLocalBlocksByExecutor += address -> blocksForAddress + hostLocalBlocks ++= blocksForAddress.map(info => (info._1, info._3)) + hostLocalBlockBytes += mergedBlockInfos.map(_.size).sum } else { - val iterator = blockInfos.iterator - var curRequestSize = 0L - var curBlocks = new ArrayBuffer[(BlockId, Long)] - while (iterator.hasNext) { - val (blockId, size) = iterator.next() - remoteBlockBytes += size - if (size < 0) { - throw new BlockException(blockId, "Negative block size " + size) - } else if (size == 0) { - throw new BlockException(blockId, "Zero-sized blocks should be excluded.") + numRemoteBlocks += blockInfos.size + remoteBlockBytes += blockInfos.map(_._2).sum + collectFetchRequests(address, blockInfos, collectedRemoteRequests) + } + } + val totalBytes = localBlockBytes + remoteBlockBytes + hostLocalBlockBytes + logInfo(s"Getting $numBlocksToFetch (${Utils.bytesToString(totalBytes)}) non-empty blocks " + + s"including ${localBlocks.size} (${Utils.bytesToString(localBlockBytes)}) local and " + + s"${hostLocalBlocks.size} (${Utils.bytesToString(hostLocalBlockBytes)}) " + + s"host-local and $numRemoteBlocks (${Utils.bytesToString(remoteBlockBytes)}) remote blocks") + collectedRemoteRequests + } + + private def collectFetchRequests( + address: BlockManagerId, + blockInfos: Seq[(BlockId, Long, Int)], + collectedRemoteRequests: ArrayBuffer[FetchRequest]): Unit = { + val iterator = blockInfos.iterator + var curRequestSize = 0L + var curBlocks = new ArrayBuffer[FetchBlockInfo] + + def createFetchRequest(blocks: Seq[FetchBlockInfo]): Unit = { + collectedRemoteRequests += FetchRequest(address, blocks) + logDebug(s"Creating fetch request of $curRequestSize at $address " + + s"with ${blocks.size} blocks") + } + + def createFetchRequests(): Unit = { + val mergedBlocks = mergeContinuousShuffleBlockIdsIfNeeded(curBlocks) + curBlocks = new ArrayBuffer[FetchBlockInfo] + if (mergedBlocks.length <= maxBlocksInFlightPerAddress) { + createFetchRequest(mergedBlocks) + } else { + mergedBlocks.grouped(maxBlocksInFlightPerAddress).foreach { blocks => + if (blocks.length == maxBlocksInFlightPerAddress) { + createFetchRequest(blocks) } else { - curBlocks += ((blockId, size)) - remoteBlocks += blockId - numBlocksToFetch += 1 - curRequestSize += size - } - if (curRequestSize >= targetRequestSize || - curBlocks.size >= maxBlocksInFlightPerAddress) { - // Add this FetchRequest - remoteRequests += new FetchRequest(address, curBlocks) - logDebug(s"Creating fetch request of $curRequestSize at $address " - + s"with ${curBlocks.size} blocks") - curBlocks = new ArrayBuffer[(BlockId, Long)] - curRequestSize = 0 + // The last group does not exceed `maxBlocksInFlightPerAddress`. Put it back + // to `curBlocks`. + curBlocks = blocks + numBlocksToFetch -= blocks.size } } - // Add in the final request - if (curBlocks.nonEmpty) { - remoteRequests += new FetchRequest(address, curBlocks) + } + curRequestSize = curBlocks.map(_.size).sum + } + + while (iterator.hasNext) { + val (blockId, size, mapIndex) = iterator.next() + assertPositiveBlockSize(blockId, size) + curBlocks += FetchBlockInfo(blockId, size, mapIndex) + curRequestSize += size + // For batch fetch, the actual block in flight should count for merged block. + val mayExceedsMaxBlocks = !doBatchFetch && curBlocks.size >= maxBlocksInFlightPerAddress + if (curRequestSize >= targetRemoteRequestSize || mayExceedsMaxBlocks) { + createFetchRequests() + } + } + // Add in the final request + if (curBlocks.nonEmpty) { + createFetchRequests() + } + } + + private def assertPositiveBlockSize(blockId: BlockId, blockSize: Long): Unit = { + if (blockSize < 0) { + throw BlockException(blockId, "Negative block size " + size) + } else if (blockSize == 0) { + throw BlockException(blockId, "Zero-sized blocks should be excluded.") + } + } + + private def checkBlockSizes(blockInfos: Seq[(BlockId, Long, Int)]): Unit = { + blockInfos.foreach { case (blockId, size, _) => assertPositiveBlockSize(blockId, size) } + } + + private[this] def mergeContinuousShuffleBlockIdsIfNeeded( + blocks: ArrayBuffer[FetchBlockInfo]): ArrayBuffer[FetchBlockInfo] = { + val result = if (doBatchFetch) { + var curBlocks = new ArrayBuffer[FetchBlockInfo] + val mergedBlockInfo = new ArrayBuffer[FetchBlockInfo] + + def mergeFetchBlockInfo(toBeMerged: ArrayBuffer[FetchBlockInfo]): FetchBlockInfo = { + val startBlockId = toBeMerged.head.blockId.asInstanceOf[ShuffleBlockId] + + // The last merged block may comes from the input, and we can merge more blocks + // into it, if the map id is the same. + def shouldMergeIntoPreviousBatchBlockId = + mergedBlockInfo.last.blockId.asInstanceOf[ShuffleBlockBatchId].mapId == startBlockId.mapId + + val startReduceId = if (mergedBlockInfo.nonEmpty && shouldMergeIntoPreviousBatchBlockId) { + // Remove the previous batch block id as we will add a new one to replace it. + mergedBlockInfo.remove(mergedBlockInfo.length - 1).blockId + .asInstanceOf[ShuffleBlockBatchId].startReduceId + } else { + startBlockId.reduceId + } + + FetchBlockInfo( + ShuffleBlockBatchId( + startBlockId.shuffleId, + startBlockId.mapId, + startReduceId, + toBeMerged.last.blockId.asInstanceOf[ShuffleBlockId].reduceId + 1), + toBeMerged.map(_.size).sum, + toBeMerged.head.mapIndex) + } + + val iter = blocks.iterator + while (iter.hasNext) { + val info = iter.next() + // It's possible that the input block id is already a batch ID. For example, we merge some + // blocks, and then make fetch requests with the merged blocks according to "max blocks per + // request". The last fetch request may be too small, and we give up and put the remaining + // merged blocks back to the input list. + if (info.blockId.isInstanceOf[ShuffleBlockBatchId]) { + mergedBlockInfo += info + } else { + if (curBlocks.isEmpty) { + curBlocks += info + } else { + val curBlockId = info.blockId.asInstanceOf[ShuffleBlockId] + val currentMapId = curBlocks.head.blockId.asInstanceOf[ShuffleBlockId].mapId + if (curBlockId.mapId != currentMapId) { + mergedBlockInfo += mergeFetchBlockInfo(curBlocks) + curBlocks.clear() + } + curBlocks += info + } } } + if (curBlocks.nonEmpty) { + mergedBlockInfo += mergeFetchBlockInfo(curBlocks) + } + mergedBlockInfo + } else { + blocks } - val totalBytes = localBlockBytes + remoteBlockBytes - logInfo(s"Getting $numBlocksToFetch (${Utils.bytesToString(totalBytes)}) non-empty blocks " + - s"including ${localBlocks.size} (${Utils.bytesToString(localBlockBytes)}) local blocks and " + - s"${remoteBlocks.size} (${Utils.bytesToString(remoteBlockBytes)}) remote blocks") - remoteRequests + // update metrics + numBlocksToFetch += result.size + result } /** @@ -336,34 +460,118 @@ final class ShuffleBlockFetcherIterator( * `ManagedBuffer`'s memory is allocated lazily when we create the input stream, so all we * track in-memory are the ManagedBuffer references themselves. */ - private[this] def fetchLocalBlocks() { + private[this] def fetchLocalBlocks(): Unit = { logDebug(s"Start fetching local blocks: ${localBlocks.mkString(", ")}") val iter = localBlocks.iterator while (iter.hasNext) { - val blockId = iter.next() + val (blockId, mapIndex) = iter.next() try { - val buf = blockManager.getBlockData(blockId) + val buf = blockManager.getLocalBlockData(blockId) shuffleMetrics.incLocalBlocksFetched(1) shuffleMetrics.incLocalBytesRead(buf.size) buf.retain() - results.put(new SuccessFetchResult(blockId, blockManager.blockManagerId, + results.put(new SuccessFetchResult(blockId, mapIndex, blockManager.blockManagerId, buf.size(), buf, false)) } catch { + // If we see an exception, stop immediately. case e: Exception => - // If we see an exception, stop immediately. - logError(s"Error occurred while fetching local blocks", e) - results.put(new FailureFetchResult(blockId, blockManager.blockManagerId, e)) + e match { + // ClosedByInterruptException is an excepted exception when kill task, + // don't log the exception stack trace to avoid confusing users. + // See: SPARK-28340 + case ce: ClosedByInterruptException => + logError("Error occurred while fetching local blocks, " + ce.getMessage) + case ex: Exception => logError("Error occurred while fetching local blocks", ex) + } + results.put(new FailureFetchResult(blockId, mapIndex, blockManager.blockManagerId, e)) return } } } + private[this] def fetchHostLocalBlock( + blockId: BlockId, + mapIndex: Int, + localDirs: Array[String], + blockManagerId: BlockManagerId): Boolean = { + try { + val buf = blockManager.getHostLocalShuffleData(blockId, localDirs) + buf.retain() + results.put(SuccessFetchResult(blockId, mapIndex, blockManagerId, buf.size(), buf, + isNetworkReqDone = false)) + true + } catch { + case e: Exception => + // If we see an exception, stop immediately. + logError(s"Error occurred while fetching local blocks", e) + results.put(FailureFetchResult(blockId, mapIndex, blockManagerId, e)) + false + } + } + + /** + * Fetch the host-local blocks while we are fetching remote blocks. This is ok because + * `ManagedBuffer`'s memory is allocated lazily when we create the input stream, so all we + * track in-memory are the ManagedBuffer references themselves. + */ + private[this] def fetchHostLocalBlocks(hostLocalDirManager: HostLocalDirManager): Unit = { + val cachedDirsByExec = hostLocalDirManager.getCachedHostLocalDirs() + val (hostLocalBlocksWithCachedDirs, hostLocalBlocksWithMissingDirs) = + hostLocalBlocksByExecutor + .map { case (hostLocalBmId, bmInfos) => + (hostLocalBmId, bmInfos, cachedDirsByExec.get(hostLocalBmId.executorId)) + }.partition(_._3.isDefined) + val bmId = blockManager.blockManagerId + val immutableHostLocalBlocksWithoutDirs = + hostLocalBlocksWithMissingDirs.map { case (hostLocalBmId, bmInfos, _) => + hostLocalBmId -> bmInfos + }.toMap + if (immutableHostLocalBlocksWithoutDirs.nonEmpty) { + logDebug(s"Asynchronous fetching host-local blocks without cached executors' dir: " + + s"${immutableHostLocalBlocksWithoutDirs.mkString(", ")}") + val execIdsWithoutDirs = immutableHostLocalBlocksWithoutDirs.keys.map(_.executorId).toArray + hostLocalDirManager.getHostLocalDirs(execIdsWithoutDirs) { + case Success(dirs) => + immutableHostLocalBlocksWithoutDirs.foreach { case (hostLocalBmId, blockInfos) => + blockInfos.takeWhile { case (blockId, _, mapIndex) => + fetchHostLocalBlock( + blockId, + mapIndex, + dirs.get(hostLocalBmId.executorId), + hostLocalBmId) + } + } + logDebug(s"Got host-local blocks (without cached executors' dir) in " + + s"${Utils.getUsedTimeNs(startTimeNs)}") + + case Failure(throwable) => + logError(s"Error occurred while fetching host local blocks", throwable) + val (hostLocalBmId, blockInfoSeq) = immutableHostLocalBlocksWithoutDirs.head + val (blockId, _, mapIndex) = blockInfoSeq.head + results.put(FailureFetchResult(blockId, mapIndex, hostLocalBmId, throwable)) + } + } + if (hostLocalBlocksWithCachedDirs.nonEmpty) { + logDebug(s"Synchronous fetching host-local blocks with cached executors' dir: " + + s"${hostLocalBlocksWithCachedDirs.mkString(", ")}") + hostLocalBlocksWithCachedDirs.foreach { case (_, blockInfos, localDirs) => + blockInfos.foreach { case (blockId, _, mapIndex) => + if (!fetchHostLocalBlock(blockId, mapIndex, localDirs.get, bmId)) { + return + } + } + } + logDebug(s"Got host-local blocks (with cached executors' dir) in " + + s"${Utils.getUsedTimeNs(startTimeNs)}") + } + } + private[this] def initialize(): Unit = { // Add a task completion callback (called in both success case and failure case) to cleanup. context.addTaskCompletionListener(onCompleteCallback) - // Split local and remote blocks. - val remoteRequests = splitLocalRemoteBlocks() + // Partition blocks by the different fetch modes: local, host-local and remote blocks. + val remoteRequests = partitionBlocksByFetchMode() // Add the remote requests into our queue in a random order fetchRequests ++= Utils.randomize(remoteRequests) assert ((0 == reqsInFlight) == (0 == bytesInFlight), @@ -379,6 +587,10 @@ final class ShuffleBlockFetcherIterator( // Get Local Blocks fetchLocalBlocks() logDebug(s"Got local blocks in ${Utils.getUsedTimeNs(startTimeNs)}") + + if (hostLocalBlocks.nonEmpty) { + blockManager.hostLocalDirManager.foreach(fetchHostLocalBlocks) + } } override def hasNext: Boolean = numBlocksProcessed < numBlocksToFetch @@ -412,17 +624,20 @@ final class ShuffleBlockFetcherIterator( shuffleMetrics.incFetchWaitTime(fetchWaitTime) result match { - case r @ SuccessFetchResult(blockId, address, size, buf, isNetworkReqDone) => + case r @ SuccessFetchResult(blockId, mapIndex, address, size, buf, isNetworkReqDone) => if (address != blockManager.blockManagerId) { - numBlocksInFlightPerAddress(address) = numBlocksInFlightPerAddress(address) - 1 - shuffleMetrics.incRemoteBytesRead(buf.size) - if (buf.isInstanceOf[FileSegmentManagedBuffer]) { - shuffleMetrics.incRemoteBytesReadToDisk(buf.size) + if (hostLocalBlocks.contains(blockId -> mapIndex)) { + shuffleMetrics.incLocalBlocksFetched(1) + shuffleMetrics.incLocalBytesRead(buf.size) + } else { + numBlocksInFlightPerAddress(address) = numBlocksInFlightPerAddress(address) - 1 + shuffleMetrics.incRemoteBytesRead(buf.size) + if (buf.isInstanceOf[FileSegmentManagedBuffer]) { + shuffleMetrics.incRemoteBytesReadToDisk(buf.size) + } + shuffleMetrics.incRemoteBlocksFetched(1) + bytesInFlight -= size } - shuffleMetrics.incRemoteBlocksFetched(1) - } - if (!localBlocks.contains(blockId)) { - bytesInFlight -= size } if (isNetworkReqDone) { reqsInFlight -= 1 @@ -445,7 +660,7 @@ final class ShuffleBlockFetcherIterator( // since the last call. val msg = s"Received a zero-size buffer for block $blockId from $address " + s"(expectedApproxSize = $size, isNetworkReqDone=$isNetworkReqDone)" - throwFetchFailedException(blockId, address, new IOException(msg)) + throwFetchFailedException(blockId, mapIndex, address, new IOException(msg)) } val in = try { @@ -454,9 +669,14 @@ final class ShuffleBlockFetcherIterator( // The exception could only be throwed by local shuffle block case e: IOException => assert(buf.isInstanceOf[FileSegmentManagedBuffer]) - logError("Failed to create input stream from local block", e) + e match { + case ce: ClosedByInterruptException => + logError("Failed to create input stream from local block, " + + ce.getMessage) + case e: IOException => logError("Failed to create input stream from local block", e) + } buf.release() - throwFetchFailedException(blockId, address, e) + throwFetchFailedException(blockId, mapIndex, address, e) } try { input = streamWrapper(blockId, in) @@ -474,11 +694,12 @@ final class ShuffleBlockFetcherIterator( buf.release() if (buf.isInstanceOf[FileSegmentManagedBuffer] || corruptedBlocks.contains(blockId)) { - throwFetchFailedException(blockId, address, e) + throwFetchFailedException(blockId, mapIndex, address, e) } else { logWarning(s"got an corrupted block $blockId from $address, fetch again", e) corruptedBlocks += blockId - fetchRequests += FetchRequest(address, Array((blockId, size))) + fetchRequests += FetchRequest( + address, Array(FetchBlockInfo(blockId, size, mapIndex))) result = null } } finally { @@ -490,8 +711,8 @@ final class ShuffleBlockFetcherIterator( } } - case FailureFetchResult(blockId, address, e) => - throwFetchFailedException(blockId, address, e) + case FailureFetchResult(blockId, mapIndex, address, e) => + throwFetchFailedException(blockId, mapIndex, address, e) } // Send fetch requests up to maxBytesInFlight @@ -504,6 +725,7 @@ final class ShuffleBlockFetcherIterator( input, this, currentResult.blockId, + currentResult.mapIndex, currentResult.address, detectCorrupt && streamCompressedOrEncrypted)) } @@ -570,11 +792,14 @@ final class ShuffleBlockFetcherIterator( private[storage] def throwFetchFailedException( blockId: BlockId, + mapIndex: Int, address: BlockManagerId, e: Throwable) = { blockId match { case ShuffleBlockId(shufId, mapId, reduceId) => - throw new FetchFailedException(address, shufId.toInt, mapId.toInt, reduceId, e) + throw new FetchFailedException(address, shufId, mapId, mapIndex, reduceId, e) + case ShuffleBlockBatchId(shuffleId, mapId, startReduceId, _) => + throw new FetchFailedException(address, shuffleId, mapId, mapIndex, startReduceId, e) case _ => throw new SparkException( "Failed to get block " + blockId + ", which is not a shuffle block", e) @@ -591,6 +816,7 @@ private class BufferReleasingInputStream( private[storage] val delegate: InputStream, private val iterator: ShuffleBlockFetcherIterator, private val blockId: BlockId, + private val mapIndex: Int, private val address: BlockManagerId, private val detectCorruption: Boolean) extends InputStream { @@ -602,7 +828,7 @@ private class BufferReleasingInputStream( } catch { case e: IOException if detectCorruption => IOUtils.closeQuietly(this) - iterator.throwFetchFailedException(blockId, address, e) + iterator.throwFetchFailedException(blockId, mapIndex, address, e) } } @@ -624,7 +850,7 @@ private class BufferReleasingInputStream( } catch { case e: IOException if detectCorruption => IOUtils.closeQuietly(this) - iterator.throwFetchFailedException(blockId, address, e) + iterator.throwFetchFailedException(blockId, mapIndex, address, e) } } @@ -636,7 +862,7 @@ private class BufferReleasingInputStream( } catch { case e: IOException if detectCorruption => IOUtils.closeQuietly(this) - iterator.throwFetchFailedException(blockId, address, e) + iterator.throwFetchFailedException(blockId, mapIndex, address, e) } } @@ -646,7 +872,7 @@ private class BufferReleasingInputStream( } catch { case e: IOException if detectCorruption => IOUtils.closeQuietly(this) - iterator.throwFetchFailedException(blockId, address, e) + iterator.throwFetchFailedException(blockId, mapIndex, address, e) } } @@ -677,14 +903,25 @@ private class ShuffleFetchCompletionListener(var data: ShuffleBlockFetcherIterat private[storage] object ShuffleBlockFetcherIterator { + /** + * The block information to fetch used in FetchRequest. + * @param blockId block id + * @param size estimated size of the block. Note that this is NOT the exact bytes. + * Size of remote block is used to calculate bytesInFlight. + * @param mapIndex the mapIndex for this block, which indicate the index in the map stage. + */ + private[storage] case class FetchBlockInfo( + blockId: BlockId, + size: Long, + mapIndex: Int) + /** * A request to fetch blocks from a remote BlockManager. * @param address remote BlockManager to fetch from. - * @param blocks Sequence of tuple, where the first element is the block id, - * and the second element is the estimated size, used to calculate bytesInFlight. + * @param blocks Sequence of the information for blocks to fetch from the same address. */ - case class FetchRequest(address: BlockManagerId, blocks: Seq[(BlockId, Long)]) { - val size = blocks.map(_._2).sum + case class FetchRequest(address: BlockManagerId, blocks: Seq[FetchBlockInfo]) { + val size = blocks.map(_.size).sum } /** @@ -698,6 +935,7 @@ object ShuffleBlockFetcherIterator { /** * Result of a fetch from a remote block successfully. * @param blockId block id + * @param mapIndex the mapIndex for this block, which indicate the index in the map stage. * @param address BlockManager that the block was fetched from. * @param size estimated size of the block. Note that this is NOT the exact bytes. * Size of remote block is used to calculate bytesInFlight. @@ -706,6 +944,7 @@ object ShuffleBlockFetcherIterator { */ private[storage] case class SuccessFetchResult( blockId: BlockId, + mapIndex: Int, address: BlockManagerId, size: Long, buf: ManagedBuffer, @@ -717,11 +956,13 @@ object ShuffleBlockFetcherIterator { /** * Result of a fetch from a remote block unsuccessfully. * @param blockId block id + * @param mapIndex the mapIndex for this block, which indicate the index in the map stage * @param address BlockManager that the block was attempted to be fetched from * @param e the failure exception */ private[storage] case class FailureFetchResult( blockId: BlockId, + mapIndex: Int, address: BlockManagerId, e: Throwable) extends FetchResult diff --git a/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala b/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala index f36b31c65a63d..d3a061fae746f 100644 --- a/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala +++ b/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala @@ -48,7 +48,7 @@ private[spark] class ConsoleProgressBar(sc: SparkContext) extends Logging { // Schedule a refresh thread to run periodically private val timer = new Timer("refresh progress", true) timer.schedule(new TimerTask{ - override def run() { + override def run(): Unit = { refresh() } }, firstDelayMSec, updatePeriodMSec) @@ -73,7 +73,7 @@ private[spark] class ConsoleProgressBar(sc: SparkContext) extends Logging { * after your last output, keeps overwriting itself to hold in one line. The logging will follow * the progress bar, then progress bar will be showed in next line without overwrite logs. */ - private def show(now: Long, stages: Seq[StageData]) { + private def show(now: Long, stages: Seq[StageData]): Unit = { val width = TerminalWidth / stages.size val bar = stages.map { s => val total = s.numTasks @@ -94,7 +94,7 @@ private[spark] class ConsoleProgressBar(sc: SparkContext) extends Logging { // only refresh if it's changed OR after 1 minute (or the ssh connection will be closed // after idle some time) if (bar != lastProgressBar || now - lastUpdateTime > 60 * 1000L) { - System.err.print(CR + bar) + System.err.print(CR + bar + CR) lastUpdateTime = now } lastProgressBar = bar @@ -103,7 +103,7 @@ private[spark] class ConsoleProgressBar(sc: SparkContext) extends Logging { /** * Clear the progress bar if showed. */ - private def clear() { + private def clear(): Unit = { if (!lastProgressBar.isEmpty) { System.err.printf(CR + " " * TerminalWidth + CR) lastProgressBar = "" diff --git a/core/src/main/scala/org/apache/spark/ui/GraphUIData.scala b/core/src/main/scala/org/apache/spark/ui/GraphUIData.scala new file mode 100644 index 0000000000000..87ff677514461 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/ui/GraphUIData.scala @@ -0,0 +1,169 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ui + +import java.{util => ju} +import java.lang.{Long => JLong} + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ArrayBuffer +import scala.xml.{Node, Unparsed} + +/** + * A helper class to generate JavaScript and HTML for both timeline and histogram graphs. + * + * @param timelineDivId the timeline `id` used in the html `div` tag + * @param histogramDivId the timeline `id` used in the html `div` tag + * @param data the data for the graph + * @param minX the min value of X axis + * @param maxX the max value of X axis + * @param minY the min value of Y axis + * @param maxY the max value of Y axis + * @param unitY the unit of Y axis + * @param batchInterval if `batchInterval` is not None, we will draw a line for `batchInterval` in + * the graph + */ +private[spark] class GraphUIData( + timelineDivId: String, + histogramDivId: String, + data: Seq[(Long, Double)], + minX: Long, + maxX: Long, + minY: Double, + maxY: Double, + unitY: String, + batchInterval: Option[Double] = None) { + + private var dataJavaScriptName: String = _ + + def generateDataJs(jsCollector: JsCollector): Unit = { + val jsForData = data.map { case (x, y) => + s"""{"x": $x, "y": $y}""" + }.mkString("[", ",", "]") + dataJavaScriptName = jsCollector.nextVariableName + jsCollector.addPreparedStatement(s"var $dataJavaScriptName = $jsForData;") + } + + def generateTimelineHtml(jsCollector: JsCollector): Seq[Node] = { + jsCollector.addPreparedStatement(s"registerTimeline($minY, $maxY);") + if (batchInterval.isDefined) { + jsCollector.addStatement( + "drawTimeline(" + + s"'#$timelineDivId', $dataJavaScriptName, $minX, $maxX, $minY, $maxY, '$unitY'," + + s" ${batchInterval.get}" + + ");") + } else { + jsCollector.addStatement( + s"drawTimeline('#$timelineDivId', $dataJavaScriptName, $minX, $maxX, $minY, $maxY," + + s" '$unitY');") + } +
    + } + + def generateHistogramHtml(jsCollector: JsCollector): Seq[Node] = { + val histogramData = s"$dataJavaScriptName.map(function(d) { return d.y; })" + jsCollector.addPreparedStatement(s"registerHistogram($histogramData, $minY, $maxY);") + if (batchInterval.isDefined) { + jsCollector.addStatement( + "drawHistogram(" + + s"'#$histogramDivId', $histogramData, $minY, $maxY, '$unitY', ${batchInterval.get}" + + ");") + } else { + jsCollector.addStatement( + s"drawHistogram('#$histogramDivId', $histogramData, $minY, $maxY, '$unitY');") + } +
    + } + + def generateAreaStackHtmlWithData( + jsCollector: JsCollector, + values: Array[(Long, ju.Map[String, JLong])]): Seq[Node] = { + val operationLabels = values.flatMap(_._2.keySet().asScala).toSet + val durationDataPadding = UIUtils.durationDataPadding(values) + val jsForData = durationDataPadding.map { case (x, y) => + val s = y.toSeq.sortBy(_._1).map(e => s""""${e._1}": "${e._2}"""").mkString(",") + s"""{x: "${UIUtils.formatBatchTime(x, 1, showYYYYMMSS = false)}", $s}""" + }.mkString("[", ",", "]") + val jsForLabels = operationLabels.toSeq.sorted.mkString("[\"", "\",\"", "\"]") + + val (maxX, minX, maxY, minY) = if (values != null && values.length > 0) { + val xValues = values.map(_._1.toLong) + val yValues = values.map(_._2.asScala.toSeq.map(_._2.toLong).sum) + (xValues.max, xValues.min, yValues.max, yValues.min) + } else { + (0L, 0L, 0L, 0L) + } + + dataJavaScriptName = jsCollector.nextVariableName + jsCollector.addPreparedStatement(s"var $dataJavaScriptName = $jsForData;") + val labels = jsCollector.nextVariableName + jsCollector.addPreparedStatement(s"var $labels = $jsForLabels;") + jsCollector.addStatement( + s"drawAreaStack('#$timelineDivId', $labels, $dataJavaScriptName, $minX, $maxX, $minY, $maxY)") +
    + } +} + +/** + * A helper class that allows the user to add JavaScript statements which will be executed when the + * DOM has finished loading. + */ +private[spark] class JsCollector { + + private var variableId = 0 + + /** + * Return the next unused JavaScript variable name + */ + def nextVariableName: String = { + variableId += 1 + "v" + variableId + } + + /** + * JavaScript statements that will execute before `statements` + */ + private val preparedStatements = ArrayBuffer[String]() + + /** + * JavaScript statements that will execute after `preparedStatements` + */ + private val statements = ArrayBuffer[String]() + + def addPreparedStatement(js: String): Unit = { + preparedStatements += js + } + + def addStatement(js: String): Unit = { + statements += js + } + + /** + * Generate a html snippet that will execute all scripts when the DOM has finished loading. + */ + def toHtml: Seq[Node] = { + val js = + s""" + |$$(document).ready(function() { + | ${preparedStatements.mkString("\n")} + | ${statements.mkString("\n")} + |});""".stripMargin + + + } +} diff --git a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala index ff7baf4d9419b..94c99d48e773c 100644 --- a/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/JettyUtils.scala @@ -20,7 +20,7 @@ package org.apache.spark.ui import java.net.{URI, URL} import java.util.EnumSet import javax.servlet.DispatcherType -import javax.servlet.http.{HttpServlet, HttpServletRequest, HttpServletResponse} +import javax.servlet.http._ import scala.language.implicitConversions import scala.xml.Node @@ -73,7 +73,7 @@ private[spark] object JettyUtils extends Logging { servletParams: ServletParams[T], conf: SparkConf): HttpServlet = { new HttpServlet { - override def doGet(request: HttpServletRequest, response: HttpServletResponse) { + override def doGet(request: HttpServletRequest, response: HttpServletResponse): Unit = { try { response.setContentType("%s;charset=utf-8".format(servletParams.contentType)) response.setStatus(HttpServletResponse.SC_OK) @@ -259,7 +259,15 @@ private[spark] object JettyUtils extends Logging { server.addBean(errorHandler) val collection = new ContextHandlerCollection - server.setHandler(collection) + conf.get(PROXY_REDIRECT_URI) match { + case Some(proxyUri) => + val proxyHandler = new ProxyRedirectHandler(proxyUri) + proxyHandler.setHandler(collection) + server.setHandler(proxyHandler) + + case _ => + server.setHandler(collection) + } // Executor used to create daemon threads for the Jetty connectors. val serverExecutor = new ScheduledExecutorScheduler(s"$serverName-JettyScheduler", true) @@ -526,3 +534,51 @@ private[spark] case class ServerInfo( } } + +/** + * A Jetty handler to handle redirects to a proxy server. It intercepts redirects and rewrites the + * location to point to the proxy server. + * + * The handler needs to be set as the server's handler, because Jetty sometimes generates redirects + * before invoking any servlet handlers or filters. One of such cases is when asking for the root of + * a servlet context without the trailing slash (e.g. "/jobs") - Jetty will send a redirect to the + * same URL, but with a trailing slash. + */ +private class ProxyRedirectHandler(_proxyUri: String) extends HandlerWrapper { + + private val proxyUri = _proxyUri.stripSuffix("/") + + override def handle( + target: String, + baseRequest: Request, + request: HttpServletRequest, + response: HttpServletResponse): Unit = { + super.handle(target, baseRequest, request, new ResponseWrapper(request, response)) + } + + private class ResponseWrapper( + req: HttpServletRequest, + res: HttpServletResponse) + extends HttpServletResponseWrapper(res) { + + override def sendRedirect(location: String): Unit = { + val newTarget = if (location != null) { + val target = new URI(location) + val path = if (target.getPath().startsWith("/")) { + target.getPath() + } else { + req.getRequestURI().stripSuffix("/") + "/" + target.getPath() + } + // The target path should already be encoded, so don't re-encode it, just the + // proxy address part. + val proxyBase = UIUtils.uiRoot(req) + val proxyPrefix = if (proxyBase.nonEmpty) s"$proxyUri$proxyBase" else proxyUri + s"${res.encodeURL(proxyPrefix)}${target.getPath()}" + } else { + null + } + super.sendRedirect(newTarget) + } + } + +} diff --git a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala index 6fb8e458a789c..8ae9828c3fee1 100644 --- a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala +++ b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala @@ -66,6 +66,9 @@ private[spark] class SparkUI private ( addStaticHandler(SparkUI.STATIC_RESOURCE_DIR) attachHandler(createRedirectHandler("/", "/jobs/", basePath = basePath)) attachHandler(ApiRootResource.getServletHandler(this)) + if (sc.map(_.conf.get(UI_PROMETHEUS_ENABLED)).getOrElse(false)) { + attachHandler(PrometheusResource.getServletHandler(this)) + } // These should be POST only, but, the YARN AM proxy won't proxy POSTs attachHandler(createRedirectHandler( @@ -94,7 +97,7 @@ private[spark] class SparkUI private ( } /** Stop the server behind this web interface. Only valid after bind(). */ - override def stop() { + override def stop(): Unit = { super.stop() logInfo(s"Stopped Spark web UI at $webUrl") } diff --git a/core/src/main/scala/org/apache/spark/ui/ToolTips.scala b/core/src/main/scala/org/apache/spark/ui/ToolTips.scala index 766cc65084f07..aefd001e573f9 100644 --- a/core/src/main/scala/org/apache/spark/ui/ToolTips.scala +++ b/core/src/main/scala/org/apache/spark/ui/ToolTips.scala @@ -31,9 +31,9 @@ private[spark] object ToolTips { val SHUFFLE_READ_BLOCKED_TIME = "Time that the task spent blocked waiting for shuffle data to be read from remote machines." - val INPUT = "Bytes and records read from Hadoop or from Spark storage." + val INPUT = "Bytes read from Hadoop or from Spark storage." - val OUTPUT = "Bytes and records written to Hadoop." + val OUTPUT = "Bytes written to Hadoop." val STORAGE_MEMORY = "Memory used / total available memory for storage of data " + @@ -99,4 +99,7 @@ private[spark] object ToolTips { dynamic allocation is enabled. The number of granted executors may exceed the limit ephemerally when executors are being killed. """ + + val DURATION = + "Elapsed time since the stage was submitted until execution completion of all its tasks." } diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala index 70e24bd0e7ecd..94c45215b5ff2 100644 --- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala @@ -17,6 +17,8 @@ package org.apache.spark.ui +import java.{util => ju} +import java.lang.{Long => JLong} import java.net.URLDecoder import java.nio.charset.StandardCharsets.UTF_8 import java.text.SimpleDateFormat @@ -24,6 +26,7 @@ import java.util.{Date, Locale, TimeZone} import javax.servlet.http.HttpServletRequest import javax.ws.rs.core.{MediaType, Response} +import scala.collection.JavaConverters._ import scala.util.control.NonFatal import scala.xml._ import scala.xml.transform.{RewriteRule, RuleTransformer} @@ -119,6 +122,59 @@ private[spark] object UIUtils extends Logging { } } + // SimpleDateFormat is not thread-safe. Don't expose it to avoid improper use. + private val batchTimeFormat = new ThreadLocal[SimpleDateFormat]() { + override def initialValue(): SimpleDateFormat = + new SimpleDateFormat("yyyy/MM/dd HH:mm:ss", Locale.US) + } + + private val batchTimeFormatWithMilliseconds = new ThreadLocal[SimpleDateFormat]() { + override def initialValue(): SimpleDateFormat = + new SimpleDateFormat("yyyy/MM/dd HH:mm:ss.SSS", Locale.US) + } + + /** + * If `batchInterval` is less than 1 second, format `batchTime` with milliseconds. Otherwise, + * format `batchTime` without milliseconds. + * + * @param batchTime the batch time to be formatted + * @param batchInterval the batch interval + * @param showYYYYMMSS if showing the `yyyy/MM/dd` part. If it's false, the return value wll be + * only `HH:mm:ss` or `HH:mm:ss.SSS` depending on `batchInterval` + * @param timezone only for test + */ + def formatBatchTime( + batchTime: Long, + batchInterval: Long, + showYYYYMMSS: Boolean = true, + timezone: TimeZone = null): String = { + val oldTimezones = + (batchTimeFormat.get.getTimeZone, batchTimeFormatWithMilliseconds.get.getTimeZone) + if (timezone != null) { + batchTimeFormat.get.setTimeZone(timezone) + batchTimeFormatWithMilliseconds.get.setTimeZone(timezone) + } + try { + val formattedBatchTime = + if (batchInterval < 1000) { + batchTimeFormatWithMilliseconds.get.format(batchTime) + } else { + // If batchInterval >= 1 second, don't show milliseconds + batchTimeFormat.get.format(batchTime) + } + if (showYYYYMMSS) { + formattedBatchTime + } else { + formattedBatchTime.substring(formattedBatchTime.indexOf(' ') + 1) + } + } finally { + if (timezone != null) { + batchTimeFormat.get.setTimeZone(oldTimezones._1) + batchTimeFormatWithMilliseconds.get.setTimeZone(oldTimezones._2) + } + } + } + /** Generate a human-readable string representing a number (e.g. 100 K) */ def formatNumber(records: Double): String = { val trillion = 1e12 @@ -227,7 +283,7 @@ private[spark] object UIUtils extends Logging { {tab.name} } - val helpButton: Seq[Node] = helpText.map(tooltip(_, "bottom")).getOrElse(Seq.empty) + val helpButton: Seq[Node] = helpText.map(tooltip(_, "top")).getOrElse(Seq.empty) @@ -309,9 +365,13 @@ private[spark] object UIUtils extends Logging { data: Iterable[T], fixedWidth: Boolean = false, id: Option[String] = None, + // When headerClasses is not empty, it should have the same length as headers parameter headerClasses: Seq[String] = Seq.empty, stripeRowsWithCss: Boolean = true, - sortable: Boolean = true): Seq[Node] = { + sortable: Boolean = true, + // The tooltip information could be None, which indicates header does not have a tooltip. + // When tooltipHeaders is not empty, it should have the same length as headers parameter + tooltipHeaders: Seq[Option[String]] = Seq.empty): Seq[Node] = { val listingTableClass = { val _tableClass = if (stripeRowsWithCss) TABLE_CLASS_STRIPED else TABLE_CLASS_NOT_STRIPED @@ -332,6 +392,14 @@ private[spark] object UIUtils extends Logging { } } + def getTooltip(index: Int): Option[String] = { + if (index < tooltipHeaders.size) { + tooltipHeaders(index) + } else { + None + } + } + val newlinesInHeader = headers.exists(_.contains("\n")) def getHeaderContent(header: String): Seq[Node] = { if (newlinesInHeader) { @@ -345,7 +413,15 @@ private[spark] object UIUtils extends Logging { val headerRow: Seq[Node] = { headers.view.zipWithIndex.map { x => -
    + getTooltip(x._2) match { + case Some(tooltip) => + + case None => + } } }
    - Executor ID - AddressStatus - RDD BlocksExecutor IDAddressStatusRDD Blocks @@ -90,13 +87,13 @@

    Executors

    Off Heap Storage Memory
    Disk UsedCoresResourcesActive TasksFailed TasksComplete TasksTotal TasksDisk UsedCoresResourcesActive TasksFailed TasksComplete TasksTotal Tasks @@ -110,14 +107,11 @@

    Executors

    title="Total shuffle bytes and records read (includes both data read locally and data read from remote executors)."> Shuffle Read
    - - Shuffle WriteLogsThread DumpLogsThread Dump
    {formatResourcesAddresses(driver.resources)} {driver.desc.command.arguments(2)}{UIUtils.formatDuration(System.currentTimeMillis() - driver.startTime)}
    {getHeaderContent(x._1)} + + {getHeaderContent(x._1)} + + {getHeaderContent(x._1)}
    @@ -408,7 +484,7 @@ private[spark] object UIUtils extends Logging { class="expand-dag-viz" onclick={s"toggleDagViz($forJob);"}> + data-placement="top"> DAG Visualization @@ -552,4 +628,39 @@ private[spark] object UIUtils extends Logging { def buildErrorResponse(status: Response.Status, msg: String): Response = { Response.status(status).entity(msg).`type`(MediaType.TEXT_PLAIN).build() } + + /** + * There may be different duration labels in each batch. So we need to + * mark those missing duration label as '0d' to avoid UI rending error. + */ + def durationDataPadding( + values: Array[(Long, ju.Map[String, JLong])]): Array[(Long, Map[String, Double])] = { + val operationLabels = values.flatMap(_._2.keySet().asScala).toSet + values.map { case (xValue, yValue) => + val dataPadding = operationLabels.map { opLabel => + if (yValue.containsKey(opLabel)) { + (opLabel, yValue.get(opLabel).toDouble) + } else { + (opLabel, 0d) + } + } + (xValue, dataPadding.toMap) + } + } + + def detailsUINode(isMultiline: Boolean, message: String): Seq[Node] = { + if (isMultiline) { + // scalastyle:off + + +details + ++ + + // scalastyle:on + } else { + Seq.empty[Node] + } + } } diff --git a/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala b/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala index 8845dcf48a844..ca111a8d00a64 100644 --- a/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala +++ b/core/src/main/scala/org/apache/spark/ui/UIWorkloadGenerator.scala @@ -37,7 +37,7 @@ private[spark] object UIWorkloadGenerator { val NUM_PARTITIONS = 100 val INTER_JOB_WAIT_MS = 5000 - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { if (args.length < 3) { // scalastyle:off println println( @@ -98,7 +98,7 @@ private[spark] object UIWorkloadGenerator { (1 to nJobSet).foreach { _ => for ((desc, job) <- jobs) { new Thread { - override def run() { + override def run(): Unit = { // scalastyle:off println try { setProperties(desc) diff --git a/core/src/main/scala/org/apache/spark/ui/WebUI.scala b/core/src/main/scala/org/apache/spark/ui/WebUI.scala index 1fe822a0e3b57..9faa3dcf2cdf2 100644 --- a/core/src/main/scala/org/apache/spark/ui/WebUI.scala +++ b/core/src/main/scala/org/apache/spark/ui/WebUI.scala @@ -184,7 +184,7 @@ private[spark] abstract class WebUITab(parent: WebUI, val prefix: String) { val name = prefix.capitalize /** Attach a page to this tab. This prepends the page's prefix with the tab's own prefix. */ - def attachPage(page: WebUIPage) { + def attachPage(page: WebUIPage): Unit = { page.prefix = (prefix + "/" + page.prefix).stripSuffix("/") pages += page } @@ -236,4 +236,8 @@ private[spark] class DelegatingServletContextHandler(handler: ServletContextHand def filterCount(): Int = { handler.getServletHandler.getFilters.length } + + def getContextPath(): String = { + handler.getContextPath + } } diff --git a/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala b/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala index 76537afd81ce0..c6eb461ad601c 100644 --- a/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala @@ -39,15 +39,20 @@ private[ui] class EnvironmentPage( "Scala Version" -> appEnv.runtime.scalaVersion) val runtimeInformationTable = UIUtils.listingTable( - propertyHeader, jvmRow, jvmInformation.toSeq.sorted, fixedWidth = true) + propertyHeader, jvmRow, jvmInformation.toSeq.sorted, fixedWidth = true, + headerClasses = headerClasses) val sparkPropertiesTable = UIUtils.listingTable(propertyHeader, propertyRow, - Utils.redact(conf, appEnv.sparkProperties.sorted), fixedWidth = true) + Utils.redact(conf, appEnv.sparkProperties.sorted), fixedWidth = true, + headerClasses = headerClasses) val hadoopPropertiesTable = UIUtils.listingTable(propertyHeader, propertyRow, - Utils.redact(conf, appEnv.hadoopProperties.sorted), fixedWidth = true) + Utils.redact(conf, appEnv.hadoopProperties.sorted), fixedWidth = true, + headerClasses = headerClasses) val systemPropertiesTable = UIUtils.listingTable(propertyHeader, propertyRow, - Utils.redact(conf, appEnv.systemProperties.sorted), fixedWidth = true) + Utils.redact(conf, appEnv.systemProperties.sorted), fixedWidth = true, + headerClasses = headerClasses) val classpathEntriesTable = UIUtils.listingTable( - classPathHeaders, classPathRow, appEnv.classpathEntries.sorted, fixedWidth = true) + classPathHeader, classPathRow, appEnv.classpathEntries.sorted, fixedWidth = true, + headerClasses = headerClasses) val content = private def propertyRow(kv: (String, String)) = private def classPathRow(data: (String, String)) = diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala index a13037b5e24db..77564f48015f1 100644 --- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorThreadDumpPage.scala @@ -89,7 +89,12 @@ private[ui] class ExecutorThreadDumpPage( - + {dumpRows}
    {kv._1}{kv._2}
    {kv._1}{kv._2}
    {data._1}{data._2}
    Thread ID Thread Name Thread StateThread Locks + + Thread Locks + +
    diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala index 11fcbf1c29c05..f53e67ff5cc98 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala @@ -71,7 +71,10 @@ private[ui] class AllJobsPage(parent: JobsTab, store: AppStatusStore) extends We val jobId = job.jobId val status = job.status val (_, lastStageDescription) = lastStageNameAndDescription(store, job) - val jobDescription = UIUtils.makeDescription(lastStageDescription, "", plainText = true).text + val jobDescription = UIUtils.makeDescription( + job.description.getOrElse(lastStageDescription), + "", + plainText = true).text val submissionTime = job.submissionTime.get.getTime() val completionTime = job.completionTime.map(_.getTime()).getOrElse(System.currentTimeMillis()) @@ -123,7 +126,7 @@ private[ui] class AllJobsPage(parent: JobsTab, store: AppStatusStore) extends We | 'group': 'executors', | 'start': new Date(${e.addTime.getTime()}), | 'content': '
    Executor ${e.id} added
    ' @@ -139,7 +142,7 @@ private[ui] class AllJobsPage(parent: JobsTab, store: AppStatusStore) extends We | 'group': 'executors', | 'start': new Date(${removeTime.getTime()}), | 'content': '
    - + Event Timeline ++ @@ -449,7 +452,11 @@ private[ui] class JobDataSource( val formattedSubmissionTime = submissionTime.map(UIUtils.formatDate).getOrElse("Unknown") val (lastStageName, lastStageDescription) = lastStageNameAndDescription(store, jobData) - val jobDescription = UIUtils.makeDescription(lastStageDescription, basePath, plainText = false) + val jobDescription = + UIUtils.makeDescription( + jobData.description.getOrElse(lastStageDescription), + basePath, + plainText = false) val detailUrl = "%s/jobs/job/?id=%s".format(basePath, jobData.jobId) @@ -541,12 +548,15 @@ private[ui] class JobPagedTable( override def headers: Seq[Node] = { // Information for each header: title, cssClass, and sortable - val jobHeadersAndCssClasses: Seq[(String, String, Boolean)] = + val jobHeadersAndCssClasses: Seq[(String, String, Boolean, Option[String])] = Seq( - (jobIdTitle, "", true), - ("Description", "", true), ("Submitted", "", true), ("Duration", "", true), - ("Stages: Succeeded/Total", "", false), - ("Tasks (for all stages): Succeeded/Total", "", false) + (jobIdTitle, "", true, None), + ("Description", "", true, None), + ("Submitted", "", true, None), + ("Duration", "", true, Some("Elapsed time since the job was submitted " + + "until execution completion of all its stages.")), + ("Stages: Succeeded/Total", "", false, None), + ("Tasks (for all stages): Succeeded/Total", "", false, None) ) if (!jobHeadersAndCssClasses.filter(_._3).map(_._1).contains(sortColumn)) { @@ -554,7 +564,7 @@ private[ui] class JobPagedTable( } val headerRow: Seq[Node] = { - jobHeadersAndCssClasses.map { case (header, cssClass, sortable) => + jobHeadersAndCssClasses.map { case (header, cssClass, sortable, tooltip) => if (header == sortColumn) { val headerLink = Unparsed( parameterPath + @@ -566,9 +576,17 @@ private[ui] class JobPagedTable( - {header} -  {Unparsed(arrow)} - + { + if (tooltip.nonEmpty) { + + {header} {Unparsed(arrow)} + + } else { + + {header} {Unparsed(arrow)} + + } + } } else { @@ -581,12 +599,32 @@ private[ui] class JobPagedTable( - {header} - + { + if (tooltip.nonEmpty) { + + {header} + + } else { + + {header} + + } + } + } else { - {header} + { + if (tooltip.nonEmpty) { + + {header} + + } else { + + {header} + + } + } } } diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala index f672ce0ec6a68..d8a93adbbe90a 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala @@ -30,7 +30,6 @@ import org.apache.spark.ui.{UIUtils, WebUIPage} private[ui] class AllStagesPage(parent: StagesTab) extends WebUIPage("") { private val sc = parent.sc private val subPath = "stages" - private def isFairScheduler = parent.isFairScheduler def render(request: HttpServletRequest): Seq[Node] = { // For now, pool information is only accessible in live UIs @@ -57,7 +56,7 @@ private[ui] class AllStagesPage(parent: StagesTab) extends WebUIPage("") {
    - val poolsDescription = if (sc.isDefined && isFairScheduler) { + val poolsDescription = if (parent.isFairScheduler) {

    diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala index f7aca507d6f93..12f1aa25e8d2a 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala @@ -104,7 +104,7 @@ private[ui] class JobPage(parent: JobsTab, store: AppStatusStore) extends WebUIP | 'group': 'executors', | 'start': new Date(${e.addTime.getTime()}), | 'content': '
    Executor ${e.id} added
    ' @@ -120,7 +120,7 @@ private[ui] class JobPage(parent: JobsTab, store: AppStatusStore) extends WebUIP | 'group': 'executors', | 'start': new Date(${removeTime.getTime()}), | 'content': '
    - + Event Timeline ++ diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala index c2644a8eea157..dc3106400dd2b 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala @@ -32,7 +32,9 @@ private[ui] class JobsTab(parent: SparkUI, store: AppStatusStore) val sc = parent.sc val killEnabled = parent.killEnabled + // Show pool information for only live UI. def isFairScheduler: Boolean = { + sc.isDefined && store .environmentInfo() .sparkProperties diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/PoolTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/PoolTable.scala index 683cfa582877d..7b90baad6d8d3 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/PoolTable.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/PoolTable.scala @@ -34,11 +34,17 @@ private[ui] class PoolTable(pools: Map[Schedulable, PoolData], parent: StagesTab - - + + - + {pools.map { case (s, p) => poolRow(request, s, p) }} diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala index fce05e8a42fda..ccaa70b9daae0 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala @@ -288,10 +288,10 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We val executorOverhead = serializationTime + deserializationTime val executorRunTime = if (taskInfo.duration.isDefined) { - totalExecutionTime - executorOverhead - gettingResultTime + math.max(totalExecutionTime - executorOverhead - gettingResultTime - schedulerDelay, 0) } else { metricsOpt.map(_.executorRunTime).getOrElse( - totalExecutionTime - executorOverhead - gettingResultTime) + math.max(totalExecutionTime - executorOverhead - gettingResultTime - schedulerDelay, 0)) } val executorComputingTime = executorRunTime - shuffleReadTime - shuffleWriteTime val executorComputingTimeProportion = @@ -721,19 +721,7 @@ private[ui] class TaskPagedTable( } else { error }) - val details = if (isMultiline) { - // scalastyle:off - - +details - ++ - - // scalastyle:on - } else { - "" - } + val details = UIUtils.detailsUINode(isMultiline, error) } } diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala index e24b2f2ec36db..a7d38e9b04b70 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/StageTable.scala @@ -179,18 +179,20 @@ private[ui] class StagePagedTable( // stageHeadersAndCssClasses has three parts: header title, tooltip information, and sortable. // The tooltip information could be None, which indicates it does not have a tooltip. // Otherwise, it has two parts: tooltip text, and position (true for left, false for default). - val stageHeadersAndCssClasses: Seq[(String, Option[(String, Boolean)], Boolean)] = - Seq(("Stage Id", None, true)) ++ - {if (isFairScheduler) {Seq(("Pool Name", None, true))} else Seq.empty} ++ + val stageHeadersAndCssClasses: Seq[(String, String, Boolean)] = + Seq(("Stage Id", null, true)) ++ + {if (isFairScheduler) {Seq(("Pool Name", null, true))} else Seq.empty} ++ Seq( - ("Description", None, true), ("Submitted", None, true), ("Duration", None, true), - ("Tasks: Succeeded/Total", None, false), - ("Input", Some((ToolTips.INPUT, false)), true), - ("Output", Some((ToolTips.OUTPUT, false)), true), - ("Shuffle Read", Some((ToolTips.SHUFFLE_READ, false)), true), - ("Shuffle Write", Some((ToolTips.SHUFFLE_WRITE, true)), true) + ("Description", null, true), + ("Submitted", null, true), + ("Duration", ToolTips.DURATION, true), + ("Tasks: Succeeded/Total", null, false), + ("Input", ToolTips.INPUT, true), + ("Output", ToolTips.OUTPUT, true), + ("Shuffle Read", ToolTips.SHUFFLE_READ, true), + ("Shuffle Write", ToolTips.SHUFFLE_WRITE, true) ) ++ - {if (isFailedStage) {Seq(("Failure Reason", None, false))} else Seq.empty} + {if (isFailedStage) {Seq(("Failure Reason", null, false))} else Seq.empty} if (!stageHeadersAndCssClasses.filter(_._3).map(_._1).contains(sortColumn)) { throw new IllegalArgumentException(s"Unknown column: $sortColumn") @@ -198,22 +200,13 @@ private[ui] class StagePagedTable( val headerRow: Seq[Node] = { stageHeadersAndCssClasses.map { case (header, tooltip, sortable) => - val headerSpan = tooltip.map { case (title, left) => - if (left) { - /* Place the shuffle write tooltip on the left (rather than the default position - of on top) because the shuffle write column is the last column on the right side and - the tooltip is wider than the column, so it doesn't fit on top. */ - + val headerSpan = if (null != tooltip && !tooltip.isEmpty) { + {header} - } else { - - {header} - - } - }.getOrElse( + } else { {header} - ) + } if (header == sortColumn) { val headerLink = Unparsed( @@ -316,19 +309,7 @@ private[ui] class StagePagedTable( } else { failureReason }) - val details = if (isMultiline) { - // scalastyle:off - - +details - ++ - - // scalastyle:on - } else { - "" - } + val details = UIUtils.detailsUINode(isMultiline, failureReason) } diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala index 2d222b842be55..b59dd333da19e 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala @@ -37,7 +37,9 @@ private[ui] class StagesTab(val parent: SparkUI, val store: AppStatusStore) attachPage(new StagePage(this, store)) attachPage(new PoolPage(this)) + // Show pool information for only live UI. def isFairScheduler: Boolean = { + sc.isDefined && store .environmentInfo() .sparkProperties diff --git a/core/src/main/scala/org/apache/spark/ui/storage/StoragePage.scala b/core/src/main/scala/org/apache/spark/ui/storage/StoragePage.scala index 2488197814ffd..fb43af357f7b8 100644 --- a/core/src/main/scala/org/apache/spark/ui/storage/StoragePage.scala +++ b/core/src/main/scala/org/apache/spark/ui/storage/StoragePage.scala @@ -25,6 +25,7 @@ import scala.xml.Node import org.apache.spark.status.{AppStatusStore, StreamBlockData} import org.apache.spark.status.api.v1 import org.apache.spark.ui._ +import org.apache.spark.ui.storage.ToolTips._ import org.apache.spark.util.Utils /** Page showing list of RDD's currently stored in the cluster */ @@ -56,7 +57,8 @@ private[ui] class StoragePage(parent: SparkUITab, store: AppStatusStore) extends rddHeader, rddRow(request, _: v1.RDDStorageInfo), rdds, - id = Some("storage-by-rdd-table"))} + id = Some("storage-by-rdd-table"), + tooltipHeaders = tooltips)} } @@ -72,6 +74,16 @@ private[ui] class StoragePage(parent: SparkUITab, store: AppStatusStore) extends "Size in Memory", "Size on Disk") + /** Tooltips for header fields of the RDD table */ + val tooltips = Seq( + None, + Some(RDD_NAME), + Some(STORAGE_LEVEL), + Some(CACHED_PARTITIONS), + Some(FRACTION_CACHED), + Some(SIZE_IN_MEMORY), + Some(SIZE_ON_DISK)) + /** Render an HTML row representing an RDD */ private def rddRow(request: HttpServletRequest, rdd: v1.RDDStorageInfo): Seq[Node] = { // scalastyle:off diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/HiveTestUtils.scala b/core/src/main/scala/org/apache/spark/ui/storage/ToolTips.scala similarity index 58% rename from sql/hive/src/test/scala/org/apache/spark/sql/hive/test/HiveTestUtils.scala rename to core/src/main/scala/org/apache/spark/ui/storage/ToolTips.scala index 7631efedf46af..4677eba63c830 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/HiveTestUtils.scala +++ b/core/src/main/scala/org/apache/spark/ui/storage/ToolTips.scala @@ -15,18 +15,28 @@ * limitations under the License. */ -package org.apache.spark.sql.hive.test +package org.apache.spark.ui.storage -import java.io.File +private[ui] object ToolTips { -import org.apache.hadoop.hive.contrib.udaf.example.UDAFExampleMax -import org.apache.hive.hcatalog.data.JsonSerDe + val RDD_NAME = + "Name of the persisted RDD" -object HiveTestUtils { + val STORAGE_LEVEL = + "StorageLevel displays where the persisted RDD is stored, " + + "format of the persisted RDD (serialized or de-serialized) and" + + "replication factor of the persisted RDD" - val getHiveContribJar: File = - new File(classOf[UDAFExampleMax].getProtectionDomain.getCodeSource.getLocation.getPath) + val CACHED_PARTITIONS = + "Number of partitions cached" - val getHiveHcatalogCoreJar: File = - new File(classOf[JsonSerDe].getProtectionDomain.getCodeSource.getLocation.getPath) + val FRACTION_CACHED = + "Fraction of total partitions cached" + + val SIZE_IN_MEMORY = + "Total size of partitions in memory" + + val SIZE_ON_DISK = + "Total size of partitions on the disk" } + diff --git a/core/src/main/scala/org/apache/spark/util/ByteBufferInputStream.scala b/core/src/main/scala/org/apache/spark/util/ByteBufferInputStream.scala index a5ee0ff16b5df..1383e1835028c 100644 --- a/core/src/main/scala/org/apache/spark/util/ByteBufferInputStream.scala +++ b/core/src/main/scala/org/apache/spark/util/ByteBufferInputStream.scala @@ -67,7 +67,7 @@ class ByteBufferInputStream(private var buffer: ByteBuffer) /** * Clean up the buffer, and potentially dispose of it using StorageUtils.dispose(). */ - private def cleanUp() { + private def cleanUp(): Unit = { if (buffer != null) { buffer = null } diff --git a/core/src/main/scala/org/apache/spark/util/Clock.scala b/core/src/main/scala/org/apache/spark/util/Clock.scala index e92ed11bd165b..226f15d3d38c2 100644 --- a/core/src/main/scala/org/apache/spark/util/Clock.scala +++ b/core/src/main/scala/org/apache/spark/util/Clock.scala @@ -21,7 +21,37 @@ package org.apache.spark.util * An interface to represent clocks, so that they can be mocked out in unit tests. */ private[spark] trait Clock { + /** @return Current system time, in ms. */ def getTimeMillis(): Long + + // scalastyle:off line.size.limit + /** + * Current value of high resolution time source, in ns. + * + * This method abstracts the call to the JRE's `System.nanoTime()` call. As with that method, the + * value here is not guaranteed to be monotonically increasing, but rather a higher resolution + * time source for use in the calculation of time intervals. The characteristics of the values + * returned may very from JVM to JVM (or even the same JVM running on different OSes or CPUs), but + * in general it should be preferred over [[getTimeMillis()]] when calculating time differences. + * + * Specifically for Linux on x64 architecture, the following links provide useful information + * about the characteristics of the value returned: + * + * http://btorpey.github.io/blog/2014/02/18/clock-sources-in-linux/ + * https://stackoverflow.com/questions/10921210/cpu-tsc-fetch-operation-especially-in-multicore-multi-processor-environment + * + * TL;DR: on modern (2.6.32+) Linux kernels with modern (AMD K8+) CPUs, the values returned by + * `System.nanoTime()` are consistent across CPU cores *and* packages, and provide always + * increasing values (although it may not be completely monotonic when the system clock is + * adjusted by NTP daemons using time slew). + */ + // scalastyle:on line.size.limit + def nanoTime(): Long + + /** + * Wait until the wall clock reaches at least the given time. Note this may not actually wait for + * the actual difference between the current and target times, since the wall clock may drift. + */ def waitTillTime(targetTime: Long): Long } @@ -36,15 +66,19 @@ private[spark] class SystemClock extends Clock { * @return the same time (milliseconds since the epoch) * as is reported by `System.currentTimeMillis()` */ - def getTimeMillis(): Long = System.currentTimeMillis() + override def getTimeMillis(): Long = System.currentTimeMillis() + + /** + * @return value reported by `System.nanoTime()`. + */ + override def nanoTime(): Long = System.nanoTime() /** * @param targetTime block until the current time is at least this value * @return current system time when wait has completed */ - def waitTillTime(targetTime: Long): Long = { - var currentTime = 0L - currentTime = System.currentTimeMillis() + override def waitTillTime(targetTime: Long): Long = { + var currentTime = System.currentTimeMillis() var waitTime = targetTime - currentTime if (waitTime <= 0) { diff --git a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala index 6d6ef5a744204..d2ad14f2a1a96 100644 --- a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala +++ b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala @@ -387,7 +387,7 @@ private[spark] object ClosureCleaner extends Logging { } } - private def ensureSerializable(func: AnyRef) { + private def ensureSerializable(func: AnyRef): Unit = { try { if (SparkEnv.get != null) { SparkEnv.get.closureSerializer.newInstance().serialize(func) @@ -433,7 +433,7 @@ private class ReturnStatementFinder(targetMethodName: Option[String] = None) name == targetMethodName.get || name == targetMethodName.get.stripSuffix("$adapted") new MethodVisitor(ASM7) { - override def visitTypeInsn(op: Int, tp: String) { + override def visitTypeInsn(op: Int, tp: String): Unit = { if (op == NEW && tp.contains("scala/runtime/NonLocalReturnControl") && isTargetMethod) { throw new ReturnStatementInClosureException } @@ -480,7 +480,7 @@ private[util] class FieldAccessFinder( } new MethodVisitor(ASM7) { - override def visitFieldInsn(op: Int, owner: String, name: String, desc: String) { + override def visitFieldInsn(op: Int, owner: String, name: String, desc: String): Unit = { if (op == GETFIELD) { for (cl <- fields.keys if cl.getName == owner.replace('/', '.')) { fields(cl) += name @@ -489,7 +489,7 @@ private[util] class FieldAccessFinder( } override def visitMethodInsn( - op: Int, owner: String, name: String, desc: String, itf: Boolean) { + op: Int, owner: String, name: String, desc: String, itf: Boolean): Unit = { for (cl <- fields.keys if cl.getName == owner.replace('/', '.')) { // Check for calls a getter method for a variable in an interpreter wrapper object. // This means that the corresponding field will be accessed, so we should save it. @@ -528,7 +528,7 @@ private class InnerClosureFinder(output: Set[Class[_]]) extends ClassVisitor(ASM // The second closure technically has two inner closures, but this finder only finds one override def visit(version: Int, access: Int, name: String, sig: String, - superName: String, interfaces: Array[String]) { + superName: String, interfaces: Array[String]): Unit = { myName = name } @@ -536,7 +536,7 @@ private class InnerClosureFinder(output: Set[Class[_]]) extends ClassVisitor(ASM sig: String, exceptions: Array[String]): MethodVisitor = { new MethodVisitor(ASM7) { override def visitMethodInsn( - op: Int, owner: String, name: String, desc: String, itf: Boolean) { + op: Int, owner: String, name: String, desc: String, itf: Boolean): Unit = { val argTypes = Type.getArgumentTypes(desc) if (op == INVOKESPECIAL && name == "" && argTypes.length > 0 && argTypes(0).toString.startsWith("L") // is it an object? diff --git a/core/src/main/scala/org/apache/spark/util/Distribution.scala b/core/src/main/scala/org/apache/spark/util/Distribution.scala index 240dcfbab60ac..550884c873297 100644 --- a/core/src/main/scala/org/apache/spark/util/Distribution.scala +++ b/core/src/main/scala/org/apache/spark/util/Distribution.scala @@ -65,7 +65,7 @@ private[spark] class Distribution(val data: Array[Double], val startIdx: Int, va * print a summary of this distribution to the given PrintStream. * @param out */ - def summary(out: PrintStream = System.out) { + def summary(out: PrintStream = System.out): Unit = { // scalastyle:off println out.println(statCounter) showQuantiles(out) @@ -83,7 +83,7 @@ private[spark] object Distribution { } } - def showQuantiles(out: PrintStream = System.out, quantiles: Iterable[Double]) { + def showQuantiles(out: PrintStream = System.out, quantiles: Iterable[Double]): Unit = { // scalastyle:off println out.println("min\t25%\t50%\t75%\tmax") quantiles.foreach{q => out.print(q + "\t")} diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala index 73ef80980e73f..53824735d2fc5 100644 --- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala +++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala @@ -33,7 +33,7 @@ import org.apache.spark._ import org.apache.spark.executor._ import org.apache.spark.metrics.ExecutorMetricType import org.apache.spark.rdd.RDDOperationScope -import org.apache.spark.resource.ResourceInformation +import org.apache.spark.resource.{ResourceInformation, ResourceProfile} import org.apache.spark.scheduler._ import org.apache.spark.scheduler.cluster.ExecutorInfo import org.apache.spark.storage._ @@ -391,6 +391,7 @@ private[spark] object JsonProtocol { ("Executor Deserialize CPU Time" -> taskMetrics.executorDeserializeCpuTime) ~ ("Executor Run Time" -> taskMetrics.executorRunTime) ~ ("Executor CPU Time" -> taskMetrics.executorCpuTime) ~ + ("Peak Execution Memory" -> taskMetrics.peakExecutionMemory) ~ ("Result Size" -> taskMetrics.resultSize) ~ ("JVM GC Time" -> taskMetrics.jvmGCTime) ~ ("Result Serialization Time" -> taskMetrics.resultSerializationTime) ~ @@ -420,6 +421,7 @@ private[spark] object JsonProtocol { ("Block Manager Address" -> blockManagerAddress) ~ ("Shuffle ID" -> fetchFailed.shuffleId) ~ ("Map ID" -> fetchFailed.mapId) ~ + ("Map Index" -> fetchFailed.mapIndex) ~ ("Reduce ID" -> fetchFailed.reduceId) ~ ("Message" -> fetchFailed.message) case exceptionFailure: ExceptionFailure => @@ -660,7 +662,8 @@ private[spark] object JsonProtocol { val stageInfos = jsonOption(json \ "Stage Infos") .map(_.extract[Seq[JValue]].map(stageInfoFromJson)).getOrElse { stageIds.map { id => - new StageInfo(id, 0, "unknown", 0, Seq.empty, Seq.empty, "unknown") + new StageInfo(id, 0, "unknown", 0, Seq.empty, Seq.empty, "unknown", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) } } SparkListenerJobStart(jobId, submissionTime, stageInfos, properties) @@ -801,7 +804,8 @@ private[spark] object JsonProtocol { } val stageInfo = new StageInfo( - stageId, attemptId, stageName, numTasks, rddInfos, parentIds, details) + stageId, attemptId, stageName, numTasks, rddInfos, parentIds, details, + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) stageInfo.submissionTime = submissionTime stageInfo.completionTime = completionTime stageInfo.failureReason = failureReason @@ -893,6 +897,10 @@ private[spark] object JsonProtocol { case JNothing => 0 case x => x.extract[Long] }) + metrics.setPeakExecutionMemory((json \ "Peak Execution Memory") match { + case JNothing => 0 + case x => x.extract[Long] + }) metrics.setResultSize((json \ "Result Size").extract[Long]) metrics.setJvmGCTime((json \ "JVM GC Time").extract[Long]) metrics.setResultSerializationTime((json \ "Result Serialization Time").extract[Long]) @@ -974,10 +982,11 @@ private[spark] object JsonProtocol { case `fetchFailed` => val blockManagerAddress = blockManagerIdFromJson(json \ "Block Manager Address") val shuffleId = (json \ "Shuffle ID").extract[Int] - val mapId = (json \ "Map ID").extract[Int] + val mapId = (json \ "Map ID").extract[Long] + val mapIndex = (json \ "Map Index").extract[Int] val reduceId = (json \ "Reduce ID").extract[Int] val message = jsonOption(json \ "Message").map(_.extract[String]) - new FetchFailed(blockManagerAddress, shuffleId, mapId, reduceId, + new FetchFailed(blockManagerAddress, shuffleId, mapId, mapIndex, reduceId, message.getOrElse("Unknown reason")) case `exceptionFailure` => val className = (json \ "Class Name").extract[String] diff --git a/core/src/main/scala/org/apache/spark/util/ListenerBus.scala b/core/src/main/scala/org/apache/spark/util/ListenerBus.scala index 2e517707ff774..51cd7d1284ff3 100644 --- a/core/src/main/scala/org/apache/spark/util/ListenerBus.scala +++ b/core/src/main/scala/org/apache/spark/util/ListenerBus.scala @@ -25,7 +25,8 @@ import scala.util.control.NonFatal import com.codahale.metrics.Timer -import org.apache.spark.internal.Logging +import org.apache.spark.SparkEnv +import org.apache.spark.internal.{config, Logging} /** * An event bus which posts events to its listeners. @@ -37,6 +38,20 @@ private[spark] trait ListenerBus[L <: AnyRef, E] extends Logging { // Marked `private[spark]` for access in tests. private[spark] def listeners = listenersPlusTimers.asScala.map(_._1).asJava + private lazy val env = SparkEnv.get + + private lazy val logSlowEventEnabled = if (env != null) { + env.conf.get(config.LISTENER_BUS_LOG_SLOW_EVENT_ENABLED) + } else { + false + } + + private lazy val logSlowEventThreshold = if (env != null) { + env.conf.get(config.LISTENER_BUS_LOG_SLOW_EVENT_TIME_THRESHOLD) + } else { + Long.MaxValue + } + /** * Returns a CodaHale metrics Timer for measuring the listener's event processing time. * This method is intended to be overridden by subclasses. @@ -95,6 +110,7 @@ private[spark] trait ListenerBus[L <: AnyRef, E] extends Logging { } else { null } + lazy val listenerName = Utils.getFormattedClassName(listener) try { doPostEvent(listener, event) if (Thread.interrupted()) { @@ -104,14 +120,17 @@ private[spark] trait ListenerBus[L <: AnyRef, E] extends Logging { } } catch { case ie: InterruptedException => - logError(s"Interrupted while posting to ${Utils.getFormattedClassName(listener)}. " + - s"Removing that listener.", ie) + logError(s"Interrupted while posting to ${listenerName}. Removing that listener.", ie) removeListenerOnError(listener) case NonFatal(e) if !isIgnorableException(e) => - logError(s"Listener ${Utils.getFormattedClassName(listener)} threw an exception", e) + logError(s"Listener ${listenerName} threw an exception", e) } finally { if (maybeTimerContext != null) { - maybeTimerContext.stop() + val elapsed = maybeTimerContext.stop() + if (logSlowEventEnabled && elapsed > logSlowEventThreshold) { + logInfo(s"Process of event ${event} by listener ${listenerName} took " + + s"${elapsed / 1000000000d}s.") + } } } } diff --git a/core/src/main/scala/org/apache/spark/util/ManualClock.scala b/core/src/main/scala/org/apache/spark/util/ManualClock.scala index e7a65d74a440e..36d6820eba239 100644 --- a/core/src/main/scala/org/apache/spark/util/ManualClock.scala +++ b/core/src/main/scala/org/apache/spark/util/ManualClock.scala @@ -17,11 +17,16 @@ package org.apache.spark.util +import java.util.concurrent.TimeUnit + /** * A `Clock` whose time can be manually set and modified. Its reported time does not change * as time elapses, but only as its time is modified by callers. This is mainly useful for * testing. * + * For this implementation, `getTimeMillis()` and `nanoTime()` always return the same value + * (adjusted for the correct unit). + * * @param time initial time (in milliseconds since the epoch) */ private[spark] class ManualClock(private var time: Long) extends Clock { @@ -31,10 +36,11 @@ private[spark] class ManualClock(private var time: Long) extends Clock { */ def this() = this(0L) - def getTimeMillis(): Long = - synchronized { - time - } + override def getTimeMillis(): Long = synchronized { + time + } + + override def nanoTime(): Long = TimeUnit.MILLISECONDS.toNanos(getTimeMillis()) /** * @param timeToSet new time (in milliseconds) that the clock should represent @@ -56,7 +62,7 @@ private[spark] class ManualClock(private var time: Long) extends Clock { * @param targetTime block until the clock time is set or advanced to at least this time * @return current time reported by the clock when waiting finishes */ - def waitTillTime(targetTime: Long): Long = synchronized { + override def waitTillTime(targetTime: Long): Long = synchronized { while (time < targetTime) { wait(10) } diff --git a/core/src/main/scala/org/apache/spark/util/NextIterator.scala b/core/src/main/scala/org/apache/spark/util/NextIterator.scala index 0b505a576768c..0e289025da110 100644 --- a/core/src/main/scala/org/apache/spark/util/NextIterator.scala +++ b/core/src/main/scala/org/apache/spark/util/NextIterator.scala @@ -50,7 +50,7 @@ private[spark] abstract class NextIterator[U] extends Iterator[U] { * Ideally you should have another try/catch, as in HadoopRDD, that * ensures any resources are closed should iteration fail. */ - protected def close() + protected def close(): Unit /** * Calls the subclass-defined close method, but only once. @@ -58,7 +58,7 @@ private[spark] abstract class NextIterator[U] extends Iterator[U] { * Usually calling `close` multiple times should be fine, but historically * there have been issues with some InputFormats throwing exceptions. */ - def closeIfNeeded() { + def closeIfNeeded(): Unit = { if (!closed) { // Note: it's important that we set closed = true before calling close(), since setting it // afterwards would permit us to call close() multiple times if close() threw an exception. diff --git a/core/src/main/scala/org/apache/spark/util/PeriodicCheckpointer.scala b/core/src/main/scala/org/apache/spark/util/PeriodicCheckpointer.scala index c105f3229af09..f01645d82303e 100644 --- a/core/src/main/scala/org/apache/spark/util/PeriodicCheckpointer.scala +++ b/core/src/main/scala/org/apache/spark/util/PeriodicCheckpointer.scala @@ -24,7 +24,6 @@ import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.internal.Logging -import org.apache.spark.storage.StorageLevel /** diff --git a/core/src/main/scala/org/apache/spark/util/SerializableConfiguration.scala b/core/src/main/scala/org/apache/spark/util/SerializableConfiguration.scala index 3354a923273ff..42d7f71404594 100644 --- a/core/src/main/scala/org/apache/spark/util/SerializableConfiguration.scala +++ b/core/src/main/scala/org/apache/spark/util/SerializableConfiguration.scala @@ -20,7 +20,14 @@ import java.io.{ObjectInputStream, ObjectOutputStream} import org.apache.hadoop.conf.Configuration -private[spark] +import org.apache.spark.annotation.{DeveloperApi, Unstable} + +/** + * Hadoop configuration but serializable. Use `value` to access the Hadoop configuration. + * + * @param value Hadoop configuration + */ +@DeveloperApi @Unstable class SerializableConfiguration(@transient var value: Configuration) extends Serializable { private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException { out.defaultWriteObject() diff --git a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala index b702838fa257f..4f1311224bb95 100644 --- a/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala +++ b/core/src/main/scala/org/apache/spark/util/ShutdownHookManager.scala @@ -70,7 +70,7 @@ private[spark] object ShutdownHookManager extends Logging { } // Register the path to be deleted via shutdown hook - def registerShutdownDeleteDir(file: File) { + def registerShutdownDeleteDir(file: File): Unit = { val absolutePath = file.getAbsolutePath() shutdownDeletePaths.synchronized { shutdownDeletePaths += absolutePath @@ -78,7 +78,7 @@ private[spark] object ShutdownHookManager extends Logging { } // Remove the path to be deleted via shutdown hook - def removeShutdownDeleteDir(file: File) { + def removeShutdownDeleteDir(file: File): Unit = { val absolutePath = file.getAbsolutePath() shutdownDeletePaths.synchronized { shutdownDeletePaths.remove(absolutePath) @@ -120,7 +120,7 @@ private[spark] object ShutdownHookManager extends Logging { def inShutdown(): Boolean = { try { val hook = new Thread { - override def run() {} + override def run(): Unit = {} } // scalastyle:off runtimeaddshutdownhook Runtime.getRuntime.addShutdownHook(hook) diff --git a/core/src/main/scala/org/apache/spark/util/SignalUtils.scala b/core/src/main/scala/org/apache/spark/util/SignalUtils.scala index 5a24965170cef..230195da2a121 100644 --- a/core/src/main/scala/org/apache/spark/util/SignalUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/SignalUtils.scala @@ -60,7 +60,7 @@ private[spark] object SignalUtils extends Logging { if (SystemUtils.IS_OS_UNIX) { try { val handler = handlers.getOrElseUpdate(signal, { - logInfo("Registered signal handler for " + signal) + logInfo("Registering signal handler for " + signal) new ActionHandler(new Signal(signal)) }) handler.register(action) diff --git a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala index 09c69f5c68b03..85e1119569ce2 100644 --- a/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala +++ b/core/src/main/scala/org/apache/spark/util/SizeEstimator.scala @@ -107,7 +107,7 @@ object SizeEstimator extends Logging { // Sets object size, pointer size based on architecture and CompressedOops settings // from the JVM. - private def initialize() { + private def initialize(): Unit = { val arch = System.getProperty("os.arch") is64bit = arch.contains("64") || arch.contains("s390x") isCompressedOops = getIsCompressedOops @@ -171,7 +171,7 @@ object SizeEstimator extends Logging { val stack = new ArrayBuffer[AnyRef] var size = 0L - def enqueue(obj: AnyRef) { + def enqueue(obj: AnyRef): Unit = { if (obj != null && !visited.containsKey(obj)) { visited.put(obj, null) stack += obj @@ -205,7 +205,7 @@ object SizeEstimator extends Logging { state.size } - private def visitSingleObject(obj: AnyRef, state: SearchState) { + private def visitSingleObject(obj: AnyRef, state: SearchState): Unit = { val cls = obj.getClass if (cls.isArray) { visitArray(obj, cls, state) @@ -234,7 +234,7 @@ object SizeEstimator extends Logging { private val ARRAY_SIZE_FOR_SAMPLING = 400 private val ARRAY_SAMPLE_SIZE = 100 // should be lower than ARRAY_SIZE_FOR_SAMPLING - private def visitArray(array: AnyRef, arrayClass: Class[_], state: SearchState) { + private def visitArray(array: AnyRef, arrayClass: Class[_], state: SearchState): Unit = { val length = ScalaRunTime.array_length(array) val elementClass = arrayClass.getComponentType() @@ -326,7 +326,7 @@ object SizeEstimator extends Logging { val parent = getClassInfo(cls.getSuperclass) var shellSize = parent.shellSize var pointerFields = parent.pointerFields - val sizeCount = Array.fill(fieldSizes.max + 1)(0) + val sizeCount = Array.ofDim[Int](fieldSizes.max + 1) // iterate through the fields of this class and gather information. for (field <- cls.getDeclaredFields) { diff --git a/core/src/main/scala/org/apache/spark/util/SparkUncaughtExceptionHandler.scala b/core/src/main/scala/org/apache/spark/util/SparkUncaughtExceptionHandler.scala index 1b34fbde38cd6..e77128755363d 100644 --- a/core/src/main/scala/org/apache/spark/util/SparkUncaughtExceptionHandler.scala +++ b/core/src/main/scala/org/apache/spark/util/SparkUncaughtExceptionHandler.scala @@ -28,7 +28,7 @@ import org.apache.spark.internal.Logging private[spark] class SparkUncaughtExceptionHandler(val exitOnUncaughtException: Boolean = true) extends Thread.UncaughtExceptionHandler with Logging { - override def uncaughtException(thread: Thread, exception: Throwable) { + override def uncaughtException(thread: Thread, exception: Throwable): Unit = { try { // Make it explicit that uncaught exceptions are thrown when container is shutting down. // It will help users when they analyze the executor logs @@ -48,15 +48,30 @@ private[spark] class SparkUncaughtExceptionHandler(val exitOnUncaughtException: System.exit(SparkExitCode.OOM) case _ if exitOnUncaughtException => System.exit(SparkExitCode.UNCAUGHT_EXCEPTION) + case _ => + // SPARK-30310: Don't System.exit() when exitOnUncaughtException is false } } } catch { - case oom: OutOfMemoryError => Runtime.getRuntime.halt(SparkExitCode.OOM) - case t: Throwable => Runtime.getRuntime.halt(SparkExitCode.UNCAUGHT_EXCEPTION_TWICE) + case oom: OutOfMemoryError => + try { + logError(s"Uncaught OutOfMemoryError in thread $thread, process halted.", oom) + } catch { + // absorb any exception/error since we're halting the process + case _: Throwable => + } + Runtime.getRuntime.halt(SparkExitCode.OOM) + case t: Throwable => + try { + logError(s"Another uncaught exception in thread $thread, process halted.", t) + } catch { + case _: Throwable => + } + Runtime.getRuntime.halt(SparkExitCode.UNCAUGHT_EXCEPTION_TWICE) } } - def uncaughtException(exception: Throwable) { + def uncaughtException(exception: Throwable): Unit = { uncaughtException(Thread.currentThread(), exception) } } diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala index 8df331251c749..de39e4b410f25 100644 --- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala @@ -18,23 +18,97 @@ package org.apache.spark.util import java.util.concurrent._ +import java.util.concurrent.locks.ReentrantLock -import scala.collection.TraversableLike -import scala.collection.generic.CanBuildFrom -import scala.language.higherKinds - -import com.google.common.util.concurrent.{MoreExecutors, ThreadFactoryBuilder} import scala.concurrent.{Awaitable, ExecutionContext, ExecutionContextExecutor, Future} import scala.concurrent.duration.{Duration, FiniteDuration} +import scala.language.higherKinds import scala.util.control.NonFatal +import com.google.common.util.concurrent.ThreadFactoryBuilder + import org.apache.spark.SparkException import org.apache.spark.rpc.RpcAbortException private[spark] object ThreadUtils { private val sameThreadExecutionContext = - ExecutionContext.fromExecutorService(MoreExecutors.sameThreadExecutor()) + ExecutionContext.fromExecutorService(sameThreadExecutorService()) + + // Inspired by Guava MoreExecutors.sameThreadExecutor; inlined and converted + // to Scala here to avoid Guava version issues + def sameThreadExecutorService(): ExecutorService = new AbstractExecutorService { + private val lock = new ReentrantLock() + private val termination = lock.newCondition() + private var runningTasks = 0 + private var serviceIsShutdown = false + + override def shutdown(): Unit = { + lock.lock() + try { + serviceIsShutdown = true + } finally { + lock.unlock() + } + } + + override def shutdownNow(): java.util.List[Runnable] = { + shutdown() + java.util.Collections.emptyList() + } + + override def isShutdown: Boolean = { + lock.lock() + try { + serviceIsShutdown + } finally { + lock.unlock() + } + } + + override def isTerminated: Boolean = synchronized { + lock.lock() + try { + serviceIsShutdown && runningTasks == 0 + } finally { + lock.unlock() + } + } + + override def awaitTermination(timeout: Long, unit: TimeUnit): Boolean = { + var nanos = unit.toNanos(timeout) + lock.lock() + try { + while (nanos > 0 && !isTerminated()) { + nanos = termination.awaitNanos(nanos) + } + isTerminated() + } finally { + lock.unlock() + } + } + + override def execute(command: Runnable): Unit = { + lock.lock() + try { + if (isShutdown()) throw new RejectedExecutionException("Executor already shutdown") + runningTasks += 1 + } finally { + lock.unlock() + } + try { + command.run() + } finally { + lock.lock() + try { + runningTasks -= 1 + if (isTerminated()) termination.signalAll() + } finally { + lock.unlock() + } + } + } + } /** * An `ExecutionContextExecutor` that runs each task in the thread that invokes `execute/submit`. @@ -275,13 +349,7 @@ private[spark] object ThreadUtils { * @return new collection in which each element was given from the input collection `in` by * applying the lambda function `f`. */ - def parmap[I, O, Col[X] <: TraversableLike[X, Col[X]]] - (in: Col[I], prefix: String, maxThreads: Int) - (f: I => O) - (implicit - cbf: CanBuildFrom[Col[I], Future[O], Col[Future[O]]], // For in.map - cbf2: CanBuildFrom[Col[Future[O]], O, Col[O]] // for Future.sequence - ): Col[O] = { + def parmap[I, O](in: Seq[I], prefix: String, maxThreads: Int)(f: I => O): Seq[O] = { val pool = newForkJoinPool(prefix, maxThreads) try { implicit val ec = ExecutionContext.fromExecutor(pool) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 9c1f21fa236ba..297cc5e4cb100 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -28,7 +28,7 @@ import java.nio.channels.{Channels, FileChannel, WritableByteChannel} import java.nio.charset.StandardCharsets import java.nio.file.Files import java.security.SecureRandom -import java.util.{Locale, Properties, Random, UUID} +import java.util.{Arrays, Locale, Properties, Random, UUID} import java.util.concurrent._ import java.util.concurrent.TimeUnit.NANOSECONDS import java.util.zip.GZIPInputStream @@ -45,9 +45,9 @@ import scala.util.matching.Regex import _root_.io.netty.channel.unix.Errors.NativeIoException import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache} -import com.google.common.hash.HashCodes import com.google.common.io.{ByteStreams, Files => GFiles} import com.google.common.net.InetAddresses +import org.apache.commons.codec.binary.Hex import org.apache.commons.lang3.SystemUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FileUtil, Path} @@ -95,7 +95,7 @@ private[spark] object Utils extends Logging { */ val DEFAULT_DRIVER_MEM_MB = JavaUtils.DEFAULT_DRIVER_MEM_MB.toInt - private val MAX_DIR_CREATION_ATTEMPTS: Int = 10 + val MAX_DIR_CREATION_ATTEMPTS: Int = 10 @volatile private var localRootDirs: Array[String] = null /** Scheme used for files that are locally available on worker nodes in the cluster. */ @@ -731,7 +731,7 @@ private[spark] object Utils extends Logging { case "file" => // In the case of a local file, copy the local file to the target directory. // Note the difference between uri vs url. - val sourceFile = if (uri.isAbsolute) new File(uri) else new File(url) + val sourceFile = if (uri.isAbsolute) new File(uri) else new File(uri.getPath) copyFile(url, sourceFile, targetFile, fileOverwrite) case _ => val fs = getHadoopFileSystem(uri, hadoopConf) @@ -999,7 +999,7 @@ private[spark] object Utils extends Logging { * Allow setting a custom host name because when we run on Mesos we need to use the same * hostname it reports to the master. */ - def setCustomHostname(hostname: String) { + def setCustomHostname(hostname: String): Unit = { // DEBUG code Utils.checkHost(hostname) customHostname = Some(hostname) @@ -1026,11 +1026,11 @@ private[spark] object Utils extends Logging { customHostname.getOrElse(InetAddresses.toUriString(localIpAddress)) } - def checkHost(host: String) { + def checkHost(host: String): Unit = { assert(host != null && host.indexOf(':') == -1, s"Expected hostname (not IP) but got $host") } - def checkHostPort(hostPort: String) { + def checkHostPort(hostPort: String): Unit = { assert(hostPort != null && hostPort.indexOf(':') != -1, s"Expected host and port but got $hostPort") } @@ -1280,7 +1280,7 @@ private[spark] object Utils extends Logging { inputStream: InputStream, processLine: String => Unit): Thread = { val t = new Thread(threadName) { - override def run() { + override def run(): Unit = { for (line <- Source.fromInputStream(inputStream).getLines()) { processLine(line) } @@ -1297,7 +1297,7 @@ private[spark] object Utils extends Logging { * * NOTE: This method is to be called by the spark-started JVM process. */ - def tryOrExit(block: => Unit) { + def tryOrExit(block: => Unit): Unit = { try { block } catch { @@ -1314,7 +1314,7 @@ private[spark] object Utils extends Logging { * user-started JVM process completely; in contrast, tryOrExit is to be called in the * spark-started JVM process . */ - def tryOrStopSparkContext(sc: SparkContext)(block: => Unit) { + def tryOrStopSparkContext(sc: SparkContext)(block: => Unit): Unit = { try { block } catch { @@ -1352,7 +1352,7 @@ private[spark] object Utils extends Logging { } /** Executes the given block. Log non-fatal errors if any, and only throw fatal errors */ - def tryLogNonFatalError(block: => Unit) { + def tryLogNonFatalError(block: => Unit): Unit = { try { block } catch { @@ -1671,7 +1671,7 @@ private[spark] object Utils extends Logging { var inSingleQuote = false var inDoubleQuote = false val curWord = new StringBuilder - def endWord() { + def endWord(): Unit = { buf += curWord.toString curWord.clear() } @@ -1744,34 +1744,6 @@ private[spark] object Utils extends Logging { hashAbs } - /** - * NaN-safe version of `java.lang.Double.compare()` which allows NaN values to be compared - * according to semantics where NaN == NaN and NaN is greater than any non-NaN double. - */ - def nanSafeCompareDoubles(x: Double, y: Double): Int = { - val xIsNan: Boolean = java.lang.Double.isNaN(x) - val yIsNan: Boolean = java.lang.Double.isNaN(y) - if ((xIsNan && yIsNan) || (x == y)) 0 - else if (xIsNan) 1 - else if (yIsNan) -1 - else if (x > y) 1 - else -1 - } - - /** - * NaN-safe version of `java.lang.Float.compare()` which allows NaN values to be compared - * according to semantics where NaN == NaN and NaN is greater than any non-NaN float. - */ - def nanSafeCompareFloats(x: Float, y: Float): Int = { - val xIsNan: Boolean = java.lang.Float.isNaN(x) - val yIsNan: Boolean = java.lang.Float.isNaN(y) - if ((xIsNan && yIsNan) || (x == y)) 0 - else if (xIsNan) 1 - else if (yIsNan) -1 - else if (x > y) 1 - else -1 - } - /** * Returns the system properties map that is thread-safe to iterator over. It gets the * properties which have been set explicitly, as well as those for which only a default value @@ -1840,14 +1812,14 @@ private[spark] object Utils extends Logging { * Generate a zipWithIndex iterator, avoid index value overflowing problem * in scala's zipWithIndex */ - def getIteratorZipWithIndex[T](iterator: Iterator[T], startIndex: Long): Iterator[(T, Long)] = { + def getIteratorZipWithIndex[T](iter: Iterator[T], startIndex: Long): Iterator[(T, Long)] = { new Iterator[(T, Long)] { require(startIndex >= 0, "startIndex should be >= 0.") var index: Long = startIndex - 1L - def hasNext: Boolean = iterator.hasNext + def hasNext: Boolean = iter.hasNext def next(): (T, Long) = { index += 1L - (iterator.next(), index) + (iter.next(), index) } } } @@ -2342,7 +2314,7 @@ private[spark] object Utils extends Logging { /** * configure a new log4j level */ - def setLogLevel(l: org.apache.log4j.Level) { + def setLogLevel(l: org.apache.log4j.Level): Unit = { val rootLogger = org.apache.log4j.Logger.getRootLogger() rootLogger.setLevel(l) // Setting threshold to null as rootLevel will define log level for spark-shell @@ -2838,7 +2810,7 @@ private[spark] object Utils extends Logging { val rnd = new SecureRandom() val secretBytes = new Array[Byte](bits / JByte.SIZE) rnd.nextBytes(secretBytes) - HashCodes.fromBytes(secretBytes).toString() + Hex.encodeHexString(secretBytes) } /** @@ -2950,6 +2922,13 @@ private[spark] object Utils extends Logging { val codec = codecFactory.getCodec(path) codec == null || codec.isInstanceOf[SplittableCompressionCodec] } + + /** Create a new properties object with the same values as `props` */ + def cloneProperties(props: Properties): Properties = { + val resultProps = new Properties() + props.forEach((k, v) => resultProps.put(k, v)) + resultProps + } } private[util] object CallerContext extends Logging { @@ -3033,7 +3012,8 @@ private[spark] class CallerContext( if (CallerContext.callerContextSupported) { try { val callerContext = Utils.classForName("org.apache.hadoop.ipc.CallerContext") - val builder = Utils.classForName("org.apache.hadoop.ipc.CallerContext$Builder") + val builder: Class[AnyRef] = + Utils.classForName("org.apache.hadoop.ipc.CallerContext$Builder") val builderInst = builder.getConstructor(classOf[String]).newInstance(context) val hdfsContext = builder.getMethod("build").invoke(builderInst) callerContext.getMethod("setCurrent", callerContext).invoke(null, hdfsContext) @@ -3056,7 +3036,7 @@ private[spark] class RedirectThread( extends Thread(name) { setDaemon(true) - override def run() { + override def run(): Unit = { scala.util.control.Exception.ignoring(classOf[IOException]) { // FIXME: We copy the stream on the level of bytes to avoid encoding problems. Utils.tryWithSafeFinally { diff --git a/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala index bcb95b416dd25..46e311d8b0476 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/AppendOnlyMap.scala @@ -198,7 +198,7 @@ class AppendOnlyMap[K, V](initialCapacity: Int = 64) override def size: Int = curSize /** Increase table size by 1, rehashing if necessary */ - private def incrementSize() { + private def incrementSize(): Unit = { curSize += 1 if (curSize > growThreshold) { growTable() @@ -211,7 +211,7 @@ class AppendOnlyMap[K, V](initialCapacity: Int = 64) private def rehash(h: Int): Int = Hashing.murmur3_32().hashInt(h).asInt() /** Double the table's size and re-hash everything */ - protected def growTable() { + protected def growTable(): Unit = { // capacity < MAXIMUM_CAPACITY (2 ^ 29) so capacity * 2 won't overflow val newCapacity = capacity * 2 require(newCapacity <= MAXIMUM_CAPACITY, s"Can't contain more than ${growThreshold} elements") diff --git a/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala b/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala index e63e0e3e1f68f..098f389829ec5 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala @@ -150,12 +150,12 @@ class BitSet(numBits: Int) extends Serializable { * Sets the bit at the specified index to true. * @param index the bit index */ - def set(index: Int) { + def set(index: Int): Unit = { val bitmask = 1L << (index & 0x3f) // mod 64 and shift words(index >> 6) |= bitmask // div by 64 and mask } - def unset(index: Int) { + def unset(index: Int): Unit = { val bitmask = 1L << (index & 0x3f) // mod 64 and shift words(index >> 6) &= ~bitmask // div by 64 and mask } diff --git a/core/src/main/scala/org/apache/spark/util/collection/CompactBuffer.scala b/core/src/main/scala/org/apache/spark/util/collection/CompactBuffer.scala index 5d3693190cc1f..9d5f1aac3391b 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/CompactBuffer.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/CompactBuffer.scala @@ -112,8 +112,6 @@ private[spark] class CompactBuffer[T: ClassTag] extends Seq[T] with Serializable override def length: Int = curSize - override def size: Int = curSize - override def iterator: Iterator[T] = new Iterator[T] { private var pos = 0 override def hasNext: Boolean = pos < curSize diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala index 1ba3b7875f8dc..7f40b469a95e9 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala @@ -367,7 +367,7 @@ class ExternalAppendOnlyMap[K, V, C]( private def removeFromBuffer[T](buffer: ArrayBuffer[T], index: Int): T = { val elem = buffer(index) buffer(index) = buffer(buffer.size - 1) // This also works if index == buffer.size - 1 - buffer.reduceToSize(buffer.size - 1) + buffer.trimEnd(1) elem } @@ -549,7 +549,7 @@ class ExternalAppendOnlyMap[K, V, C]( item } - private def cleanup() { + private def cleanup(): Unit = { batchIndex = batchOffsets.length // Prevent reading any other batch if (deserializeStream != null) { deserializeStream.close() diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala index 7a822e137e556..cc97bbfa7201f 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala @@ -23,7 +23,7 @@ import java.util.Comparator import scala.collection.mutable import scala.collection.mutable.ArrayBuffer -import com.google.common.io.{ByteStreams, Closeables} +import com.google.common.io.ByteStreams import org.apache.spark._ import org.apache.spark.executor.ShuffleWriteMetrics @@ -534,7 +534,7 @@ private[spark] class ExternalSorter[K, V, C]( * Update partitionId if we have reached the end of our current partition, possibly skipping * empty partitions on the way. */ - private def skipToNextPartition() { + private def skipToNextPartition(): Unit = { while (partitionId < numPartitions && indexInPartition == spill.elementsPerPartition(partitionId)) { partitionId += 1 @@ -605,7 +605,7 @@ private[spark] class ExternalSorter[K, V, C]( } // Clean up our open streams and put us in a state where we can't read any more data - def cleanup() { + def cleanup(): Unit = { batchId = batchOffsets.length // Prevent reading any other batch val ds = deserializeStream deserializeStream = null @@ -727,7 +727,7 @@ private[spark] class ExternalSorter[K, V, C]( */ def writePartitionedMapOutput( shuffleId: Int, - mapId: Int, + mapId: Long, mapOutputWriter: ShuffleMapOutputWriter): Unit = { var nextPartitionId = 0 if (spills.isEmpty) { diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashMap.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashMap.scala index 10ab0b3f89964..1200ac001cce7 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashMap.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashMap.scala @@ -76,7 +76,7 @@ class OpenHashMap[K : ClassTag, @specialized(Long, Int, Double) V: ClassTag]( } /** Set the value for a key */ - def update(k: K, v: V) { + def update(k: K, v: V): Unit = { if (k == null) { haveNullValue = true nullValue = v diff --git a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala index 8883e17bf3164..6815e47a198d9 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/OpenHashSet.scala @@ -113,7 +113,7 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag]( * Add an element to the set. If the set is over capacity after the insertion, grow the set * and rehash all elements. */ - def add(k: T) { + def add(k: T): Unit = { addWithoutResize(k) rehashIfNeeded(k, grow, move) } @@ -166,7 +166,7 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag]( * @param moveFunc Callback invoked when we move the key from one position (in the old data array) * to a new position (in the new data array). */ - def rehashIfNeeded(k: T, allocateFunc: (Int) => Unit, moveFunc: (Int, Int) => Unit) { + def rehashIfNeeded(k: T, allocateFunc: (Int) => Unit, moveFunc: (Int, Int) => Unit): Unit = { if (_size > _growThreshold) { rehash(k, allocateFunc, moveFunc) } @@ -227,7 +227,7 @@ class OpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag]( * @param moveFunc Callback invoked when we move the key from one position (in the old data array) * to a new position (in the new data array). */ - private def rehash(k: T, allocateFunc: (Int) => Unit, moveFunc: (Int, Int) => Unit) { + private def rehash(k: T, allocateFunc: (Int) => Unit, moveFunc: (Int, Int) => Unit): Unit = { val newCapacity = _capacity * 2 require(newCapacity > 0 && newCapacity <= OpenHashSet.MAX_CAPACITY, s"Can't contain more than ${(loadFactor * OpenHashSet.MAX_CAPACITY).toInt} elements") @@ -320,8 +320,8 @@ object OpenHashSet { override def hash(o: Float): Int = java.lang.Float.floatToIntBits(o) } - private def grow1(newSize: Int) {} - private def move1(oldPos: Int, newPos: Int) { } + private def grow1(newSize: Int): Unit = {} + private def move1(oldPos: Int, newPos: Int): Unit = { } private val grow = grow1 _ private val move = move1 _ diff --git a/core/src/main/scala/org/apache/spark/util/collection/PrimitiveKeyOpenHashMap.scala b/core/src/main/scala/org/apache/spark/util/collection/PrimitiveKeyOpenHashMap.scala index b4ec4ea521253..7a50d851941ee 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/PrimitiveKeyOpenHashMap.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/PrimitiveKeyOpenHashMap.scala @@ -66,7 +66,7 @@ class PrimitiveKeyOpenHashMap[@specialized(Long, Int) K: ClassTag, } /** Set the value for a key */ - def update(k: K, v: V) { + def update(k: K, v: V): Unit = { val pos = _keySet.addWithoutResize(k) & OpenHashSet.POSITION_MASK _values(pos) = v _keySet.rehashIfNeeded(k, grow, move) diff --git a/core/src/main/scala/org/apache/spark/util/collection/SortDataFormat.scala b/core/src/main/scala/org/apache/spark/util/collection/SortDataFormat.scala index 9a7a5a4e74868..582bd124b5116 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/SortDataFormat.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/SortDataFormat.scala @@ -87,7 +87,7 @@ class KVArraySortDataFormat[K, T <: AnyRef : ClassTag] extends SortDataFormat[K, override def getKey(data: Array[T], pos: Int): K = data(2 * pos).asInstanceOf[K] - override def swap(data: Array[T], pos0: Int, pos1: Int) { + override def swap(data: Array[T], pos0: Int, pos1: Int): Unit = { val tmpKey = data(2 * pos0) val tmpVal = data(2 * pos0 + 1) data(2 * pos0) = data(2 * pos1) @@ -96,12 +96,13 @@ class KVArraySortDataFormat[K, T <: AnyRef : ClassTag] extends SortDataFormat[K, data(2 * pos1 + 1) = tmpVal } - override def copyElement(src: Array[T], srcPos: Int, dst: Array[T], dstPos: Int) { + override def copyElement(src: Array[T], srcPos: Int, dst: Array[T], dstPos: Int): Unit = { dst(2 * dstPos) = src(2 * srcPos) dst(2 * dstPos + 1) = src(2 * srcPos + 1) } - override def copyRange(src: Array[T], srcPos: Int, dst: Array[T], dstPos: Int, length: Int) { + override def copyRange(src: Array[T], srcPos: Int, + dst: Array[T], dstPos: Int, length: Int): Unit = { System.arraycopy(src, 2 * srcPos, dst, 2 * dstPos, 2 * length) } diff --git a/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala b/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala index bfc0face5d8e5..1983b0002853d 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala @@ -141,7 +141,7 @@ private[spark] abstract class Spillable[C](taskMemoryManager: TaskMemoryManager) * * @param size number of bytes spilled */ - @inline private def logSpillage(size: Long) { + @inline private def logSpillage(size: Long): Unit = { val threadId = Thread.currentThread().getId logInfo("Thread %d spilling in-memory map of %s to disk (%d time%s so far)" .format(threadId, org.apache.spark.util.Utils.bytesToString(size), diff --git a/core/src/main/scala/org/apache/spark/util/collection/WritablePartitionedPairCollection.scala b/core/src/main/scala/org/apache/spark/util/collection/WritablePartitionedPairCollection.scala index da8d58d05b6b9..9624b02cb407c 100644 --- a/core/src/main/scala/org/apache/spark/util/collection/WritablePartitionedPairCollection.scala +++ b/core/src/main/scala/org/apache/spark/util/collection/WritablePartitionedPairCollection.scala @@ -19,7 +19,6 @@ package org.apache.spark.util.collection import java.util.Comparator -import org.apache.spark.storage.DiskBlockObjectWriter /** * A common interface for size-tracking collections of key-value pairs that diff --git a/core/src/main/scala/org/apache/spark/util/logging/DriverLogger.scala b/core/src/main/scala/org/apache/spark/util/logging/DriverLogger.scala index c4540433bce97..4c1b49762ace3 100644 --- a/core/src/main/scala/org/apache/spark/util/logging/DriverLogger.scala +++ b/core/src/main/scala/org/apache/spark/util/logging/DriverLogger.scala @@ -18,15 +18,18 @@ package org.apache.spark.util.logging import java.io._ +import java.util.EnumSet import java.util.concurrent.{ScheduledExecutorService, TimeUnit} import org.apache.commons.io.FileUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataOutputStream, Path} import org.apache.hadoop.fs.permission.FsPermission +import org.apache.hadoop.hdfs.client.HdfsDataOutputStream import org.apache.log4j.{FileAppender => Log4jFileAppender, _} import org.apache.spark.SparkConf +import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ import org.apache.spark.network.util.JavaUtils @@ -111,7 +114,8 @@ private[spark] class DriverLogger(conf: SparkConf) extends Logging { + DriverLogger.DRIVER_LOG_FILE_SUFFIX).getAbsolutePath() try { inStream = new BufferedInputStream(new FileInputStream(localLogFile)) - outputStream = fileSystem.create(new Path(dfsLogFile), true) + outputStream = SparkHadoopUtil.createFile(fileSystem, new Path(dfsLogFile), + conf.get(DRIVER_LOG_ALLOW_EC)) fileSystem.setPermission(new Path(dfsLogFile), LOG_FILE_PERMISSIONS) } catch { case e: Exception => @@ -131,12 +135,20 @@ private[spark] class DriverLogger(conf: SparkConf) extends Logging { } try { var remaining = inStream.available() + val hadData = remaining > 0 while (remaining > 0) { val read = inStream.read(tmpBuffer, 0, math.min(remaining, UPLOAD_CHUNK_SIZE)) outputStream.write(tmpBuffer, 0, read) remaining -= read } - outputStream.hflush() + if (hadData) { + outputStream match { + case hdfsStream: HdfsDataOutputStream => + hdfsStream.hsync(EnumSet.allOf(classOf[HdfsDataOutputStream.SyncFlag])) + case other => + other.hflush() + } + } } catch { case e: Exception => logError("Failed writing driver logs to dfs", e) } diff --git a/core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala b/core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala index 3188e0bd2b70d..7107be25eb505 100644 --- a/core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala +++ b/core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala @@ -34,7 +34,7 @@ private[spark] class FileAppender(inputStream: InputStream, file: File, bufferSi // Thread that reads the input stream and writes to file private val writingThread = new Thread("File appending thread for " + file) { setDaemon(true) - override def run() { + override def run(): Unit = { Utils.logUncaughtExceptions { appendStreamToFile() } @@ -46,17 +46,17 @@ private[spark] class FileAppender(inputStream: InputStream, file: File, bufferSi * Wait for the appender to stop appending, either because input stream is closed * or because of any error in appending */ - def awaitTermination() { + def awaitTermination(): Unit = { writingThread.join() } /** Stop the appender */ - def stop() { + def stop(): Unit = { markedForStop = true } /** Continuously read chunks from the input stream and append to the file */ - protected def appendStreamToFile() { + protected def appendStreamToFile(): Unit = { try { logDebug("Started appending thread") Utils.tryWithSafeFinally { @@ -85,7 +85,7 @@ private[spark] class FileAppender(inputStream: InputStream, file: File, bufferSi } /** Append bytes to the file output stream */ - protected def appendToFile(bytes: Array[Byte], len: Int) { + protected def appendToFile(bytes: Array[Byte], len: Int): Unit = { if (outputStream == null) { openFile() } @@ -93,13 +93,13 @@ private[spark] class FileAppender(inputStream: InputStream, file: File, bufferSi } /** Open the file output stream */ - protected def openFile() { + protected def openFile(): Unit = { outputStream = new FileOutputStream(file, true) logDebug(s"Opened file $file") } /** Close the file output stream */ - protected def closeFile() { + protected def closeFile(): Unit = { outputStream.flush() outputStream.close() logDebug(s"Closed file $file") diff --git a/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala b/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala index 59439b68792e5..b73f422649312 100644 --- a/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala +++ b/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala @@ -49,12 +49,12 @@ private[spark] class RollingFileAppender( private val enableCompression = conf.get(config.EXECUTOR_LOGS_ROLLING_ENABLE_COMPRESSION) /** Stop the appender */ - override def stop() { + override def stop(): Unit = { super.stop() } /** Append bytes to file after rolling over is necessary */ - override protected def appendToFile(bytes: Array[Byte], len: Int) { + override protected def appendToFile(bytes: Array[Byte], len: Int): Unit = { if (rollingPolicy.shouldRollover(len)) { rollover() rollingPolicy.rolledOver() @@ -64,7 +64,7 @@ private[spark] class RollingFileAppender( } /** Rollover the file, by closing the output stream and moving it over */ - private def rollover() { + private def rollover(): Unit = { try { closeFile() moveFile() @@ -106,7 +106,7 @@ private[spark] class RollingFileAppender( } /** Move the active log file to a new rollover file */ - private def moveFile() { + private def moveFile(): Unit = { val rolloverSuffix = rollingPolicy.generateRolledOverFileSuffix() val rolloverFile = new File( activeFile.getParentFile, activeFile.getName + rolloverSuffix).getAbsoluteFile @@ -138,7 +138,7 @@ private[spark] class RollingFileAppender( } /** Retain only last few files */ - private[util] def deleteOldFiles() { + private[util] def deleteOldFiles(): Unit = { try { val rolledoverFiles = activeFile.getParentFile.listFiles(new FileFilter { def accept(f: File): Boolean = { diff --git a/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala b/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala index 1f263df57c857..5327ecd3e56a9 100644 --- a/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala +++ b/core/src/main/scala/org/apache/spark/util/logging/RollingPolicy.scala @@ -67,12 +67,12 @@ private[spark] class TimeBasedRollingPolicy( } /** Rollover has occurred, so find the next time to rollover */ - def rolledOver() { + def rolledOver(): Unit = { nextRolloverTime = calculateNextRolloverTime() logDebug(s"Current time: ${System.currentTimeMillis}, next rollover time: " + nextRolloverTime) } - def bytesWritten(bytes: Long) { } // nothing to do + def bytesWritten(bytes: Long): Unit = { } // nothing to do private def calculateNextRolloverTime(): Long = { val now = System.currentTimeMillis() @@ -118,12 +118,12 @@ private[spark] class SizeBasedRollingPolicy( } /** Rollover has occurred, so reset the counter */ - def rolledOver() { + def rolledOver(): Unit = { bytesWrittenSinceRollover = 0 } /** Increment the bytes that have been written in the current file */ - def bytesWritten(bytes: Long) { + def bytesWritten(bytes: Long): Unit = { bytesWrittenSinceRollover += bytes } diff --git a/core/src/main/scala/org/apache/spark/util/random/RandomSampler.scala b/core/src/main/scala/org/apache/spark/util/random/RandomSampler.scala index 70554f1d03067..6dd2beebbb3dc 100644 --- a/core/src/main/scala/org/apache/spark/util/random/RandomSampler.scala +++ b/core/src/main/scala/org/apache/spark/util/random/RandomSampler.scala @@ -201,7 +201,7 @@ class PoissonSampler[T]( private val rng = new PoissonDistribution(if (fraction > 0.0) fraction else 1.0) private val rngGap = RandomSampler.newDefaultRNG - override def setSeed(seed: Long) { + override def setSeed(seed: Long): Unit = { rng.reseedRandomGenerator(seed) rngGap.setSeed(seed) } diff --git a/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala b/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala index af09e50a157ae..313569a81646d 100644 --- a/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala +++ b/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala @@ -49,7 +49,7 @@ private[spark] class XORShiftRandom(init: Long) extends JavaRandom(init) { (nextSeed & ((1L << bits) -1)).asInstanceOf[Int] } - override def setSeed(s: Long) { + override def setSeed(s: Long): Unit = { seed = XORShiftRandom.hashSeed(s) } } @@ -60,7 +60,7 @@ private[spark] object XORShiftRandom { /** Hash seeds to have 0/1 bits throughout. */ private[random] def hashSeed(seed: Long): Long = { val bytes = ByteBuffer.allocate(java.lang.Long.BYTES).putLong(seed).array() - val lowBits = MurmurHash3.bytesHash(bytes) + val lowBits = MurmurHash3.bytesHash(bytes, MurmurHash3.arraySeed) val highBits = MurmurHash3.bytesHash(bytes, lowBits) (highBits.toLong << 32) | (lowBits.toLong & 0xFFFFFFFFL) } diff --git a/core/src/test/java/org/apache/spark/ExecutorPluginSuite.java b/core/src/test/java/org/apache/spark/ExecutorPluginSuite.java deleted file mode 100644 index 80cd70282a51d..0000000000000 --- a/core/src/test/java/org/apache/spark/ExecutorPluginSuite.java +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark; - -import org.apache.spark.api.java.JavaSparkContext; - -import org.junit.After; -import org.junit.Before; -import org.junit.Test; - -import static org.junit.Assert.*; - -public class ExecutorPluginSuite { - private static final String EXECUTOR_PLUGIN_CONF_NAME = "spark.executor.plugins"; - private static final String testBadPluginName = TestBadShutdownPlugin.class.getName(); - private static final String testPluginName = TestExecutorPlugin.class.getName(); - private static final String testSecondPluginName = TestSecondPlugin.class.getName(); - - // Static value modified by testing plugins to ensure plugins loaded correctly. - public static int numSuccessfulPlugins = 0; - - // Static value modified by testing plugins to verify plugins shut down properly. - public static int numSuccessfulTerminations = 0; - - private JavaSparkContext sc; - - @Before - public void setUp() { - sc = null; - numSuccessfulPlugins = 0; - numSuccessfulTerminations = 0; - } - - @After - public void tearDown() { - if (sc != null) { - sc.stop(); - sc = null; - } - } - - private SparkConf initializeSparkConf(String pluginNames) { - return new SparkConf() - .setMaster("local") - .setAppName("test") - .set(EXECUTOR_PLUGIN_CONF_NAME, pluginNames); - } - - @Test - public void testPluginClassDoesNotExist() { - SparkConf conf = initializeSparkConf("nonexistent.plugin"); - try { - sc = new JavaSparkContext(conf); - fail("No exception thrown for nonexistent plugin"); - } catch (Exception e) { - // We cannot catch ClassNotFoundException directly because Java doesn't think it'll be thrown - assertTrue(e.toString().startsWith("java.lang.ClassNotFoundException")); - } - } - - @Test - public void testAddPlugin() throws InterruptedException { - // Load the sample TestExecutorPlugin, which will change the value of numSuccessfulPlugins - SparkConf conf = initializeSparkConf(testPluginName); - sc = new JavaSparkContext(conf); - assertEquals(1, numSuccessfulPlugins); - sc.stop(); - sc = null; - assertEquals(1, numSuccessfulTerminations); - } - - @Test - public void testAddMultiplePlugins() throws InterruptedException { - // Load two plugins and verify they both execute. - SparkConf conf = initializeSparkConf(testPluginName + "," + testSecondPluginName); - sc = new JavaSparkContext(conf); - assertEquals(2, numSuccessfulPlugins); - sc.stop(); - sc = null; - assertEquals(2, numSuccessfulTerminations); - } - - @Test - public void testPluginShutdownWithException() { - // Verify an exception in one plugin shutdown does not affect the others - String pluginNames = testPluginName + "," + testBadPluginName + "," + testPluginName; - SparkConf conf = initializeSparkConf(pluginNames); - sc = new JavaSparkContext(conf); - assertEquals(3, numSuccessfulPlugins); - sc.stop(); - sc = null; - assertEquals(2, numSuccessfulTerminations); - } - - public static class TestExecutorPlugin implements ExecutorPlugin { - public void init() { - ExecutorPluginSuite.numSuccessfulPlugins++; - } - - public void shutdown() { - ExecutorPluginSuite.numSuccessfulTerminations++; - } - } - - public static class TestSecondPlugin implements ExecutorPlugin { - public void init() { - ExecutorPluginSuite.numSuccessfulPlugins++; - } - - public void shutdown() { - ExecutorPluginSuite.numSuccessfulTerminations++; - } - } - - public static class TestBadShutdownPlugin implements ExecutorPlugin { - public void init() { - ExecutorPluginSuite.numSuccessfulPlugins++; - } - - public void shutdown() { - throw new RuntimeException("This plugin will fail to cleanly shut down"); - } - } -} diff --git a/core/src/test/java/org/apache/spark/io/GenericFileInputStreamSuite.java b/core/src/test/java/org/apache/spark/io/GenericFileInputStreamSuite.java index 22db3592ecc96..8ff787975eaae 100644 --- a/core/src/test/java/org/apache/spark/io/GenericFileInputStreamSuite.java +++ b/core/src/test/java/org/apache/spark/io/GenericFileInputStreamSuite.java @@ -48,8 +48,12 @@ public void setUp() throws IOException { } @After - public void tearDown() { + public void tearDown() throws IOException { inputFile.delete(); + + for (InputStream is : inputStreams) { + is.close(); + } } @Test @@ -141,4 +145,15 @@ public void testBytesSkippedAfterEOF() throws IOException { assertEquals(-1, inputStream.read()); } } + + @Test + public void testReadPastEOF() throws IOException { + InputStream is = inputStreams[0]; + byte[] buf = new byte[1024]; + int read; + while ((read = is.read(buf, 0, buf.length)) != -1); + + int readAfterEOF = is.read(buf, 0, buf.length); + assertEquals(-1, readAfterEOF); + } } diff --git a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java index 773c390175b6d..fb8523856da6f 100644 --- a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java +++ b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java @@ -323,7 +323,7 @@ public static class InProcessTestApp { public static void main(String[] args) throws Exception { assertNotEquals(0, args.length); - assertEquals(args[0], "hello"); + assertEquals("hello", args[0]); new SparkContext().stop(); synchronized (LOCK) { @@ -340,7 +340,7 @@ public static class ErrorInProcessTestApp { public static void main(String[] args) { assertNotEquals(0, args.length); - assertEquals(args[0], "hello"); + assertEquals("hello", args[0]); throw DUMMY_EXCEPTION; } } diff --git a/core/src/test/java/org/apache/spark/resource/JavaResourceProfileSuite.java b/core/src/test/java/org/apache/spark/resource/JavaResourceProfileSuite.java new file mode 100644 index 0000000000000..bb413c00fb972 --- /dev/null +++ b/core/src/test/java/org/apache/spark/resource/JavaResourceProfileSuite.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.resource; + +import java.util.Map; + +import static org.junit.Assert.*; +import org.junit.Test; + +// Test the ResourceProfile and Request api's from Java +public class JavaResourceProfileSuite { + + String GpuResource = "resource.gpu"; + String FPGAResource = "resource.fpga"; + + @Test + public void testResourceProfileAccessFromJava() throws Exception { + ExecutorResourceRequests execReqGpu = + new ExecutorResourceRequests().resource(GpuResource, 2,"myscript", ""); + ExecutorResourceRequests execReqFpga = + new ExecutorResourceRequests().resource(FPGAResource, 3, "myfpgascript", "nvidia"); + + ResourceProfileBuilder rprof = new ResourceProfileBuilder(); + rprof.require(execReqGpu); + rprof.require(execReqFpga); + TaskResourceRequests taskReq1 = new TaskResourceRequests().resource(GpuResource, 1); + rprof.require(taskReq1); + + assertEquals(rprof.executorResources().size(), 2); + Map eresources = rprof.executorResourcesJMap(); + assert(eresources.containsKey(GpuResource)); + ExecutorResourceRequest gpuReq = eresources.get(GpuResource); + assertEquals(gpuReq.amount(), 2); + assertEquals(gpuReq.discoveryScript(), "myscript"); + assertEquals(gpuReq.vendor(), ""); + + assert(eresources.containsKey(FPGAResource)); + ExecutorResourceRequest fpgaReq = eresources.get(FPGAResource); + assertEquals(fpgaReq.amount(), 3); + assertEquals(fpgaReq.discoveryScript(), "myfpgascript"); + assertEquals(fpgaReq.vendor(), "nvidia"); + + assertEquals(rprof.taskResources().size(), 1); + Map tresources = rprof.taskResourcesJMap(); + assert(tresources.containsKey(GpuResource)); + TaskResourceRequest taskReq = tresources.get(GpuResource); + assertEquals(taskReq.amount(), 1.0, 0); + assertEquals(taskReq.resourceName(), GpuResource); + } +} + diff --git a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java index 6b83a984f037c..ee8e38c24b47f 100644 --- a/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java +++ b/core/src/test/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriterSuite.java @@ -19,8 +19,10 @@ import java.io.*; import java.nio.ByteBuffer; +import java.nio.file.Files; import java.util.*; +import org.mockito.stubbing.Answer; import scala.Option; import scala.Product2; import scala.Tuple2; @@ -28,7 +30,6 @@ import scala.collection.Iterator; import com.google.common.collect.HashMultiset; -import com.google.common.collect.Iterators; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -53,6 +54,7 @@ import org.apache.spark.security.CryptoStreamUtils; import org.apache.spark.serializer.*; import org.apache.spark.shuffle.IndexShuffleBlockResolver; +import org.apache.spark.shuffle.sort.io.LocalDiskShuffleExecutorComponents; import org.apache.spark.storage.*; import org.apache.spark.util.Utils; @@ -65,6 +67,7 @@ public class UnsafeShuffleWriterSuite { + static final int DEFAULT_INITIAL_SORT_BUFFER_SIZE = 4096; static final int NUM_PARTITITONS = 4; TestMemoryManager memoryManager; TaskMemoryManager taskMemoryManager; @@ -131,15 +134,29 @@ public void setUp() throws IOException { ); }); - when(shuffleBlockResolver.getDataFile(anyInt(), anyInt())).thenReturn(mergedOutputFile); - doAnswer(invocationOnMock -> { + when(shuffleBlockResolver.getDataFile(anyInt(), anyLong())).thenReturn(mergedOutputFile); + + Answer renameTempAnswer = invocationOnMock -> { partitionSizesInMergedFile = (long[]) invocationOnMock.getArguments()[2]; File tmp = (File) invocationOnMock.getArguments()[3]; - mergedOutputFile.delete(); - tmp.renameTo(mergedOutputFile); + if (!mergedOutputFile.delete()) { + throw new RuntimeException("Failed to delete old merged output file."); + } + if (tmp != null) { + Files.move(tmp.toPath(), mergedOutputFile.toPath()); + } else if (!mergedOutputFile.createNewFile()) { + throw new RuntimeException("Failed to create empty merged output file."); + } return null; - }).when(shuffleBlockResolver) - .writeIndexFileAndCommit(anyInt(), anyInt(), any(long[].class), any(File.class)); + }; + + doAnswer(renameTempAnswer) + .when(shuffleBlockResolver) + .writeIndexFileAndCommit(anyInt(), anyLong(), any(long[].class), any(File.class)); + + doAnswer(renameTempAnswer) + .when(shuffleBlockResolver) + .writeIndexFileAndCommit(anyInt(), anyLong(), any(long[].class), eq(null)); when(diskBlockManager.createTempShuffleBlock()).thenAnswer(invocationOnMock -> { TempShuffleBlockId blockId = new TempShuffleBlockId(UUID.randomUUID()); @@ -151,21 +168,20 @@ public void setUp() throws IOException { when(taskContext.taskMetrics()).thenReturn(taskMetrics); when(shuffleDep.serializer()).thenReturn(serializer); when(shuffleDep.partitioner()).thenReturn(hashPartitioner); + when(taskContext.taskMemoryManager()).thenReturn(taskMemoryManager); } - private UnsafeShuffleWriter createWriter( - boolean transferToEnabled) throws IOException { + private UnsafeShuffleWriter createWriter(boolean transferToEnabled) { conf.set("spark.file.transferTo", String.valueOf(transferToEnabled)); return new UnsafeShuffleWriter<>( blockManager, - shuffleBlockResolver, taskMemoryManager, - new SerializedShuffleHandle<>(0, 1, shuffleDep), - 0, // map id + new SerializedShuffleHandle<>(0, shuffleDep), + 0L, // map id taskContext, conf, - taskContext.taskMetrics().shuffleWriteMetrics() - ); + taskContext.taskMetrics().shuffleWriteMetrics(), + new LocalDiskShuffleExecutorComponents(conf, blockManager, shuffleBlockResolver)); } private void assertSpillFilesWereCleanedUp() { @@ -232,7 +248,7 @@ class BadRecords extends scala.collection.AbstractIterator writer = createWriter(true); - writer.write(Iterators.emptyIterator()); + writer.write(new ArrayList>().iterator()); final Option mapStatus = writer.stop(true); assertTrue(mapStatus.isDefined()); assertTrue(mergedOutputFile.exists()); @@ -391,7 +407,7 @@ public void mergeSpillsWithFileStreamAndCompressionAndEncryption() throws Except @Test public void mergeSpillsWithCompressionAndEncryptionSlowPath() throws Exception { - conf.set(package$.MODULE$.SHUFFLE_UNDAFE_FAST_MERGE_ENABLE(), false); + conf.set(package$.MODULE$.SHUFFLE_UNSAFE_FAST_MERGE_ENABLE(), false); testMergingSpills(false, LZ4CompressionCodec.class.getName(), true); } @@ -444,10 +460,10 @@ public void writeEnoughRecordsToTriggerSortBufferExpansionAndSpillRadixOn() thro } private void writeEnoughRecordsToTriggerSortBufferExpansionAndSpill() throws Exception { - memoryManager.limit(UnsafeShuffleWriter.DEFAULT_INITIAL_SORT_BUFFER_SIZE * 16); + memoryManager.limit(DEFAULT_INITIAL_SORT_BUFFER_SIZE * 16); final UnsafeShuffleWriter writer = createWriter(false); final ArrayList> dataToWrite = new ArrayList<>(); - for (int i = 0; i < UnsafeShuffleWriter.DEFAULT_INITIAL_SORT_BUFFER_SIZE + 1; i++) { + for (int i = 0; i < DEFAULT_INITIAL_SORT_BUFFER_SIZE + 1; i++) { dataToWrite.add(new Tuple2<>(i, i)); } writer.write(dataToWrite.iterator()); @@ -516,16 +532,15 @@ public void testPeakMemoryUsed() throws Exception { final long numRecordsPerPage = pageSizeBytes / recordLengthBytes; taskMemoryManager = spy(taskMemoryManager); when(taskMemoryManager.pageSizeBytes()).thenReturn(pageSizeBytes); - final UnsafeShuffleWriter writer = - new UnsafeShuffleWriter<>( + final UnsafeShuffleWriter writer = new UnsafeShuffleWriter<>( blockManager, - shuffleBlockResolver, taskMemoryManager, - new SerializedShuffleHandle<>(0, 1, shuffleDep), - 0, // map id + new SerializedShuffleHandle<>(0, shuffleDep), + 0L, // map id taskContext, conf, - taskContext.taskMetrics().shuffleWriteMetrics()); + taskContext.taskMetrics().shuffleWriteMetrics(), + new LocalDiskShuffleExecutorComponents(conf, blockManager, shuffleBlockResolver)); // Peak memory should be monotonically increasing. More specifically, every time // we allocate a new page it should increase by exactly the size of the page. diff --git a/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java b/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java index 8d03c6778e18b..6e995a3929a75 100644 --- a/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java +++ b/core/src/test/java/org/apache/spark/unsafe/map/AbstractBytesToBytesMapSuite.java @@ -34,6 +34,7 @@ import org.apache.spark.SparkConf; import org.apache.spark.executor.ShuffleWriteMetrics; import org.apache.spark.memory.MemoryMode; +import org.apache.spark.memory.SparkOutOfMemoryError; import org.apache.spark.memory.TestMemoryConsumer; import org.apache.spark.memory.TaskMemoryManager; import org.apache.spark.memory.TestMemoryManager; @@ -691,13 +692,11 @@ public void avoidDeadlock() throws InterruptedException { Thread thread = new Thread(() -> { int i = 0; - long used = 0; while (i < 10) { c1.use(10000000); - used += 10000000; i++; } - c1.free(used); + c1.free(c1.getUsed()); }); try { @@ -726,4 +725,22 @@ public void avoidDeadlock() throws InterruptedException { } } + @Test + public void freeAfterFailedReset() { + // SPARK-29244: BytesToBytesMap.free after a OOM reset operation should not cause failure. + memoryManager.limit(5000); + BytesToBytesMap map = + new BytesToBytesMap(taskMemoryManager, blockManager, serializerManager, 256, 0.5, 4000); + // Force OOM on next memory allocation. + memoryManager.markExecutionAsOutOfMemoryOnce(); + try { + map.reset(); + Assert.fail("Expected SparkOutOfMemoryError to be thrown"); + } catch (SparkOutOfMemoryError e) { + // Expected exception; do nothing. + } finally { + map.free(); + } + } + } diff --git a/core/src/test/java/org/apache/spark/util/SerializableConfigurationSuite.java b/core/src/test/java/org/apache/spark/util/SerializableConfigurationSuite.java new file mode 100644 index 0000000000000..28d038a524c88 --- /dev/null +++ b/core/src/test/java/org/apache/spark/util/SerializableConfigurationSuite.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.util; + +import java.util.Arrays; + +import org.apache.hadoop.conf.Configuration; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; + +import static org.junit.Assert.assertEquals; + + +public class SerializableConfigurationSuite { + private transient JavaSparkContext sc; + + @Before + public void setUp() { + sc = new JavaSparkContext("local", "SerializableConfigurationSuite"); + } + + @After + public void tearDown() { + sc.stop(); + sc = null; + } + + @Test + public void testSerializableConfiguration() { + JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4), 2); + Configuration hadoopConfiguration = new Configuration(false); + hadoopConfiguration.set("test.property", "value"); + SerializableConfiguration scs = new SerializableConfiguration(hadoopConfiguration); + SerializableConfiguration actual = rdd.map(val -> scs).collect().get(0); + assertEquals("value", actual.value().get("test.property")); + } +} diff --git a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java index c6aa623560d57..43977717f6c97 100644 --- a/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java +++ b/core/src/test/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorterSuite.java @@ -222,7 +222,7 @@ public void testSortingEmptyArrays() throws Exception { public void testSortTimeMetric() throws Exception { final UnsafeExternalSorter sorter = newSorter(); long prevSortTime = sorter.getSortTimeNanos(); - assertEquals(prevSortTime, 0); + assertEquals(0, prevSortTime); sorter.insertRecord(null, 0, 0, 0, false); sorter.spill(); @@ -230,11 +230,14 @@ public void testSortTimeMetric() throws Exception { prevSortTime = sorter.getSortTimeNanos(); sorter.spill(); // no sort needed - assertEquals(sorter.getSortTimeNanos(), prevSortTime); + assertEquals(prevSortTime, sorter.getSortTimeNanos()); sorter.insertRecord(null, 0, 0, 0, false); UnsafeSorterIterator iter = sorter.getSortedIterator(); assertThat(sorter.getSortTimeNanos(), greaterThan(prevSortTime)); + + sorter.cleanupResources(); + assertSpillFilesWereCleanedUp(); } @Test @@ -510,6 +513,8 @@ public void testGetIterator() throws Exception { verifyIntIterator(sorter.getIterator(79), 79, 300); verifyIntIterator(sorter.getIterator(139), 139, 300); verifyIntIterator(sorter.getIterator(279), 279, 300); + sorter.cleanupResources(); + assertSpillFilesWereCleanedUp(); } @Test diff --git a/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java b/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java index 62a0b85915efc..5ce7937c03de2 100644 --- a/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java +++ b/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java @@ -17,7 +17,10 @@ package test.org.apache.spark; +import java.util.Map; + import org.apache.spark.TaskContext; +import org.apache.spark.resource.ResourceInformation; import org.apache.spark.util.TaskCompletionListener; import org.apache.spark.util.TaskFailureListener; @@ -40,7 +43,9 @@ public static void test() { tc.stageId(); tc.stageAttemptNumber(); tc.taskAttemptId(); + // this returns a scala Map, so make sure the JMap version give a java type back tc.resources(); + Map resources = tc.resourcesJMap(); tc.taskMetrics(); tc.taskMemoryManager(); tc.getLocalProperties(); diff --git a/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala b/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala index 435665d8a1ce2..a75cf3f0381df 100644 --- a/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala +++ b/core/src/test/scala/org/apache/spark/AccumulatorSuite.scala @@ -126,7 +126,7 @@ private[spark] object AccumulatorSuite { sc.addSparkListener(listener) testBody // wait until all events have been processed before proceeding to assert things - sc.listenerBus.waitUntilEmpty(10 * 1000) + sc.listenerBus.waitUntilEmpty() val accums = listener.getCompletedStageInfos.flatMap(_.accumulables.values) val isSet = accums.exists { a => a.name == Some(PEAK_EXECUTION_MEMORY) && a.value.exists(_.asInstanceOf[Long] > 0L) diff --git a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala index 3a43f1a033da1..6a108a55045ee 100644 --- a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala +++ b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala @@ -24,6 +24,7 @@ import scala.reflect.ClassTag import com.google.common.io.ByteStreams import org.apache.hadoop.fs.Path +import org.apache.spark.internal.config.CACHE_CHECKPOINT_PREFERRED_LOCS_EXPIRE_TIME import org.apache.spark.internal.config.UI._ import org.apache.spark.io.CompressionCodec import org.apache.spark.rdd._ @@ -584,7 +585,7 @@ object CheckpointSuite { } } -class CheckpointCompressionSuite extends SparkFunSuite with LocalSparkContext { +class CheckpointStorageSuite extends SparkFunSuite with LocalSparkContext { test("checkpoint compression") { withTempDir { checkpointDir => @@ -618,4 +619,27 @@ class CheckpointCompressionSuite extends SparkFunSuite with LocalSparkContext { assert(rdd.collect().toSeq === (1 to 20)) } } + + test("cache checkpoint preferred location") { + withTempDir { checkpointDir => + val conf = new SparkConf() + .set(CACHE_CHECKPOINT_PREFERRED_LOCS_EXPIRE_TIME.key, "10") + .set(UI_ENABLED.key, "false") + sc = new SparkContext("local", "test", conf) + sc.setCheckpointDir(checkpointDir.toString) + val rdd = sc.makeRDD(1 to 20, numSlices = 1) + rdd.checkpoint() + assert(rdd.collect().toSeq === (1 to 20)) + + // Verify that RDD is checkpointed + assert(rdd.firstParent.isInstanceOf[ReliableCheckpointRDD[_]]) + val checkpointedRDD = rdd.firstParent.asInstanceOf[ReliableCheckpointRDD[_]] + val partiton = checkpointedRDD.partitions(0) + assert(!checkpointedRDD.cachedPreferredLocations.asMap.containsKey(partiton)) + + val preferredLoc = checkpointedRDD.preferredLocations(partiton) + assert(checkpointedRDD.cachedPreferredLocations.asMap.containsKey(partiton)) + assert(preferredLoc == checkpointedRDD.cachedPreferredLocations.get(partiton)) + } + } } diff --git a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala index 6a30a1d32f8c6..92ed24408384f 100644 --- a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala +++ b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala @@ -97,7 +97,7 @@ abstract class ContextCleanerSuiteBase(val shuffleManager: Class[_] = classOf[So } /** Run GC and make sure it actually has run */ - protected def runGC() { + protected def runGC(): Unit = { val weakRef = new WeakReference(new Object()) val startTimeNs = System.nanoTime() System.gc() // Make a best effort to run the garbage collection. It *usually* runs GC. @@ -406,7 +406,7 @@ class CleanerTester( sc.cleaner.get.attachListener(cleanerListener) /** Assert that all the stuff has been cleaned up */ - def assertCleanup()(implicit waitTimeout: PatienceConfiguration.Timeout) { + def assertCleanup()(implicit waitTimeout: PatienceConfiguration.Timeout): Unit = { try { eventually(waitTimeout, interval(100.milliseconds)) { assert(isAllCleanedUp, @@ -419,7 +419,7 @@ class CleanerTester( } /** Verify that RDDs, shuffles, etc. occupy resources */ - private def preCleanupValidate() { + private def preCleanupValidate(): Unit = { assert(rddIds.nonEmpty || shuffleIds.nonEmpty || broadcastIds.nonEmpty || checkpointIds.nonEmpty, "Nothing to cleanup") @@ -465,7 +465,7 @@ class CleanerTester( * Verify that RDDs, shuffles, etc. do not occupy resources. Tests multiple times as there is * as there is not guarantee on how long it will take clean up the resources. */ - private def postCleanupValidate() { + private def postCleanupValidate(): Unit = { // Verify the RDDs have been persisted and blocks are present rddIds.foreach { rddId => assert( diff --git a/core/src/test/scala/org/apache/spark/DebugFilesystem.scala b/core/src/test/scala/org/apache/spark/DebugFilesystem.scala index a5bdc95790722..1d3e28b39548f 100644 --- a/core/src/test/scala/org/apache/spark/DebugFilesystem.scala +++ b/core/src/test/scala/org/apache/spark/DebugFilesystem.scala @@ -21,7 +21,6 @@ import java.io.{FileDescriptor, InputStream} import java.lang import java.nio.ByteBuffer -import scala.collection.JavaConverters._ import scala.collection.mutable import org.apache.hadoop.fs._ diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala index aad20545bafbe..3f309819065be 100644 --- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala +++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark +import org.scalatest.Assertions._ import org.scalatest.Matchers import org.scalatest.concurrent.{Signaler, ThreadSignaler, TimeLimits} import org.scalatest.time.{Millis, Span} @@ -77,7 +78,7 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex test("simple groupByKey") { sc = new SparkContext(clusterUrl, "test") - val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (2, 1)), 5) + val pairs = sc.parallelize(Seq((1, 1), (1, 2), (1, 3), (2, 1)), 5) val groups = pairs.groupByKey(5).collect() assert(groups.size === 2) val valuesFor1 = groups.find(_._1 == 1).get._2 @@ -339,6 +340,21 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex } } + test("reference partitions inside a task") { + // Run a simple job which just makes sure there is no failure if we touch rdd.partitions + // inside a task. This requires the stateLock to be serializable. This is very convoluted + // use case, it's just a check for backwards-compatibility after the fix for SPARK-28917. + sc = new SparkContext("local-cluster[1,1,1024]", "test") + val rdd1 = sc.parallelize(1 to 10, 1) + val rdd2 = rdd1.map { x => x + 1} + // ensure we can force computation of rdd2.dependencies inside a task. Just touching + // it will force computation and touching the stateLock. The check for null is to just + // to make sure that we've setup our test correctly, and haven't precomputed dependencies + // in the driver + val dependencyComputeCount = rdd1.map { x => if (rdd2.dependencies == null) 1 else 0}.sum() + assert(dependencyComputeCount > 0) + } + } object DistributedSuite { diff --git a/core/src/test/scala/org/apache/spark/DriverSuite.scala b/core/src/test/scala/org/apache/spark/DriverSuite.scala index 182f28c5cce54..f58777584d0ae 100644 --- a/core/src/test/scala/org/apache/spark/DriverSuite.scala +++ b/core/src/test/scala/org/apache/spark/DriverSuite.scala @@ -50,7 +50,7 @@ class DriverSuite extends SparkFunSuite with TimeLimits { * sys.exit() after finishing. */ object DriverWithoutCleanup { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { TestUtils.configTestLog4j("INFO") val conf = new SparkConf val sc = new SparkContext(args(0), "DriverWithoutCleanup", conf) diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala index 07fb323cfc355..8fa33f4915ea4 100644 --- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark +import java.util.concurrent.TimeUnit + import scala.collection.mutable import org.mockito.ArgumentMatchers.{any, eq => meq} @@ -27,6 +29,8 @@ import org.apache.spark.executor.ExecutorMetrics import org.apache.spark.internal.config import org.apache.spark.internal.config.Tests.TEST_SCHEDULE_INTERVAL import org.apache.spark.metrics.MetricsSystem +import org.apache.spark.resource.{ExecutorResourceRequests, ResourceProfile, ResourceProfileBuilder, ResourceProfileManager, TaskResourceRequests} +import org.apache.spark.resource.ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID import org.apache.spark.scheduler._ import org.apache.spark.scheduler.cluster.ExecutorInfo import org.apache.spark.util.{Clock, ManualClock, SystemClock} @@ -42,6 +46,9 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { private val managers = new mutable.ListBuffer[ExecutorAllocationManager]() private var listenerBus: LiveListenerBus = _ private var client: ExecutorAllocationClient = _ + private val clock = new SystemClock() + private var rpManager: ResourceProfileManager = _ + override def beforeEach(): Unit = { super.beforeEach() @@ -64,7 +71,7 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { private def post(event: SparkListenerEvent): Unit = { listenerBus.post(event) - listenerBus.waitUntilEmpty(1000) + listenerBus.waitUntilEmpty() } test("initialize dynamic allocation in SparkContext") { @@ -105,65 +112,257 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { test("starting state") { val manager = createManager(createConf()) - assert(numExecutorsTarget(manager) === 1) + assert(numExecutorsTargetForDefaultProfileId(manager) === 1) assert(executorsPendingToRemove(manager).isEmpty) assert(addTime(manager) === ExecutorAllocationManager.NOT_SET) } - test("add executors") { + test("add executors default profile") { val manager = createManager(createConf(1, 10, 1)) post(SparkListenerStageSubmitted(createStageInfo(0, 1000))) + val updatesNeeded = + new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates] + + // Keep adding until the limit is reached + assert(numExecutorsTargetForDefaultProfileId(manager) === 1) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 2) + assert(numExecutorsToAddForDefaultProfile(manager) === 2) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 4) + assert(numExecutorsToAddForDefaultProfile(manager) === 4) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 4) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 8) + assert(numExecutorsToAddForDefaultProfile(manager) === 8) + // reached the limit of 10 + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 10) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 10) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) + + // Register previously requested executors + onExecutorAddedDefaultProfile(manager, "first") + assert(numExecutorsTargetForDefaultProfileId(manager) === 10) + onExecutorAddedDefaultProfile(manager, "second") + onExecutorAddedDefaultProfile(manager, "third") + onExecutorAddedDefaultProfile(manager, "fourth") + assert(numExecutorsTargetForDefaultProfileId(manager) === 10) + onExecutorAddedDefaultProfile(manager, "first") // duplicates should not count + onExecutorAddedDefaultProfile(manager, "second") + assert(numExecutorsTargetForDefaultProfileId(manager) === 10) + + // Try adding again + // This should still fail because the number pending + running is still at the limit + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 10) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 10) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) + } + + test("add executors multiple profiles") { + val manager = createManager(createConf(1, 10, 1)) + post(SparkListenerStageSubmitted(createStageInfo(0, 1000, rp = defaultProfile))) + val rp1 = new ResourceProfileBuilder() + val execReqs = new ExecutorResourceRequests().cores(4).resource("gpu", 4) + val taskReqs = new TaskResourceRequests().cpus(1).resource("gpu", 1) + rp1.require(execReqs).require(taskReqs) + val rprof1 = rp1.build + rpManager.addResourceProfile(rprof1) + post(SparkListenerStageSubmitted(createStageInfo(1, 1000, rp = rprof1))) + val updatesNeeded = + new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates] + // Keep adding until the limit is reached - assert(numExecutorsTarget(manager) === 1) - assert(numExecutorsToAdd(manager) === 1) - assert(addExecutors(manager) === 1) - assert(numExecutorsTarget(manager) === 2) - assert(numExecutorsToAdd(manager) === 2) - assert(addExecutors(manager) === 2) - assert(numExecutorsTarget(manager) === 4) - assert(numExecutorsToAdd(manager) === 4) - assert(addExecutors(manager) === 4) - assert(numExecutorsTarget(manager) === 8) - assert(numExecutorsToAdd(manager) === 8) - assert(addExecutors(manager) === 2) // reached the limit of 10 - assert(numExecutorsTarget(manager) === 10) - assert(numExecutorsToAdd(manager) === 1) - assert(addExecutors(manager) === 0) - assert(numExecutorsTarget(manager) === 10) - assert(numExecutorsToAdd(manager) === 1) + assert(numExecutorsTargetForDefaultProfileId(manager) === 1) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1) + assert(numExecutorsToAdd(manager, rprof1) === 1) + assert(numExecutorsTarget(manager, rprof1.id) === 1) + assert(addExecutorsToTarget(manager, updatesNeeded, rprof1) === 1) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 2) + assert(numExecutorsToAddForDefaultProfile(manager) === 2) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2) + assert(numExecutorsToAdd(manager, rprof1) === 2) + assert(numExecutorsTarget(manager, rprof1.id) === 2) + assert(addExecutorsToTarget(manager, updatesNeeded, rprof1) === 2) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 4) + assert(numExecutorsToAddForDefaultProfile(manager) === 4) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 4) + assert(numExecutorsToAdd(manager, rprof1) === 4) + assert(numExecutorsTarget(manager, rprof1.id) === 4) + assert(addExecutorsToTarget(manager, updatesNeeded, rprof1) === 4) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 8) + assert(numExecutorsToAddForDefaultProfile(manager) === 8) + // reached the limit of 10 + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2) + assert(numExecutorsToAdd(manager, rprof1) === 8) + assert(numExecutorsTarget(manager, rprof1.id) === 8) + assert(addExecutorsToTarget(manager, updatesNeeded, rprof1) === 2) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 10) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0) + assert(numExecutorsToAdd(manager, rprof1) === 1) + assert(numExecutorsTarget(manager, rprof1.id) === 10) + assert(addExecutorsToTarget(manager, updatesNeeded, rprof1) === 0) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 10) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) + assert(numExecutorsToAdd(manager, rprof1) === 1) + assert(numExecutorsTarget(manager, rprof1.id) === 10) // Register previously requested executors - onExecutorAdded(manager, "first") - assert(numExecutorsTarget(manager) === 10) - onExecutorAdded(manager, "second") - onExecutorAdded(manager, "third") - onExecutorAdded(manager, "fourth") - assert(numExecutorsTarget(manager) === 10) - onExecutorAdded(manager, "first") // duplicates should not count - onExecutorAdded(manager, "second") - assert(numExecutorsTarget(manager) === 10) + onExecutorAddedDefaultProfile(manager, "first") + onExecutorAdded(manager, "firstrp1", rprof1) + assert(numExecutorsTargetForDefaultProfileId(manager) === 10) + assert(numExecutorsTarget(manager, rprof1.id) === 10) + onExecutorAddedDefaultProfile(manager, "second") + onExecutorAddedDefaultProfile(manager, "third") + onExecutorAddedDefaultProfile(manager, "fourth") + onExecutorAdded(manager, "secondrp1", rprof1) + onExecutorAdded(manager, "thirdrp1", rprof1) + onExecutorAdded(manager, "fourthrp1", rprof1) + assert(numExecutorsTargetForDefaultProfileId(manager) === 10) + assert(numExecutorsTarget(manager, rprof1.id) === 10) + onExecutorAddedDefaultProfile(manager, "first") // duplicates should not count + onExecutorAddedDefaultProfile(manager, "second") + onExecutorAdded(manager, "firstrp1", rprof1) + onExecutorAdded(manager, "secondrp1", rprof1) + assert(numExecutorsTargetForDefaultProfileId(manager) === 10) + assert(numExecutorsTarget(manager, rprof1.id) === 10) // Try adding again // This should still fail because the number pending + running is still at the limit - assert(addExecutors(manager) === 0) - assert(numExecutorsTarget(manager) === 10) - assert(numExecutorsToAdd(manager) === 1) - assert(addExecutors(manager) === 0) - assert(numExecutorsTarget(manager) === 10) - assert(numExecutorsToAdd(manager) === 1) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0) + assert(addExecutorsToTarget(manager, updatesNeeded, rprof1) === 0) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 10) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) + assert(numExecutorsToAdd(manager, rprof1) === 1) + assert(numExecutorsTarget(manager, rprof1.id) === 10) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0) + assert(addExecutorsToTarget(manager, updatesNeeded, rprof1) === 0) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 10) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) + assert(numExecutorsToAdd(manager, rprof1) === 1) + assert(numExecutorsTarget(manager, rprof1.id) === 10) + } + + test("remove executors multiple profiles") { + val manager = createManager(createConf(5, 10, 5)) + val rp1 = new ResourceProfileBuilder() + val execReqs = new ExecutorResourceRequests().cores(4).resource("gpu", 4) + val taskReqs = new TaskResourceRequests().cpus(1).resource("gpu", 1) + rp1.require(execReqs).require(taskReqs) + val rprof1 = rp1.build + val rp2 = new ResourceProfileBuilder() + val execReqs2 = new ExecutorResourceRequests().cores(1) + val taskReqs2 = new TaskResourceRequests().cpus(1) + rp2.require(execReqs2).require(taskReqs2) + val rprof2 = rp2.build + rpManager.addResourceProfile(rprof1) + rpManager.addResourceProfile(rprof2) + post(SparkListenerStageSubmitted(createStageInfo(1, 10, rp = rprof1))) + post(SparkListenerStageSubmitted(createStageInfo(2, 10, rp = rprof2))) + + (1 to 10).map(_.toString).foreach { id => onExecutorAdded(manager, id, rprof1) } + (11 to 20).map(_.toString).foreach { id => onExecutorAdded(manager, id, rprof2) } + (21 to 30).map(_.toString).foreach { id => onExecutorAdded(manager, id, defaultProfile) } + + // Keep removing until the limit is reached + assert(executorsPendingToRemove(manager).isEmpty) + assert(removeExecutor(manager, "1", rprof1.id)) + assert(executorsPendingToRemove(manager).size === 1) + assert(executorsPendingToRemove(manager).contains("1")) + assert(removeExecutor(manager, "11", rprof2.id)) + assert(removeExecutor(manager, "2", rprof1.id)) + assert(executorsPendingToRemove(manager).size === 3) + assert(executorsPendingToRemove(manager).contains("2")) + assert(executorsPendingToRemove(manager).contains("11")) + assert(removeExecutor(manager, "21", defaultProfile.id)) + assert(removeExecutor(manager, "3", rprof1.id)) + assert(removeExecutor(manager, "4", rprof1.id)) + assert(executorsPendingToRemove(manager).size === 6) + assert(executorsPendingToRemove(manager).contains("21")) + assert(executorsPendingToRemove(manager).contains("3")) + assert(executorsPendingToRemove(manager).contains("4")) + assert(removeExecutor(manager, "5", rprof1.id)) + assert(!removeExecutor(manager, "6", rprof1.id)) // reached the limit of 5 + assert(executorsPendingToRemove(manager).size === 7) + assert(executorsPendingToRemove(manager).contains("5")) + assert(!executorsPendingToRemove(manager).contains("6")) + + // Kill executors previously requested to remove + onExecutorRemoved(manager, "1") + assert(executorsPendingToRemove(manager).size === 6) + assert(!executorsPendingToRemove(manager).contains("1")) + onExecutorRemoved(manager, "2") + onExecutorRemoved(manager, "3") + assert(executorsPendingToRemove(manager).size === 4) + assert(!executorsPendingToRemove(manager).contains("2")) + assert(!executorsPendingToRemove(manager).contains("3")) + onExecutorRemoved(manager, "2") // duplicates should not count + onExecutorRemoved(manager, "3") + assert(executorsPendingToRemove(manager).size === 4) + onExecutorRemoved(manager, "4") + onExecutorRemoved(manager, "5") + assert(executorsPendingToRemove(manager).size === 2) + assert(executorsPendingToRemove(manager).contains("11")) + assert(executorsPendingToRemove(manager).contains("21")) + + // Try removing again + // This should still fail because the number pending + running is still at the limit + assert(!removeExecutor(manager, "7", rprof1.id)) + assert(executorsPendingToRemove(manager).size === 2) + assert(!removeExecutor(manager, "8", rprof1.id)) + assert(executorsPendingToRemove(manager).size === 2) + + // make sure rprof2 has the same min limit or 5 + assert(removeExecutor(manager, "12", rprof2.id)) + assert(removeExecutor(manager, "13", rprof2.id)) + assert(removeExecutor(manager, "14", rprof2.id)) + assert(removeExecutor(manager, "15", rprof2.id)) + assert(!removeExecutor(manager, "16", rprof2.id)) // reached the limit of 5 + assert(executorsPendingToRemove(manager).size === 6) + assert(!executorsPendingToRemove(manager).contains("16")) + onExecutorRemoved(manager, "11") + onExecutorRemoved(manager, "12") + onExecutorRemoved(manager, "13") + onExecutorRemoved(manager, "14") + onExecutorRemoved(manager, "15") + assert(executorsPendingToRemove(manager).size === 1) } def testAllocationRatio(cores: Int, divisor: Double, expected: Int): Unit = { + val updatesNeeded = + new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates] val conf = createConf(3, 15) .set(config.DYN_ALLOCATION_EXECUTOR_ALLOCATION_RATIO, divisor) .set(config.EXECUTOR_CORES, cores) val manager = createManager(conf) post(SparkListenerStageSubmitted(createStageInfo(0, 20))) for (i <- 0 to 5) { - addExecutors(manager) + addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) } - assert(numExecutorsTarget(manager) === expected) + assert(numExecutorsTargetForDefaultProfileId(manager) === expected) } test("executionAllocationRatio is correctly handled") { @@ -182,83 +381,249 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { val manager = createManager(createConf(0, 10, 0)) post(SparkListenerStageSubmitted(createStageInfo(0, 5))) + val updatesNeeded = + new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates] + // Verify that we're capped at number of tasks in the stage - assert(numExecutorsTarget(manager) === 0) - assert(numExecutorsToAdd(manager) === 1) - assert(addExecutors(manager) === 1) - assert(numExecutorsTarget(manager) === 1) - assert(numExecutorsToAdd(manager) === 2) - assert(addExecutors(manager) === 2) - assert(numExecutorsTarget(manager) === 3) - assert(numExecutorsToAdd(manager) === 4) - assert(addExecutors(manager) === 2) - assert(numExecutorsTarget(manager) === 5) - assert(numExecutorsToAdd(manager) === 1) + assert(numExecutorsTargetForDefaultProfileId(manager) === 0) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 1) + assert(numExecutorsToAddForDefaultProfile(manager) === 2) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 3) + assert(numExecutorsToAddForDefaultProfile(manager) === 4) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 5) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) // Verify that running a task doesn't affect the target post(SparkListenerStageSubmitted(createStageInfo(1, 3))) post(SparkListenerExecutorAdded( 0L, "executor-1", new ExecutorInfo("host1", 1, Map.empty, Map.empty))) post(SparkListenerTaskStart(1, 0, createTaskInfo(0, 0, "executor-1"))) - assert(numExecutorsTarget(manager) === 5) - assert(addExecutors(manager) === 1) - assert(numExecutorsTarget(manager) === 6) - assert(numExecutorsToAdd(manager) === 2) - assert(addExecutors(manager) === 2) - assert(numExecutorsTarget(manager) === 8) - assert(numExecutorsToAdd(manager) === 4) - assert(addExecutors(manager) === 0) - assert(numExecutorsTarget(manager) === 8) - assert(numExecutorsToAdd(manager) === 1) + assert(numExecutorsTargetForDefaultProfileId(manager) === 5) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 6) + assert(numExecutorsToAddForDefaultProfile(manager) === 2) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2) + + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 8) + assert(numExecutorsToAddForDefaultProfile(manager) === 4) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 8) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) // Verify that re-running a task doesn't blow things up post(SparkListenerStageSubmitted(createStageInfo(2, 3))) post(SparkListenerTaskStart(2, 0, createTaskInfo(0, 0, "executor-1"))) post(SparkListenerTaskStart(2, 0, createTaskInfo(1, 0, "executor-1"))) - assert(addExecutors(manager) === 1) - assert(numExecutorsTarget(manager) === 9) - assert(numExecutorsToAdd(manager) === 2) - assert(addExecutors(manager) === 1) - assert(numExecutorsTarget(manager) === 10) - assert(numExecutorsToAdd(manager) === 1) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 9) + assert(numExecutorsToAddForDefaultProfile(manager) === 2) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 10) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) // Verify that running a task once we're at our limit doesn't blow things up post(SparkListenerTaskStart(2, 0, createTaskInfo(0, 1, "executor-1"))) - assert(addExecutors(manager) === 0) - assert(numExecutorsTarget(manager) === 10) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 10) } test("add executors when speculative tasks added") { val manager = createManager(createConf(0, 10, 0)) + val updatesNeeded = + new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates] + + post(SparkListenerStageSubmitted(createStageInfo(1, 2))) // Verify that we're capped at number of tasks including the speculative ones in the stage post(SparkListenerSpeculativeTaskSubmitted(1)) - assert(numExecutorsTarget(manager) === 0) - assert(numExecutorsToAdd(manager) === 1) - assert(addExecutors(manager) === 1) + assert(numExecutorsTargetForDefaultProfileId(manager) === 0) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) post(SparkListenerSpeculativeTaskSubmitted(1)) post(SparkListenerSpeculativeTaskSubmitted(1)) - post(SparkListenerStageSubmitted(createStageInfo(1, 2))) - assert(numExecutorsTarget(manager) === 1) - assert(numExecutorsToAdd(manager) === 2) - assert(addExecutors(manager) === 2) - assert(numExecutorsTarget(manager) === 3) - assert(numExecutorsToAdd(manager) === 4) - assert(addExecutors(manager) === 2) - assert(numExecutorsTarget(manager) === 5) - assert(numExecutorsToAdd(manager) === 1) + assert(numExecutorsTargetForDefaultProfileId(manager) === 1) + assert(numExecutorsToAddForDefaultProfile(manager) === 2) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 3) + assert(numExecutorsToAddForDefaultProfile(manager) === 4) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 5) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) // Verify that running a task doesn't affect the target post(SparkListenerTaskStart(1, 0, createTaskInfo(0, 0, "executor-1"))) - assert(numExecutorsTarget(manager) === 5) - assert(addExecutors(manager) === 0) - assert(numExecutorsToAdd(manager) === 1) + assert(numExecutorsTargetForDefaultProfileId(manager) === 5) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) // Verify that running a speculative task doesn't affect the target post(SparkListenerTaskStart(1, 0, createTaskInfo(1, 0, "executor-2", true))) - assert(numExecutorsTarget(manager) === 5) - assert(addExecutors(manager) === 0) - assert(numExecutorsToAdd(manager) === 1) + assert(numExecutorsTargetForDefaultProfileId(manager) === 5) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) + } + + test("SPARK-30511 remove executors when speculative tasks end") { + val clock = new ManualClock() + val stage = createStageInfo(0, 40) + val conf = createConf(0, 10, 0).set(config.EXECUTOR_CORES, 4) + val manager = createManager(conf, clock = clock) + val updatesNeeded = + new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates] + + post(SparkListenerStageSubmitted(stage)) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 4) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 3) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + + (0 to 9).foreach(execId => onExecutorAddedDefaultProfile(manager, execId.toString)) + (0 to 39).map { i => createTaskInfo(i, i, executorId = s"${i / 4}")}.foreach { + info => post(SparkListenerTaskStart(0, 0, info)) + } + assert(numExecutorsTarget(manager, defaultProfile.id) === 10) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 10) + + // 30 tasks (0 - 29) finished + (0 to 29).map { i => createTaskInfo(i, i, executorId = s"${i / 4}")}.foreach { + info => post(SparkListenerTaskEnd(0, 0, null, Success, info, new ExecutorMetrics, null)) } + clock.advance(1000) + manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime()) + assert(numExecutorsTarget(manager, defaultProfile.id) === 3) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 3) + (0 to 6).foreach { i => assert(removeExecutorDefaultProfile(manager, i.toString))} + (0 to 6).foreach { i => onExecutorRemoved(manager, i.toString)} + + // 10 speculative tasks (30 - 39) launch for the remaining tasks + (30 to 39).foreach { _ => post(SparkListenerSpeculativeTaskSubmitted(0))} + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTarget(manager, defaultProfile.id) == 5) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 5) + (10 to 12).foreach(execId => onExecutorAddedDefaultProfile(manager, execId.toString)) + (40 to 49).map { i => + createTaskInfo(taskId = i, taskIndex = i - 10, executorId = s"${i / 4}", speculative = true)} + .foreach { info => post(SparkListenerTaskStart(0, 0, info))} + clock.advance(1000) + manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime()) + // At this point, we still have 6 executors running + assert(numExecutorsTarget(manager, defaultProfile.id) == 5) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 5) + + // 6 speculative tasks (40 - 45) finish before the original tasks, with 4 speculative remaining + (40 to 45).map { i => + createTaskInfo(taskId = i, taskIndex = i - 10, executorId = s"${i / 4}", speculative = true)} + .foreach { + info => post(SparkListenerTaskEnd(0, 0, null, Success, info, new ExecutorMetrics, null))} + clock.advance(1000) + manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime()) + assert(numExecutorsTarget(manager, defaultProfile.id) === 4) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 4) + assert(removeExecutorDefaultProfile(manager, "10")) + onExecutorRemoved(manager, "10") + // At this point, we still have 5 executors running: ["7", "8", "9", "11", "12"] + + // 6 original tasks (30 - 35) are intentionally killed + (30 to 35).map { i => + createTaskInfo(i, i, executorId = s"${i / 4}")} + .foreach { info => post( + SparkListenerTaskEnd(0, 0, null, TaskKilled("test"), info, new ExecutorMetrics, null))} + clock.advance(1000) + manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime()) + assert(numExecutorsTarget(manager, defaultProfile.id) === 2) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 2) + (7 to 8).foreach { i => assert(removeExecutorDefaultProfile(manager, i.toString))} + (7 to 8).foreach { i => onExecutorRemoved(manager, i.toString)} + // At this point, we still have 3 executors running: ["9", "11", "12"] + + // Task 36 finishes before the speculative task 46, task 46 killed + post(SparkListenerTaskEnd(0, 0, null, Success, + createTaskInfo(36, 36, executorId = "9"), new ExecutorMetrics, null)) + post(SparkListenerTaskEnd(0, 0, null, TaskKilled("test"), + createTaskInfo(46, 36, executorId = "11", speculative = true), new ExecutorMetrics, null)) + + // We should have 3 original tasks (index 37, 38, 39) running, with corresponding 3 speculative + // tasks running. Target lowers to 2, but still hold 3 executors ["9", "11", "12"] + clock.advance(1000) + manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime()) + assert(numExecutorsTarget(manager, defaultProfile.id) === 2) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 2) + // At this point, we still have 3 executors running: ["9", "11", "12"] + + // Task 37 and 47 succeed at the same time + post(SparkListenerTaskEnd(0, 0, null, Success, + createTaskInfo(37, 37, executorId = "9"), new ExecutorMetrics, null)) + post(SparkListenerTaskEnd(0, 0, null, Success, + createTaskInfo(47, 37, executorId = "11", speculative = true), new ExecutorMetrics, null)) + + // We should have 2 original tasks (index 38, 39) running, with corresponding 2 speculative + // tasks running + clock.advance(1000) + manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime()) + assert(numExecutorsTarget(manager, defaultProfile.id) === 1) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 1) + assert(removeExecutorDefaultProfile(manager, "11")) + onExecutorRemoved(manager, "11") + // At this point, we still have 2 executors running: ["9", "12"] + + // Task 38 fails and task 49 fails, new speculative task 50 is submitted to speculate on task 39 + post(SparkListenerTaskEnd(0, 0, null, UnknownReason, + createTaskInfo(38, 38, executorId = "9"), new ExecutorMetrics, null)) + post(SparkListenerTaskEnd(0, 0, null, UnknownReason, + createTaskInfo(49, 39, executorId = "12", speculative = true), new ExecutorMetrics, null)) + post(SparkListenerSpeculativeTaskSubmitted(0)) + clock.advance(1000) + manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime()) + // maxNeeded = 1, allocate one more to satisfy speculation locality requirement + assert(numExecutorsTarget(manager, defaultProfile.id) === 2) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 2) + post(SparkListenerTaskStart(0, 0, + createTaskInfo(50, 39, executorId = "12", speculative = true))) + clock.advance(1000) + manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime()) + assert(numExecutorsTarget(manager, defaultProfile.id) === 1) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 1) + + // Task 39 and 48 succeed, task 50 killed + post(SparkListenerTaskEnd(0, 0, null, Success, + createTaskInfo(39, 39, executorId = "9"), new ExecutorMetrics, null)) + post(SparkListenerTaskEnd(0, 0, null, Success, + createTaskInfo(48, 38, executorId = "12", speculative = true), new ExecutorMetrics, null)) + post(SparkListenerTaskEnd(0, 0, null, TaskKilled("test"), + createTaskInfo(50, 39, executorId = "12", speculative = true), new ExecutorMetrics, null)) + post(SparkListenerStageCompleted(stage)) + clock.advance(1000) + manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime()) + assert(numExecutorsTarget(manager, defaultProfile.id) === 0) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 0) + assert(removeExecutorDefaultProfile(manager, "9")) + onExecutorRemoved(manager, "9") + assert(removeExecutorDefaultProfile(manager, "12")) + onExecutorRemoved(manager, "12") } test("properly handle task end events from completed stages") { @@ -279,43 +644,49 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { post(SparkListenerStageCompleted(stage)) // There are still two tasks that belong to the zombie stage running. - assert(totalRunningTasks(manager) === 2) + assert(totalRunningTasksPerResourceProfile(manager) === 2) // submit another attempt for the stage. We count completions from the first zombie attempt val stageAttempt1 = createStageInfo(stage.stageId, 5, attemptId = 1) post(SparkListenerStageSubmitted(stageAttempt1)) post(SparkListenerTaskEnd(0, 0, null, Success, taskInfo1, new ExecutorMetrics, null)) - assert(totalRunningTasks(manager) === 1) + assert(totalRunningTasksPerResourceProfile(manager) === 1) val attemptTaskInfo1 = createTaskInfo(3, 0, "executor-1") val attemptTaskInfo2 = createTaskInfo(4, 1, "executor-1") post(SparkListenerTaskStart(0, 1, attemptTaskInfo1)) post(SparkListenerTaskStart(0, 1, attemptTaskInfo2)) - assert(totalRunningTasks(manager) === 3) + assert(totalRunningTasksPerResourceProfile(manager) === 3) post(SparkListenerTaskEnd(0, 1, null, Success, attemptTaskInfo1, new ExecutorMetrics, null)) - assert(totalRunningTasks(manager) === 2) + assert(totalRunningTasksPerResourceProfile(manager) === 2) post(SparkListenerTaskEnd(0, 0, null, Success, taskInfo2, new ExecutorMetrics, null)) - assert(totalRunningTasks(manager) === 1) + assert(totalRunningTasksPerResourceProfile(manager) === 1) post(SparkListenerTaskEnd(0, 1, null, Success, attemptTaskInfo2, new ExecutorMetrics, null)) - assert(totalRunningTasks(manager) === 0) + assert(totalRunningTasksPerResourceProfile(manager) === 0) } testRetry("cancel pending executors when no longer needed") { val manager = createManager(createConf(0, 10, 0)) post(SparkListenerStageSubmitted(createStageInfo(2, 5))) - assert(numExecutorsTarget(manager) === 0) - assert(numExecutorsToAdd(manager) === 1) - assert(addExecutors(manager) === 1) - assert(numExecutorsTarget(manager) === 1) - assert(numExecutorsToAdd(manager) === 2) - assert(addExecutors(manager) === 2) - assert(numExecutorsTarget(manager) === 3) + val updatesNeeded = + new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates] + + assert(numExecutorsTargetForDefaultProfileId(manager) === 0) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 1) + assert(numExecutorsToAddForDefaultProfile(manager) === 2) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 3) val task1Info = createTaskInfo(0, 0, "executor-1") post(SparkListenerTaskStart(2, 0, task1Info)) - assert(numExecutorsToAdd(manager) === 4) - assert(addExecutors(manager) === 2) + assert(numExecutorsToAddForDefaultProfile(manager) === 4) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) val task2Info = createTaskInfo(1, 0, "executor-1") post(SparkListenerTaskStart(2, 0, task2Info)) @@ -331,22 +702,21 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { test("remove executors") { val manager = createManager(createConf(5, 10, 5)) - (1 to 10).map(_.toString).foreach { id => onExecutorAdded(manager, id) } + (1 to 10).map(_.toString).foreach { id => onExecutorAddedDefaultProfile(manager, id) } // Keep removing until the limit is reached assert(executorsPendingToRemove(manager).isEmpty) - assert(removeExecutor(manager, "1")) + assert(removeExecutorDefaultProfile(manager, "1")) assert(executorsPendingToRemove(manager).size === 1) assert(executorsPendingToRemove(manager).contains("1")) - assert(removeExecutor(manager, "2")) - assert(removeExecutor(manager, "3")) + assert(removeExecutorDefaultProfile(manager, "2")) + assert(removeExecutorDefaultProfile(manager, "3")) assert(executorsPendingToRemove(manager).size === 3) assert(executorsPendingToRemove(manager).contains("2")) assert(executorsPendingToRemove(manager).contains("3")) - assert(executorsPendingToRemove(manager).size === 3) - assert(removeExecutor(manager, "4")) - assert(removeExecutor(manager, "5")) - assert(!removeExecutor(manager, "6")) // reached the limit of 5 + assert(removeExecutorDefaultProfile(manager, "4")) + assert(removeExecutorDefaultProfile(manager, "5")) + assert(!removeExecutorDefaultProfile(manager, "6")) // reached the limit of 5 assert(executorsPendingToRemove(manager).size === 5) assert(executorsPendingToRemove(manager).contains("4")) assert(executorsPendingToRemove(manager).contains("5")) @@ -370,29 +740,29 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { // Try removing again // This should still fail because the number pending + running is still at the limit - assert(!removeExecutor(manager, "7")) + assert(!removeExecutorDefaultProfile(manager, "7")) assert(executorsPendingToRemove(manager).isEmpty) - assert(!removeExecutor(manager, "8")) + assert(!removeExecutorDefaultProfile(manager, "8")) assert(executorsPendingToRemove(manager).isEmpty) } test("remove multiple executors") { val manager = createManager(createConf(5, 10, 5)) - (1 to 10).map(_.toString).foreach { id => onExecutorAdded(manager, id) } + (1 to 10).map(_.toString).foreach { id => onExecutorAddedDefaultProfile(manager, id) } // Keep removing until the limit is reached assert(executorsPendingToRemove(manager).isEmpty) - assert(removeExecutors(manager, Seq("1")) === Seq("1")) + assert(removeExecutorsDefaultProfile(manager, Seq("1")) === Seq("1")) assert(executorsPendingToRemove(manager).size === 1) assert(executorsPendingToRemove(manager).contains("1")) - assert(removeExecutors(manager, Seq("2", "3")) === Seq("2", "3")) + assert(removeExecutorsDefaultProfile(manager, Seq("2", "3")) === Seq("2", "3")) assert(executorsPendingToRemove(manager).size === 3) assert(executorsPendingToRemove(manager).contains("2")) assert(executorsPendingToRemove(manager).contains("3")) assert(executorsPendingToRemove(manager).size === 3) - assert(removeExecutor(manager, "4")) - assert(removeExecutors(manager, Seq("5")) === Seq("5")) - assert(!removeExecutor(manager, "6")) // reached the limit of 5 + assert(removeExecutorDefaultProfile(manager, "4")) + assert(removeExecutorsDefaultProfile(manager, Seq("5")) === Seq("5")) + assert(!removeExecutorDefaultProfile(manager, "6")) // reached the limit of 5 assert(executorsPendingToRemove(manager).size === 5) assert(executorsPendingToRemove(manager).contains("4")) assert(executorsPendingToRemove(manager).contains("5")) @@ -416,87 +786,100 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { // Try removing again // This should still fail because the number pending + running is still at the limit - assert(!removeExecutor(manager, "7")) + assert(!removeExecutorDefaultProfile(manager, "7")) assert(executorsPendingToRemove(manager).isEmpty) - assert(removeExecutors(manager, Seq("8")) !== Seq("8")) + assert(removeExecutorsDefaultProfile(manager, Seq("8")) !== Seq("8")) assert(executorsPendingToRemove(manager).isEmpty) } - test ("Removing with various numExecutorsTarget condition") { + test ("Removing with various numExecutorsTargetForDefaultProfileId condition") { val manager = createManager(createConf(5, 12, 5)) post(SparkListenerStageSubmitted(createStageInfo(0, 8))) - // Remove when numExecutorsTarget is the same as the current number of executors - assert(addExecutors(manager) === 1) - assert(addExecutors(manager) === 2) - (1 to 8).foreach(execId => onExecutorAdded(manager, execId.toString)) + val updatesNeeded = + new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates] + + // Remove when numExecutorsTargetForDefaultProfileId is the same as the current + // number of executors + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + (1 to 8).foreach(execId => onExecutorAddedDefaultProfile(manager, execId.toString)) (1 to 8).map { i => createTaskInfo(i, i, s"$i") }.foreach { info => post(SparkListenerTaskStart(0, 0, info)) } assert(manager.executorMonitor.executorCount === 8) - assert(numExecutorsTarget(manager) === 8) - assert(maxNumExecutorsNeeded(manager) == 8) - assert(!removeExecutor(manager, "1")) // won't work since numExecutorsTarget == numExecutors + assert(numExecutorsTargetForDefaultProfileId(manager) === 8) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 8) + // won't work since numExecutorsTargetForDefaultProfileId == numExecutors + assert(!removeExecutorDefaultProfile(manager, "1")) - // Remove executors when numExecutorsTarget is lower than current number of executors + // Remove executors when numExecutorsTargetForDefaultProfileId is lower than + // current number of executors (1 to 3).map { i => createTaskInfo(i, i, s"$i") }.foreach { info => post(SparkListenerTaskEnd(0, 0, null, Success, info, new ExecutorMetrics, null)) } adjustRequestedExecutors(manager) assert(manager.executorMonitor.executorCount === 8) - assert(numExecutorsTarget(manager) === 5) - assert(maxNumExecutorsNeeded(manager) == 5) - assert(removeExecutor(manager, "1")) - assert(removeExecutors(manager, Seq("2", "3"))=== Seq("2", "3")) + assert(numExecutorsTargetForDefaultProfileId(manager) === 5) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 5) + assert(removeExecutorDefaultProfile(manager, "1")) + assert(removeExecutorsDefaultProfile(manager, Seq("2", "3"))=== Seq("2", "3")) onExecutorRemoved(manager, "1") onExecutorRemoved(manager, "2") onExecutorRemoved(manager, "3") - // numExecutorsTarget is lower than minNumExecutors + // numExecutorsTargetForDefaultProfileId is lower than minNumExecutors post(SparkListenerTaskEnd(0, 0, null, Success, createTaskInfo(4, 4, "4"), new ExecutorMetrics, null)) assert(manager.executorMonitor.executorCount === 5) - assert(numExecutorsTarget(manager) === 5) - assert(maxNumExecutorsNeeded(manager) == 4) - assert(!removeExecutor(manager, "4")) // lower limit - assert(addExecutors(manager) === 0) // upper limit + assert(numExecutorsTargetForDefaultProfileId(manager) === 5) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) == 4) + assert(!removeExecutorDefaultProfile(manager, "4")) // lower limit + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0) // upper limit } test ("interleaving add and remove") { val manager = createManager(createConf(5, 12, 5)) post(SparkListenerStageSubmitted(createStageInfo(0, 1000))) + val updatesNeeded = + new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates] + // Add a few executors - assert(addExecutors(manager) === 1) - assert(addExecutors(manager) === 2) - onExecutorAdded(manager, "1") - onExecutorAdded(manager, "2") - onExecutorAdded(manager, "3") - onExecutorAdded(manager, "4") - onExecutorAdded(manager, "5") - onExecutorAdded(manager, "6") - onExecutorAdded(manager, "7") - onExecutorAdded(manager, "8") + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + onExecutorAddedDefaultProfile(manager, "1") + onExecutorAddedDefaultProfile(manager, "2") + onExecutorAddedDefaultProfile(manager, "3") + onExecutorAddedDefaultProfile(manager, "4") + onExecutorAddedDefaultProfile(manager, "5") + onExecutorAddedDefaultProfile(manager, "6") + onExecutorAddedDefaultProfile(manager, "7") + onExecutorAddedDefaultProfile(manager, "8") assert(manager.executorMonitor.executorCount === 8) - assert(numExecutorsTarget(manager) === 8) + assert(numExecutorsTargetForDefaultProfileId(manager) === 8) // Remove when numTargetExecutors is equal to the current number of executors - assert(!removeExecutor(manager, "1")) - assert(removeExecutors(manager, Seq("2", "3")) !== Seq("2", "3")) + assert(!removeExecutorDefaultProfile(manager, "1")) + assert(removeExecutorsDefaultProfile(manager, Seq("2", "3")) !== Seq("2", "3")) // Remove until limit - onExecutorAdded(manager, "9") - onExecutorAdded(manager, "10") - onExecutorAdded(manager, "11") - onExecutorAdded(manager, "12") + onExecutorAddedDefaultProfile(manager, "9") + onExecutorAddedDefaultProfile(manager, "10") + onExecutorAddedDefaultProfile(manager, "11") + onExecutorAddedDefaultProfile(manager, "12") assert(manager.executorMonitor.executorCount === 12) - assert(numExecutorsTarget(manager) === 8) + assert(numExecutorsTargetForDefaultProfileId(manager) === 8) - assert(removeExecutor(manager, "1")) - assert(removeExecutors(manager, Seq("2", "3", "4")) === Seq("2", "3", "4")) - assert(!removeExecutor(manager, "5")) // lower limit reached - assert(!removeExecutor(manager, "6")) + assert(removeExecutorDefaultProfile(manager, "1")) + assert(removeExecutorsDefaultProfile(manager, Seq("2", "3", "4")) === Seq("2", "3", "4")) + assert(!removeExecutorDefaultProfile(manager, "5")) // lower limit reached + assert(!removeExecutorDefaultProfile(manager, "6")) onExecutorRemoved(manager, "1") onExecutorRemoved(manager, "2") onExecutorRemoved(manager, "3") @@ -504,33 +887,36 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { assert(manager.executorMonitor.executorCount === 8) // Add until limit - assert(!removeExecutor(manager, "7")) // still at lower limit + assert(!removeExecutorDefaultProfile(manager, "7")) // still at lower limit assert((manager, Seq("8")) !== Seq("8")) - onExecutorAdded(manager, "13") - onExecutorAdded(manager, "14") - onExecutorAdded(manager, "15") - onExecutorAdded(manager, "16") + onExecutorAddedDefaultProfile(manager, "13") + onExecutorAddedDefaultProfile(manager, "14") + onExecutorAddedDefaultProfile(manager, "15") + onExecutorAddedDefaultProfile(manager, "16") assert(manager.executorMonitor.executorCount === 12) // Remove succeeds again, now that we are no longer at the lower limit - assert(removeExecutors(manager, Seq("5", "6", "7")) === Seq("5", "6", "7")) - assert(removeExecutor(manager, "8")) + assert(removeExecutorsDefaultProfile(manager, Seq("5", "6", "7")) === Seq("5", "6", "7")) + assert(removeExecutorDefaultProfile(manager, "8")) assert(manager.executorMonitor.executorCount === 12) onExecutorRemoved(manager, "5") onExecutorRemoved(manager, "6") assert(manager.executorMonitor.executorCount === 10) - assert(numExecutorsToAdd(manager) === 4) + assert(numExecutorsToAddForDefaultProfile(manager) === 4) onExecutorRemoved(manager, "9") onExecutorRemoved(manager, "10") - assert(addExecutors(manager) === 4) // at upper limit - onExecutorAdded(manager, "17") - onExecutorAdded(manager, "18") + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 4) // at upper limit + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + onExecutorAddedDefaultProfile(manager, "17") + onExecutorAddedDefaultProfile(manager, "18") assert(manager.executorMonitor.executorCount === 10) - assert(addExecutors(manager) === 0) // still at upper limit - onExecutorAdded(manager, "19") - onExecutorAdded(manager, "20") + // still at upper limit + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 0) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + onExecutorAddedDefaultProfile(manager, "19") + onExecutorAddedDefaultProfile(manager, "20") assert(manager.executorMonitor.executorCount === 12) - assert(numExecutorsTarget(manager) === 12) + assert(numExecutorsTargetForDefaultProfileId(manager) === 12) } test("starting/canceling add timer") { @@ -541,7 +927,7 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { assert(addTime(manager) === NOT_SET) onSchedulerBacklogged(manager) val firstAddTime = addTime(manager) - assert(firstAddTime === clock.getTimeMillis + schedulerBacklogTimeout * 1000) + assert(firstAddTime === clock.nanoTime() + TimeUnit.SECONDS.toNanos(schedulerBacklogTimeout)) clock.advance(100L) onSchedulerBacklogged(manager) assert(addTime(manager) === firstAddTime) // timer is already started @@ -555,7 +941,7 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { assert(addTime(manager) === NOT_SET) onSchedulerBacklogged(manager) val secondAddTime = addTime(manager) - assert(secondAddTime === clock.getTimeMillis + schedulerBacklogTimeout * 1000) + assert(secondAddTime === clock.nanoTime() + TimeUnit.SECONDS.toNanos(schedulerBacklogTimeout)) clock.advance(100L) onSchedulerBacklogged(manager) assert(addTime(manager) === secondAddTime) // timer is already started @@ -568,22 +954,22 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { val manager = createManager(createConf(0, 20, 0), clock = clock) // No events - we should not be adding or removing - assert(numExecutorsTarget(manager) === 0) + assert(numExecutorsTargetForDefaultProfileId(manager) === 0) assert(executorsPendingToRemove(manager).isEmpty) schedule(manager) - assert(numExecutorsTarget(manager) === 0) + assert(numExecutorsTargetForDefaultProfileId(manager) === 0) assert(executorsPendingToRemove(manager).isEmpty) clock.advance(100L) schedule(manager) - assert(numExecutorsTarget(manager) === 0) + assert(numExecutorsTargetForDefaultProfileId(manager) === 0) assert(executorsPendingToRemove(manager).isEmpty) clock.advance(1000L) schedule(manager) - assert(numExecutorsTarget(manager) === 0) + assert(numExecutorsTargetForDefaultProfileId(manager) === 0) assert(executorsPendingToRemove(manager).isEmpty) clock.advance(10000L) schedule(manager) - assert(numExecutorsTarget(manager) === 0) + assert(numExecutorsTargetForDefaultProfileId(manager) === 0) assert(executorsPendingToRemove(manager).isEmpty) } @@ -596,43 +982,43 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { onSchedulerBacklogged(manager) clock.advance(schedulerBacklogTimeout * 1000 / 2) schedule(manager) - assert(numExecutorsTarget(manager) === 0) // timer not exceeded yet + assert(numExecutorsTargetForDefaultProfileId(manager) === 0) // timer not exceeded yet clock.advance(schedulerBacklogTimeout * 1000) schedule(manager) - assert(numExecutorsTarget(manager) === 1) // first timer exceeded + assert(numExecutorsTargetForDefaultProfileId(manager) === 1) // first timer exceeded clock.advance(sustainedSchedulerBacklogTimeout * 1000 / 2) schedule(manager) - assert(numExecutorsTarget(manager) === 1) // second timer not exceeded yet + assert(numExecutorsTargetForDefaultProfileId(manager) === 1) // second timer not exceeded yet clock.advance(sustainedSchedulerBacklogTimeout * 1000) schedule(manager) - assert(numExecutorsTarget(manager) === 1 + 2) // second timer exceeded + assert(numExecutorsTargetForDefaultProfileId(manager) === 1 + 2) // second timer exceeded clock.advance(sustainedSchedulerBacklogTimeout * 1000) schedule(manager) - assert(numExecutorsTarget(manager) === 1 + 2 + 4) // third timer exceeded + assert(numExecutorsTargetForDefaultProfileId(manager) === 1 + 2 + 4) // third timer exceeded // Scheduler queue drained onSchedulerQueueEmpty(manager) clock.advance(sustainedSchedulerBacklogTimeout * 1000) schedule(manager) - assert(numExecutorsTarget(manager) === 7) // timer is canceled + assert(numExecutorsTargetForDefaultProfileId(manager) === 7) // timer is canceled clock.advance(sustainedSchedulerBacklogTimeout * 1000) schedule(manager) - assert(numExecutorsTarget(manager) === 7) + assert(numExecutorsTargetForDefaultProfileId(manager) === 7) // Scheduler queue backlogged again onSchedulerBacklogged(manager) clock.advance(schedulerBacklogTimeout * 1000) schedule(manager) - assert(numExecutorsTarget(manager) === 7 + 1) // timer restarted + assert(numExecutorsTargetForDefaultProfileId(manager) === 7 + 1) // timer restarted clock.advance(sustainedSchedulerBacklogTimeout * 1000) schedule(manager) - assert(numExecutorsTarget(manager) === 7 + 1 + 2) + assert(numExecutorsTargetForDefaultProfileId(manager) === 7 + 1 + 2) clock.advance(sustainedSchedulerBacklogTimeout * 1000) schedule(manager) - assert(numExecutorsTarget(manager) === 7 + 1 + 2 + 4) + assert(numExecutorsTargetForDefaultProfileId(manager) === 7 + 1 + 2 + 4) clock.advance(sustainedSchedulerBacklogTimeout * 1000) schedule(manager) - assert(numExecutorsTarget(manager) === 20) // limit reached + assert(numExecutorsTargetForDefaultProfileId(manager) === 20) // limit reached } test("mock polling loop remove behavior") { @@ -640,9 +1026,9 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { val manager = createManager(createConf(1, 20, 1), clock = clock) // Remove idle executors on timeout - onExecutorAdded(manager, "executor-1") - onExecutorAdded(manager, "executor-2") - onExecutorAdded(manager, "executor-3") + onExecutorAddedDefaultProfile(manager, "executor-1") + onExecutorAddedDefaultProfile(manager, "executor-2") + onExecutorAddedDefaultProfile(manager, "executor-3") assert(executorsPendingToRemove(manager).isEmpty) // idle threshold not reached yet @@ -658,10 +1044,10 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { assert(executorsPendingToRemove(manager).size === 2) // limit reached (1 executor remaining) // Mark a subset as busy - only idle executors should be removed - onExecutorAdded(manager, "executor-4") - onExecutorAdded(manager, "executor-5") - onExecutorAdded(manager, "executor-6") - onExecutorAdded(manager, "executor-7") + onExecutorAddedDefaultProfile(manager, "executor-4") + onExecutorAddedDefaultProfile(manager, "executor-5") + onExecutorAddedDefaultProfile(manager, "executor-6") + onExecutorAddedDefaultProfile(manager, "executor-7") assert(manager.executorMonitor.executorCount === 7) assert(executorsPendingToRemove(manager).size === 2) // 2 pending to be removed onExecutorBusy(manager, "executor-4") @@ -726,23 +1112,31 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { val stage1 = createStageInfo(0, 1000) post(SparkListenerStageSubmitted(stage1)) - assert(addExecutors(manager) === 1) - assert(addExecutors(manager) === 2) - assert(addExecutors(manager) === 4) - assert(addExecutors(manager) === 8) - assert(numExecutorsTarget(manager) === 15) + val updatesNeeded = + new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates] + + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 4) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 8) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 15) (0 until 15).foreach { i => - onExecutorAdded(manager, s"executor-$i") + onExecutorAddedDefaultProfile(manager, s"executor-$i") } assert(manager.executorMonitor.executorCount === 15) post(SparkListenerStageCompleted(stage1)) adjustRequestedExecutors(manager) - assert(numExecutorsTarget(manager) === 0) + assert(numExecutorsTargetForDefaultProfileId(manager) === 0) post(SparkListenerStageSubmitted(createStageInfo(1, 1000))) - addExecutors(manager) - assert(numExecutorsTarget(manager) === 16) + addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 16) } test("avoid ramp down initial executors until first job is submitted") { @@ -750,19 +1144,19 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { val manager = createManager(createConf(2, 5, 3), clock = clock) // Verify the initial number of executors - assert(numExecutorsTarget(manager) === 3) + assert(numExecutorsTargetForDefaultProfileId(manager) === 3) schedule(manager) // Verify whether the initial number of executors is kept with no pending tasks - assert(numExecutorsTarget(manager) === 3) + assert(numExecutorsTargetForDefaultProfileId(manager) === 3) post(SparkListenerStageSubmitted(createStageInfo(1, 2))) clock.advance(100L) - assert(maxNumExecutorsNeeded(manager) === 2) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) === 2) schedule(manager) // Verify that current number of executors should be ramp down when first job is submitted - assert(numExecutorsTarget(manager) === 2) + assert(numExecutorsTargetForDefaultProfileId(manager) === 2) } test("avoid ramp down initial executors until idle executor is timeout") { @@ -770,20 +1164,20 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { val manager = createManager(createConf(2, 5, 3), clock = clock) // Verify the initial number of executors - assert(numExecutorsTarget(manager) === 3) + assert(numExecutorsTargetForDefaultProfileId(manager) === 3) schedule(manager) // Verify the initial number of executors is kept when no pending tasks - assert(numExecutorsTarget(manager) === 3) + assert(numExecutorsTargetForDefaultProfileId(manager) === 3) (0 until 3).foreach { i => - onExecutorAdded(manager, s"executor-$i") + onExecutorAddedDefaultProfile(manager, s"executor-$i") } clock.advance(executorIdleTimeout * 1000) - assert(maxNumExecutorsNeeded(manager) === 0) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) === 0) schedule(manager) - // Verify executor is timeout,numExecutorsTarget is recalculated - assert(numExecutorsTarget(manager) === 2) + // Verify executor is timeout,numExecutorsTargetForDefaultProfileId is recalculated + assert(numExecutorsTargetForDefaultProfileId(manager) === 2) } test("get pending task number and related locality preference") { @@ -799,7 +1193,8 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { val stageInfo1 = createStageInfo(1, 5, localityPreferences1) post(SparkListenerStageSubmitted(stageInfo1)) - assert(localityAwareTasks(manager) === 3) + assert(localityAwareTasksForDefaultProfile(manager) === 3) + val hostToLocal = hostToLocalTaskCount(manager) assert(hostToLocalTaskCount(manager) === Map("host1" -> 2, "host2" -> 3, "host3" -> 2, "host4" -> 2)) @@ -811,67 +1206,76 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { val stageInfo2 = createStageInfo(2, 3, localityPreferences2) post(SparkListenerStageSubmitted(stageInfo2)) - assert(localityAwareTasks(manager) === 5) + assert(localityAwareTasksForDefaultProfile(manager) === 5) assert(hostToLocalTaskCount(manager) === Map("host1" -> 2, "host2" -> 4, "host3" -> 4, "host4" -> 3, "host5" -> 2)) post(SparkListenerStageCompleted(stageInfo1)) - assert(localityAwareTasks(manager) === 2) + assert(localityAwareTasksForDefaultProfile(manager) === 2) assert(hostToLocalTaskCount(manager) === Map("host2" -> 1, "host3" -> 2, "host4" -> 1, "host5" -> 2)) } - test("SPARK-8366: maxNumExecutorsNeeded should properly handle failed tasks") { + test("SPARK-8366: maxNumExecutorsNeededPerResourceProfile should properly handle failed tasks") { val manager = createManager(createConf()) - assert(maxNumExecutorsNeeded(manager) === 0) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) === 0) post(SparkListenerStageSubmitted(createStageInfo(0, 1))) - assert(maxNumExecutorsNeeded(manager) === 1) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) === 1) val taskInfo = createTaskInfo(1, 1, "executor-1") post(SparkListenerTaskStart(0, 0, taskInfo)) - assert(maxNumExecutorsNeeded(manager) === 1) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) === 1) // If the task is failed, we expect it to be resubmitted later. val taskEndReason = ExceptionFailure(null, null, null, null, None) post(SparkListenerTaskEnd(0, 0, null, taskEndReason, taskInfo, new ExecutorMetrics, null)) - assert(maxNumExecutorsNeeded(manager) === 1) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) === 1) } test("reset the state of allocation manager") { val manager = createManager(createConf()) - assert(numExecutorsTarget(manager) === 1) - assert(numExecutorsToAdd(manager) === 1) + assert(numExecutorsTargetForDefaultProfileId(manager) === 1) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) + + val updatesNeeded = + new mutable.HashMap[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates] // Allocation manager is reset when adding executor requests are sent without reporting back // executor added. post(SparkListenerStageSubmitted(createStageInfo(0, 10))) - assert(addExecutors(manager) === 1) - assert(numExecutorsTarget(manager) === 2) - assert(addExecutors(manager) === 2) - assert(numExecutorsTarget(manager) === 4) - assert(addExecutors(manager) === 1) - assert(numExecutorsTarget(manager) === 5) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 2) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 2) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 4) + assert(addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) === 1) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 5) manager.reset() - assert(numExecutorsTarget(manager) === 1) - assert(numExecutorsToAdd(manager) === 1) + assert(numExecutorsTargetForDefaultProfileId(manager) === 1) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) assert(manager.executorMonitor.executorCount === 0) // Allocation manager is reset when executors are added. post(SparkListenerStageSubmitted(createStageInfo(0, 10))) - addExecutors(manager) - addExecutors(manager) - addExecutors(manager) - assert(numExecutorsTarget(manager) === 5) - - onExecutorAdded(manager, "first") - onExecutorAdded(manager, "second") - onExecutorAdded(manager, "third") - onExecutorAdded(manager, "fourth") - onExecutorAdded(manager, "fifth") + addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 5) + + onExecutorAddedDefaultProfile(manager, "first") + onExecutorAddedDefaultProfile(manager, "second") + onExecutorAddedDefaultProfile(manager, "third") + onExecutorAddedDefaultProfile(manager, "fourth") + onExecutorAddedDefaultProfile(manager, "fifth") assert(manager.executorMonitor.executorCount === 5) // Cluster manager lost will make all the live executors lost, so here simulate this behavior @@ -882,28 +1286,31 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { onExecutorRemoved(manager, "fifth") manager.reset() - assert(numExecutorsTarget(manager) === 1) - assert(numExecutorsToAdd(manager) === 1) + assert(numExecutorsTargetForDefaultProfileId(manager) === 1) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) assert(manager.executorMonitor.executorCount === 0) // Allocation manager is reset when executors are pending to remove - addExecutors(manager) - addExecutors(manager) - addExecutors(manager) - assert(numExecutorsTarget(manager) === 5) - - onExecutorAdded(manager, "first") - onExecutorAdded(manager, "second") - onExecutorAdded(manager, "third") - onExecutorAdded(manager, "fourth") - onExecutorAdded(manager, "fifth") - onExecutorAdded(manager, "sixth") - onExecutorAdded(manager, "seventh") - onExecutorAdded(manager, "eighth") + addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + addExecutorsToTargetForDefaultProfile(manager, updatesNeeded) + doUpdateRequest(manager, updatesNeeded.toMap, clock.getTimeMillis()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 5) + + onExecutorAddedDefaultProfile(manager, "first") + onExecutorAddedDefaultProfile(manager, "second") + onExecutorAddedDefaultProfile(manager, "third") + onExecutorAddedDefaultProfile(manager, "fourth") + onExecutorAddedDefaultProfile(manager, "fifth") + onExecutorAddedDefaultProfile(manager, "sixth") + onExecutorAddedDefaultProfile(manager, "seventh") + onExecutorAddedDefaultProfile(manager, "eighth") assert(manager.executorMonitor.executorCount === 8) - removeExecutor(manager, "first") - removeExecutors(manager, Seq("second", "third")) + removeExecutorDefaultProfile(manager, "first") + removeExecutorsDefaultProfile(manager, Seq("second", "third")) assert(executorsPendingToRemove(manager) === Set("first", "second", "third")) assert(manager.executorMonitor.executorCount === 8) @@ -917,8 +1324,8 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { manager.reset() - assert(numExecutorsTarget(manager) === 1) - assert(numExecutorsToAdd(manager) === 1) + assert(numExecutorsTargetForDefaultProfileId(manager) === 1) + assert(numExecutorsToAddForDefaultProfile(manager) === 1) assert(executorsPendingToRemove(manager) === Set.empty) assert(manager.executorMonitor.executorCount === 0) } @@ -929,31 +1336,31 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { createConf(1, 2, 1).set(config.DYN_ALLOCATION_TESTING, false), clock = clock) - when(client.requestTotalExecutors(meq(2), any(), any())).thenReturn(true) + when(client.requestTotalExecutors(any(), any(), any())).thenReturn(true) // test setup -- job with 2 tasks, scale up to two executors - assert(numExecutorsTarget(manager) === 1) + assert(numExecutorsTargetForDefaultProfileId(manager) === 1) post(SparkListenerExecutorAdded( clock.getTimeMillis(), "executor-1", new ExecutorInfo("host1", 1, Map.empty, Map.empty))) post(SparkListenerStageSubmitted(createStageInfo(0, 2))) clock.advance(1000) - manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.getTimeMillis()) - assert(numExecutorsTarget(manager) === 2) + manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 2) val taskInfo0 = createTaskInfo(0, 0, "executor-1") post(SparkListenerTaskStart(0, 0, taskInfo0)) post(SparkListenerExecutorAdded( clock.getTimeMillis(), "executor-2", new ExecutorInfo("host1", 1, Map.empty, Map.empty))) val taskInfo1 = createTaskInfo(1, 1, "executor-2") post(SparkListenerTaskStart(0, 0, taskInfo1)) - assert(numExecutorsTarget(manager) === 2) + assert(numExecutorsTargetForDefaultProfileId(manager) === 2) // have one task finish -- we should adjust the target number of executors down // but we should *not* kill any executors yet post(SparkListenerTaskEnd(0, 0, null, Success, taskInfo0, new ExecutorMetrics, null)) - assert(maxNumExecutorsNeeded(manager) === 1) - assert(numExecutorsTarget(manager) === 2) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) === 1) + assert(numExecutorsTargetForDefaultProfileId(manager) === 2) clock.advance(1000) - manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.getTimeMillis()) - assert(numExecutorsTarget(manager) === 1) + manager invokePrivate _updateAndSyncNumExecutorsTarget(clock.nanoTime()) + assert(numExecutorsTargetForDefaultProfileId(manager) === 1) verify(client, never).killExecutors(any(), any(), any(), any()) // now we cross the idle timeout for executor-1, so we kill it. the really important @@ -963,8 +1370,8 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { .thenReturn(Seq("executor-1")) clock.advance(3000) schedule(manager) - assert(maxNumExecutorsNeeded(manager) === 1) - assert(numExecutorsTarget(manager) === 1) + assert(maxNumExecutorsNeededPerResourceProfile(manager, defaultProfile) === 1) + assert(numExecutorsTargetForDefaultProfileId(manager) === 1) // here's the important verify -- we did kill the executors, but did not adjust the target count verify(client).killExecutors(Seq("executor-1"), false, false, false) } @@ -972,7 +1379,7 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { test("SPARK-26758 check executor target number after idle time out ") { val clock = new ManualClock(10000L) val manager = createManager(createConf(1, 5, 3), clock = clock) - assert(numExecutorsTarget(manager) === 3) + assert(numExecutorsTargetForDefaultProfileId(manager) === 3) post(SparkListenerExecutorAdded( clock.getTimeMillis(), "executor-1", new ExecutorInfo("host1", 1, Map.empty))) post(SparkListenerExecutorAdded( @@ -983,14 +1390,14 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { clock.advance(executorIdleTimeout * 1000) schedule(manager) // once the schedule is run target executor number should be 1 - assert(numExecutorsTarget(manager) === 1) + assert(numExecutorsTargetForDefaultProfileId(manager) === 1) } private def createConf( minExecutors: Int = 1, maxExecutors: Int = 5, initialExecutors: Int = 1): SparkConf = { - new SparkConf() + val sparkConf = new SparkConf() .set(config.DYN_ALLOCATION_ENABLED, true) .set(config.DYN_ALLOCATION_MIN_EXECUTORS, minExecutors) .set(config.DYN_ALLOCATION_MAX_EXECUTORS, maxExecutors) @@ -1005,19 +1412,37 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { // SPARK-22864: effectively disable the allocation schedule by setting the period to a // really long value. .set(TEST_SCHEDULE_INTERVAL, 10000L) + sparkConf } private def createManager( conf: SparkConf, clock: Clock = new SystemClock()): ExecutorAllocationManager = { - val manager = new ExecutorAllocationManager(client, listenerBus, conf, clock = clock) + ResourceProfile.reInitDefaultProfile(conf) + rpManager = new ResourceProfileManager(conf) + val manager = new ExecutorAllocationManager(client, listenerBus, conf, clock = clock, + resourceProfileManager = rpManager) managers += manager manager.start() manager } - private def onExecutorAdded(manager: ExecutorAllocationManager, id: String): Unit = { - post(SparkListenerExecutorAdded(0L, id, null)) + private val execInfo = new ExecutorInfo("host1", 1, Map.empty, + Map.empty, Map.empty, DEFAULT_RESOURCE_PROFILE_ID) + + private def onExecutorAddedDefaultProfile( + manager: ExecutorAllocationManager, + id: String): Unit = { + post(SparkListenerExecutorAdded(0L, id, execInfo)) + } + + private def onExecutorAdded( + manager: ExecutorAllocationManager, + id: String, + rp: ResourceProfile): Unit = { + val cores = rp.getExecutorCores.getOrElse(1) + val execInfo = new ExecutorInfo("host1", cores, Map.empty, Map.empty, Map.empty, rp.id) + post(SparkListenerExecutorAdded(0L, id, execInfo)) } private def onExecutorRemoved(manager: ExecutorAllocationManager, id: String): Unit = { @@ -1035,8 +1460,18 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite { post(SparkListenerTaskEnd(1, 1, "foo", Success, info, new ExecutorMetrics, null)) } - private def removeExecutor(manager: ExecutorAllocationManager, executorId: String): Boolean = { - val executorsRemoved = removeExecutors(manager, Seq(executorId)) + private def removeExecutorDefaultProfile( + manager: ExecutorAllocationManager, + executorId: String): Boolean = { + val executorsRemoved = removeExecutorsDefaultProfile(manager, Seq(executorId)) + executorsRemoved.nonEmpty && executorsRemoved(0) == executorId + } + + private def removeExecutor( + manager: ExecutorAllocationManager, + executorId: String, + rpId: Int): Boolean = { + val executorsRemoved = removeExecutors(manager, Seq((executorId, rpId))) executorsRemoved.nonEmpty && executorsRemoved(0) == executorId } @@ -1058,10 +1493,11 @@ private object ExecutorAllocationManagerSuite extends PrivateMethodTester { stageId: Int, numTasks: Int, taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty, - attemptId: Int = 0 + attemptId: Int = 0, + rp: ResourceProfile = defaultProfile ): StageInfo = { new StageInfo(stageId, attemptId, "name", numTasks, Seq.empty, Seq.empty, "no details", - taskLocalityPreferences = taskLocalityPreferences) + taskLocalityPreferences = taskLocalityPreferences, resourceProfileId = rp.id) } private def createTaskInfo( @@ -1076,52 +1512,117 @@ private object ExecutorAllocationManagerSuite extends PrivateMethodTester { | Helper methods for accessing private methods and fields | * ------------------------------------------------------- */ - private val _numExecutorsToAdd = PrivateMethod[Int]('numExecutorsToAdd) - private val _numExecutorsTarget = PrivateMethod[Int]('numExecutorsTarget) - private val _maxNumExecutorsNeeded = PrivateMethod[Int]('maxNumExecutorsNeeded) - private val _addTime = PrivateMethod[Long]('addTime) - private val _schedule = PrivateMethod[Unit]('schedule) - private val _addExecutors = PrivateMethod[Int]('addExecutors) + private val _numExecutorsToAddPerResourceProfileId = + PrivateMethod[mutable.HashMap[Int, Int]]( + Symbol("numExecutorsToAddPerResourceProfileId")) + private val _numExecutorsTargetPerResourceProfileId = + PrivateMethod[mutable.HashMap[Int, Int]]( + Symbol("numExecutorsTargetPerResourceProfileId")) + private val _maxNumExecutorsNeededPerResourceProfile = + PrivateMethod[Int](Symbol("maxNumExecutorsNeededPerResourceProfile")) + private val _addTime = PrivateMethod[Long](Symbol("addTime")) + private val _schedule = PrivateMethod[Unit](Symbol("schedule")) + private val _doUpdateRequest = PrivateMethod[Unit](Symbol("doUpdateRequest")) private val _updateAndSyncNumExecutorsTarget = - PrivateMethod[Int]('updateAndSyncNumExecutorsTarget) - private val _removeExecutors = PrivateMethod[Seq[String]]('removeExecutors) - private val _onSchedulerBacklogged = PrivateMethod[Unit]('onSchedulerBacklogged) - private val _onSchedulerQueueEmpty = PrivateMethod[Unit]('onSchedulerQueueEmpty) - private val _localityAwareTasks = PrivateMethod[Int]('localityAwareTasks) - private val _hostToLocalTaskCount = PrivateMethod[Map[String, Int]]('hostToLocalTaskCount) - private val _onSpeculativeTaskSubmitted = PrivateMethod[Unit]('onSpeculativeTaskSubmitted) - private val _totalRunningTasks = PrivateMethod[Int]('totalRunningTasks) + PrivateMethod[Int](Symbol("updateAndSyncNumExecutorsTarget")) + private val _addExecutorsToTarget = PrivateMethod[Int](Symbol("addExecutorsToTarget")) + private val _removeExecutors = PrivateMethod[Seq[String]](Symbol("removeExecutors")) + private val _onSchedulerBacklogged = PrivateMethod[Unit](Symbol("onSchedulerBacklogged")) + private val _onSchedulerQueueEmpty = PrivateMethod[Unit](Symbol("onSchedulerQueueEmpty")) + private val _localityAwareTasksPerResourceProfileId = + PrivateMethod[mutable.HashMap[Int, Int]](Symbol("numLocalityAwareTasksPerResourceProfileId")) + private val _rpIdToHostToLocalTaskCount = + PrivateMethod[Map[Int, Map[String, Int]]](Symbol("rpIdToHostToLocalTaskCount")) + private val _onSpeculativeTaskSubmitted = + PrivateMethod[Unit](Symbol("onSpeculativeTaskSubmitted")) + private val _totalRunningTasksPerResourceProfile = + PrivateMethod[Int](Symbol("totalRunningTasksPerResourceProfile")) + + private val defaultProfile = ResourceProfile.getOrCreateDefaultProfile(new SparkConf) + + private def numExecutorsToAddForDefaultProfile(manager: ExecutorAllocationManager): Int = { + numExecutorsToAdd(manager, defaultProfile) + } + + private def numExecutorsToAdd( + manager: ExecutorAllocationManager, + rp: ResourceProfile): Int = { + val nmap = manager invokePrivate _numExecutorsToAddPerResourceProfileId() + nmap(rp.id) + } + + private def updateAndSyncNumExecutorsTarget( + manager: ExecutorAllocationManager, + now: Long): Unit = { + manager invokePrivate _updateAndSyncNumExecutorsTarget(now) + } - private def numExecutorsToAdd(manager: ExecutorAllocationManager): Int = { - manager invokePrivate _numExecutorsToAdd() + private def numExecutorsTargetForDefaultProfileId(manager: ExecutorAllocationManager): Int = { + numExecutorsTarget(manager, defaultProfile.id) } - private def numExecutorsTarget(manager: ExecutorAllocationManager): Int = { - manager invokePrivate _numExecutorsTarget() + private def numExecutorsTarget( + manager: ExecutorAllocationManager, + rpId: Int): Int = { + val numMap = manager invokePrivate _numExecutorsTargetPerResourceProfileId() + numMap(rpId) + } + + private def addExecutorsToTargetForDefaultProfile( + manager: ExecutorAllocationManager, + updatesNeeded: mutable.HashMap[ResourceProfile, + ExecutorAllocationManager.TargetNumUpdates] + ): Int = { + addExecutorsToTarget(manager, updatesNeeded, defaultProfile) + } + + private def addExecutorsToTarget( + manager: ExecutorAllocationManager, + updatesNeeded: mutable.HashMap[ResourceProfile, + ExecutorAllocationManager.TargetNumUpdates], + rp: ResourceProfile + ): Int = { + val maxNumExecutorsNeeded = + manager invokePrivate _maxNumExecutorsNeededPerResourceProfile(rp.id) + manager invokePrivate + _addExecutorsToTarget(maxNumExecutorsNeeded, rp.id, updatesNeeded) } private def addTime(manager: ExecutorAllocationManager): Long = { manager invokePrivate _addTime() } - private def schedule(manager: ExecutorAllocationManager): Unit = { - manager invokePrivate _schedule() + private def doUpdateRequest( + manager: ExecutorAllocationManager, + updates: Map[ResourceProfile, ExecutorAllocationManager.TargetNumUpdates], + now: Long): Unit = { + manager invokePrivate _doUpdateRequest(updates, now) } - private def maxNumExecutorsNeeded(manager: ExecutorAllocationManager): Int = { - manager invokePrivate _maxNumExecutorsNeeded() + private def schedule(manager: ExecutorAllocationManager): Unit = { + manager invokePrivate _schedule() } - private def addExecutors(manager: ExecutorAllocationManager): Int = { - val maxNumExecutorsNeeded = manager invokePrivate _maxNumExecutorsNeeded() - manager invokePrivate _addExecutors(maxNumExecutorsNeeded) + private def maxNumExecutorsNeededPerResourceProfile( + manager: ExecutorAllocationManager, + rp: ResourceProfile): Int = { + manager invokePrivate _maxNumExecutorsNeededPerResourceProfile(rp.id) } private def adjustRequestedExecutors(manager: ExecutorAllocationManager): Int = { manager invokePrivate _updateAndSyncNumExecutorsTarget(0L) } - private def removeExecutors(manager: ExecutorAllocationManager, ids: Seq[String]): Seq[String] = { + private def removeExecutorsDefaultProfile( + manager: ExecutorAllocationManager, + ids: Seq[String]): Seq[String] = { + val idsAndProfileIds = ids.map((_, defaultProfile.id)) + manager invokePrivate _removeExecutors(idsAndProfileIds) + } + + private def removeExecutors( + manager: ExecutorAllocationManager, + ids: Seq[(String, Int)]): Seq[String] = { manager invokePrivate _removeExecutors(ids) } @@ -1137,15 +1638,22 @@ private object ExecutorAllocationManagerSuite extends PrivateMethodTester { manager invokePrivate _onSpeculativeTaskSubmitted(id) } - private def localityAwareTasks(manager: ExecutorAllocationManager): Int = { - manager invokePrivate _localityAwareTasks() + private def localityAwareTasksForDefaultProfile(manager: ExecutorAllocationManager): Int = { + val localMap = manager invokePrivate _localityAwareTasksPerResourceProfileId() + localMap(defaultProfile.id) + } + + private def totalRunningTasksPerResourceProfile(manager: ExecutorAllocationManager): Int = { + manager invokePrivate _totalRunningTasksPerResourceProfile(defaultProfile.id) } - private def totalRunningTasks(manager: ExecutorAllocationManager): Int = { - manager invokePrivate _totalRunningTasks() + private def hostToLocalTaskCount( + manager: ExecutorAllocationManager): Map[String, Int] = { + val rpIdToHostLocal = manager invokePrivate _rpIdToHostToLocalTaskCount() + rpIdToHostLocal(defaultProfile.id) } - private def hostToLocalTaskCount(manager: ExecutorAllocationManager): Map[String, Int] = { - manager invokePrivate _hostToLocalTaskCount() + private def getResourceProfileIdOfExecutor(manager: ExecutorAllocationManager): Int = { + defaultProfile.id } } diff --git a/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala b/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala index 7f7f3db65d6ca..c217419f4092e 100644 --- a/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala +++ b/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala @@ -40,7 +40,7 @@ class ExternalShuffleServiceSuite extends ShuffleSuite with BeforeAndAfterAll wi var transportContext: TransportContext = _ var rpcHandler: ExternalBlockHandler = _ - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() val transportConf = SparkTransportConf.fromSparkConf(conf, "shuffle", numUsableCores = 2) rpcHandler = new ExternalBlockHandler(transportConf, null) @@ -52,7 +52,7 @@ class ExternalShuffleServiceSuite extends ShuffleSuite with BeforeAndAfterAll wi conf.set(config.SHUFFLE_SERVICE_PORT, server.getPort) } - override def afterAll() { + override def afterAll(): Unit = { Utils.tryLogNonFatalError{ server.close() } @@ -68,6 +68,7 @@ class ExternalShuffleServiceSuite extends ShuffleSuite with BeforeAndAfterAll wi // This test ensures that the external shuffle service is actually in use for the other tests. test("using external shuffle service") { sc = new SparkContext("local-cluster[2,1,1024]", "test", conf) + sc.getConf.get(config.SHUFFLE_HOST_LOCAL_DISK_READING_ENABLED) should equal(false) sc.env.blockManager.externalShuffleServiceEnabled should equal(true) sc.env.blockManager.blockStoreClient.getClass should equal(classOf[ExternalBlockStoreClient]) @@ -79,7 +80,9 @@ class ExternalShuffleServiceSuite extends ShuffleSuite with BeforeAndAfterAll wi // Therefore, we should wait until all slaves are up TestUtils.waitUntilExecutorsUp(sc, 2, 60000) - val rdd = sc.parallelize(0 until 1000, 10).map(i => (i, 1)).reduceByKey(_ + _) + val rdd = sc.parallelize(0 until 1000, 10) + .map { i => (i, 1) } + .reduceByKey(_ + _) rdd.count() rdd.count() @@ -96,6 +99,50 @@ class ExternalShuffleServiceSuite extends ShuffleSuite with BeforeAndAfterAll wi e.getMessage should include ("Fetch failure will not retry stage due to testing config") } + test("SPARK-27651: read host local shuffle blocks from disk and avoid network remote fetches") { + val confWithHostLocalRead = + conf.clone.set(config.SHUFFLE_HOST_LOCAL_DISK_READING_ENABLED, true) + confWithHostLocalRead.set(config.STORAGE_LOCAL_DISK_BY_EXECUTORS_CACHE_SIZE, 5) + sc = new SparkContext("local-cluster[2,1,1024]", "test", confWithHostLocalRead) + sc.getConf.get(config.SHUFFLE_HOST_LOCAL_DISK_READING_ENABLED) should equal(true) + sc.env.blockManager.externalShuffleServiceEnabled should equal(true) + sc.env.blockManager.hostLocalDirManager.isDefined should equal(true) + sc.env.blockManager.blockStoreClient.getClass should equal(classOf[ExternalBlockStoreClient]) + + // In a slow machine, one slave may register hundreds of milliseconds ahead of the other one. + // If we don't wait for all slaves, it's possible that only one executor runs all jobs. Then + // all shuffle blocks will be in this executor, ShuffleBlockFetcherIterator will directly fetch + // local blocks from the local BlockManager and won't send requests to ExternalShuffleService. + // In this case, we won't receive FetchFailed. And it will make this test fail. + // Therefore, we should wait until all slaves are up + TestUtils.waitUntilExecutorsUp(sc, 2, 60000) + + val rdd = sc.parallelize(0 until 1000, 10) + .map { i => (i, 1) } + .reduceByKey(_ + _) + + rdd.count() + rdd.count() + + val cachedExecutors = rdd.mapPartitions { _ => + SparkEnv.get.blockManager.hostLocalDirManager.map { localDirManager => + localDirManager.getCachedHostLocalDirs().keySet.iterator + }.getOrElse(Iterator.empty) + }.collect().toSet + + // both executors are caching the dirs of the other one + cachedExecutors should equal(sc.getExecutorIds().toSet) + + // Invalidate the registered executors, disallowing access to their shuffle blocks (without + // deleting the actual shuffle files, so we could access them without the shuffle service). + // As directories are already cached there is no request to external shuffle service. + rpcHandler.applicationRemoved(sc.conf.getAppId, false /* cleanupLocalDirs */) + + // Now Spark will not receive FetchFailed as host local blocks are read from the cached local + // disk directly + rdd.collect().map(_._2).sum should equal(1000) + } + test("SPARK-25888: using external shuffle service fetching disk persisted blocks") { val confWithRddFetchEnabled = conf.clone.set(config.SHUFFLE_SERVICE_FETCH_RDD_ENABLED, true) sc = new SparkContext("local-cluster[1,1,1024]", "test", confWithRddFetchEnabled) diff --git a/core/src/test/scala/org/apache/spark/FailureSuite.scala b/core/src/test/scala/org/apache/spark/FailureSuite.scala index 5f79b526a419b..8b75c3a0ba653 100644 --- a/core/src/test/scala/org/apache/spark/FailureSuite.scala +++ b/core/src/test/scala/org/apache/spark/FailureSuite.scala @@ -31,7 +31,7 @@ object FailureSuiteState { var tasksRun = 0 var tasksFailed = 0 - def clear() { + def clear(): Unit = { synchronized { tasksRun = 0 tasksFailed = 0 diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala index 6651e38f7ed62..e9ee6b5dfb665 100644 --- a/core/src/test/scala/org/apache/spark/FileSuite.scala +++ b/core/src/test/scala/org/apache/spark/FileSuite.scala @@ -36,18 +36,19 @@ import org.apache.hadoop.mapreduce.lib.output.{TextOutputFormat => NewTextOutput import org.apache.spark.internal.config._ import org.apache.spark.rdd.{HadoopRDD, NewHadoopRDD} +import org.apache.spark.serializer.KryoSerializer import org.apache.spark.storage.StorageLevel import org.apache.spark.util.Utils class FileSuite extends SparkFunSuite with LocalSparkContext { var tempDir: File = _ - override def beforeEach() { + override def beforeEach(): Unit = { super.beforeEach() tempDir = Utils.createTempDir() } - override def afterEach() { + override def afterEach(): Unit = { try { Utils.deleteRecursively(tempDir) } finally { @@ -372,7 +373,7 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { test ("prevent user from overwriting the empty directory (old Hadoop API)") { sc = new SparkContext("local", "test") - val randomRDD = sc.parallelize(Array((1, "a"), (1, "a"), (2, "b"), (3, "c")), 1) + val randomRDD = sc.parallelize(Seq((1, "a"), (1, "a"), (2, "b"), (3, "c")), 1) intercept[FileAlreadyExistsException] { randomRDD.saveAsTextFile(tempDir.getPath) } @@ -380,7 +381,7 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { test ("prevent user from overwriting the non-empty directory (old Hadoop API)") { sc = new SparkContext("local", "test") - val randomRDD = sc.parallelize(Array((1, "a"), (1, "a"), (2, "b"), (3, "c")), 1) + val randomRDD = sc.parallelize(Seq((1, "a"), (1, "a"), (2, "b"), (3, "c")), 1) randomRDD.saveAsTextFile(tempDir.getPath + "/output") assert(new File(tempDir.getPath + "/output/part-00000").exists()) intercept[FileAlreadyExistsException] { @@ -392,7 +393,7 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { val conf = new SparkConf() conf.setAppName("test").setMaster("local").set("spark.hadoop.validateOutputSpecs", "false") sc = new SparkContext(conf) - val randomRDD = sc.parallelize(Array((1, "a"), (1, "a"), (2, "b"), (3, "c")), 1) + val randomRDD = sc.parallelize(Seq((1, "a"), (1, "a"), (2, "b"), (3, "c")), 1) randomRDD.saveAsTextFile(tempDir.getPath + "/output") assert(new File(tempDir.getPath + "/output/part-00000").exists()) randomRDD.saveAsTextFile(tempDir.getPath + "/output") @@ -402,7 +403,7 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { test ("prevent user from overwriting the empty directory (new Hadoop API)") { sc = new SparkContext("local", "test") val randomRDD = sc.parallelize( - Array(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 1) + Seq(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 1) intercept[FileAlreadyExistsException] { randomRDD.saveAsNewAPIHadoopFile[NewTextOutputFormat[String, String]](tempDir.getPath) } @@ -411,7 +412,7 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { test ("prevent user from overwriting the non-empty directory (new Hadoop API)") { sc = new SparkContext("local", "test") val randomRDD = sc.parallelize( - Array(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 1) + Seq(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 1) randomRDD.saveAsNewAPIHadoopFile[NewTextOutputFormat[String, String]]( tempDir.getPath + "/output") assert(new File(tempDir.getPath + "/output/part-r-00000").exists()) @@ -425,7 +426,7 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { conf.setAppName("test").setMaster("local").set("spark.hadoop.validateOutputSpecs", "false") sc = new SparkContext(conf) val randomRDD = sc.parallelize( - Array(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 1) + Seq(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 1) randomRDD.saveAsNewAPIHadoopFile[NewTextOutputFormat[String, String]]( tempDir.getPath + "/output") assert(new File(tempDir.getPath + "/output/part-r-00000").exists()) @@ -437,7 +438,7 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { test ("save Hadoop Dataset through old Hadoop API") { sc = new SparkContext("local", "test") val randomRDD = sc.parallelize( - Array(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 1) + Seq(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 1) val job = new JobConf() job.setOutputKeyClass(classOf[String]) job.setOutputValueClass(classOf[String]) @@ -450,7 +451,7 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { test ("save Hadoop Dataset through new Hadoop API") { sc = new SparkContext("local", "test") val randomRDD = sc.parallelize( - Array(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 1) + Seq(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 1) val job = Job.getInstance(sc.hadoopConfiguration) job.setOutputKeyClass(classOf[String]) job.setOutputValueClass(classOf[String]) @@ -559,7 +560,7 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { sc = new SparkContext(conf) def testIgnoreEmptySplits( - data: Array[Tuple2[String, String]], + data: Seq[Tuple2[String, String]], actualPartitionNum: Int, expectedPartitionNum: Int): Unit = { val output = new File(tempDir, "output") @@ -581,13 +582,13 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { // Ensure that if no split is empty, we don't lose any splits testIgnoreEmptySplits( - data = Array(("key1", "a"), ("key2", "a"), ("key3", "b")), + data = Seq(("key1", "a"), ("key2", "a"), ("key3", "b")), actualPartitionNum = 2, expectedPartitionNum = 2) // Ensure that if part of the splits are empty, we remove the splits correctly testIgnoreEmptySplits( - data = Array(("key1", "a"), ("key2", "a")), + data = Seq(("key1", "a"), ("key2", "a")), actualPartitionNum = 5, expectedPartitionNum = 2) } @@ -600,7 +601,7 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { sc = new SparkContext(conf) def testIgnoreEmptySplits( - data: Array[Tuple2[String, String]], + data: Seq[Tuple2[String, String]], actualPartitionNum: Int, expectedPartitionNum: Int): Unit = { val output = new File(tempDir, "output") @@ -624,13 +625,13 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { // Ensure that if no split is empty, we don't lose any splits testIgnoreEmptySplits( - data = Array(("1", "a"), ("2", "a"), ("3", "b")), + data = Seq(("1", "a"), ("2", "a"), ("3", "b")), actualPartitionNum = 2, expectedPartitionNum = 2) // Ensure that if part of the splits are empty, we remove the splits correctly testIgnoreEmptySplits( - data = Array(("1", "a"), ("2", "b")), + data = Seq(("1", "a"), ("2", "b")), actualPartitionNum = 5, expectedPartitionNum = 2) } @@ -700,4 +701,40 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { assert(collectRDDAndDeleteFileBeforeCompute(true).isEmpty) } + + test("SPARK-25100: Support commit tasks when Kyro registration is required") { + // Prepare the input file + val inputFilePath = new File(tempDir, "/input").getAbsolutePath + Utils.tryWithResource(new PrintWriter(new File(inputFilePath))) { writer => + for (i <- 1 to 3) { + writer.print(i) + writer.write('\n') + } + } + + // Start a new SparkContext + val conf = new SparkConf(false) + .setMaster("local") + .setAppName("test") + .set("spark.kryo.registrationRequired", "true") + .set("spark.serializer", classOf[KryoSerializer].getName) + sc = new SparkContext(conf) + + // Prepare the input RDD + val pairRDD = sc.textFile(inputFilePath).map(x => (x, x)) + + // Test saveAsTextFile() + val outputFilePath1 = new File(tempDir, "/out1").getAbsolutePath + pairRDD.saveAsTextFile(outputFilePath1) + assert(sc.textFile(outputFilePath1).collect() === Array("(1,1)", "(2,2)", "(3,3)")) + + // Test saveAsNewAPIHadoopDataset() + val outputFilePath2 = new File(tempDir, "/out2").getAbsolutePath + val jobConf = new JobConf() + jobConf.setOutputKeyClass(classOf[IntWritable]) + jobConf.setOutputValueClass(classOf[IntWritable]) + jobConf.set("mapred.output.dir", outputFilePath2) + pairRDD.saveAsNewAPIHadoopDataset(jobConf) + assert(sc.textFile(outputFilePath2).collect() === Array("1\t1", "2\t2", "3\t3")) + } } diff --git a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala index dfe33b1e52695..a9296955d18b4 100644 --- a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala +++ b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala @@ -26,9 +26,11 @@ import scala.concurrent.duration._ import org.mockito.ArgumentMatchers.{any, eq => meq} import org.mockito.Mockito.{mock, spy, verify, when} import org.scalatest.{BeforeAndAfterEach, PrivateMethodTester} +import org.scalatest.concurrent.Eventually._ import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics} import org.apache.spark.internal.config.DYN_ALLOCATION_TESTING +import org.apache.spark.resource.ResourceProfile import org.apache.spark.rpc.{RpcCallContext, RpcEndpoint, RpcEndpointRef, RpcEnv} import org.apache.spark.scheduler._ import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._ @@ -55,9 +57,10 @@ class HeartbeatReceiverSuite private var heartbeatReceiverClock: ManualClock = null // Helper private method accessors for HeartbeatReceiver - private val _executorLastSeen = PrivateMethod[collection.Map[String, Long]]('executorLastSeen) - private val _executorTimeoutMs = PrivateMethod[Long]('executorTimeoutMs) - private val _killExecutorThread = PrivateMethod[ExecutorService]('killExecutorThread) + private val _executorLastSeen = + PrivateMethod[collection.Map[String, Long]](Symbol("executorLastSeen")) + private val _executorTimeoutMs = PrivateMethod[Long](Symbol("executorTimeoutMs")) + private val _killExecutorThread = PrivateMethod[ExecutorService](Symbol("killExecutorThread")) /** * Before each test, set up the SparkContext and a custom [[HeartbeatReceiver]] @@ -73,6 +76,7 @@ class HeartbeatReceiverSuite scheduler = mock(classOf[TaskSchedulerImpl]) when(sc.taskScheduler).thenReturn(scheduler) when(scheduler.nodeBlacklist).thenReturn(Predef.Set[String]()) + when(scheduler.resourcesReqsPerTask).thenReturn(Seq.empty) when(scheduler.sc).thenReturn(sc) heartbeatReceiverClock = new ManualClock heartbeatReceiver = new HeartbeatReceiver(sc, heartbeatReceiverClock) @@ -151,7 +155,6 @@ class HeartbeatReceiverSuite heartbeatReceiverClock.advance(executorTimeout) heartbeatReceiverRef.askSync[Boolean](ExpireDeadHosts) // Only the second executor should be expired as a dead host - verify(scheduler).executorLost(meq(executorId2), any()) val trackedExecutors = getTrackedExecutors assert(trackedExecutors.size === 1) assert(trackedExecutors.contains(executorId1)) @@ -175,10 +178,10 @@ class HeartbeatReceiverSuite val dummyExecutorEndpointRef2 = rpcEnv.setupEndpoint("fake-executor-2", dummyExecutorEndpoint2) fakeSchedulerBackend.driverEndpoint.askSync[Boolean]( RegisterExecutor(executorId1, dummyExecutorEndpointRef1, "1.2.3.4", 0, Map.empty, Map.empty, - Map.empty)) + Map.empty, ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)) fakeSchedulerBackend.driverEndpoint.askSync[Boolean]( RegisterExecutor(executorId2, dummyExecutorEndpointRef2, "1.2.3.5", 0, Map.empty, Map.empty, - Map.empty)) + Map.empty, ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)) heartbeatReceiverRef.askSync[Boolean](TaskSchedulerIsSet) addExecutorAndVerify(executorId1) addExecutorAndVerify(executorId2) @@ -207,6 +210,12 @@ class HeartbeatReceiverSuite // explicitly request new executors. For more detail, see SPARK-8119. assert(fakeClusterManager.getTargetNumExecutors === 2) assert(fakeClusterManager.getExecutorIdsToKill === Set(executorId1, executorId2)) + // [SPARK-27348] HeartbeatReceiver should remove lost executor from scheduler backend + eventually(timeout(5.seconds)) { + assert(!fakeSchedulerBackend.getExecutorIds().contains(executorId1)) + assert(!fakeSchedulerBackend.getExecutorIds().contains(executorId2)) + } + fakeSchedulerBackend.stop() } /** Manually send a heartbeat and return the response. */ @@ -276,9 +285,14 @@ private class FakeSchedulerBackend( clusterManagerEndpoint: RpcEndpointRef) extends CoarseGrainedSchedulerBackend(scheduler, rpcEnv) { - protected override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = { + protected override def doRequestTotalExecutors( + resourceProfileToTotalExecs: Map[ResourceProfile, Int]): Future[Boolean] = { clusterManagerEndpoint.ask[Boolean]( - RequestExecutors(requestedTotal, localityAwareTasks, hostToLocalTaskCount, Set.empty)) + RequestExecutors( + resourceProfileToTotalExecs(ResourceProfile.getOrCreateDefaultProfile(conf)), + numLocalityAwareTasksPerResourceProfileId(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID), + rpHostToLocalTaskCount(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID), + Set.empty)) } protected override def doKillExecutors(executorIds: Seq[String]): Future[Boolean] = { diff --git a/core/src/test/scala/org/apache/spark/InternalAccumulatorSuite.scala b/core/src/test/scala/org/apache/spark/InternalAccumulatorSuite.scala index e7eef8ec5150c..5399d868f46f1 100644 --- a/core/src/test/scala/org/apache/spark/InternalAccumulatorSuite.scala +++ b/core/src/test/scala/org/apache/spark/InternalAccumulatorSuite.scala @@ -90,7 +90,7 @@ class InternalAccumulatorSuite extends SparkFunSuite with LocalSparkContext { TaskContext.get().taskMetrics().testAccum.get.add(1) iter } - .reduceByKey { case (x, y) => x + y } + .reduceByKey { (x, y) => x + y } .mapPartitions { iter => TaskContext.get().taskMetrics().testAccum.get.add(10) iter @@ -142,6 +142,7 @@ class InternalAccumulatorSuite extends SparkFunSuite with LocalSparkContext { sid, taskContext.partitionId(), taskContext.partitionId(), + taskContext.partitionId(), "simulated fetch failure") } else { iter @@ -210,7 +211,8 @@ class InternalAccumulatorSuite extends SparkFunSuite with LocalSparkContext { /** * A special [[ContextCleaner]] that saves the IDs of the accumulators registered for cleanup. */ - private class SaveAccumContextCleaner(sc: SparkContext) extends ContextCleaner(sc) { + private class SaveAccumContextCleaner(sc: SparkContext) extends + ContextCleaner(sc, null) { private val accumsRegistered = new ArrayBuffer[Long] override def registerAccumulatorForCleanup(a: AccumulatorV2[_, _]): Unit = { diff --git a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala index b533304287cf6..94ad8d8880027 100644 --- a/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala +++ b/core/src/test/scala/org/apache/spark/JobCancellationSuite.scala @@ -40,7 +40,7 @@ import org.apache.spark.util.ThreadUtils class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAfter with LocalSparkContext { - override def afterEach() { + override def afterEach(): Unit = { try { resetSparkContext() JobCancellationSuite.taskStartedSemaphore.drainPermits() @@ -127,7 +127,7 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft // Add a listener to release the semaphore once any tasks are launched. val sem = new Semaphore(0) sc.addSparkListener(new SparkListener { - override def onTaskStart(taskStart: SparkListenerTaskStart) { + override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = { sem.release() } }) @@ -157,7 +157,7 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft // Add a listener to release the semaphore once any tasks are launched. val sem = new Semaphore(0) sc.addSparkListener(new SparkListener { - override def onTaskStart(taskStart: SparkListenerTaskStart) { + override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = { sem.release() } }) @@ -192,7 +192,7 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft // Add a listener to release the semaphore once any tasks are launched. val sem = new Semaphore(0) sc.addSparkListener(new SparkListener { - override def onTaskStart(taskStart: SparkListenerTaskStart) { + override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = { sem.release() } }) @@ -225,7 +225,7 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft // Add a listener to release the semaphore once any tasks are launched. val sem = new Semaphore(0) sc.addSparkListener(new SparkListener { - override def onTaskStart(taskStart: SparkListenerTaskStart) { + override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = { sem.release() } }) @@ -264,7 +264,7 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft // Add a listener to release the semaphore once any tasks are launched. val sem = new Semaphore(0) sc.addSparkListener(new SparkListener { - override def onTaskStart(taskStart: SparkListenerTaskStart) { + override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = { sem.release() } }) @@ -301,7 +301,7 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft sc = new SparkContext("local[2]", "test") sc.addSparkListener(new SparkListener { - override def onTaskStart(taskStart: SparkListenerTaskStart) { + override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = { sem1.release() } }) @@ -391,7 +391,7 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft assert(executionOfInterruptibleCounter.get() < numElements) } - def testCount() { + def testCount(): Unit = { // Cancel before launching any tasks { val f = sc.parallelize(1 to 10000, 2).map { i => Thread.sleep(10); i }.countAsync() @@ -405,7 +405,7 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft // Add a listener to release the semaphore once any tasks are launched. val sem = new Semaphore(0) sc.addSparkListener(new SparkListener { - override def onTaskStart(taskStart: SparkListenerTaskStart) { + override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = { sem.release() } }) @@ -421,7 +421,7 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft } } - def testTake() { + def testTake(): Unit = { // Cancel before launching any tasks { val f = sc.parallelize(1 to 10000, 2).map { i => Thread.sleep(10); i }.takeAsync(5000) @@ -435,7 +435,7 @@ class JobCancellationSuite extends SparkFunSuite with Matchers with BeforeAndAft // Add a listener to release the semaphore once any tasks are launched. val sem = new Semaphore(0) sc.addSparkListener(new SparkListener { - override def onTaskStart(taskStart: SparkListenerTaskStart) { + override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = { sem.release() } }) diff --git a/core/src/test/scala/org/apache/spark/JsonTestUtils.scala b/core/src/test/scala/org/apache/spark/JsonTestUtils.scala index ba367cd476146..8aa7f3c7cb1bf 100644 --- a/core/src/test/scala/org/apache/spark/JsonTestUtils.scala +++ b/core/src/test/scala/org/apache/spark/JsonTestUtils.scala @@ -20,7 +20,7 @@ import org.json4s._ import org.json4s.jackson.JsonMethods trait JsonTestUtils { - def assertValidDataInJson(validateJson: JValue, expectedJson: JValue) { + def assertValidDataInJson(validateJson: JValue, expectedJson: JValue): Unit = { val Diff(c, a, d) = validateJson.diff(expectedJson) val validatePretty = JsonMethods.pretty(validateJson) val expectedPretty = JsonMethods.pretty(expectedJson) diff --git a/core/src/test/scala/org/apache/spark/LocalSparkContext.scala b/core/src/test/scala/org/apache/spark/LocalSparkContext.scala index 05aaaa11451b4..599ea8955491f 100644 --- a/core/src/test/scala/org/apache/spark/LocalSparkContext.scala +++ b/core/src/test/scala/org/apache/spark/LocalSparkContext.scala @@ -22,17 +22,19 @@ import org.scalatest.BeforeAndAfterAll import org.scalatest.BeforeAndAfterEach import org.scalatest.Suite +import org.apache.spark.resource.ResourceProfile + /** Manages a local `sc` `SparkContext` variable, correctly stopping it after each test. */ trait LocalSparkContext extends BeforeAndAfterEach with BeforeAndAfterAll { self: Suite => @transient var sc: SparkContext = _ - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() InternalLoggerFactory.setDefaultFactory(Slf4JLoggerFactory.INSTANCE) } - override def afterEach() { + override def afterEach(): Unit = { try { resetSparkContext() } finally { @@ -42,13 +44,14 @@ trait LocalSparkContext extends BeforeAndAfterEach with BeforeAndAfterAll { self def resetSparkContext(): Unit = { LocalSparkContext.stop(sc) + ResourceProfile.clearDefaultProfile() sc = null } } object LocalSparkContext { - def stop(sc: SparkContext) { + def stop(sc: SparkContext): Unit = { if (sc != null) { sc.stop() } diff --git a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala index d86975964b558..d5ee19bde8edf 100644 --- a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala +++ b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala @@ -64,14 +64,15 @@ class MapOutputTrackerSuite extends SparkFunSuite { val size1000 = MapStatus.decompressSize(MapStatus.compressSize(1000L)) val size10000 = MapStatus.decompressSize(MapStatus.compressSize(10000L)) tracker.registerMapOutput(10, 0, MapStatus(BlockManagerId("a", "hostA", 1000), - Array(1000L, 10000L))) + Array(1000L, 10000L), 5)) tracker.registerMapOutput(10, 1, MapStatus(BlockManagerId("b", "hostB", 1000), - Array(10000L, 1000L))) + Array(10000L, 1000L), 6)) val statuses = tracker.getMapSizesByExecutorId(10, 0) assert(statuses.toSet === - Seq((BlockManagerId("a", "hostA", 1000), ArrayBuffer((ShuffleBlockId(10, 0, 0), size1000))), - (BlockManagerId("b", "hostB", 1000), ArrayBuffer((ShuffleBlockId(10, 1, 0), size10000)))) - .toSet) + Seq((BlockManagerId("a", "hostA", 1000), + ArrayBuffer((ShuffleBlockId(10, 5, 0), size1000, 0))), + (BlockManagerId("b", "hostB", 1000), + ArrayBuffer((ShuffleBlockId(10, 6, 0), size10000, 1)))).toSet) assert(0 == tracker.getNumCachedSerializedBroadcast) tracker.stop() rpcEnv.shutdown() @@ -86,9 +87,9 @@ class MapOutputTrackerSuite extends SparkFunSuite { val compressedSize1000 = MapStatus.compressSize(1000L) val compressedSize10000 = MapStatus.compressSize(10000L) tracker.registerMapOutput(10, 0, MapStatus(BlockManagerId("a", "hostA", 1000), - Array(compressedSize1000, compressedSize10000))) + Array(compressedSize1000, compressedSize10000), 5)) tracker.registerMapOutput(10, 1, MapStatus(BlockManagerId("b", "hostB", 1000), - Array(compressedSize10000, compressedSize1000))) + Array(compressedSize10000, compressedSize1000), 6)) assert(tracker.containsShuffle(10)) assert(tracker.getMapSizesByExecutorId(10, 0).nonEmpty) assert(0 == tracker.getNumCachedSerializedBroadcast) @@ -109,9 +110,9 @@ class MapOutputTrackerSuite extends SparkFunSuite { val compressedSize1000 = MapStatus.compressSize(1000L) val compressedSize10000 = MapStatus.compressSize(10000L) tracker.registerMapOutput(10, 0, MapStatus(BlockManagerId("a", "hostA", 1000), - Array(compressedSize1000, compressedSize1000, compressedSize1000))) + Array(compressedSize1000, compressedSize1000, compressedSize1000), 5)) tracker.registerMapOutput(10, 1, MapStatus(BlockManagerId("b", "hostB", 1000), - Array(compressedSize10000, compressedSize1000, compressedSize1000))) + Array(compressedSize10000, compressedSize1000, compressedSize1000), 6)) assert(0 == tracker.getNumCachedSerializedBroadcast) // As if we had two simultaneous fetch failures @@ -147,10 +148,11 @@ class MapOutputTrackerSuite extends SparkFunSuite { val size1000 = MapStatus.decompressSize(MapStatus.compressSize(1000L)) masterTracker.registerMapOutput(10, 0, MapStatus( - BlockManagerId("a", "hostA", 1000), Array(1000L))) + BlockManagerId("a", "hostA", 1000), Array(1000L), 5)) slaveTracker.updateEpoch(masterTracker.getEpoch) assert(slaveTracker.getMapSizesByExecutorId(10, 0).toSeq === - Seq((BlockManagerId("a", "hostA", 1000), ArrayBuffer((ShuffleBlockId(10, 0, 0), size1000))))) + Seq((BlockManagerId("a", "hostA", 1000), + ArrayBuffer((ShuffleBlockId(10, 5, 0), size1000, 0))))) assert(0 == masterTracker.getNumCachedSerializedBroadcast) val masterTrackerEpochBeforeLossOfMapOutput = masterTracker.getEpoch @@ -184,7 +186,7 @@ class MapOutputTrackerSuite extends SparkFunSuite { // Message size should be ~123B, and no exception should be thrown masterTracker.registerShuffle(10, 1) masterTracker.registerMapOutput(10, 0, MapStatus( - BlockManagerId("88", "mph", 1000), Array.fill[Long](10)(0))) + BlockManagerId("88", "mph", 1000), Array.fill[Long](10)(0), 5)) val senderAddress = RpcAddress("localhost", 12345) val rpcCallContext = mock(classOf[RpcCallContext]) when(rpcCallContext.senderAddress).thenReturn(senderAddress) @@ -218,11 +220,11 @@ class MapOutputTrackerSuite extends SparkFunSuite { // on hostB with output size 3 tracker.registerShuffle(10, 3) tracker.registerMapOutput(10, 0, MapStatus(BlockManagerId("a", "hostA", 1000), - Array(2L))) + Array(2L), 5)) tracker.registerMapOutput(10, 1, MapStatus(BlockManagerId("a", "hostA", 1000), - Array(2L))) + Array(2L), 6)) tracker.registerMapOutput(10, 2, MapStatus(BlockManagerId("b", "hostB", 1000), - Array(3L))) + Array(3L), 7)) // When the threshold is 50%, only host A should be returned as a preferred location // as it has 4 out of 7 bytes of output. @@ -262,7 +264,7 @@ class MapOutputTrackerSuite extends SparkFunSuite { masterTracker.registerShuffle(20, 100) (0 until 100).foreach { i => masterTracker.registerMapOutput(20, i, new CompressedMapStatus( - BlockManagerId("999", "mps", 1000), Array.fill[Long](4000000)(0))) + BlockManagerId("999", "mps", 1000), Array.fill[Long](4000000)(0), 5)) } val senderAddress = RpcAddress("localhost", 12345) val rpcCallContext = mock(classOf[RpcCallContext]) @@ -311,16 +313,18 @@ class MapOutputTrackerSuite extends SparkFunSuite { val size1000 = MapStatus.decompressSize(MapStatus.compressSize(1000L)) val size10000 = MapStatus.decompressSize(MapStatus.compressSize(10000L)) tracker.registerMapOutput(10, 0, MapStatus(BlockManagerId("a", "hostA", 1000), - Array(size0, size1000, size0, size10000))) + Array(size0, size1000, size0, size10000), 5)) tracker.registerMapOutput(10, 1, MapStatus(BlockManagerId("b", "hostB", 1000), - Array(size10000, size0, size1000, size0))) + Array(size10000, size0, size1000, size0), 6)) assert(tracker.containsShuffle(10)) assert(tracker.getMapSizesByExecutorId(10, 0, 4).toSeq === Seq( (BlockManagerId("a", "hostA", 1000), - Seq((ShuffleBlockId(10, 0, 1), size1000), (ShuffleBlockId(10, 0, 3), size10000))), + Seq((ShuffleBlockId(10, 5, 1), size1000, 0), + (ShuffleBlockId(10, 5, 3), size10000, 0))), (BlockManagerId("b", "hostB", 1000), - Seq((ShuffleBlockId(10, 1, 0), size10000), (ShuffleBlockId(10, 1, 2), size1000))) + Seq((ShuffleBlockId(10, 6, 0), size10000, 1), + (ShuffleBlockId(10, 6, 2), size1000, 1))) ) ) diff --git a/core/src/test/scala/org/apache/spark/MapStatusesSerDeserBenchmark.scala b/core/src/test/scala/org/apache/spark/MapStatusesSerDeserBenchmark.scala new file mode 100644 index 0000000000000..78f1246295bf8 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/MapStatusesSerDeserBenchmark.scala @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark + +import org.scalatest.Assertions._ + +import org.apache.spark.benchmark.Benchmark +import org.apache.spark.benchmark.BenchmarkBase +import org.apache.spark.scheduler.CompressedMapStatus +import org.apache.spark.storage.BlockManagerId + +/** + * Benchmark for MapStatuses serialization & deserialization performance. + * {{{ + * To run this benchmark: + * 1. without sbt: bin/spark-submit --class --jars + * 2. build/sbt "core/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "core/test:runMain " + * Results will be written to "benchmarks/MapStatusesSerDeserBenchmark-results.txt". + * }}} + */ +object MapStatusesSerDeserBenchmark extends BenchmarkBase { + + var sc: SparkContext = null + var tracker: MapOutputTrackerMaster = null + + def serDeserBenchmark(numMaps: Int, blockSize: Int, enableBroadcast: Boolean): Unit = { + val minBroadcastSize = if (enableBroadcast) { + 0 + } else { + Int.MaxValue + } + + val benchmark = new Benchmark(s"$numMaps MapOutputs, $blockSize blocks " + { + if (enableBroadcast) "w/ " else "w/o " + } + "broadcast", numMaps, output = output) + + val shuffleId = 10 + + tracker.registerShuffle(shuffleId, numMaps) + val r = new scala.util.Random(912) + (0 until numMaps).foreach { i => + tracker.registerMapOutput(shuffleId, i, + new CompressedMapStatus(BlockManagerId(s"node$i", s"node$i.spark.apache.org", 1000), + Array.fill(blockSize) { + // Creating block size ranging from 0byte to 1GB + (r.nextDouble() * 1024 * 1024 * 1024).toLong + }, i)) + } + + val shuffleStatus = tracker.shuffleStatuses.get(shuffleId).head + + var serializedMapStatusSizes = 0 + var serializedBroadcastSizes = 0 + + val (serializedMapStatus, serializedBroadcast) = MapOutputTracker.serializeMapStatuses( + shuffleStatus.mapStatuses, tracker.broadcastManager, tracker.isLocal, minBroadcastSize, + sc.getConf) + serializedMapStatusSizes = serializedMapStatus.length + if (serializedBroadcast != null) { + serializedBroadcastSizes = serializedBroadcast.value.length + } + + benchmark.addCase("Serialization") { _ => + MapOutputTracker.serializeMapStatuses(shuffleStatus.mapStatuses, tracker.broadcastManager, + tracker.isLocal, minBroadcastSize, sc.getConf) + } + + benchmark.addCase("Deserialization") { _ => + val result = MapOutputTracker.deserializeMapStatuses(serializedMapStatus, sc.getConf) + assert(result.length == numMaps) + } + + benchmark.run() + // scalastyle:off + import org.apache.commons.io.FileUtils + benchmark.out.println("Compressed Serialized MapStatus sizes: " + + FileUtils.byteCountToDisplaySize(serializedMapStatusSizes)) + benchmark.out.println("Compressed Serialized Broadcast MapStatus sizes: " + + FileUtils.byteCountToDisplaySize(serializedBroadcastSizes) + "\n\n") + // scalastyle:on + + tracker.unregisterShuffle(shuffleId) + } + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + createSparkContext() + tracker = sc.env.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] + val rpcEnv = sc.env.rpcEnv + val masterEndpoint = new MapOutputTrackerMasterEndpoint(rpcEnv, tracker, sc.getConf) + rpcEnv.stop(tracker.trackerEndpoint) + rpcEnv.setupEndpoint(MapOutputTracker.ENDPOINT_NAME, masterEndpoint) + + serDeserBenchmark(200000, 10, true) + serDeserBenchmark(200000, 10, false) + + serDeserBenchmark(200000, 100, true) + serDeserBenchmark(200000, 100, false) + + serDeserBenchmark(200000, 1000, true) + serDeserBenchmark(200000, 1000, false) + } + + def createSparkContext(): Unit = { + val conf = new SparkConf() + if (sc != null) { + sc.stop() + } + sc = new SparkContext("local", "MapStatusesSerializationBenchmark", conf) + } + + override def afterAll(): Unit = { + tracker.stop() + if (sc != null) { + sc.stop() + } + } +} diff --git a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala index 9206b5debf4f3..1a3259c707025 100644 --- a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala +++ b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala @@ -70,7 +70,7 @@ class PartitioningSuite extends SparkFunSuite with SharedSparkContext with Priva // 1000 partitions. val partitionSizes = List(1, 2, 10, 100, 500, 1000, 1500) val partitioners = partitionSizes.map(p => (p, new RangePartitioner(p, rdd))) - val decoratedRangeBounds = PrivateMethod[Array[Int]]('rangeBounds) + val decoratedRangeBounds = PrivateMethod[Array[Int]](Symbol("rangeBounds")) partitioners.foreach { case (numPartitions, partitioner) => val rangeBounds = partitioner.invokePrivate(decoratedRangeBounds()) for (element <- 1 to 1000) { @@ -262,11 +262,11 @@ class PartitioningSuite extends SparkFunSuite with SharedSparkContext with Priva test("defaultPartitioner") { val rdd1 = sc.parallelize((1 to 1000).map(x => (x, x)), 150) - val rdd2 = sc.parallelize(Array((1, 2), (2, 3), (2, 4), (3, 4))) + val rdd2 = sc.parallelize(Seq((1, 2), (2, 3), (2, 4), (3, 4))) .partitionBy(new HashPartitioner(10)) - val rdd3 = sc.parallelize(Array((1, 6), (7, 8), (3, 10), (5, 12), (13, 14))) + val rdd3 = sc.parallelize(Seq((1, 6), (7, 8), (3, 10), (5, 12), (13, 14))) .partitionBy(new HashPartitioner(100)) - val rdd4 = sc.parallelize(Array((1, 2), (2, 3), (2, 4), (3, 4))) + val rdd4 = sc.parallelize(Seq((1, 2), (2, 3), (2, 4), (3, 4))) .partitionBy(new HashPartitioner(9)) val rdd5 = sc.parallelize((1 to 10).map(x => (x, x)), 11) @@ -289,14 +289,14 @@ class PartitioningSuite extends SparkFunSuite with SharedSparkContext with Priva sc.conf.set("spark.default.parallelism", "4") val rdd1 = sc.parallelize((1 to 1000).map(x => (x, x)), 150) - val rdd2 = sc.parallelize(Array((1, 2), (2, 3), (2, 4), (3, 4))) + val rdd2 = sc.parallelize(Seq((1, 2), (2, 3), (2, 4), (3, 4))) .partitionBy(new HashPartitioner(10)) - val rdd3 = sc.parallelize(Array((1, 6), (7, 8), (3, 10), (5, 12), (13, 14))) + val rdd3 = sc.parallelize(Seq((1, 6), (7, 8), (3, 10), (5, 12), (13, 14))) .partitionBy(new HashPartitioner(100)) - val rdd4 = sc.parallelize(Array((1, 2), (2, 3), (2, 4), (3, 4))) + val rdd4 = sc.parallelize(Seq((1, 2), (2, 3), (2, 4), (3, 4))) .partitionBy(new HashPartitioner(9)) val rdd5 = sc.parallelize((1 to 10).map(x => (x, x)), 11) - val rdd6 = sc.parallelize(Array((1, 2), (2, 3), (2, 4), (3, 4))) + val rdd6 = sc.parallelize(Seq((1, 2), (2, 3), (2, 4), (3, 4))) .partitionBy(new HashPartitioner(3)) val partitioner1 = Partitioner.defaultPartitioner(rdd1, rdd2) diff --git a/core/src/test/scala/org/apache/spark/SharedSparkContext.scala b/core/src/test/scala/org/apache/spark/SharedSparkContext.scala index 1aa1c421d792e..bdeb631878350 100644 --- a/core/src/test/scala/org/apache/spark/SharedSparkContext.scala +++ b/core/src/test/scala/org/apache/spark/SharedSparkContext.scala @@ -43,12 +43,12 @@ trait SharedSparkContext extends BeforeAndAfterAll with BeforeAndAfterEach { sel } } - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() initializeContext() } - override def afterAll() { + override def afterAll(): Unit = { try { LocalSparkContext.stop(_sc) _sc = null diff --git a/core/src/test/scala/org/apache/spark/ShuffleNettySuite.scala b/core/src/test/scala/org/apache/spark/ShuffleNettySuite.scala index 73638d9b131ea..378a361845139 100644 --- a/core/src/test/scala/org/apache/spark/ShuffleNettySuite.scala +++ b/core/src/test/scala/org/apache/spark/ShuffleNettySuite.scala @@ -23,7 +23,7 @@ class ShuffleNettySuite extends ShuffleSuite with BeforeAndAfterAll { // This test suite should run all tests in ShuffleSuite with Netty shuffle mode. - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() conf.set("spark.shuffle.blockTransferService", "netty") } diff --git a/core/src/test/scala/org/apache/spark/ShuffleOldFetchProtocolSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleOldFetchProtocolSuite.scala new file mode 100644 index 0000000000000..a878593ba601a --- /dev/null +++ b/core/src/test/scala/org/apache/spark/ShuffleOldFetchProtocolSuite.scala @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark + +import org.scalatest.BeforeAndAfterAll + +class ShuffleOldFetchProtocolSuite extends ShuffleSuite with BeforeAndAfterAll { + + // This test suite should run all tests by setting spark.shuffle.useOldFetchProtocol=true. + override def beforeAll(): Unit = { + super.beforeAll() + conf.set("spark.shuffle.useOldFetchProtocol", "true") + } +} diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala index 923c9c90447fd..9e39271bdf9ee 100644 --- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala +++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala @@ -31,7 +31,7 @@ import org.apache.spark.scheduler.{MapStatus, MyRDD, SparkListener, SparkListene import org.apache.spark.serializer.KryoSerializer import org.apache.spark.shuffle.ShuffleWriter import org.apache.spark.storage.{ShuffleBlockId, ShuffleDataBlockId, ShuffleIndexBlockId} -import org.apache.spark.util.{MutablePair, Utils} +import org.apache.spark.util.MutablePair abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkContext { @@ -44,7 +44,7 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC test("groupByKey without compression") { val myConf = conf.clone().set(config.SHUFFLE_COMPRESS, false) sc = new SparkContext("local", "test", myConf) - val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (2, 1)), 4) + val pairs = sc.parallelize(Seq((1, 1), (1, 2), (1, 3), (2, 1)), 4) val groups = pairs.groupByKey(4).collect() assert(groups.size === 2) val valuesFor1 = groups.find(_._1 == 1).get._2 @@ -360,7 +360,7 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC val metricsSystem = sc.env.metricsSystem val shuffleMapRdd = new MyRDD(sc, 1, Nil) val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(1)) - val shuffleHandle = manager.registerShuffle(0, 1, shuffleDep) + val shuffleHandle = manager.registerShuffle(0, shuffleDep) mapTrackerMaster.registerShuffle(0, 1) // first attempt -- its successful @@ -487,7 +487,7 @@ object ShuffleSuite { @volatile var bytesWritten: Long = 0 @volatile var bytesRead: Long = 0 val listener = new SparkListener { - override def onTaskEnd(taskEnd: SparkListenerTaskEnd) { + override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = { recordsWritten += taskEnd.taskMetrics.shuffleWriteMetrics.recordsWritten bytesWritten += taskEnd.taskMetrics.shuffleWriteMetrics.bytesWritten recordsRead += taskEnd.taskMetrics.shuffleReadMetrics.recordsRead @@ -498,7 +498,7 @@ object ShuffleSuite { job - sc.listenerBus.waitUntilEmpty(500) + sc.listenerBus.waitUntilEmpty() AggregatedShuffleMetrics(recordsWritten, recordsRead, bytesWritten, bytesRead) } } diff --git a/core/src/test/scala/org/apache/spark/SortShuffleSuite.scala b/core/src/test/scala/org/apache/spark/SortShuffleSuite.scala index 1aceda498d7c7..1a563621a5179 100644 --- a/core/src/test/scala/org/apache/spark/SortShuffleSuite.scala +++ b/core/src/test/scala/org/apache/spark/SortShuffleSuite.scala @@ -37,7 +37,7 @@ class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll { private var tempDir: File = _ - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() // Once 'spark.local.dir' is set, it is cached. Unless this is manually cleared // before/after a test, it could return the same directory even if this property diff --git a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala index 9f00131c8dc20..3bc2061c4f2ad 100644 --- a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala @@ -449,13 +449,77 @@ class SparkConfSuite extends SparkFunSuite with LocalSparkContext with ResetSyst conf.remove(TASK_FPGA_ID.amountConf) // Ignore invalid prefix - conf.set(ResourceID("spark.invalid.prefix", FPGA).amountConf, "1") + conf.set(new ResourceID("spark.invalid.prefix", FPGA).amountConf, "1") taskResourceRequirement = parseResourceRequirements(conf, SPARK_TASK_PREFIX) .map(req => (req.resourceName, req.amount)).toMap assert(taskResourceRequirement.size == 1) assert(taskResourceRequirement.get(FPGA).isEmpty) } + + test("test task resource requirement with 0 amount") { + val conf = new SparkConf() + conf.set(TASK_GPU_ID.amountConf, "2") + conf.set(TASK_FPGA_ID.amountConf, "0") + var taskResourceRequirement = + parseResourceRequirements(conf, SPARK_TASK_PREFIX) + .map(req => (req.resourceName, req.amount)).toMap + + assert(taskResourceRequirement.size == 1) + assert(taskResourceRequirement(GPU) == 2) + } + + + test("Ensure that we can configure fractional resources for a task") { + val ratioSlots = Seq( + (0.10, 10), (0.11, 9), (0.125, 8), (0.14, 7), (0.16, 6), + (0.20, 5), (0.25, 4), (0.33, 3), (0.5, 2), (1.0, 1), + // if the amount is fractional greater than 0.5 and less than 1.0 we throw + (0.51, 1), (0.9, 1), + // if the amount is greater than one is not whole, we throw + (1.5, 0), (2.5, 0), + // it's ok if the amount is whole, and greater than 1 + // parts are 1 because we get a whole part of a resource + (2.0, 1), (3.0, 1), (4.0, 1)) + ratioSlots.foreach { + case (ratio, slots) => + val conf = new SparkConf() + conf.set(TASK_GPU_ID.amountConf, ratio.toString) + if (ratio > 0.5 && ratio % 1 != 0) { + assertThrows[SparkException] { + parseResourceRequirements(conf, SPARK_TASK_PREFIX) + } + } else { + val reqs = parseResourceRequirements(conf, SPARK_TASK_PREFIX) + assert(reqs.size == 1) + assert(reqs.head.amount == Math.ceil(ratio).toInt) + assert(reqs.head.numParts == slots) + } + } + } + + test("Non-task resources are never fractional") { + val ratioSlots = Seq( + // if the amount provided is not a whole number, we throw + (0.25, 0), (0.5, 0), (1.5, 0), + // otherwise we are successful at parsing resources + (1.0, 1), (2.0, 2), (3.0, 3)) + ratioSlots.foreach { + case (ratio, slots) => + val conf = new SparkConf() + conf.set(EXECUTOR_GPU_ID.amountConf, ratio.toString) + if (ratio % 1 != 0) { + assertThrows[SparkException] { + parseResourceRequirements(conf, SPARK_EXECUTOR_PREFIX) + } + } else { + val reqs = parseResourceRequirements(conf, SPARK_EXECUTOR_PREFIX) + assert(reqs.size == 1) + assert(reqs.head.amount == slots) + assert(reqs.head.numParts == 1) + } + } + } } class Class1 {} @@ -463,7 +527,7 @@ class Class2 {} class Class3 {} class CustomRegistrator extends KryoRegistrator { - def registerClasses(kryo: Kryo) { + def registerClasses(kryo: Kryo): Unit = { kryo.register(classOf[Class2]) } } diff --git a/core/src/test/scala/org/apache/spark/SparkContextInfoSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextInfoSuite.scala index 536b4aec75623..6271ce507fddb 100644 --- a/core/src/test/scala/org/apache/spark/SparkContextInfoSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkContextInfoSuite.scala @@ -63,7 +63,7 @@ class SparkContextInfoSuite extends SparkFunSuite with LocalSparkContext { val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2).cache() assert(sc.getRDDStorageInfo.length === 0) rdd.collect() - sc.listenerBus.waitUntilEmpty(10000) + sc.listenerBus.waitUntilEmpty() eventually(timeout(10.seconds), interval(100.milliseconds)) { assert(sc.getRDDStorageInfo.length === 1) } @@ -82,7 +82,7 @@ class SparkContextInfoSuite extends SparkFunSuite with LocalSparkContext { package object testPackage extends Assertions { private val CALL_SITE_REGEX = "(.+) at (.+):([0-9]+)".r - def runCallSiteTest(sc: SparkContext) { + def runCallSiteTest(sc: SparkContext): Unit = { val rdd = sc.makeRDD(Array(1, 2, 3, 4), 2) val rddCreationSite = rdd.getCreationSite val curCallSite = sc.getCallSite().shortForm // note: 2 lines after definition of "rdd" diff --git a/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala index 811b9757232e2..0c72f770a787c 100644 --- a/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkContextSchedulerCreationSuite.scala @@ -46,7 +46,7 @@ class SparkContextSchedulerCreationSuite // real schedulers, so we don't want to create a full SparkContext with the desired scheduler. sc = new SparkContext("local", "test", conf) val createTaskSchedulerMethod = - PrivateMethod[Tuple2[SchedulerBackend, TaskScheduler]]('createTaskScheduler) + PrivateMethod[Tuple2[SchedulerBackend, TaskScheduler]](Symbol("createTaskScheduler")) val (_, sched) = SparkContext invokePrivate createTaskSchedulerMethod(sc, master, deployMode) try { diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala index 786f55c96a3e8..b6dfa69015c28 100644 --- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala @@ -36,6 +36,7 @@ import org.scalatest.concurrent.Eventually import org.apache.spark.TestUtils._ import org.apache.spark.internal.config._ +import org.apache.spark.internal.config.Tests._ import org.apache.spark.internal.config.UI._ import org.apache.spark.resource.ResourceAllocation import org.apache.spark.resource.ResourceUtils._ @@ -233,6 +234,42 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu } } + test("SPARK-30126: addFile when file path contains spaces with recursive works") { + withTempDir { dir => + try { + val sep = File.separator + val tmpDir = Utils.createTempDir(dir.getAbsolutePath + sep + "test space") + val tmpConfFile1 = File.createTempFile("test file", ".conf", tmpDir) + + sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local")) + sc.addFile(tmpConfFile1.getAbsolutePath, true) + + assert(sc.listFiles().size == 1) + assert(sc.listFiles().head.contains(new Path(tmpConfFile1.getName).toUri.toString)) + } finally { + sc.stop() + } + } + } + + test("SPARK-30126: addFile when file path contains spaces without recursive works") { + withTempDir { dir => + try { + val sep = File.separator + val tmpDir = Utils.createTempDir(dir.getAbsolutePath + sep + "test space") + val tmpConfFile2 = File.createTempFile("test file", ".conf", tmpDir) + + sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local")) + sc.addFile(tmpConfFile2.getAbsolutePath) + + assert(sc.listFiles().size == 1) + assert(sc.listFiles().head.contains(new Path(tmpConfFile2.getName).toUri.toString)) + } finally { + sc.stop() + } + } + } + test("addFile recursive can't add directories by default") { withTempDir { dir => try { @@ -294,6 +331,24 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu } } + test("SPARK-30126: add jar when path contains spaces") { + withTempDir { dir => + try { + val sep = File.separator + val tmpDir = Utils.createTempDir(dir.getAbsolutePath + sep + "test space") + val tmpJar = File.createTempFile("test", ".jar", tmpDir) + + sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local")) + sc.addJar(tmpJar.getAbsolutePath) + + assert(sc.listJars().size == 1) + assert(sc.listJars().head.contains(tmpJar.getName)) + } finally { + sc.stop() + } + } + } + test("add jar with invalid path") { withTempDir { tmpDir => val tmpJar = File.createTempFile("test", ".jar", tmpDir) @@ -450,7 +505,9 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local")) sc.setLocalProperty("testProperty", "testValue") var result = "unset"; - val thread = new Thread() { override def run() = {result = sc.getLocalProperty("testProperty")}} + val thread = new Thread() { + override def run(): Unit = {result = sc.getLocalProperty("testProperty")} + } thread.start() thread.join() sc.stop() @@ -461,10 +518,10 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local")) var result = "unset"; val thread1 = new Thread() { - override def run() = {sc.setLocalProperty("testProperty", "testValue")}} + override def run(): Unit = {sc.setLocalProperty("testProperty", "testValue")}} // testProperty should be unset and thus return null val thread2 = new Thread() { - override def run() = {result = sc.getLocalProperty("testProperty")}} + override def run(): Unit = {result = sc.getLocalProperty("testProperty")}} thread1.start() thread1.join() thread2.start() @@ -705,7 +762,7 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu if (context.stageAttemptNumber == 0) { if (context.partitionId == 0) { // Make the first task in the first stage attempt fail. - throw new FetchFailedException(SparkEnv.get.blockManager.blockManagerId, 0, 0, 0, + throw new FetchFailedException(SparkEnv.get.blockManager.blockManagerId, 0, 0L, 0, 0, new java.io.IOException("fake")) } else { // Make the second task in the first stage attempt sleep to generate a zombie task @@ -716,7 +773,7 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu } x }.collect() - sc.listenerBus.waitUntilEmpty(10000) + sc.listenerBus.waitUntilEmpty() // As executors will send the metrics of running tasks via heartbeat, we can use this to check // whether there is any running task. eventually(timeout(10.seconds)) { @@ -728,7 +785,7 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu } test(s"Avoid setting ${CPUS_PER_TASK.key} unreasonably (SPARK-27192)") { - val FAIL_REASON = s"has to be >= the task config: ${CPUS_PER_TASK.key}" + val FAIL_REASON = " has to be >= the number of cpus per task" Seq( ("local", 2, None), ("local[2]", 3, None), @@ -761,7 +818,7 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu sc = new SparkContext(conf) // Ensure all executors has started - TestUtils.waitUntilExecutorsUp(sc, 1, 10000) + TestUtils.waitUntilExecutorsUp(sc, 1, 60000) assert(sc.resources.size === 1) assert(sc.resources.get(GPU).get.addresses === Array("5", "6")) assert(sc.resources.get(GPU).get.name === "gpu") @@ -790,7 +847,7 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu sc = new SparkContext(conf) // Ensure all executors has started - TestUtils.waitUntilExecutorsUp(sc, 1, 10000) + TestUtils.waitUntilExecutorsUp(sc, 1, 60000) // driver gpu resources file should take precedence over the script assert(sc.resources.size === 1) assert(sc.resources.get(GPU).get.addresses === Array("0", "1", "8")) @@ -808,9 +865,8 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu sc = new SparkContext(conf) }.getMessage() - assert(error.contains("The executor resource config: spark.executor.resource.gpu.amount " + - "needs to be specified since a task requirement config: spark.task.resource.gpu.amount " + - "was specified")) + assert(error.contains("No executor resource configs were not specified for the following " + + "task configs: gpu")) } test("Test parsing resources executor config < task requirements") { @@ -824,15 +880,15 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu sc = new SparkContext(conf) }.getMessage() - assert(error.contains("The executor resource config: spark.executor.resource.gpu.amount = 1 " + - "has to be >= the requested amount in task resource config: " + - "spark.task.resource.gpu.amount = 2")) + assert(error.contains("The executor resource: gpu, amount: 1 needs to be >= the task " + + "resource request amount of 2.0")) } test("Parse resources executor config not the same multiple numbers of the task requirements") { val conf = new SparkConf() .setMaster("local-cluster[1, 1, 1024]") .setAppName("test-cluster") + conf.set(RESOURCES_WARNING_TESTING, true) conf.set(TASK_GPU_ID.amountConf, "2") conf.set(EXECUTOR_GPU_ID.amountConf, "4") @@ -840,9 +896,10 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu sc = new SparkContext(conf) }.getMessage() - assert(error.contains("The configuration of resource: gpu (exec = 4, task = 2) will result " + - "in wasted resources due to resource CPU limiting the number of runnable tasks per " + - "executor to: 1. Please adjust your configuration.")) + assert(error.contains( + "The configuration of resource: gpu (exec = 4, task = 2.0/1, runnable tasks = 2) will " + + "result in wasted resources due to resource cpus limiting the number of runnable " + + "tasks per executor to: 1. Please adjust your configuration.")) } test("test resource scheduling under local-cluster mode") { @@ -854,7 +911,7 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu """{"name": "gpu","addresses":["0", "1", "2", "3", "4", "5", "6", "7", "8"]}""") val conf = new SparkConf() - .setMaster("local-cluster[3, 3, 1024]") + .setMaster("local-cluster[3, 1, 1024]") .setAppName("test-cluster") .set(WORKER_GPU_ID.amountConf, "3") .set(WORKER_GPU_ID.discoveryScriptConf, discoveryScript) diff --git a/core/src/test/scala/org/apache/spark/SparkFunSuite.scala b/core/src/test/scala/org/apache/spark/SparkFunSuite.scala index 9dd113262653b..cf4400e080e37 100644 --- a/core/src/test/scala/org/apache/spark/SparkFunSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkFunSuite.scala @@ -20,15 +20,17 @@ package org.apache.spark // scalastyle:off import java.io.File -import scala.annotation.tailrec +import org.apache.log4j.spi.LoggingEvent -import org.apache.log4j.{Appender, Level, Logger} +import scala.annotation.tailrec +import org.apache.log4j.{Appender, AppenderSkeleton, Level, Logger} import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, BeforeAndAfterEach, FunSuite, Outcome} - import org.apache.spark.internal.Logging import org.apache.spark.internal.config.Tests.IS_TESTING import org.apache.spark.util.{AccumulatorContext, Utils} +import scala.collection.mutable.ArrayBuffer + /** * Base abstract class for all unit tests in Spark for handling common functionality. * @@ -186,4 +188,19 @@ abstract class SparkFunSuite } } } + + class LogAppender(msg: String = "", maxEvents: Int = 1000) extends AppenderSkeleton { + val loggingEvents = new ArrayBuffer[LoggingEvent]() + + override def append(loggingEvent: LoggingEvent): Unit = { + if (loggingEvents.size >= maxEvents) { + val loggingInfo = if (msg == "") "." else s" while logging $msg." + throw new IllegalStateException( + s"Number of events reached the limit of $maxEvents$loggingInfo") + } + loggingEvents.append(loggingEvent) + } + override def close(): Unit = {} + override def requiresLayout(): Boolean = false + } } diff --git a/core/src/test/scala/org/apache/spark/ThreadingSuite.scala b/core/src/test/scala/org/apache/spark/ThreadingSuite.scala index 5cf9c087e1dcb..bb04d0d263253 100644 --- a/core/src/test/scala/org/apache/spark/ThreadingSuite.scala +++ b/core/src/test/scala/org/apache/spark/ThreadingSuite.scala @@ -29,7 +29,7 @@ object ThreadingSuiteState { val runningThreads = new AtomicInteger val failed = new AtomicBoolean - def clear() { + def clear(): Unit = { runningThreads.set(0) failed.set(false) } @@ -44,7 +44,7 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging { @volatile var answer1: Int = 0 @volatile var answer2: Int = 0 new Thread { - override def run() { + override def run(): Unit = { answer1 = nums.reduce(_ + _) answer2 = nums.first() // This will run "locally" in the current thread sem.release() @@ -62,7 +62,7 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging { @volatile var ok = true for (i <- 0 until 10) { new Thread { - override def run() { + override def run(): Unit = { val answer1 = nums.reduce(_ + _) if (answer1 != 55) { printf("In thread %d: answer1 was %d\n", i, answer1) @@ -90,7 +90,7 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging { @volatile var ok = true for (i <- 0 until 10) { new Thread { - override def run() { + override def run(): Unit = { val answer1 = nums.reduce(_ + _) if (answer1 != 55) { printf("In thread %d: answer1 was %d\n", i, answer1) @@ -121,7 +121,7 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging { var throwable: Option[Throwable] = None for (i <- 0 until 2) { new Thread { - override def run() { + override def run(): Unit = { try { val ans = nums.map(number => { val running = ThreadingSuiteState.runningThreads @@ -161,7 +161,7 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging { var throwable: Option[Throwable] = None val threads = (1 to 5).map { i => new Thread() { - override def run() { + override def run(): Unit = { try { sc.setLocalProperty("test", i.toString) assert(sc.getLocalProperty("test") === i.toString) @@ -189,7 +189,7 @@ class ThreadingSuite extends SparkFunSuite with LocalSparkContext with Logging { var throwable: Option[Throwable] = None val threads = (1 to 5).map { i => new Thread() { - override def run() { + override def run(): Unit = { try { assert(sc.getLocalProperty("test") === "parent") sc.setLocalProperty("test", i.toString) diff --git a/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala b/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala index 73f9d0e2bc0e1..9629f5ab1a3dd 100644 --- a/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala +++ b/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala @@ -26,6 +26,7 @@ import scala.util.Try import org.apache.commons.io.output.TeeOutputStream import org.apache.commons.lang3.SystemUtils +import org.scalatest.Assertions._ import org.apache.spark.util.Utils @@ -141,12 +142,14 @@ private[spark] class Benchmark( val minIters = if (overrideNumIters != 0) overrideNumIters else minNumIters val minDuration = if (overrideNumIters != 0) 0 else minTime.toNanos val runTimes = ArrayBuffer[Long]() + var totalTime = 0L var i = 0 - while (i < minIters || runTimes.sum < minDuration) { + while (i < minIters || totalTime < minDuration) { val timer = new Benchmark.Timer(i) f(timer) val runTime = timer.totalTime() runTimes += runTime + totalTime += runTime if (outputPerIteration) { // scalastyle:off diff --git a/core/src/test/scala/org/apache/spark/benchmark/BenchmarkBase.scala b/core/src/test/scala/org/apache/spark/benchmark/BenchmarkBase.scala index a6666db4e95c3..55e34b32fe0d4 100644 --- a/core/src/test/scala/org/apache/spark/benchmark/BenchmarkBase.scala +++ b/core/src/test/scala/org/apache/spark/benchmark/BenchmarkBase.scala @@ -21,6 +21,7 @@ import java.io.{File, FileOutputStream, OutputStream} /** * A base class for generate benchmark results to a file. + * For JDK9+, JDK major version number is added to the file names to distingush the results. */ abstract class BenchmarkBase { var output: Option[OutputStream] = None @@ -43,7 +44,9 @@ abstract class BenchmarkBase { def main(args: Array[String]): Unit = { val regenerateBenchmarkFiles: Boolean = System.getenv("SPARK_GENERATE_BENCHMARK_FILES") == "1" if (regenerateBenchmarkFiles) { - val resultFileName = s"${this.getClass.getSimpleName.replace("$", "")}-results.txt" + val version = System.getProperty("java.version").split("\\D+")(0).toInt + val jdkString = if (version > 8) s"-jdk$version" else "" + val resultFileName = s"${this.getClass.getSimpleName.replace("$", "")}$jdkString-results.txt" val file = new File(s"benchmarks/$resultFileName") if (!file.exists()) { file.createNewFile() diff --git a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala index 66b2f487dc1cb..a6776ee077894 100644 --- a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala +++ b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala @@ -194,11 +194,12 @@ class BroadcastSuite extends SparkFunSuite with LocalSparkContext with Encryptio * In between each step, this test verifies that the broadcast blocks are present only on the * expected nodes. */ - private def testUnpersistTorrentBroadcast(distributed: Boolean, removeFromDriver: Boolean) { + private def testUnpersistTorrentBroadcast(distributed: Boolean, + removeFromDriver: Boolean): Unit = { val numSlaves = if (distributed) 2 else 0 // Verify that blocks are persisted only on the driver - def afterCreation(broadcastId: Long, bmm: BlockManagerMaster) { + def afterCreation(broadcastId: Long, bmm: BlockManagerMaster): Unit = { var blockId = BroadcastBlockId(broadcastId) var statuses = bmm.getBlockStatus(blockId, askSlaves = true) assert(statuses.size === 1) @@ -209,7 +210,7 @@ class BroadcastSuite extends SparkFunSuite with LocalSparkContext with Encryptio } // Verify that blocks are persisted in both the executors and the driver - def afterUsingBroadcast(broadcastId: Long, bmm: BlockManagerMaster) { + def afterUsingBroadcast(broadcastId: Long, bmm: BlockManagerMaster): Unit = { var blockId = BroadcastBlockId(broadcastId) val statuses = bmm.getBlockStatus(blockId, askSlaves = true) assert(statuses.size === numSlaves + 1) @@ -220,7 +221,7 @@ class BroadcastSuite extends SparkFunSuite with LocalSparkContext with Encryptio // Verify that blocks are unpersisted on all executors, and on all nodes if removeFromDriver // is true. - def afterUnpersist(broadcastId: Long, bmm: BlockManagerMaster) { + def afterUnpersist(broadcastId: Long, bmm: BlockManagerMaster): Unit = { var blockId = BroadcastBlockId(broadcastId) var expectedNumBlocks = if (removeFromDriver) 0 else 1 var statuses = bmm.getBlockStatus(blockId, askSlaves = true) @@ -251,7 +252,7 @@ class BroadcastSuite extends SparkFunSuite with LocalSparkContext with Encryptio afterCreation: (Long, BlockManagerMaster) => Unit, afterUsingBroadcast: (Long, BlockManagerMaster) => Unit, afterUnpersist: (Long, BlockManagerMaster) => Unit, - removeFromDriver: Boolean) { + removeFromDriver: Boolean): Unit = { sc = if (distributed) { val _sc = @@ -307,7 +308,7 @@ class BroadcastSuite extends SparkFunSuite with LocalSparkContext with Encryptio package object testPackage extends Assertions { - def runCallSiteTest(sc: SparkContext) { + def runCallSiteTest(sc: SparkContext): Unit = { val broadcast = sc.broadcast(Array(1, 2, 3, 4)) broadcast.destroy(blocking = true) val thrown = intercept[SparkException] { broadcast.value } diff --git a/core/src/test/scala/org/apache/spark/deploy/ExternalShuffleServiceDbSuite.scala b/core/src/test/scala/org/apache/spark/deploy/ExternalShuffleServiceDbSuite.scala index 9cfb8a647ad89..6914714dce6eb 100644 --- a/core/src/test/scala/org/apache/spark/deploy/ExternalShuffleServiceDbSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/ExternalShuffleServiceDbSuite.scala @@ -46,7 +46,7 @@ class ExternalShuffleServiceDbSuite extends SparkFunSuite { var blockHandler: ExternalBlockHandler = _ var blockResolver: ExternalShuffleBlockResolver = _ - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() sparkConf = new SparkConf() sparkConf.set("spark.shuffle.service.enabled", "true") @@ -63,7 +63,7 @@ class ExternalShuffleServiceDbSuite extends SparkFunSuite { registerExecutor() } - override def afterAll() { + override def afterAll(): Unit = { try { dataContext.cleanup() } finally { diff --git a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala index ad402c0e905ae..eeccf56cbf02e 100644 --- a/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/JsonProtocolSuite.scala @@ -89,7 +89,7 @@ class JsonProtocolSuite extends SparkFunSuite with JsonTestUtils { assertValidDataInJson(output, JsonMethods.parse(JsonConstants.workerStateJsonStr)) } - def assertValidJson(json: JValue) { + def assertValidJson(json: JValue): Unit = { try { JsonMethods.parse(JsonMethods.compact(json)) } catch { diff --git a/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala b/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala index cbdf1755b0c5b..84fc16979925b 100644 --- a/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/LogUrlsStandaloneSuite.scala @@ -29,9 +29,6 @@ import org.apache.spark.util.SparkConfWithEnv class LogUrlsStandaloneSuite extends SparkFunSuite with LocalSparkContext { - /** Length of time to wait while draining listener events. */ - private val WAIT_TIMEOUT_MILLIS = 10000 - test("verify that correct log urls get propagated from workers") { sc = new SparkContext("local-cluster[2,1,1024]", "test") @@ -41,7 +38,7 @@ class LogUrlsStandaloneSuite extends SparkFunSuite with LocalSparkContext { // Trigger a job so that executors get added sc.parallelize(1 to 100, 4).map(_.toString).count() - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) + sc.listenerBus.waitUntilEmpty() listener.addedExecutorInfos.values.foreach { info => assert(info.logUrlMap.nonEmpty) // Browse to each URL to check that it's valid @@ -61,7 +58,7 @@ class LogUrlsStandaloneSuite extends SparkFunSuite with LocalSparkContext { // Trigger a job so that executors get added sc.parallelize(1 to 100, 4).map(_.toString).count() - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) + sc.listenerBus.waitUntilEmpty() val listeners = sc.listenerBus.findListenersByClass[SaveExecutorInfo] assert(listeners.size === 1) val listener = listeners(0) @@ -77,7 +74,7 @@ class LogUrlsStandaloneSuite extends SparkFunSuite with LocalSparkContext { private[spark] class SaveExecutorInfo extends SparkListener { val addedExecutorInfos = mutable.Map[String, ExecutorInfo]() - override def onExecutorAdded(executor: SparkListenerExecutorAdded) { + override def onExecutorAdded(executor: SparkListenerExecutorAdded): Unit = { addedExecutorInfos(executor.executorId) = executor.executorInfo } } diff --git a/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala index ef947eb074647..d04d9b6dcb2be 100644 --- a/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/RPackageUtilsSuite.scala @@ -58,7 +58,7 @@ class RPackageUtilsSuite /** Simple PrintStream that reads data into a buffer */ private class BufferPrintStream extends PrintStream(noOpOutputStream) { // scalastyle:off println - override def println(line: String) { + override def println(line: String): Unit = { // scalastyle:on println lineBuffer += line } diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index 385f549aa1ad9..9d4736825618e 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -39,11 +39,11 @@ import org.apache.spark.TestUtils.JavaSourceFromString import org.apache.spark.api.r.RUtils import org.apache.spark.deploy.SparkSubmit._ import org.apache.spark.deploy.SparkSubmitUtils.MavenCoordinate +import org.apache.spark.deploy.history.EventLogFileReader import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ import org.apache.spark.internal.config.UI._ import org.apache.spark.launcher.SparkLauncher -import org.apache.spark.scheduler.EventLoggingListener import org.apache.spark.util.{CommandLineUtils, ResetSystemProperties, Utils} trait TestPrematureExit { @@ -57,7 +57,7 @@ trait TestPrematureExit { private class BufferPrintStream extends PrintStream(noOpOutputStream) { var lineBuffer = ArrayBuffer[String]() // scalastyle:off println - override def println(line: String) { + override def println(line: String): Unit = { lineBuffer += line } // scalastyle:on println @@ -121,7 +121,7 @@ class SparkSubmitSuite private val submit = new SparkSubmit() - override def beforeEach() { + override def beforeEach(): Unit = { super.beforeEach() } @@ -453,6 +453,83 @@ class SparkSubmitSuite conf.get("spark.kubernetes.driver.container.image") should be ("bar") } + /** + * Helper function for testing main class resolution on remote JAR files. + * + * @param tempDir path to temporary directory + * @param deployMode either "client" or "cluster" + * @return a pair of the JAR file and the 4-tuple returned by + * [[org.apache.spark.deploy.SparkSubmit#prepareSubmitEnvironment]] + */ + private def testResolveMainClassOnRemoteJar( + tempDir: File, + deployMode: String + ): (File, (Seq[String], Seq[String], SparkConf, String)) = { + val excFile = TestUtils.createCompiledClass("SomeMainClass", tempDir, "", null, Seq.empty) + val jarFile = new File(tempDir, "s3-mainClass-test-%s.jar".format(System.currentTimeMillis())) + val jarUrl = TestUtils.createJar( + Seq(excFile), + jarFile, + directoryPrefix = Some(tempDir.toString), + mainClass = Some("SomeMainClass")) + + val hadoopConf = new Configuration() + updateConfWithFakeS3Fs(hadoopConf) + + val clArgs = Seq( + "--name", "testApp", + "--master", "yarn", + "--deploy-mode", deployMode, + "--conf", "spark.hadoop.fs.s3a.impl=org.apache.spark.deploy.TestFileSystem", + "--conf", "spark.hadoop.fs.s3a.impl.disable.cache=true", + s"s3a://${jarUrl.getPath}", + "arg1", "arg2") + + val appArgs = new SparkSubmitArguments(clArgs) + (jarFile, submit.prepareSubmitEnvironment(appArgs, conf = Some(hadoopConf))) + } + + test("automatically sets mainClass if primary resource is S3 JAR in client mode") { + withTempDir { tempDir => + val (jarFile, (childArgs, classpaths, _, mainClass_)) = testResolveMainClassOnRemoteJar( + tempDir, "client" + ) + + mainClass_ should be ("SomeMainClass") + classpaths should have length 1 + classpaths.head should endWith (jarFile.getName) + childArgs.mkString(" ") should be ("arg1 arg2") + } + } + + test("automatically sets mainClass if primary resource is S3 JAR in cluster mode") { + withTempDir { tempDir => + val (jarFile, (childArgs, classpaths, _, mainClass_)) = testResolveMainClassOnRemoteJar( + tempDir, "cluster" + ) + + mainClass_ should be (YARN_CLUSTER_SUBMIT_CLASS) + classpaths should have length 1 + classpaths.head should endWith (jarFile.getName) + childArgs.mkString(" ") should include ("--class SomeMainClass") + childArgs.mkString(" ") should endWith ("--arg arg1 --arg arg2") + } + } + + test("error informatively when mainClass isn't set and S3 JAR doesn't exist") { + val hadoopConf = new Configuration() + updateConfWithFakeS3Fs(hadoopConf) + + val clArgs = Seq( + "--name", "testApp", + "--master", "yarn", + "--conf", "spark.hadoop.fs.s3a.impl=org.apache.spark.deploy.TestFileSystem", + "--conf", "spark.hadoop.fs.s3a.impl.disable.cache=true", + s"s3a:///does-not-exist.jar") + + testPrematureExit(clArgs.toArray, "File /does-not-exist.jar does not exist") + } + test("handles confs with flag equivalents") { val clArgs = Seq( "--deploy-mode", "cluster", @@ -535,7 +612,7 @@ class SparkSubmitSuite unusedJar.toString) runSparkSubmit(args) val listStatus = fileSystem.listStatus(testDirPath) - val logData = EventLoggingListener.openEventLog(listStatus.last.getPath, fileSystem) + val logData = EventLogFileReader.openEventLog(listStatus.last.getPath, fileSystem) Source.fromInputStream(logData).getLines().foreach { line => assert(!line.contains("secret_password")) } @@ -600,7 +677,7 @@ class SparkSubmitSuite } // TODO(SPARK-9603): Building a package is flaky on Jenkins Maven builds. - // See https://gist.github.com/shivaram/3a2fecce60768a603dac for a error log + // See https://gist.github.com/shivaram/3a2fecce60768a603dac for an error log ignore("correctly builds R packages included in a jar with --packages") { assume(RUtils.isRInstalled, "R isn't installed on this machine.") assume(RUtils.isSparkRInstalled, "SparkR is not installed in this build.") @@ -1365,7 +1442,7 @@ object SparkSubmitSuite extends SparkFunSuite with TimeLimits { } object JarCreationTest extends Logging { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { TestUtils.configTestLog4j("INFO") val conf = new SparkConf() val sc = new SparkContext(conf) @@ -1389,7 +1466,7 @@ object JarCreationTest extends Logging { } object SimpleApplicationTest { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { TestUtils.configTestLog4j("INFO") val conf = new SparkConf() val sc = new SparkContext(conf) @@ -1415,7 +1492,7 @@ object SimpleApplicationTest { } object UserClasspathFirstTest { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { val ccl = Thread.currentThread().getContextClassLoader() val resource = ccl.getResourceAsStream("test.resource") val bytes = ByteStreams.toByteArray(resource) diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala index 8e1a519e187ce..31e6c730eadc0 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala @@ -44,13 +44,13 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with BeforeAndAfterAll { private class BufferPrintStream extends PrintStream(noOpOutputStream) { var lineBuffer = ArrayBuffer[String]() // scalastyle:off println - override def println(line: String) { + override def println(line: String): Unit = { lineBuffer += line } // scalastyle:on println } - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() // We don't want to write logs during testing SparkSubmitUtils.printStream = new BufferPrintStream diff --git a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala index 9bf7714ed77dd..f8b99302c4ad5 100644 --- a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala @@ -26,15 +26,16 @@ import org.scalatest.{BeforeAndAfterAll, PrivateMethodTester} import org.scalatest.concurrent.Eventually._ import org.apache.spark._ -import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, RequestMasterState} +import org.apache.spark.deploy.DeployMessages._ import org.apache.spark.deploy.master.ApplicationInfo import org.apache.spark.deploy.master.Master import org.apache.spark.deploy.worker.Worker import org.apache.spark.internal.config +import org.apache.spark.resource.ResourceProfile import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef, RpcEnv} import org.apache.spark.scheduler.TaskSchedulerImpl import org.apache.spark.scheduler.cluster._ -import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.{RegisterExecutor, RegisterExecutorFailed} +import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.{LaunchedExecutor, RegisterExecutor} /** * End-to-end tests for dynamic allocation in standalone mode. @@ -437,8 +438,8 @@ class StandaloneDynamicAllocationSuite assert(executors.size === 2) // simulate running a task on the executor - val getMap = - PrivateMethod[mutable.HashMap[String, mutable.HashSet[Long]]]('executorIdToRunningTaskIds) + val getMap = PrivateMethod[mutable.HashMap[String, mutable.HashSet[Long]]]( + Symbol("executorIdToRunningTaskIds")) val taskScheduler = sc.taskScheduler.asInstanceOf[TaskSchedulerImpl] val executorIdToRunningTaskIds = taskScheduler invokePrivate getMap() executorIdToRunningTaskIds(executors.head) = mutable.HashSet(1L) @@ -482,12 +483,16 @@ class StandaloneDynamicAllocationSuite assert(apps.head.getExecutorLimit === Int.MaxValue) } val beforeList = getApplications().head.executors.keys.toSet - assert(killExecutorsOnHost(sc, "localhost").equals(true)) - syncExecutors(sc) - val afterList = getApplications().head.executors.keys.toSet + + sc.schedulerBackend match { + case b: CoarseGrainedSchedulerBackend => + b.killExecutorsOnHost("localhost") + case _ => fail("expected coarse grained scheduler") + } eventually(timeout(10.seconds), interval(100.millis)) { + val afterList = getApplications().head.executors.keys.toSet assert(beforeList.intersect(afterList).size == 0) } } @@ -501,11 +506,12 @@ class StandaloneDynamicAllocationSuite val mockAddress = mock(classOf[RpcAddress]) when(endpointRef.address).thenReturn(mockAddress) val message = RegisterExecutor("one", endpointRef, "blacklisted-host", 10, Map.empty, - Map.empty, Map.empty) + Map.empty, Map.empty, ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) val taskScheduler = mock(classOf[TaskSchedulerImpl]) when(taskScheduler.nodeBlacklist()).thenReturn(Set("blacklisted-host")) when(taskScheduler.resourceOffers(any())).thenReturn(Nil) + when(taskScheduler.resourcesReqsPerTask).thenReturn(Seq.empty) when(taskScheduler.sc).thenReturn(sc) val rpcEnv = RpcEnv.create("test-rpcenv", "localhost", 0, conf, securityManager) @@ -513,10 +519,11 @@ class StandaloneDynamicAllocationSuite val scheduler = new CoarseGrainedSchedulerBackend(taskScheduler, rpcEnv) try { scheduler.start() - scheduler.driverEndpoint.ask[Boolean](message) - eventually(timeout(10.seconds), interval(100.millis)) { - verify(endpointRef).send(RegisterExecutorFailed(any())) + val e = intercept[SparkException] { + scheduler.driverEndpoint.askSync[Boolean](message) } + assert(e.getCause().isInstanceOf[IllegalStateException]) + assert(scheduler.getExecutorIds().isEmpty) } finally { scheduler.stop() } @@ -535,6 +542,11 @@ class StandaloneDynamicAllocationSuite .setMaster(masterRpcEnv.address.toSparkURL) .setAppName("test") .set(config.EXECUTOR_MEMORY.key, "256m") + // Because we're faking executor launches in the Worker, set the config so that the driver + // will not timeout anything related to executors. + .set(config.Network.NETWORK_TIMEOUT.key, "2h") + .set(config.EXECUTOR_HEARTBEAT_INTERVAL.key, "1h") + .set(config.STORAGE_BLOCKMANAGER_SLAVE_TIMEOUT.key, "1h") } /** Make a master to which our application will send executor requests. */ @@ -548,8 +560,7 @@ class StandaloneDynamicAllocationSuite private def makeWorkers(cores: Int, memory: Int): Seq[Worker] = { (0 until numWorkers).map { i => val rpcEnv = workerRpcEnvs(i) - val worker = new Worker(rpcEnv, 0, cores, memory, Array(masterRpcEnv.address), - Worker.ENDPOINT_NAME, null, conf, securityManager) + val worker = new TestWorker(rpcEnv, cores, memory) rpcEnv.setupEndpoint(Worker.ENDPOINT_NAME, worker) worker } @@ -587,16 +598,6 @@ class StandaloneDynamicAllocationSuite } } - /** Kill the executors on a given host. */ - private def killExecutorsOnHost(sc: SparkContext, host: String): Boolean = { - syncExecutors(sc) - sc.schedulerBackend match { - case b: CoarseGrainedSchedulerBackend => - b.killExecutorsOnHost(host) - case _ => fail("expected coarse grained scheduler") - } - } - /** * Return a list of executor IDs belonging to this application. * @@ -619,9 +620,8 @@ class StandaloneDynamicAllocationSuite * we submit a request to kill them. This must be called before each kill request. */ private def syncExecutors(sc: SparkContext): Unit = { - val driverExecutors = sc.env.blockManager.master.getStorageStatus - .map(_.blockManagerId.executorId) - .filter { _ != SparkContext.DRIVER_IDENTIFIER} + val backend = sc.schedulerBackend.asInstanceOf[CoarseGrainedSchedulerBackend] + val driverExecutors = backend.getExecutorIds() val masterExecutors = getExecutorIds(sc) val missingExecutors = masterExecutors.toSet.diff(driverExecutors.toSet).toSeq.sorted missingExecutors.foreach { id => @@ -630,10 +630,30 @@ class StandaloneDynamicAllocationSuite val mockAddress = mock(classOf[RpcAddress]) when(endpointRef.address).thenReturn(mockAddress) val message = RegisterExecutor(id, endpointRef, "localhost", 10, Map.empty, Map.empty, - Map.empty) - val backend = sc.schedulerBackend.asInstanceOf[CoarseGrainedSchedulerBackend] + Map.empty, ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) backend.driverEndpoint.askSync[Boolean](message) + backend.driverEndpoint.send(LaunchedExecutor(id)) + } + } + + /** + * Worker implementation that does not actually launch any executors, but reports them as + * running so the Master keeps track of them. This requires that `syncExecutors` be used + * to make sure the Master instance and the SparkContext under test agree about what + * executors are running. + */ + private class TestWorker(rpcEnv: RpcEnv, cores: Int, memory: Int) + extends Worker( + rpcEnv, 0, cores, memory, Array(masterRpcEnv.address), Worker.ENDPOINT_NAME, + null, conf, securityManager) { + + override def receive: PartialFunction[Any, Unit] = testReceive.orElse(super.receive) + + private def testReceive: PartialFunction[Any, Unit] = synchronized { + case LaunchExecutor(_, appId, execId, _, _, _, _) => + self.send(ExecutorStateChanged(appId, execId, ExecutorState.RUNNING, None, None)) } + } } diff --git a/core/src/test/scala/org/apache/spark/deploy/client/AppClientSuite.scala b/core/src/test/scala/org/apache/spark/deploy/client/AppClientSuite.scala index a1d3077b8fc87..a3e39d7f53728 100644 --- a/core/src/test/scala/org/apache/spark/deploy/client/AppClientSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/client/AppClientSuite.scala @@ -30,7 +30,7 @@ import org.apache.spark.deploy.{ApplicationDescription, Command} import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, RequestMasterState} import org.apache.spark.deploy.master.{ApplicationInfo, Master} import org.apache.spark.deploy.worker.Worker -import org.apache.spark.internal.Logging +import org.apache.spark.internal.{config, Logging} import org.apache.spark.rpc.RpcEnv import org.apache.spark.util.Utils @@ -44,13 +44,13 @@ class AppClientSuite with Eventually with ScalaFutures { private val numWorkers = 2 - private val conf = new SparkConf() - private val securityManager = new SecurityManager(conf) + private var conf: SparkConf = null private var masterRpcEnv: RpcEnv = null private var workerRpcEnvs: Seq[RpcEnv] = null private var master: Master = null private var workers: Seq[Worker] = null + private var securityManager: SecurityManager = null /** * Start the local cluster. @@ -58,6 +58,8 @@ class AppClientSuite */ override def beforeAll(): Unit = { super.beforeAll() + conf = new SparkConf().set(config.Worker.WORKER_DECOMMISSION_ENABLED.key, "true") + securityManager = new SecurityManager(conf) masterRpcEnv = RpcEnv.create(Master.SYSTEM_NAME, "localhost", 0, conf, securityManager) workerRpcEnvs = (0 until numWorkers).map { i => RpcEnv.create(Worker.SYSTEM_NAME + i, "localhost", 0, conf, securityManager) @@ -111,8 +113,23 @@ class AppClientSuite assert(apps.head.getExecutorLimit === numExecutorsRequested, s"executor request failed") } + + // Save the executor id before decommissioning so we can kill it + val application = getApplications().head + val executors = application.executors + val executorId: String = executors.head._2.fullId + + // Send a decommission self to all the workers + // Note: normally the worker would send this on their own. + workers.foreach(worker => worker.decommissionSelf()) + + // Decommissioning is async. + eventually(timeout(1.seconds), interval(10.millis)) { + // We only record decommissioning for the executor we've requested + assert(ci.listener.execDecommissionedList.size === 1) + } + // Send request to kill executor, verify request was made - val executorId: String = getApplications().head.executors.head._2.fullId whenReady( ci.client.killExecutors(Seq(executorId)), timeout(10.seconds), @@ -120,6 +137,15 @@ class AppClientSuite assert(acknowledged) } + // Verify that asking for executors on the decommissioned workers fails + whenReady( + ci.client.requestTotalExecutors(numExecutorsRequested), + timeout(10.seconds), + interval(10.millis)) { acknowledged => + assert(acknowledged) + } + assert(getApplications().head.executors.size === 0) + // Issue stop command for Client to disconnect from Master ci.client.stop() @@ -189,6 +215,7 @@ class AppClientSuite val deadReasonList = new ConcurrentLinkedQueue[String]() val execAddedList = new ConcurrentLinkedQueue[String]() val execRemovedList = new ConcurrentLinkedQueue[String]() + val execDecommissionedList = new ConcurrentLinkedQueue[String]() def connected(id: String): Unit = { connectedIdList.add(id) @@ -218,6 +245,10 @@ class AppClientSuite execRemovedList.add(id) } + def executorDecommissioned(id: String, message: String): Unit = { + execDecommissionedList.add(id) + } + def workerRemoved(workerId: String, host: String, message: String): Unit = {} } diff --git a/core/src/test/scala/org/apache/spark/deploy/client/TestExecutor.scala b/core/src/test/scala/org/apache/spark/deploy/client/TestExecutor.scala index a98b1fa8f83a1..1dce49d1f9d5a 100644 --- a/core/src/test/scala/org/apache/spark/deploy/client/TestExecutor.scala +++ b/core/src/test/scala/org/apache/spark/deploy/client/TestExecutor.scala @@ -18,7 +18,7 @@ package org.apache.spark.deploy.client private[spark] object TestExecutor { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { // scalastyle:off println println("Hello world!") // scalastyle:on println diff --git a/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala index 1148446c9faa1..48bd088d07ff9 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/ApplicationCacheSuite.scala @@ -28,7 +28,7 @@ import org.mockito.ArgumentMatchers.any import org.mockito.Mockito._ import org.mockito.invocation.InvocationOnMock import org.scalatest.Matchers -import org.scalatest.mockito.MockitoSugar +import org.scalatestplus.mockito.MockitoSugar import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging diff --git a/core/src/test/scala/org/apache/spark/deploy/history/BasicEventFilterBuilderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/BasicEventFilterBuilderSuite.scala new file mode 100644 index 0000000000000..c905797bf1287 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/deploy/history/BasicEventFilterBuilderSuite.scala @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy.history + +import org.apache.spark.{SparkFunSuite, Success, TaskResultLost, TaskState} +import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics} +import org.apache.spark.resource.ResourceProfile +import org.apache.spark.scheduler._ +import org.apache.spark.status.ListenerEventsTestHelper + +class BasicEventFilterBuilderSuite extends SparkFunSuite { + import ListenerEventsTestHelper._ + + override protected def beforeEach(): Unit = { + ListenerEventsTestHelper.reset() + } + + test("track live jobs") { + var time = 0L + + val listener = new BasicEventFilterBuilder + listener.onOtherEvent(SparkListenerLogStart("TestSparkVersion")) + + // Start the application. + time += 1 + listener.onApplicationStart(SparkListenerApplicationStart( + "name", + Some("id"), + time, + "user", + Some("attempt"), + None)) + + // Start a couple of executors. + time += 1 + val execIds = Array("1", "2") + execIds.foreach { id => + listener.onExecutorAdded(createExecutorAddedEvent(id, time)) + } + + // Start a job with 2 stages / 4 tasks each + time += 1 + + val rddsForStage0 = createRdds(2) + val rddsForStage1 = createRdds(2) + + val stage0 = createStage(rddsForStage0, Nil) + val stage1 = createStage(rddsForStage1, Seq(stage0.stageId)) + val stages = Seq(stage0, stage1) + + val jobProps = createJobProps() + listener.onJobStart(SparkListenerJobStart(1, time, stages, jobProps)) + + // Submit stage 0 + time += 1 + stages.head.submissionTime = Some(time) + listener.onStageSubmitted(SparkListenerStageSubmitted(stages.head, jobProps)) + + // Start tasks from stage 0 + time += 1 + + val s0Tasks = ListenerEventsTestHelper.createTasks(4, execIds, time) + s0Tasks.foreach { task => + listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, + stages.head.attemptNumber(), task)) + } + + // Fail one of the tasks, re-start it. + time += 1 + s0Tasks.head.markFinished(TaskState.FAILED, time) + listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptNumber, + "taskType", TaskResultLost, s0Tasks.head, new ExecutorMetrics, null)) + + time += 1 + val reattempt = createTaskWithNewAttempt(s0Tasks.head, time) + listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptNumber, + reattempt)) + + // Succeed all tasks in stage 0. + val pending = s0Tasks.drop(1) ++ Seq(reattempt) + + time += 1 + pending.foreach { task => + task.markFinished(TaskState.FINISHED, time) + listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptNumber, + "taskType", Success, task, new ExecutorMetrics, TaskMetrics.empty)) + } + + // End stage 0. + time += 1 + stages.head.completionTime = Some(time) + listener.onStageCompleted(SparkListenerStageCompleted(stages.head)) + + assert(listener.liveJobs === Set(1)) + assert(listener.liveStages === Set(0)) + // stage 1 not yet submitted - RDDs for stage 1 is not available + assert(listener.liveRDDs === rddsForStage0.map(_.id).toSet) + assert(listener.liveTasks === (s0Tasks ++ Seq(reattempt)).map(_.taskId).toSet) + + // Submit stage 1. + time += 1 + stages.last.submissionTime = Some(time) + listener.onStageSubmitted(SparkListenerStageSubmitted(stages.last, jobProps)) + + // Start and fail all tasks of stage 1. + time += 1 + val s1Tasks = createTasks(4, execIds, time) + s1Tasks.foreach { task => + listener.onTaskStart(SparkListenerTaskStart(stages.last.stageId, + stages.last.attemptNumber, + task)) + } + + time += 1 + s1Tasks.foreach { task => + task.markFinished(TaskState.FAILED, time) + listener.onTaskEnd(SparkListenerTaskEnd(stages.last.stageId, stages.last.attemptNumber, + "taskType", TaskResultLost, task, new ExecutorMetrics, null)) + } + + // Fail stage 1. + time += 1 + stages.last.completionTime = Some(time) + stages.last.failureReason = Some("uh oh") + listener.onStageCompleted(SparkListenerStageCompleted(stages.last)) + + // - Re-submit stage 1, all tasks, and succeed them and the stage. + val oldS1 = stages.last + val newS1 = new StageInfo(oldS1.stageId, oldS1.attemptNumber + 1, oldS1.name, oldS1.numTasks, + oldS1.rddInfos, oldS1.parentIds, oldS1.details, oldS1.taskMetrics, + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) + + time += 1 + newS1.submissionTime = Some(time) + listener.onStageSubmitted(SparkListenerStageSubmitted(newS1, jobProps)) + + val newS1Tasks = createTasks(4, execIds, time) + + newS1Tasks.foreach { task => + listener.onTaskStart(SparkListenerTaskStart(newS1.stageId, newS1.attemptNumber, task)) + } + + time += 1 + newS1Tasks.foreach { task => + task.markFinished(TaskState.FINISHED, time) + listener.onTaskEnd(SparkListenerTaskEnd(newS1.stageId, newS1.attemptNumber, "taskType", + Success, task, new ExecutorMetrics, null)) + } + + time += 1 + newS1.completionTime = Some(time) + listener.onStageCompleted(SparkListenerStageCompleted(newS1)) + + assert(listener.liveJobs === Set(1)) + assert(listener.liveStages === Set(0, 1)) + // stage 0 and 1 are finished but it stores the information regarding stage + assert(listener.liveRDDs === (rddsForStage0.map(_.id) ++ rddsForStage1.map(_.id)).toSet) + assert(listener.liveTasks === + (s0Tasks ++ Seq(reattempt) ++ s1Tasks ++ newS1Tasks).map(_.taskId).toSet) + + // Start next job. + time += 1 + + val rddsForStage2 = createRdds(2) + val rddsForStage3 = createRdds(2) + + val stage3 = createStage(rddsForStage2, Nil) + val stage4 = createStage(rddsForStage3, Seq(stage3.stageId)) + val stagesForJob2 = Seq(stage3, stage4) + + listener.onJobStart(SparkListenerJobStart(2, time, stagesForJob2, jobProps)) + + // End job 1. + time += 1 + listener.onJobEnd(SparkListenerJobEnd(1, time, JobSucceeded)) + + // everything related to job 1 should be cleaned up, but not for job 2 + assert(listener.liveJobs === Set(2)) + assert(listener.liveStages.isEmpty) + // no RDD information available as these stages are not submitted yet + assert(listener.liveRDDs.isEmpty) + // stageToTasks has no information for job 2, as no task has been started + assert(listener.liveTasks.isEmpty) + } + + test("track live executors") { + var time = 0L + + val listener = new BasicEventFilterBuilder + listener.onOtherEvent(SparkListenerLogStart("TestSparkVersion")) + + // Start the application. + time += 1 + listener.onApplicationStart(SparkListenerApplicationStart( + "name", + Some("id"), + time, + "user", + Some("attempt"), + None)) + + // Start a couple of executors. + time += 1 + val execIds = (1 to 3).map(_.toString) + execIds.foreach { id => + listener.onExecutorAdded(createExecutorAddedEvent(id, time)) + } + + // End one of executors. + time += 1 + listener.onExecutorRemoved(createExecutorRemovedEvent(execIds.head, time)) + + assert(listener.liveExecutors === execIds.drop(1).toSet) + } +} diff --git a/core/src/test/scala/org/apache/spark/deploy/history/BasicEventFilterSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/BasicEventFilterSuite.scala new file mode 100644 index 0000000000000..2da40dccba53e --- /dev/null +++ b/core/src/test/scala/org/apache/spark/deploy/history/BasicEventFilterSuite.scala @@ -0,0 +1,208 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy.history + +import org.apache.spark.{storage, SparkContext, SparkFunSuite, Success, TaskState} +import org.apache.spark.deploy.history.EventFilter.FilterStatistics +import org.apache.spark.executor.ExecutorMetrics +import org.apache.spark.scheduler._ +import org.apache.spark.status.ListenerEventsTestHelper._ +import org.apache.spark.storage.{BlockManagerId, RDDBlockId, StorageLevel} + +class BasicEventFilterSuite extends SparkFunSuite { + import BasicEventFilterSuite._ + + test("filter out events for finished jobs") { + // assume finished job 1 with stage 1, tasks (1, 2), rdds (1, 2) + // live job 2 with stage 2 with tasks (3, 4) & rdds (3, 4), + // and stage 3 with tasks (5, 6) & rdds (5, 6) + val liveJobs = Set(2) + val liveStages = Set(2, 3) + val liveTasks = Set(3L, 4L, 5L, 6L) + val liveRDDs = Set(3, 4, 5, 6) + val liveExecutors: Set[String] = Set("1", "2") + val filterStats = FilterStatistics( + // counts finished job 1 + liveJobs.size + 1, + liveJobs.size, + // counts finished stage 1 for job 1 + liveStages.size + 1, + liveStages.size, + // counts finished tasks (1, 2) for job 1 + liveTasks.size + 2, + liveTasks.size) + + val filter = new BasicEventFilter(filterStats, liveJobs, liveStages, liveTasks, liveRDDs, + liveExecutors) + val acceptFn = filter.acceptFn().lift + + // Verifying with finished job 1 + val rddsForStage1 = createRddsWithId(1 to 2) + val stage1 = createStage(1, rddsForStage1, Nil) + val tasksForStage1 = createTasks(Seq(1L, 2L), liveExecutors.toArray, 0) + tasksForStage1.foreach { task => task.markFinished(TaskState.FINISHED, 5) } + + val jobStartEventForJob1 = SparkListenerJobStart(1, 0, Seq(stage1)) + val jobEndEventForJob1 = SparkListenerJobEnd(1, 0, JobSucceeded) + val stageSubmittedEventsForJob1 = SparkListenerStageSubmitted(stage1) + val stageCompletedEventsForJob1 = SparkListenerStageCompleted(stage1) + val unpersistRDDEventsForJob1 = (1 to 2).map(SparkListenerUnpersistRDD) + + // job events for finished job should be rejected + assert(Some(false) === acceptFn(jobStartEventForJob1)) + assert(Some(false) === acceptFn(jobEndEventForJob1)) + + // stage events for finished job should be rejected + // NOTE: it doesn't filter out stage events which are also related to the executor + assertFilterStageEvents( + acceptFn, + stageSubmittedEventsForJob1, + stageCompletedEventsForJob1, + unpersistRDDEventsForJob1, + SparkListenerSpeculativeTaskSubmitted(stage1.stageId, stageAttemptId = 1), + Some(false)) + + // task events for finished job should be rejected + assertFilterTaskEvents(acceptFn, tasksForStage1, stage1, Some(false)) + + // Verifying with live job 2 + val rddsForStage2 = createRddsWithId(3 to 4) + val stage2 = createStage(2, rddsForStage2, Nil) + val tasksForStage2 = createTasks(Seq(3L, 4L), liveExecutors.toArray, 0) + tasksForStage1.foreach { task => task.markFinished(TaskState.FINISHED, 5) } + + val jobStartEventForJob2 = SparkListenerJobStart(2, 0, Seq(stage2)) + val stageSubmittedEventsForJob2 = SparkListenerStageSubmitted(stage2) + val stageCompletedEventsForJob2 = SparkListenerStageCompleted(stage2) + val unpersistRDDEventsForJob2 = rddsForStage2.map { rdd => SparkListenerUnpersistRDD(rdd.id) } + + // job events for live job should be accepted + assert(Some(true) === acceptFn(jobStartEventForJob2)) + + // stage events for live job should be accepted + assertFilterStageEvents( + acceptFn, + stageSubmittedEventsForJob2, + stageCompletedEventsForJob2, + unpersistRDDEventsForJob2, + SparkListenerSpeculativeTaskSubmitted(stage2.stageId, stageAttemptId = 1), + Some(true)) + + // task events for live job should be accepted + assertFilterTaskEvents(acceptFn, tasksForStage2, stage2, Some(true)) + } + + test("accept all events for block manager addition/removal on driver") { + val filter = new BasicEventFilter(EMPTY_STATS, Set.empty, Set.empty, Set.empty, Set.empty, + Set.empty) + val acceptFn = filter.acceptFn().lift + + val bmId = BlockManagerId(SparkContext.DRIVER_IDENTIFIER, "host1", 1) + assert(Some(true) === acceptFn(SparkListenerBlockManagerAdded(0, bmId, 1))) + assert(Some(true) === acceptFn(SparkListenerBlockManagerRemoved(1, bmId))) + assert(Some(true) === acceptFn(SparkListenerBlockUpdated( + storage.BlockUpdatedInfo(bmId, RDDBlockId(1, 1), StorageLevel.DISK_ONLY, 0, 10)))) + } + + test("filter out events for dead executors") { + // assume executor 1 was dead, and live executor 2 is available + val liveExecutors: Set[String] = Set("2") + + val filter = new BasicEventFilter(EMPTY_STATS, Set.empty, Set.empty, Set.empty, Set.empty, + liveExecutors) + val acceptFn = filter.acceptFn().lift + + // events for dead executor should be rejected + assert(Some(false) === acceptFn(createExecutorAddedEvent(1))) + // though the name of event is stage executor metrics, AppStatusListener only deals with + // live executors + assert(Some(false) === acceptFn( + SparkListenerStageExecutorMetrics(1.toString, 0, 0, new ExecutorMetrics))) + assert(Some(false) === acceptFn(SparkListenerExecutorBlacklisted(0, 1.toString, 1))) + assert(Some(false) === acceptFn(SparkListenerExecutorUnblacklisted(0, 1.toString))) + assert(Some(false) === acceptFn(createExecutorRemovedEvent(1))) + val bmId = BlockManagerId(1.toString, "host1", 1) + assert(Some(false) === acceptFn(SparkListenerBlockManagerAdded(0, bmId, 1))) + assert(Some(false) === acceptFn(SparkListenerBlockManagerRemoved(1, bmId))) + assert(Some(false) === acceptFn(SparkListenerBlockUpdated( + storage.BlockUpdatedInfo(bmId, RDDBlockId(1, 1), StorageLevel.DISK_ONLY, 0, 10)))) + + // events for live executor should be accepted + assert(Some(true) === acceptFn(createExecutorAddedEvent(2))) + assert(Some(true) === acceptFn( + SparkListenerStageExecutorMetrics(2.toString, 0, 0, new ExecutorMetrics))) + assert(Some(true) === acceptFn(SparkListenerExecutorBlacklisted(0, 2.toString, 1))) + assert(Some(true) === acceptFn(SparkListenerExecutorUnblacklisted(0, 2.toString))) + assert(Some(true) === acceptFn(createExecutorRemovedEvent(2))) + val bmId2 = BlockManagerId(2.toString, "host1", 1) + assert(Some(true) === acceptFn(SparkListenerBlockManagerAdded(0, bmId2, 1))) + assert(Some(true) === acceptFn(SparkListenerBlockManagerRemoved(1, bmId2))) + assert(Some(true) === acceptFn(SparkListenerBlockUpdated( + storage.BlockUpdatedInfo(bmId2, RDDBlockId(1, 1), StorageLevel.DISK_ONLY, 0, 10)))) + } + + test("other events should be left to other filters") { + val filter = new BasicEventFilter(EMPTY_STATS, Set.empty, Set.empty, Set.empty, Set.empty, + Set.empty) + val acceptFn = filter.acceptFn().lift + + assert(None === acceptFn(SparkListenerEnvironmentUpdate(Map.empty))) + assert(None === acceptFn(SparkListenerApplicationStart("1", Some("1"), 0, "user", None))) + assert(None === acceptFn(SparkListenerApplicationEnd(1))) + assert(None === acceptFn(SparkListenerNodeBlacklisted(0, "host1", 1))) + assert(None === acceptFn(SparkListenerNodeUnblacklisted(0, "host1"))) + assert(None === acceptFn(SparkListenerLogStart("testVersion"))) + } + + private def assertFilterStageEvents( + acceptFn: SparkListenerEvent => Option[Boolean], + stageSubmitted: SparkListenerStageSubmitted, + stageCompleted: SparkListenerStageCompleted, + unpersistRDDs: Seq[SparkListenerUnpersistRDD], + taskSpeculativeSubmitted: SparkListenerSpeculativeTaskSubmitted, + expectedVal: Option[Boolean]): Unit = { + assert(acceptFn(stageSubmitted) === expectedVal) + assert(acceptFn(stageCompleted) === expectedVal) + unpersistRDDs.foreach { event => + assert(acceptFn(event) === expectedVal) + } + assert(acceptFn(taskSpeculativeSubmitted) === expectedVal) + } + + private def assertFilterTaskEvents( + acceptFn: SparkListenerEvent => Option[Boolean], + taskInfos: Seq[TaskInfo], + stageInfo: StageInfo, + expectedVal: Option[Boolean]): Unit = { + taskInfos.foreach { task => + val taskStartEvent = SparkListenerTaskStart(stageInfo.stageId, 0, task) + assert(acceptFn(taskStartEvent) === expectedVal) + + val taskGettingResultEvent = SparkListenerTaskGettingResult(task) + assert(acceptFn(taskGettingResultEvent) === expectedVal) + + val taskEndEvent = SparkListenerTaskEnd(stageInfo.stageId, 0, "taskType", + Success, task, new ExecutorMetrics, null) + assert(acceptFn(taskEndEvent) === expectedVal) + } + } +} + +object BasicEventFilterSuite { + val EMPTY_STATS = FilterStatistics(0, 0, 0, 0, 0, 0) +} diff --git a/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileCompactorSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileCompactorSuite.scala new file mode 100644 index 0000000000000..2a914023ec821 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileCompactorSuite.scala @@ -0,0 +1,335 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy.history + +import scala.collection.mutable +import scala.io.{Codec, Source} + +import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} +import org.json4s.jackson.JsonMethods.parse + +import org.apache.spark.{SparkConf, SparkFunSuite, Success} +import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.deploy.history.EventLogTestHelper.writeEventsToRollingWriter +import org.apache.spark.executor.ExecutorMetrics +import org.apache.spark.scheduler._ +import org.apache.spark.scheduler.cluster.ExecutorInfo +import org.apache.spark.status.ListenerEventsTestHelper._ +import org.apache.spark.storage.BlockManagerId +import org.apache.spark.util.{JsonProtocol, Utils} + +class EventLogFileCompactorSuite extends SparkFunSuite { + import EventLogFileCompactorSuite._ + + private val sparkConf = new SparkConf() + private val hadoopConf = SparkHadoopUtil.newConfiguration(sparkConf) + + test("No event log files") { + withTempDir { dir => + val fs = new Path(dir.getAbsolutePath).getFileSystem(hadoopConf) + val compactor = new EventLogFileCompactor(sparkConf, hadoopConf, fs, + TEST_ROLLING_MAX_FILES_TO_RETAIN, TEST_COMPACTION_SCORE_THRESHOLD) + + assertNoCompaction(fs, Seq.empty, compactor.compact(Seq.empty), + CompactionResultCode.NOT_ENOUGH_FILES) + } + } + + test("No compact file, less origin files available than max files to retain") { + withTempDir { dir => + val fs = new Path(dir.getAbsolutePath).getFileSystem(hadoopConf) + + val fileStatuses = writeEventsToRollingWriter(fs, "app", dir, sparkConf, hadoopConf, + (1 to 2).map(_ => testEvent): _*) + val compactor = new EventLogFileCompactor(sparkConf, hadoopConf, fs, + TEST_ROLLING_MAX_FILES_TO_RETAIN, TEST_COMPACTION_SCORE_THRESHOLD) + assertNoCompaction(fs, fileStatuses, compactor.compact(fileStatuses), + CompactionResultCode.NOT_ENOUGH_FILES) + } + } + + test("No compact file, more origin files available than max files to retain") { + withTempDir { dir => + val fs = new Path(dir.getAbsolutePath).getFileSystem(hadoopConf) + + val fileStatuses = writeEventsToRollingWriter(fs, "app", dir, sparkConf, hadoopConf, + (1 to 5).map(_ => testEvent): _*) + val compactor = new EventLogFileCompactor(sparkConf, hadoopConf, fs, + TEST_ROLLING_MAX_FILES_TO_RETAIN, TEST_COMPACTION_SCORE_THRESHOLD) + assertCompaction(fs, fileStatuses, compactor.compact(fileStatuses), + expectedNumOfFilesCompacted = 2) + } + } + + test("compact file exists, less origin files available than max files to retain") { + withTempDir { dir => + val fs = new Path(dir.getAbsolutePath).getFileSystem(hadoopConf) + + val fileStatuses = writeEventsToRollingWriter(fs, "app", dir, sparkConf, hadoopConf, + (1 to 2).map(_ => testEvent): _*) + + val fileToCompact = fileStatuses.head.getPath + val compactedPath = new Path(fileToCompact.getParent, + fileToCompact.getName + EventLogFileWriter.COMPACTED) + assert(fs.rename(fileToCompact, compactedPath)) + + val newFileStatuses = Seq(fs.getFileStatus(compactedPath)) ++ fileStatuses.drop(1) + val compactor = new EventLogFileCompactor(sparkConf, hadoopConf, fs, + TEST_ROLLING_MAX_FILES_TO_RETAIN, TEST_COMPACTION_SCORE_THRESHOLD) + assertNoCompaction(fs, newFileStatuses, compactor.compact(newFileStatuses), + CompactionResultCode.NOT_ENOUGH_FILES) + } + } + + test("compact file exists, number of origin files are same as max files to retain") { + withTempDir { dir => + val fs = new Path(dir.getAbsolutePath).getFileSystem(hadoopConf) + + val fileStatuses = writeEventsToRollingWriter(fs, "app", dir, sparkConf, hadoopConf, + (1 to 4).map(_ => testEvent): _*) + + val fileToCompact = fileStatuses.head.getPath + val compactedPath = new Path(fileToCompact.getParent, + fileToCompact.getName + EventLogFileWriter.COMPACTED) + assert(fs.rename(fileToCompact, compactedPath)) + + val newFileStatuses = Seq(fs.getFileStatus(compactedPath)) ++ fileStatuses.drop(1) + val compactor = new EventLogFileCompactor(sparkConf, hadoopConf, fs, + TEST_ROLLING_MAX_FILES_TO_RETAIN, TEST_COMPACTION_SCORE_THRESHOLD) + assertNoCompaction(fs, newFileStatuses, compactor.compact(newFileStatuses), + CompactionResultCode.NOT_ENOUGH_FILES) + } + } + + test("compact file exists, more origin files available than max files to retain") { + withTempDir { dir => + val fs = new Path(dir.getAbsolutePath).getFileSystem(hadoopConf) + + val fileStatuses = writeEventsToRollingWriter(fs, "app", dir, sparkConf, hadoopConf, + (1 to 10).map(_ => testEvent): _*) + + val fileToCompact = fileStatuses.head.getPath + val compactedPath = new Path(fileToCompact.getParent, + fileToCompact.getName + EventLogFileWriter.COMPACTED) + assert(fs.rename(fileToCompact, compactedPath)) + + val newFileStatuses = Seq(fs.getFileStatus(compactedPath)) ++ fileStatuses.drop(1) + val compactor = new EventLogFileCompactor(sparkConf, hadoopConf, fs, + TEST_ROLLING_MAX_FILES_TO_RETAIN, TEST_COMPACTION_SCORE_THRESHOLD) + assertCompaction(fs, newFileStatuses, compactor.compact(newFileStatuses), + expectedNumOfFilesCompacted = 7) + } + } + + test("events for finished job are dropped in new compact file") { + withTempDir { dir => + val fs = new Path(dir.getAbsolutePath).getFileSystem(hadoopConf) + + // 1, 2 will be compacted into one file, 3~5 are dummies to ensure max files to retain + val fileStatuses = writeEventsToRollingWriter(fs, "app", dir, sparkConf, hadoopConf, + Seq( + SparkListenerExecutorAdded(0, "exec1", new ExecutorInfo("host1", 1, Map.empty)), + SparkListenerJobStart(1, 0, Seq.empty)), + Seq( + SparkListenerJobEnd(1, 1, JobSucceeded), + SparkListenerExecutorAdded(2, "exec2", new ExecutorInfo("host2", 1, Map.empty))), + testEvent, + testEvent, + testEvent) + + val compactor = new EventLogFileCompactor(sparkConf, hadoopConf, fs, + TEST_ROLLING_MAX_FILES_TO_RETAIN, TEST_COMPACTION_SCORE_THRESHOLD) + assertCompaction(fs, fileStatuses, compactor.compact(fileStatuses), + expectedNumOfFilesCompacted = 2) + + val expectCompactFileBasePath = fileStatuses.take(2).last.getPath + val compactFilePath = getCompactFilePath(expectCompactFileBasePath) + Utils.tryWithResource(EventLogFileReader.openEventLog(compactFilePath, fs)) { is => + val lines = Source.fromInputStream(is)(Codec.UTF8).getLines().toList + assert(lines.length === 2, "Compacted file should have only two events being accepted") + lines.foreach { line => + val event = JsonProtocol.sparkEventFromJson(parse(line)) + assert(!event.isInstanceOf[SparkListenerJobStart] && + !event.isInstanceOf[SparkListenerJobEnd]) + } + } + } + } + + test("Don't compact file if score is lower than threshold") { + withTempDir { dir => + val fs = new Path(dir.getAbsolutePath).getFileSystem(hadoopConf) + + // job 1 having 4 tasks + val rddsForStage1 = createRddsWithId(1 to 2) + val stage1 = createStage(1, rddsForStage1, Nil) + val tasks = createTasks(4, Array("exec1"), 0L).map(createTaskStartEvent(_, 1, 0)) + + // job 2 having 4 tasks + val rddsForStage2 = createRddsWithId(3 to 4) + val stage2 = createStage(2, rddsForStage2, Nil) + val tasks2 = createTasks(4, Array("exec1"), 0L).map(createTaskStartEvent(_, 2, 0)) + + // here job 1 is finished and job 2 is still live, hence half of total tasks are considered + // as live + val fileStatuses = writeEventsToRollingWriter(fs, "app", dir, sparkConf, hadoopConf, + Seq(SparkListenerJobStart(1, 0, Seq(stage1)), SparkListenerStageSubmitted(stage1)), + tasks, + Seq(SparkListenerJobStart(2, 0, Seq(stage2)), SparkListenerStageSubmitted(stage2)), + tasks2, + Seq(SparkListenerJobEnd(1, 0, JobSucceeded)), + testEvent, + testEvent, + testEvent) + + val compactor = new EventLogFileCompactor(sparkConf, hadoopConf, fs, + TEST_ROLLING_MAX_FILES_TO_RETAIN, 0.7d) + assertNoCompaction(fs, fileStatuses, compactor.compact(fileStatuses), + CompactionResultCode.LOW_SCORE_FOR_COMPACTION) + } + } + + test("rewrite files with test filters") { + class TestEventFilter1 extends EventFilter { + override def acceptFn(): PartialFunction[SparkListenerEvent, Boolean] = { + case _: SparkListenerApplicationEnd => true + case _: SparkListenerBlockManagerAdded => true + case _: SparkListenerApplicationStart => false + } + + override def statistics(): Option[EventFilter.FilterStatistics] = None + } + + class TestEventFilter2 extends EventFilter { + override def acceptFn(): PartialFunction[SparkListenerEvent, Boolean] = { + case _: SparkListenerApplicationEnd => true + case _: SparkListenerEnvironmentUpdate => true + case _: SparkListenerNodeBlacklisted => true + case _: SparkListenerBlockManagerAdded => false + case _: SparkListenerApplicationStart => false + case _: SparkListenerNodeUnblacklisted => false + } + + override def statistics(): Option[EventFilter.FilterStatistics] = None + } + + def writeEventToWriter(writer: EventLogFileWriter, event: SparkListenerEvent): String = { + val line = EventLogTestHelper.convertEvent(event) + writer.writeEvent(line, flushLogger = true) + line + } + + withTempDir { tempDir => + val sparkConf = new SparkConf + val hadoopConf = SparkHadoopUtil.newConfiguration(sparkConf) + val fs = new Path(tempDir.getAbsolutePath).getFileSystem(hadoopConf) + + val writer = new SingleEventLogFileWriter("app", None, tempDir.toURI, sparkConf, hadoopConf) + writer.start() + + val expectedLines = new mutable.ArrayBuffer[String] + + // filterApplicationEnd: Some(true) & Some(true) => filter in + expectedLines += writeEventToWriter(writer, SparkListenerApplicationEnd(0)) + + // filterBlockManagerAdded: Some(true) & Some(false) => filter in + expectedLines += writeEventToWriter(writer, SparkListenerBlockManagerAdded( + 0, BlockManagerId("1", "host1", 1), 10)) + + // filterApplicationStart: Some(false) & Some(false) => filter out + writeEventToWriter(writer, SparkListenerApplicationStart("app", None, 0, "user", None)) + + // filterNodeBlacklisted: None & Some(true) => filter in + expectedLines += writeEventToWriter(writer, SparkListenerNodeBlacklisted(0, "host1", 1)) + + // filterNodeUnblacklisted: None & Some(false) => filter out + writeEventToWriter(writer, SparkListenerNodeUnblacklisted(0, "host1")) + + // other events: None & None => filter in + expectedLines += writeEventToWriter(writer, SparkListenerUnpersistRDD(0)) + + writer.stop() + + val filters = Seq(new TestEventFilter1, new TestEventFilter2) + + val logPath = new Path(writer.logPath) + val compactor = new EventLogFileCompactor(sparkConf, hadoopConf, fs, + TEST_ROLLING_MAX_FILES_TO_RETAIN, TEST_COMPACTION_SCORE_THRESHOLD) + val newPath = compactor.rewrite(filters, Seq(fs.getFileStatus(logPath))) + assert(new Path(newPath).getName === logPath.getName + EventLogFileWriter.COMPACTED) + + Utils.tryWithResource(EventLogFileReader.openEventLog(new Path(newPath), fs)) { is => + val lines = Source.fromInputStream(is)(Codec.UTF8).getLines() + var linesLength = 0 + lines.foreach { line => + linesLength += 1 + assert(expectedLines.contains(line)) + } + assert(linesLength === expectedLines.length) + } + } + } + + private def assertCompaction( + fs: FileSystem, + originalFiles: Seq[FileStatus], + compactRet: CompactionResult, + expectedNumOfFilesCompacted: Int): Unit = { + assert(CompactionResultCode.SUCCESS === compactRet.code) + + val expectRetainedFiles = originalFiles.drop(expectedNumOfFilesCompacted) + expectRetainedFiles.foreach { status => assert(fs.exists(status.getPath)) } + + val expectRemovedFiles = originalFiles.take(expectedNumOfFilesCompacted) + expectRemovedFiles.foreach { status => assert(!fs.exists(status.getPath)) } + + val expectCompactFileBasePath = originalFiles.take(expectedNumOfFilesCompacted).last.getPath + val expectCompactFileIndex = RollingEventLogFilesWriter.getEventLogFileIndex( + expectCompactFileBasePath.getName) + assert(Some(expectCompactFileIndex) === compactRet.compactIndex) + + val expectCompactFilePath = getCompactFilePath(expectCompactFileBasePath) + assert(fs.exists(expectCompactFilePath)) + } + + private def getCompactFilePath(expectCompactFileBasePath: Path): Path = { + new Path(expectCompactFileBasePath.getParent, + expectCompactFileBasePath.getName + EventLogFileWriter.COMPACTED) + } + + private def assertNoCompaction( + fs: FileSystem, + originalFiles: Seq[FileStatus], + compactRet: CompactionResult, + expectedCompactRet: CompactionResultCode.Value): Unit = { + assert(expectedCompactRet === compactRet.code) + assert(None === compactRet.compactIndex) + originalFiles.foreach { status => assert(fs.exists(status.getPath)) } + } + + private def testEvent: Seq[SparkListenerEvent] = + Seq(SparkListenerApplicationStart("app", Some("app"), 0, "user", None)) +} + +object EventLogFileCompactorSuite { + val TEST_ROLLING_MAX_FILES_TO_RETAIN = 3 + + // To simplify the tests, we set the score threshold as 0.0d. + // Individual test can use the other value to verify the functionality. + val TEST_COMPACTION_SCORE_THRESHOLD = 0.0d +} diff --git a/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileReadersSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileReadersSuite.scala new file mode 100644 index 0000000000000..8eab2da1a37b7 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileReadersSuite.scala @@ -0,0 +1,347 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy.history + +import java.io.{ByteArrayInputStream, ByteArrayOutputStream, File} +import java.net.URI +import java.nio.charset.StandardCharsets +import java.util.zip.{ZipInputStream, ZipOutputStream} + +import com.google.common.io.{ByteStreams, Files} +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.scalatest.BeforeAndAfter + +import org.apache.spark.{LocalSparkContext, SparkConf, SparkFunSuite} +import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.deploy.history.EventLogTestHelper._ +import org.apache.spark.deploy.history.RollingEventLogFilesWriter._ +import org.apache.spark.internal.Logging +import org.apache.spark.internal.config._ +import org.apache.spark.io.CompressionCodec +import org.apache.spark.util.Utils + + +abstract class EventLogFileReadersSuite extends SparkFunSuite with LocalSparkContext + with BeforeAndAfter with Logging { + + protected val fileSystem = Utils.getHadoopFileSystem("/", SparkHadoopUtil.get.conf) + protected var testDir: File = _ + protected var testDirPath: Path = _ + + before { + testDir = Utils.createTempDir(namePrefix = s"event log") + testDirPath = new Path(testDir.getAbsolutePath()) + } + + after { + Utils.deleteRecursively(testDir) + } + + test("Retrieve EventLogFileReader correctly") { + def assertInstanceOfEventLogReader( + expectedClazz: Option[Class[_ <: EventLogFileReader]], + actual: Option[EventLogFileReader]): Unit = { + if (expectedClazz.isEmpty) { + assert(actual.isEmpty, s"Expected no EventLogFileReader instance but was " + + s"${actual.map(_.getClass).getOrElse("")}") + } else { + assert(actual.isDefined, s"Expected an EventLogFileReader instance but was empty") + assert(expectedClazz.get.isAssignableFrom(actual.get.getClass), + s"Expected ${expectedClazz.get} but was ${actual.get.getClass}") + } + } + + def testCreateEventLogReaderWithPath( + path: Path, + isFile: Boolean, + expectedClazz: Option[Class[_ <: EventLogFileReader]]): Unit = { + if (isFile) { + Utils.tryWithResource(fileSystem.create(path)) { is => + is.writeInt(10) + } + } else { + fileSystem.mkdirs(path) + } + + val reader = EventLogFileReader(fileSystem, path) + assertInstanceOfEventLogReader(expectedClazz, reader) + val reader2 = EventLogFileReader(fileSystem, + fileSystem.getFileStatus(path)) + assertInstanceOfEventLogReader(expectedClazz, reader2) + } + + // path with no last index - single event log + val reader1 = EventLogFileReader(fileSystem, new Path(testDirPath, "aaa"), + None) + assertInstanceOfEventLogReader(Some(classOf[SingleFileEventLogFileReader]), Some(reader1)) + + // path with last index - rolling event log + val reader2 = EventLogFileReader(fileSystem, + new Path(testDirPath, s"${EVENT_LOG_DIR_NAME_PREFIX}aaa"), Some(3)) + assertInstanceOfEventLogReader(Some(classOf[RollingEventLogFilesFileReader]), Some(reader2)) + + // path - file (both path and FileStatus) + val eventLogFile = new Path(testDirPath, "bbb") + testCreateEventLogReaderWithPath(eventLogFile, isFile = true, + Some(classOf[SingleFileEventLogFileReader])) + + // path - file starting with "." + val invalidEventLogFile = new Path(testDirPath, ".bbb") + testCreateEventLogReaderWithPath(invalidEventLogFile, isFile = true, None) + + // path - directory with "eventlog_v2_" prefix + val eventLogDir = new Path(testDirPath, s"${EVENT_LOG_DIR_NAME_PREFIX}ccc") + testCreateEventLogReaderWithPath(eventLogDir, isFile = false, + Some(classOf[RollingEventLogFilesFileReader])) + + // path - directory with no "eventlog_v2_" prefix + val invalidEventLogDir = new Path(testDirPath, "ccc") + testCreateEventLogReaderWithPath(invalidEventLogDir, isFile = false, None) + } + + val allCodecs = Seq(None) ++ + CompressionCodec.ALL_COMPRESSION_CODECS.map { c => Some(CompressionCodec.getShortName(c)) } + + allCodecs.foreach { codecShortName => + test(s"get information, list event log files, zip log files - with codec $codecShortName") { + val appId = getUniqueApplicationId + val attemptId = None + + val conf = getLoggingConf(testDirPath, codecShortName) + val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf) + + val writer = createWriter(appId, attemptId, testDirPath.toUri, conf, hadoopConf) + writer.start() + + // The test for writing events into EventLogFileWriter is covered to its own test suite. + val dummyData = Seq("dummy1", "dummy2", "dummy3") + dummyData.foreach(writer.writeEvent(_, flushLogger = true)) + + val logPathIncompleted = getCurrentLogPath(writer.logPath, isCompleted = false) + val readerOpt = EventLogFileReader(fileSystem, new Path(logPathIncompleted)) + assertAppropriateReader(readerOpt) + val reader = readerOpt.get + + verifyReader(reader, new Path(logPathIncompleted), codecShortName, isCompleted = false) + + writer.stop() + + val logPathCompleted = getCurrentLogPath(writer.logPath, isCompleted = true) + val readerOpt2 = EventLogFileReader(fileSystem, new Path(logPathCompleted)) + assertAppropriateReader(readerOpt2) + val reader2 = readerOpt2.get + + verifyReader(reader2, new Path(logPathCompleted), codecShortName, isCompleted = true) + } + } + + protected def createWriter( + appId: String, + appAttemptId : Option[String], + logBaseDir: URI, + sparkConf: SparkConf, + hadoopConf: Configuration): EventLogFileWriter + + protected def getCurrentLogPath(logPath: String, isCompleted: Boolean): String + + protected def assertAppropriateReader(actualReader: Option[EventLogFileReader]): Unit + + protected def verifyReader( + reader: EventLogFileReader, + logPath: Path, + compressionCodecShortName: Option[String], + isCompleted: Boolean): Unit +} + +class SingleFileEventLogFileReaderSuite extends EventLogFileReadersSuite { + override protected def createWriter( + appId: String, + appAttemptId: Option[String], + logBaseDir: URI, + sparkConf: SparkConf, + hadoopConf: Configuration): EventLogFileWriter = { + new SingleEventLogFileWriter(appId, appAttemptId, logBaseDir, sparkConf, hadoopConf) + } + + override protected def assertAppropriateReader(actualReader: Option[EventLogFileReader]): Unit = { + assert(actualReader.isDefined, s"Expected an EventLogReader instance but was empty") + assert(actualReader.get.isInstanceOf[SingleFileEventLogFileReader], + s"Expected SingleFileEventLogReader but was ${actualReader.get.getClass}") + } + + override protected def getCurrentLogPath(logPath: String, isCompleted: Boolean): String = { + if (!isCompleted) logPath + EventLogFileWriter.IN_PROGRESS else logPath + } + + override protected def verifyReader( + reader: EventLogFileReader, + logPath: Path, + compressionCodecShortName: Option[String], + isCompleted: Boolean): Unit = { + val status = fileSystem.getFileStatus(logPath) + + assert(status.isFile) + assert(reader.rootPath === fileSystem.makeQualified(logPath)) + assert(reader.lastIndex.isEmpty) + assert(reader.fileSizeForLastIndex === status.getLen) + assert(reader.completed === isCompleted) + assert(reader.modificationTime === status.getModificationTime) + assert(reader.listEventLogFiles.length === 1) + assert(reader.listEventLogFiles.map(_.getPath.toUri.getPath) === + Seq(logPath.toUri.getPath)) + assert(reader.compressionCodec === compressionCodecShortName) + assert(reader.totalSize === status.getLen) + + val underlyingStream = new ByteArrayOutputStream() + Utils.tryWithResource(new ZipOutputStream(underlyingStream)) { os => + reader.zipEventLogFiles(os) + } + + Utils.tryWithResource(new ZipInputStream( + new ByteArrayInputStream(underlyingStream.toByteArray))) { is => + + var entry = is.getNextEntry + assert(entry != null) + val actual = new String(ByteStreams.toByteArray(is), StandardCharsets.UTF_8) + val expected = Files.toString(new File(logPath.toString), StandardCharsets.UTF_8) + assert(actual === expected) + assert(is.getNextEntry === null) + } + } +} + +class RollingEventLogFilesReaderSuite extends EventLogFileReadersSuite { + allCodecs.foreach { codecShortName => + test(s"rolling event log files - codec $codecShortName") { + val appId = getUniqueApplicationId + val attemptId = None + + val conf = getLoggingConf(testDirPath, codecShortName) + conf.set(EVENT_LOG_ENABLE_ROLLING, true) + conf.set(EVENT_LOG_ROLLING_MAX_FILE_SIZE.key, "10m") + + val writer = createWriter(appId, attemptId, testDirPath.toUri, conf, + SparkHadoopUtil.get.newConfiguration(conf)) + + writer.start() + + // write log more than 20m (intended to roll over to 3 files) + val dummyStr = "dummy" * 1024 + writeTestEvents(writer, dummyStr, 1024 * 1024 * 20) + + val logPathIncompleted = getCurrentLogPath(writer.logPath, isCompleted = false) + val readerOpt = EventLogFileReader(fileSystem, + new Path(logPathIncompleted)) + verifyReader(readerOpt.get, new Path(logPathIncompleted), codecShortName, isCompleted = false) + assert(readerOpt.get.listEventLogFiles.length === 3) + + writer.stop() + + val logPathCompleted = getCurrentLogPath(writer.logPath, isCompleted = true) + val readerOpt2 = EventLogFileReader(fileSystem, new Path(logPathCompleted)) + verifyReader(readerOpt2.get, new Path(logPathCompleted), codecShortName, isCompleted = true) + assert(readerOpt2.get.listEventLogFiles.length === 3) + } + } + + override protected def createWriter( + appId: String, + appAttemptId: Option[String], + logBaseDir: URI, + sparkConf: SparkConf, + hadoopConf: Configuration): EventLogFileWriter = { + new RollingEventLogFilesWriter(appId, appAttemptId, logBaseDir, sparkConf, hadoopConf) + } + + override protected def assertAppropriateReader(actualReader: Option[EventLogFileReader]): Unit = { + assert(actualReader.isDefined, s"Expected an EventLogReader instance but was empty") + assert(actualReader.get.isInstanceOf[RollingEventLogFilesFileReader], + s"Expected RollingEventLogFilesReader but was ${actualReader.get.getClass}") + } + + override protected def getCurrentLogPath(logPath: String, isCompleted: Boolean): String = logPath + + override protected def verifyReader( + reader: EventLogFileReader, + logPath: Path, + compressionCodecShortName: Option[String], + isCompleted: Boolean): Unit = { + import RollingEventLogFilesWriter._ + + val status = fileSystem.getFileStatus(logPath) + assert(status.isDirectory) + + val statusInDir = fileSystem.listStatus(logPath) + val eventFiles = statusInDir.filter(isEventLogFile).sortBy { s => + getEventLogFileIndex(s.getPath.getName) + } + assert(eventFiles.nonEmpty) + val lastEventFile = eventFiles.last + val allLen = eventFiles.map(_.getLen).sum + + assert(reader.rootPath === fileSystem.makeQualified(logPath)) + assert(reader.lastIndex === Some(getEventLogFileIndex(lastEventFile.getPath.getName))) + assert(reader.fileSizeForLastIndex === lastEventFile.getLen) + assert(reader.completed === isCompleted) + assert(reader.modificationTime === lastEventFile.getModificationTime) + assert(reader.listEventLogFiles.length === eventFiles.length) + assert(reader.listEventLogFiles.map(_.getPath) === eventFiles.map(_.getPath)) + assert(reader.compressionCodec === compressionCodecShortName) + assert(reader.totalSize === allLen) + + val underlyingStream = new ByteArrayOutputStream() + Utils.tryWithResource(new ZipOutputStream(underlyingStream)) { os => + reader.zipEventLogFiles(os) + } + + Utils.tryWithResource(new ZipInputStream( + new ByteArrayInputStream(underlyingStream.toByteArray))) { is => + + val entry = is.getNextEntry + assert(entry != null) + + // directory + assert(entry.getName === logPath.getName + "/") + + val allFileNames = fileSystem.listStatus(logPath).map(_.getPath.getName).toSet + + var count = 0 + var noMoreEntry = false + while (!noMoreEntry) { + val entry = is.getNextEntry + if (entry == null) { + noMoreEntry = true + } else { + count += 1 + + assert(entry.getName.startsWith(logPath.getName + "/")) + val fileName = entry.getName.stripPrefix(logPath.getName + "/") + assert(allFileNames.contains(fileName)) + + val actual = new String(ByteStreams.toByteArray(is), StandardCharsets.UTF_8) + val expected = Files.toString(new File(logPath.toString, fileName), + StandardCharsets.UTF_8) + assert(actual === expected) + } + } + + assert(count === allFileNames.size) + } + } +} diff --git a/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileWritersSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileWritersSuite.scala new file mode 100644 index 0000000000000..060b878fb8ef2 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileWritersSuite.scala @@ -0,0 +1,378 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy.history + +import java.io.{File, FileOutputStream, IOException} +import java.net.URI + +import scala.collection.mutable +import scala.io.Source + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} +import org.scalatest.BeforeAndAfter + +import org.apache.spark.{LocalSparkContext, SparkConf, SparkFunSuite} +import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.deploy.history.EventLogTestHelper._ +import org.apache.spark.internal.config._ +import org.apache.spark.io.CompressionCodec +import org.apache.spark.util.Utils + + +abstract class EventLogFileWritersSuite extends SparkFunSuite with LocalSparkContext + with BeforeAndAfter { + + protected val fileSystem = Utils.getHadoopFileSystem("/", + SparkHadoopUtil.get.newConfiguration(new SparkConf())) + protected var testDir: File = _ + protected var testDirPath: Path = _ + + before { + testDir = Utils.createTempDir(namePrefix = s"event log") + testDirPath = new Path(testDir.getAbsolutePath()) + } + + after { + Utils.deleteRecursively(testDir) + } + + test("create EventLogFileWriter with enable/disable rolling") { + def buildWriterAndVerify(conf: SparkConf, expectedClazz: Class[_]): Unit = { + val writer = EventLogFileWriter( + getUniqueApplicationId, None, testDirPath.toUri, conf, + SparkHadoopUtil.get.newConfiguration(conf)) + val writerClazz = writer.getClass + assert(expectedClazz === writerClazz) + } + + val conf = new SparkConf + conf.set(EVENT_LOG_ENABLED, true) + conf.set(EVENT_LOG_DIR, testDir.toString) + + // default config + buildWriterAndVerify(conf, classOf[SingleEventLogFileWriter]) + + conf.set(EVENT_LOG_ENABLE_ROLLING, true) + buildWriterAndVerify(conf, classOf[RollingEventLogFilesWriter]) + + conf.set(EVENT_LOG_ENABLE_ROLLING, false) + buildWriterAndVerify(conf, classOf[SingleEventLogFileWriter]) + } + + val allCodecs = Seq(None) ++ + CompressionCodec.ALL_COMPRESSION_CODECS.map(c => Some(CompressionCodec.getShortName(c))) + + allCodecs.foreach { codecShortName => + test(s"initialize, write, stop - with codec $codecShortName") { + val appId = getUniqueApplicationId + val attemptId = None + + val conf = getLoggingConf(testDirPath, codecShortName) + val writer = createWriter(appId, attemptId, testDirPath.toUri, conf, + SparkHadoopUtil.get.newConfiguration(conf)) + + writer.start() + + // snappy stream throws exception on empty stream, so we should provide some data to test. + val dummyData = Seq("dummy1", "dummy2", "dummy3") + dummyData.foreach(writer.writeEvent(_, flushLogger = true)) + + writer.stop() + + verifyWriteEventLogFile(appId, attemptId, testDirPath.toUri, codecShortName, dummyData) + } + } + + test("spark.eventLog.compression.codec overrides spark.io.compression.codec") { + val conf = new SparkConf + conf.set(EVENT_LOG_COMPRESS, true) + val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf) + + val appId = "test" + val appAttemptId = None + + // The default value is `spark.io.compression.codec`. + val writer = createWriter(appId, appAttemptId, testDirPath.toUri, conf, hadoopConf) + assert(writer.compressionCodecName.contains("lz4")) + + // `spark.eventLog.compression.codec` overrides `spark.io.compression.codec`. + conf.set(EVENT_LOG_COMPRESSION_CODEC, "zstd") + val writer2 = createWriter(appId, appAttemptId, testDirPath.toUri, conf, hadoopConf) + assert(writer2.compressionCodecName.contains("zstd")) + } + + protected def readLinesFromEventLogFile(log: Path, fs: FileSystem): List[String] = { + val logDataStream = EventLogFileReader.openEventLog(log, fs) + try { + Source.fromInputStream(logDataStream).getLines().toList + } finally { + logDataStream.close() + } + } + + protected def createWriter( + appId: String, + appAttemptId : Option[String], + logBaseDir: URI, + sparkConf: SparkConf, + hadoopConf: Configuration): EventLogFileWriter + + /** + * This should be called with "closed" event log file; No guarantee on reading event log file + * which is being written, especially the file is compressed. SHS also does the best it can. + */ + protected def verifyWriteEventLogFile( + appId: String, + appAttemptId : Option[String], + logBaseDir: URI, + compressionCodecShortName: Option[String], + expectedLines: Seq[String] = Seq.empty): Unit +} + +class SingleEventLogFileWriterSuite extends EventLogFileWritersSuite { + + test("Log overwriting") { + val appId = "test" + val appAttemptId = None + val logUri = SingleEventLogFileWriter.getLogPath(testDir.toURI, appId, appAttemptId) + + val conf = getLoggingConf(testDirPath) + val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf) + val writer = createWriter(appId, appAttemptId, testDir.toURI, conf, hadoopConf) + + val logPath = new Path(logUri).toUri.getPath + writer.start() + + val dummyData = Seq("dummy1", "dummy2", "dummy3") + dummyData.foreach(writer.writeEvent(_, flushLogger = true)) + + // Create file before writing the event log + new FileOutputStream(new File(logPath)).close() + // Expected IOException, since we haven't enabled log overwrite. + intercept[IOException] { writer.stop() } + + // Try again, but enable overwriting. + conf.set(EVENT_LOG_OVERWRITE, true) + val writer2 = createWriter(appId, appAttemptId, testDir.toURI, conf, hadoopConf) + writer2.start() + dummyData.foreach(writer2.writeEvent(_, flushLogger = true)) + writer2.stop() + } + + test("Event log name") { + val baseDirUri = Utils.resolveURI("/base-dir") + // without compression + assert(s"${baseDirUri.toString}/app1" === SingleEventLogFileWriter.getLogPath( + baseDirUri, "app1", None, None)) + // with compression + assert(s"${baseDirUri.toString}/app1.lzf" === + SingleEventLogFileWriter.getLogPath(baseDirUri, "app1", None, Some("lzf"))) + // illegal characters in app ID + assert(s"${baseDirUri.toString}/a-fine-mind_dollar_bills__1" === + SingleEventLogFileWriter.getLogPath(baseDirUri, + "a fine:mind$dollar{bills}.1", None, None)) + // illegal characters in app ID with compression + assert(s"${baseDirUri.toString}/a-fine-mind_dollar_bills__1.lz4" === + SingleEventLogFileWriter.getLogPath(baseDirUri, + "a fine:mind$dollar{bills}.1", None, Some("lz4"))) + } + + override protected def createWriter( + appId: String, + appAttemptId: Option[String], + logBaseDir: URI, + sparkConf: SparkConf, + hadoopConf: Configuration): EventLogFileWriter = { + new SingleEventLogFileWriter(appId, appAttemptId, logBaseDir, sparkConf, hadoopConf) + } + + override protected def verifyWriteEventLogFile( + appId: String, + appAttemptId: Option[String], + logBaseDir: URI, + compressionCodecShortName: Option[String], + expectedLines: Seq[String]): Unit = { + // read single event log file + val logPath = SingleEventLogFileWriter.getLogPath(logBaseDir, appId, appAttemptId, + compressionCodecShortName) + + val finalLogPath = new Path(logPath) + assert(fileSystem.exists(finalLogPath) && fileSystem.isFile(finalLogPath)) + assert(expectedLines === readLinesFromEventLogFile(finalLogPath, fileSystem)) + } +} + +class RollingEventLogFilesWriterSuite extends EventLogFileWritersSuite { + import RollingEventLogFilesWriter._ + + test("Event log names") { + val baseDirUri = Utils.resolveURI("/base-dir") + val appId = "app1" + val appAttemptId = None + + // happy case with app ID + val logDir = RollingEventLogFilesWriter.getAppEventLogDirPath(baseDirUri, appId, None) + assert(s"${baseDirUri.toString}/${EVENT_LOG_DIR_NAME_PREFIX}${appId}" === logDir.toString) + + // appstatus: inprogress or completed + assert(s"$logDir/${APPSTATUS_FILE_NAME_PREFIX}${appId}${EventLogFileWriter.IN_PROGRESS}" === + RollingEventLogFilesWriter.getAppStatusFilePath(logDir, appId, appAttemptId, + inProgress = true).toString) + assert(s"$logDir/${APPSTATUS_FILE_NAME_PREFIX}${appId}" === + RollingEventLogFilesWriter.getAppStatusFilePath(logDir, appId, appAttemptId, + inProgress = false).toString) + + // without compression + assert(s"$logDir/${EVENT_LOG_FILE_NAME_PREFIX}1_${appId}" === + RollingEventLogFilesWriter.getEventLogFilePath(logDir, appId, appAttemptId, 1, None).toString) + + // with compression + assert(s"$logDir/${EVENT_LOG_FILE_NAME_PREFIX}1_${appId}.lzf" === + RollingEventLogFilesWriter.getEventLogFilePath(logDir, appId, appAttemptId, + 1, Some("lzf")).toString) + + // illegal characters in app ID + assert(s"${baseDirUri.toString}/${EVENT_LOG_DIR_NAME_PREFIX}a-fine-mind_dollar_bills__1" === + RollingEventLogFilesWriter.getAppEventLogDirPath(baseDirUri, + "a fine:mind$dollar{bills}.1", None).toString) + } + + test("Log overwriting") { + val appId = "test" + val appAttemptId = None + val logDirPath = RollingEventLogFilesWriter.getAppEventLogDirPath(testDir.toURI, appId, + appAttemptId) + + val conf = getLoggingConf(testDirPath) + val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf) + val writer = createWriter(appId, appAttemptId, testDir.toURI, conf, hadoopConf) + + val logPath = logDirPath.toUri.getPath + + // Create file before writing the event log directory + // it doesn't matter whether the existing one is file or directory + new FileOutputStream(new File(logPath)).close() + + // Expected IOException, since we haven't enabled log overwrite. + // Note that the place IOException is thrown is different from single event log file. + intercept[IOException] { writer.start() } + + // Try again, but enable overwriting. + conf.set(EVENT_LOG_OVERWRITE, true) + + val writer2 = createWriter(appId, appAttemptId, testDir.toURI, conf, hadoopConf) + writer2.start() + val dummyData = Seq("dummy1", "dummy2", "dummy3") + dummyData.foreach(writer2.writeEvent(_, flushLogger = true)) + writer2.stop() + } + + allCodecs.foreach { codecShortName => + test(s"rolling event log files - codec $codecShortName") { + def assertEventLogFilesIndex( + eventLogFiles: Seq[FileStatus], + expectedLastIndex: Int, + expectedMaxSizeBytes: Long): Unit = { + assert(eventLogFiles.forall(f => f.getLen <= expectedMaxSizeBytes)) + assert((1 to expectedLastIndex) === + eventLogFiles.map(f => getEventLogFileIndex(f.getPath.getName))) + } + + val appId = getUniqueApplicationId + val attemptId = None + + val conf = getLoggingConf(testDirPath, codecShortName) + conf.set(EVENT_LOG_ENABLE_ROLLING, true) + conf.set(EVENT_LOG_ROLLING_MAX_FILE_SIZE.key, "10m") + + val writer = createWriter(appId, attemptId, testDirPath.toUri, conf, + SparkHadoopUtil.get.newConfiguration(conf)) + + writer.start() + + // write log more than 20m (intended to roll over to 3 files) + val dummyStr = "dummy" * 1024 + val expectedLines = writeTestEvents(writer, dummyStr, 1024 * 1024 * 21) + + val logDirPath = getAppEventLogDirPath(testDirPath.toUri, appId, attemptId) + + val eventLogFiles = listEventLogFiles(logDirPath) + assertEventLogFilesIndex(eventLogFiles, 3, 1024 * 1024 * 10) + + writer.stop() + + val eventLogFiles2 = listEventLogFiles(logDirPath) + assertEventLogFilesIndex(eventLogFiles2, 3, 1024 * 1024 * 10) + + verifyWriteEventLogFile(appId, attemptId, testDirPath.toUri, + codecShortName, expectedLines) + } + } + + test(s"rolling event log files - the max size of event log file size less than lower limit") { + val appId = getUniqueApplicationId + val attemptId = None + + val conf = getLoggingConf(testDirPath, None) + conf.set(EVENT_LOG_ENABLE_ROLLING, true) + conf.set(EVENT_LOG_ROLLING_MAX_FILE_SIZE.key, "9m") + + val e = intercept[IllegalArgumentException] { + createWriter(appId, attemptId, testDirPath.toUri, conf, + SparkHadoopUtil.get.newConfiguration(conf)) + } + assert(e.getMessage.contains("should be configured to be at least")) + } + + override protected def createWriter( + appId: String, + appAttemptId: Option[String], + logBaseDir: URI, + sparkConf: SparkConf, + hadoopConf: Configuration): EventLogFileWriter = { + new RollingEventLogFilesWriter(appId, appAttemptId, logBaseDir, sparkConf, hadoopConf) + } + + override protected def verifyWriteEventLogFile( + appId: String, + appAttemptId: Option[String], + logBaseDir: URI, + compressionCodecShortName: Option[String], + expectedLines: Seq[String]): Unit = { + val logDirPath = getAppEventLogDirPath(logBaseDir, appId, appAttemptId) + + assert(fileSystem.exists(logDirPath) && fileSystem.isDirectory(logDirPath)) + + val appStatusFile = getAppStatusFilePath(logDirPath, appId, appAttemptId, inProgress = false) + assert(fileSystem.exists(appStatusFile) && fileSystem.isFile(appStatusFile)) + + val eventLogFiles = listEventLogFiles(logDirPath) + val allLines = mutable.ArrayBuffer[String]() + eventLogFiles.foreach { file => + allLines.appendAll(readLinesFromEventLogFile(file.getPath, fileSystem)) + } + + assert(expectedLines === allLines) + } + + private def listEventLogFiles(logDirPath: Path): Seq[FileStatus] = { + fileSystem.listStatus(logDirPath).filter(isEventLogFile) + .sortBy { fs => getEventLogFileIndex(fs.getPath.getName) } + } +} diff --git a/core/src/test/scala/org/apache/spark/deploy/history/EventLogTestHelper.scala b/core/src/test/scala/org/apache/spark/deploy/history/EventLogTestHelper.scala new file mode 100644 index 0000000000000..298fd65f293cb --- /dev/null +++ b/core/src/test/scala/org/apache/spark/deploy/history/EventLogTestHelper.scala @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.deploy.history + +import java.io.File +import java.nio.charset.StandardCharsets + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} +import org.json4s.jackson.JsonMethods.{compact, render} + +import org.apache.spark.SparkConf +import org.apache.spark.internal.config._ +import org.apache.spark.scheduler._ +import org.apache.spark.util.JsonProtocol + +object EventLogTestHelper { + def getUniqueApplicationId: String = "test-" + System.currentTimeMillis + + /** + * Get a SparkConf with event logging enabled. It doesn't enable rolling event logs, so caller + * should set it manually. + */ + def getLoggingConf(logDir: Path, compressionCodec: Option[String] = None): SparkConf = { + val conf = new SparkConf + conf.set(EVENT_LOG_ENABLED, true) + conf.set(EVENT_LOG_BLOCK_UPDATES, true) + conf.set(EVENT_LOG_TESTING, true) + conf.set(EVENT_LOG_DIR, logDir.toString) + compressionCodec.foreach { codec => + conf.set(EVENT_LOG_COMPRESS, true) + conf.set(EVENT_LOG_COMPRESSION_CODEC, codec) + } + conf.set(EVENT_LOG_STAGE_EXECUTOR_METRICS, true) + conf + } + + def writeTestEvents( + writer: EventLogFileWriter, + eventStr: String, + desiredSize: Long): Seq[String] = { + val stringLen = eventStr.getBytes(StandardCharsets.UTF_8).length + val repeatCount = Math.floor(desiredSize / stringLen).toInt + (0 until repeatCount).map { _ => + writer.writeEvent(eventStr, flushLogger = true) + eventStr + } + } + + def writeEventLogFile( + sparkConf: SparkConf, + hadoopConf: Configuration, + dir: File, + idx: Int, + events: Seq[SparkListenerEvent]): String = { + // to simplify the code, we don't concern about file name being matched with the naming rule + // of event log file + val writer = new SingleEventLogFileWriter(s"app$idx", None, dir.toURI, sparkConf, hadoopConf) + writer.start() + events.foreach { event => writer.writeEvent(convertEvent(event), flushLogger = true) } + writer.stop() + writer.logPath + } + + def writeEventsToRollingWriter( + fs: FileSystem, + appId: String, + dir: File, + sparkConf: SparkConf, + hadoopConf: Configuration, + eventsFiles: Seq[SparkListenerEvent]*): Seq[FileStatus] = { + val writer = new RollingEventLogFilesWriter(appId, None, dir.toURI, sparkConf, hadoopConf) + writer.start() + + eventsFiles.dropRight(1).foreach { events => + writeEventsToRollingWriter(writer, events, rollFile = true) + } + eventsFiles.lastOption.foreach { events => + writeEventsToRollingWriter(writer, events, rollFile = false) + } + + writer.stop() + EventLogFileReader(fs, new Path(writer.logPath)).get.listEventLogFiles + } + + def writeEventsToRollingWriter( + writer: RollingEventLogFilesWriter, + events: Seq[SparkListenerEvent], + rollFile: Boolean): Unit = { + events.foreach { event => writer.writeEvent(convertEvent(event), flushLogger = true) } + if (rollFile) writer.rollEventLogFile() + } + + def convertEvent(event: SparkListenerEvent): String = { + compact(render(JsonProtocol.sparkEventToJson(event))) + } +} diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala index 30261dde678f1..c2f34fc3a95ed 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala @@ -37,7 +37,9 @@ import org.mockito.Mockito.{doThrow, mock, spy, verify, when} import org.scalatest.Matchers import org.scalatest.concurrent.Eventually._ -import org.apache.spark.{SecurityManager, SPARK_VERSION, SparkConf, SparkFunSuite} +import org.apache.spark.{JobExecutionStatus, SecurityManager, SPARK_VERSION, SparkConf, SparkFunSuite} +import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.deploy.history.EventLogTestHelper._ import org.apache.spark.internal.Logging import org.apache.spark.internal.config.DRIVER_LOG_DFS_DIR import org.apache.spark.internal.config.History._ @@ -47,12 +49,13 @@ import org.apache.spark.scheduler._ import org.apache.spark.scheduler.cluster.ExecutorInfo import org.apache.spark.security.GroupMappingServiceProvider import org.apache.spark.status.AppStatusStore +import org.apache.spark.status.KVUtils.KVStoreScalaSerializer import org.apache.spark.status.api.v1.{ApplicationAttemptInfo, ApplicationInfo} import org.apache.spark.util.{Clock, JsonProtocol, ManualClock, Utils} +import org.apache.spark.util.kvstore.InMemoryStore import org.apache.spark.util.logging.DriverLogger class FsHistoryProviderSuite extends SparkFunSuite with Matchers with Logging { - private var testDir: File = null override def beforeEach(): Unit = { @@ -74,8 +77,8 @@ class FsHistoryProviderSuite extends SparkFunSuite with Matchers with Logging { appAttemptId: Option[String], inProgress: Boolean, codec: Option[String] = None): File = { - val ip = if (inProgress) EventLoggingListener.IN_PROGRESS else "" - val logUri = EventLoggingListener.getLogPath(testDir.toURI, appId, appAttemptId, codec) + val ip = if (inProgress) EventLogFileWriter.IN_PROGRESS else "" + val logUri = SingleEventLogFileWriter.getLogPath(testDir.toURI, appId, appAttemptId, codec) val logPath = new Path(logUri).toUri.getPath + ip new File(logPath) } @@ -86,7 +89,7 @@ class FsHistoryProviderSuite extends SparkFunSuite with Matchers with Logging { } } - private def testAppLogParsing(inMemory: Boolean) { + private def testAppLogParsing(inMemory: Boolean): Unit = { val clock = new ManualClock(12345678) val conf = createTestConf(inMemory = inMemory) val provider = new FsHistoryProvider(conf, clock) @@ -159,13 +162,14 @@ class FsHistoryProviderSuite extends SparkFunSuite with Matchers with Logging { assume(!Utils.isWindows) class TestFsHistoryProvider extends FsHistoryProvider(createTestConf()) { - var mergeApplicationListingCall = 0 - override protected def mergeApplicationListing( - fileStatus: FileStatus, + var doMergeApplicationListingCall = 0 + override private[history] def doMergeApplicationListing( + reader: EventLogFileReader, lastSeen: Long, - enableSkipToEnd: Boolean): Unit = { - super.mergeApplicationListing(fileStatus, lastSeen, enableSkipToEnd) - mergeApplicationListingCall += 1 + enableSkipToEnd: Boolean, + lastCompactionIndex: Option[Long]): Unit = { + super.doMergeApplicationListing(reader, lastSeen, enableSkipToEnd, lastCompactionIndex) + doMergeApplicationListingCall += 1 } } val provider = new TestFsHistoryProvider @@ -186,7 +190,7 @@ class FsHistoryProviderSuite extends SparkFunSuite with Matchers with Logging { list.size should be (1) } - provider.mergeApplicationListingCall should be (1) + provider.doMergeApplicationListingCall should be (1) } test("history file is renamed from inprogress to completed") { @@ -199,13 +203,13 @@ class FsHistoryProviderSuite extends SparkFunSuite with Matchers with Logging { ) updateAndCheck(provider) { list => list.size should be (1) - provider.getAttempt("app1", None).logPath should endWith(EventLoggingListener.IN_PROGRESS) + provider.getAttempt("app1", None).logPath should endWith(EventLogFileWriter.IN_PROGRESS) } logFile1.renameTo(newLogFile("app1", None, inProgress = false)) updateAndCheck(provider) { list => list.size should be (1) - provider.getAttempt("app1", None).logPath should not endWith(EventLoggingListener.IN_PROGRESS) + provider.getAttempt("app1", None).logPath should not endWith(EventLogFileWriter.IN_PROGRESS) } } @@ -1161,29 +1165,45 @@ class FsHistoryProviderSuite extends SparkFunSuite with Matchers with Logging { when(mockedFs.open(path)).thenReturn(in) when(in.getWrappedStream).thenReturn(dfsIn) when(dfsIn.getFileLength).thenReturn(200) + // FileStatus.getLen is more than logInfo fileSize var fileStatus = new FileStatus(200, false, 0, 0, 0, path) + when(mockedFs.getFileStatus(path)).thenReturn(fileStatus) var logInfo = new LogInfo(path.toString, 0, LogType.EventLogs, Some("appId"), - Some("attemptId"), 100) - assert(mockedProvider.shouldReloadLog(logInfo, fileStatus)) + Some("attemptId"), 100, None, None, false) + var reader = EventLogFileReader(mockedFs, path) + assert(reader.isDefined) + assert(mockedProvider.shouldReloadLog(logInfo, reader.get)) fileStatus = new FileStatus() fileStatus.setPath(path) + when(mockedFs.getFileStatus(path)).thenReturn(fileStatus) // DFSInputStream.getFileLength is more than logInfo fileSize logInfo = new LogInfo(path.toString, 0, LogType.EventLogs, Some("appId"), - Some("attemptId"), 100) - assert(mockedProvider.shouldReloadLog(logInfo, fileStatus)) + Some("attemptId"), 100, None, None, false) + reader = EventLogFileReader(mockedFs, path) + assert(reader.isDefined) + assert(mockedProvider.shouldReloadLog(logInfo, reader.get)) + // DFSInputStream.getFileLength is equal to logInfo fileSize logInfo = new LogInfo(path.toString, 0, LogType.EventLogs, Some("appId"), - Some("attemptId"), 200) - assert(!mockedProvider.shouldReloadLog(logInfo, fileStatus)) + Some("attemptId"), 200, None, None, false) + reader = EventLogFileReader(mockedFs, path) + assert(reader.isDefined) + assert(!mockedProvider.shouldReloadLog(logInfo, reader.get)) + // in.getWrappedStream returns other than DFSInputStream val bin = mock(classOf[BufferedInputStream]) when(in.getWrappedStream).thenReturn(bin) - assert(!mockedProvider.shouldReloadLog(logInfo, fileStatus)) + reader = EventLogFileReader(mockedFs, path) + assert(reader.isDefined) + assert(!mockedProvider.shouldReloadLog(logInfo, reader.get)) + // fs.open throws exception when(mockedFs.open(path)).thenThrow(new IOException("Throwing intentionally")) - assert(!mockedProvider.shouldReloadLog(logInfo, fileStatus)) + reader = EventLogFileReader(mockedFs, path) + assert(reader.isDefined) + assert(!mockedProvider.shouldReloadLog(logInfo, reader.get)) } test("log cleaner with the maximum number of log files") { @@ -1236,6 +1256,220 @@ class FsHistoryProviderSuite extends SparkFunSuite with Matchers with Logging { } } + test("backwards compatibility with LogInfo from Spark 2.4") { + case class LogInfoV24( + logPath: String, + lastProcessed: Long, + appId: Option[String], + attemptId: Option[String], + fileSize: Long) + + val oldObj = LogInfoV24("dummy", System.currentTimeMillis(), Some("hello"), + Some("attempt1"), 100) + + val serializer = new KVStoreScalaSerializer() + val serializedOldObj = serializer.serialize(oldObj) + val deserializedOldObj = serializer.deserialize(serializedOldObj, classOf[LogInfo]) + assert(deserializedOldObj.logPath === oldObj.logPath) + assert(deserializedOldObj.lastProcessed === oldObj.lastProcessed) + assert(deserializedOldObj.appId === oldObj.appId) + assert(deserializedOldObj.attemptId === oldObj.attemptId) + assert(deserializedOldObj.fileSize === oldObj.fileSize) + + // SPARK-25118: added logType: LogType.Value - expected 'null' on old format + assert(deserializedOldObj.logType === null) + + // SPARK-28869: added lastIndex: Option[Long], isComplete: Boolean - expected 'None' and + // 'false' on old format. The default value for isComplete is wrong value for completed app, + // but the value will be corrected once checkForLogs is called. + assert(deserializedOldObj.lastIndex === None) + assert(deserializedOldObj.isComplete === false) + } + + test("SPARK-29755 LogInfo should be serialized/deserialized by jackson properly") { + def assertSerDe(serializer: KVStoreScalaSerializer, info: LogInfo): Unit = { + val infoAfterSerDe = serializer.deserialize(serializer.serialize(info), classOf[LogInfo]) + assert(infoAfterSerDe === info) + assertOptionAfterSerde(infoAfterSerDe.lastIndex, info.lastIndex) + } + + val serializer = new KVStoreScalaSerializer() + val logInfoWithIndexAsNone = LogInfo("dummy", 0, LogType.EventLogs, Some("appId"), + Some("attemptId"), 100, None, None, false) + assertSerDe(serializer, logInfoWithIndexAsNone) + + val logInfoWithIndex = LogInfo("dummy", 0, LogType.EventLogs, Some("appId"), + Some("attemptId"), 100, Some(3), None, false) + assertSerDe(serializer, logInfoWithIndex) + } + + test("SPARK-29755 AttemptInfoWrapper should be serialized/deserialized by jackson properly") { + def assertSerDe(serializer: KVStoreScalaSerializer, attempt: AttemptInfoWrapper): Unit = { + val attemptAfterSerDe = serializer.deserialize(serializer.serialize(attempt), + classOf[AttemptInfoWrapper]) + assert(attemptAfterSerDe.info === attempt.info) + // skip comparing some fields, as they've not triggered SPARK-29755 + assertOptionAfterSerde(attemptAfterSerDe.lastIndex, attempt.lastIndex) + } + + val serializer = new KVStoreScalaSerializer() + val appInfo = new ApplicationAttemptInfo(None, new Date(1), new Date(1), new Date(1), + 10, "spark", false, "dummy") + val attemptInfoWithIndexAsNone = new AttemptInfoWrapper(appInfo, "dummyPath", 10, None, + None, None, None, None) + assertSerDe(serializer, attemptInfoWithIndexAsNone) + + val attemptInfoWithIndex = new AttemptInfoWrapper(appInfo, "dummyPath", 10, Some(1), + None, None, None, None) + assertSerDe(serializer, attemptInfoWithIndex) + } + + test("SPARK-29043: clean up specified event log") { + val clock = new ManualClock() + val conf = createTestConf().set(MAX_LOG_AGE_S, 0L).set(CLEANER_ENABLED, true) + val provider = new FsHistoryProvider(conf, clock) + + // create an invalid application log file + val inValidLogFile = newLogFile("inValidLogFile", None, inProgress = true) + inValidLogFile.createNewFile() + writeFile(inValidLogFile, None, + SparkListenerApplicationStart(inValidLogFile.getName, None, 1L, "test", None)) + inValidLogFile.setLastModified(clock.getTimeMillis()) + + // create a valid application log file + val validLogFile = newLogFile("validLogFile", None, inProgress = true) + validLogFile.createNewFile() + writeFile(validLogFile, None, + SparkListenerApplicationStart(validLogFile.getName, Some("local_123"), 1L, "test", None)) + validLogFile.setLastModified(clock.getTimeMillis()) + + provider.checkForLogs() + // The invalid application log file would be cleaned by checkAndCleanLog(). + assert(new File(testDir.toURI).listFiles().size === 1) + + clock.advance(1) + // cleanLogs() would clean the valid application log file. + provider.cleanLogs() + assert(new File(testDir.toURI).listFiles().size === 0) + } + + private def assertOptionAfterSerde(opt: Option[Long], expected: Option[Long]): Unit = { + if (expected.isEmpty) { + assert(opt.isEmpty) + } else { + // The issue happens only when the value in Option is being unboxed. Here we ensure unboxing + // to Long succeeds: even though IDE suggests `.toLong` is redundant, direct comparison + // doesn't trigger unboxing and passes even without SPARK-29755, so don't remove + // `.toLong` below. Please refer SPARK-29755 for more details. + assert(opt.get.toLong === expected.get.toLong) + } + } + + test("compact event log files") { + def verifyEventLogFiles( + fs: FileSystem, + rootPath: String, + expectedIndexForCompact: Option[Long], + expectedIndicesForNonCompact: Seq[Long]): Unit = { + val reader = EventLogFileReader(fs, new Path(rootPath)).get + var logFiles = reader.listEventLogFiles + + expectedIndexForCompact.foreach { idx => + val headFile = logFiles.head + assert(EventLogFileWriter.isCompacted(headFile.getPath)) + assert(idx == RollingEventLogFilesWriter.getEventLogFileIndex(headFile.getPath.getName)) + logFiles = logFiles.drop(1) + } + + assert(logFiles.size === expectedIndicesForNonCompact.size) + + logFiles.foreach { logFile => + assert(RollingEventLogFilesWriter.isEventLogFile(logFile)) + assert(!EventLogFileWriter.isCompacted(logFile.getPath)) + } + + val indices = logFiles.map { logFile => + RollingEventLogFilesWriter.getEventLogFileIndex(logFile.getPath.getName) + } + assert(expectedIndicesForNonCompact === indices) + } + + withTempDir { dir => + val conf = createTestConf() + conf.set(HISTORY_LOG_DIR, dir.getAbsolutePath) + conf.set(EVENT_LOG_ROLLING_MAX_FILES_TO_RETAIN, 1) + conf.set(EVENT_LOG_COMPACTION_SCORE_THRESHOLD, 0.0d) + val hadoopConf = SparkHadoopUtil.newConfiguration(conf) + val fs = new Path(dir.getAbsolutePath).getFileSystem(hadoopConf) + + val provider = new FsHistoryProvider(conf) + + val writer = new RollingEventLogFilesWriter("app", None, dir.toURI, conf, hadoopConf) + writer.start() + + // writing event log file 1 - don't compact for now + writeEventsToRollingWriter(writer, Seq( + SparkListenerApplicationStart("app", Some("app"), 0, "user", None), + SparkListenerJobStart(1, 0, Seq.empty)), rollFile = false) + + updateAndCheck(provider) { _ => + verifyEventLogFiles(fs, writer.logPath, None, Seq(1)) + val info = provider.listing.read(classOf[LogInfo], writer.logPath) + assert(info.lastEvaluatedForCompaction === Some(1)) + } + + // writing event log file 2 - compact the event log file 1 into 1.compact + writeEventsToRollingWriter(writer, Seq.empty, rollFile = true) + writeEventsToRollingWriter(writer, Seq(SparkListenerUnpersistRDD(1), + SparkListenerJobEnd(1, 1, JobSucceeded)), rollFile = false) + + updateAndCheck(provider) { _ => + verifyEventLogFiles(fs, writer.logPath, Some(1), Seq(2)) + val info = provider.listing.read(classOf[LogInfo], writer.logPath) + assert(info.lastEvaluatedForCompaction === Some(2)) + } + + // writing event log file 3 - compact two files - 1.compact & 2 into one, 2.compact + writeEventsToRollingWriter(writer, Seq.empty, rollFile = true) + writeEventsToRollingWriter(writer, Seq( + SparkListenerExecutorAdded(3, "exec1", new ExecutorInfo("host1", 1, Map.empty)), + SparkListenerJobStart(2, 4, Seq.empty), + SparkListenerJobEnd(2, 5, JobSucceeded)), rollFile = false) + + writer.stop() + + updateAndCheck(provider) { _ => + verifyEventLogFiles(fs, writer.logPath, Some(2), Seq(3)) + + val info = provider.listing.read(classOf[LogInfo], writer.logPath) + assert(info.lastEvaluatedForCompaction === Some(3)) + + val store = new InMemoryStore + val appStore = new AppStatusStore(store) + + val reader = EventLogFileReader(fs, new Path(writer.logPath)).get + provider.rebuildAppStore(store, reader, 0L) + + // replayed store doesn't have any job, as events for job are removed while compacting + intercept[NoSuchElementException] { + appStore.job(1) + } + + // but other events should be available even they were in original files to compact + val appInfo = appStore.applicationInfo() + assert(appInfo.id === "app") + assert(appInfo.name === "app") + + // All events in retained file(s) should be available, including events which would have + // been filtered out if compaction is applied. e.g. finished jobs, removed executors, etc. + val exec1 = appStore.executorSummary("exec1") + assert(exec1.hostPort === "host1") + val job2 = appStore.job(2) + assert(job2.status === JobExecutionStatus.SUCCEEDED) + } + } + } + /** * Asks the provider to check for logs and calls a function to perform checks on the updated * app list. Example: @@ -1254,9 +1488,13 @@ class FsHistoryProviderSuite extends SparkFunSuite with Matchers with Logging { private def writeFile(file: File, codec: Option[CompressionCodec], events: SparkListenerEvent*) = { val fstream = new FileOutputStream(file) - val cstream = codec.map(_.compressedOutputStream(fstream)).getOrElse(fstream) + val cstream = codec.map(_.compressedContinuousOutputStream(fstream)).getOrElse(fstream) val bstream = new BufferedOutputStream(cstream) - EventLoggingListener.initEventLog(bstream, false, null) + + val metadata = SparkListenerLogStart(org.apache.spark.SPARK_VERSION) + val eventJson = JsonProtocol.logStartToJson(metadata) + val metadataJson = compact(eventJson) + "\n" + bstream.write(metadataJson.getBytes(StandardCharsets.UTF_8)) val writer = new OutputStreamWriter(bstream, StandardCharsets.UTF_8) Utils.tryWithSafeFinally { diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala index dbc1938ed469a..206db0feb5716 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala @@ -26,7 +26,6 @@ import javax.servlet.http.{HttpServletRequest, HttpServletRequestWrapper, HttpSe import scala.collection.JavaConverters._ import scala.concurrent.duration._ -import com.gargoylesoftware.htmlunit.BrowserVersion import com.google.common.io.{ByteStreams, Files} import org.apache.commons.io.{FileUtils, IOUtils} import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} @@ -40,8 +39,8 @@ import org.openqa.selenium.WebDriver import org.openqa.selenium.htmlunit.HtmlUnitDriver import org.scalatest.{BeforeAndAfter, Matchers} import org.scalatest.concurrent.Eventually -import org.scalatest.mockito.MockitoSugar -import org.scalatest.selenium.WebBrowser +import org.scalatestplus.mockito.MockitoSugar +import org.scalatestplus.selenium.WebBrowser import org.apache.spark._ import org.apache.spark.internal.config._ @@ -85,7 +84,7 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers .set(IS_TESTING, true) .set(LOCAL_STORE_DIR, storeDir.getAbsolutePath()) .set(EVENT_LOG_STAGE_EXECUTOR_METRICS, true) - .set(EVENT_LOG_PROCESS_TREE_METRICS, true) + .set(EXECUTOR_PROCESS_TREE_METRICS_ENABLED, true) conf.setAll(extraConf) provider = new FsHistoryProvider(conf) provider.checkForLogs() @@ -94,6 +93,7 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers server = new HistoryServer(conf, provider, securityManager, 18080) server.initialize() server.bind() + provider.start() port = server.boundPort } @@ -185,7 +185,7 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers test(name) { val (code, jsonOpt, errOpt) = getContentAndCode(path) code should be (HttpServletResponse.SC_OK) - jsonOpt should be ('defined) + jsonOpt should be (Symbol("defined")) errOpt should be (None) val exp = IOUtils.toString(new FileInputStream( @@ -364,8 +364,7 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers contextHandler.addServlet(holder, "/") server.attachHandler(contextHandler) - implicit val webDriver: WebDriver = - new HtmlUnitDriver(BrowserVersion.INTERNET_EXPLORER_11, true) + implicit val webDriver: WebDriver = new HtmlUnitDriver(true) try { val url = s"http://localhost:$port" @@ -451,6 +450,7 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers server = new HistoryServer(myConf, provider, securityManager, 0) server.initialize() server.bind() + provider.start() val port = server.boundPort val metrics = server.cacheMetrics diff --git a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala index 9ce046a2e2f50..0cf573c2490b3 100644 --- a/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/master/MasterSuite.scala @@ -18,12 +18,12 @@ package org.apache.spark.deploy.master import java.util.Date -import java.util.concurrent.ConcurrentLinkedQueue +import java.util.concurrent.{ConcurrentLinkedQueue, CountDownLatch, TimeUnit} import java.util.concurrent.atomic.AtomicInteger import scala.collection.JavaConverters._ import scala.collection.mutable -import scala.collection.mutable.{HashMap, HashSet} +import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet} import scala.concurrent.duration._ import scala.io.Source import scala.reflect.ClassTag @@ -97,13 +97,40 @@ class MockWorker(master: RpcEndpointRef, conf: SparkConf = new SparkConf) extend } } -class MockExecutorLaunchFailWorker(master: RpcEndpointRef, conf: SparkConf = new SparkConf) - extends MockWorker(master, conf) { +// This class is designed to handle the lifecycle of only one application. +class MockExecutorLaunchFailWorker(master: Master, conf: SparkConf = new SparkConf) + extends MockWorker(master.self, conf) with Eventually { + + val appRegistered = new CountDownLatch(1) + val launchExecutorReceived = new CountDownLatch(1) + val appIdsToLaunchExecutor = new mutable.HashSet[String] var failedCnt = 0 + override def receive: PartialFunction[Any, Unit] = { + case LaunchDriver(driverId, _, _) => + master.self.send(RegisterApplication(appDesc, newDriver(driverId))) + + // Below code doesn't make driver stuck, as newDriver opens another rpc endpoint for + // handling driver related messages. To simplify logic, we will block handling + // LaunchExecutor message until we validate registering app succeeds. + eventually(timeout(5.seconds)) { + // an app would be registered with Master once Driver set up + assert(apps.nonEmpty) + assert(master.idToApp.keySet.intersect(apps.keySet) == apps.keySet) + } + + appRegistered.countDown() case LaunchExecutor(_, appId, execId, _, _, _, _) => + assert(appRegistered.await(10, TimeUnit.SECONDS)) + + if (failedCnt == 0) { + launchExecutorReceived.countDown() + } + assert(master.idToApp.contains(appId)) + appIdsToLaunchExecutor += appId failedCnt += 1 - master.send(ExecutorStateChanged(appId, execId, ExecutorState.FAILED, None, None)) + master.self.send(ExecutorStateChanged(appId, execId, ExecutorState.FAILED, None, None)) + case otherMsg => super.receive(otherMsg) } } @@ -542,9 +569,10 @@ class MasterSuite extends SparkFunSuite // | Utility methods and fields for testing | // ========================================== - private val _scheduleExecutorsOnWorkers = PrivateMethod[Array[Int]]('scheduleExecutorsOnWorkers) - private val _drivers = PrivateMethod[HashSet[DriverInfo]]('drivers) - private val _state = PrivateMethod[RecoveryState.Value]('state) + private val _scheduleExecutorsOnWorkers = + PrivateMethod[Array[Int]](Symbol("scheduleExecutorsOnWorkers")) + private val _drivers = PrivateMethod[HashSet[DriverInfo]](Symbol("drivers")) + private val _state = PrivateMethod[RecoveryState.Value](Symbol("state")) private val workerInfo = makeWorkerInfo(4096, 10) private val workerInfos = Array(workerInfo, workerInfo, workerInfo) @@ -661,7 +689,7 @@ class MasterSuite extends SparkFunSuite val master = makeAliveMaster() var worker: MockExecutorLaunchFailWorker = null try { - worker = new MockExecutorLaunchFailWorker(master.self) + worker = new MockExecutorLaunchFailWorker(master) worker.rpcEnv.setupEndpoint("worker", worker) val workerRegMsg = RegisterWorker( worker.id, @@ -676,19 +704,16 @@ class MasterSuite extends SparkFunSuite val driver = DeployTestUtils.createDriverDesc() // mimic DriverClient to send RequestSubmitDriver to master master.self.askSync[SubmitDriverResponse](RequestSubmitDriver(driver)) - var appId: String = null - eventually(timeout(10.seconds)) { - // an app would be registered with Master once Driver set up - assert(worker.apps.nonEmpty) - appId = worker.apps.head._1 - assert(master.idToApp.contains(appId)) - } + + // LaunchExecutor message should have been received in worker side + assert(worker.launchExecutorReceived.await(10, TimeUnit.SECONDS)) eventually(timeout(10.seconds)) { + val appIds = worker.appIdsToLaunchExecutor // Master would continually launch executors until reach MAX_EXECUTOR_RETRIES assert(worker.failedCnt == master.conf.get(MAX_EXECUTOR_RETRIES)) // Master would remove the app if no executor could be launched for it - assert(!master.idToApp.contains(appId)) + assert(master.idToApp.keySet.intersect(appIds).isEmpty) } } finally { if (worker != null) { diff --git a/core/src/test/scala/org/apache/spark/deploy/master/ui/MasterWebUISuite.scala b/core/src/test/scala/org/apache/spark/deploy/master/ui/MasterWebUISuite.scala index f4558aa3eb893..e2d7facdd77e0 100644 --- a/core/src/test/scala/org/apache/spark/deploy/master/ui/MasterWebUISuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/master/ui/MasterWebUISuite.scala @@ -47,12 +47,12 @@ class MasterWebUISuite extends SparkFunSuite with BeforeAndAfterAll { when(master.self).thenReturn(masterEndpointRef) val masterWebUI = new MasterWebUI(master, 0) - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() masterWebUI.bind() } - override def afterAll() { + override def afterAll(): Unit = { try { masterWebUI.stop() } finally { diff --git a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala index 89b8bb4ff7d03..d5312845a3b50 100644 --- a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala @@ -42,7 +42,7 @@ class StandaloneRestSubmitSuite extends SparkFunSuite with BeforeAndAfterEach { private var rpcEnv: Option[RpcEnv] = None private var server: Option[RestSubmissionServer] = None - override def afterEach() { + override def afterEach(): Unit = { try { rpcEnv.foreach(_.shutdown()) server.foreach(_.stop()) diff --git a/core/src/test/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManagerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManagerSuite.scala index 70174f7ff939a..275bca3459855 100644 --- a/core/src/test/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/security/HadoopDelegationTokenManagerSuite.scala @@ -17,11 +17,17 @@ package org.apache.spark.deploy.security +import java.security.PrivilegedExceptionAction + import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.security.Credentials +import org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION +import org.apache.hadoop.minikdc.MiniKdc +import org.apache.hadoop.security.{Credentials, UserGroupInformation} import org.apache.spark.{SparkConf, SparkFunSuite} +import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.security.HadoopDelegationTokenProvider +import org.apache.spark.util.Utils private class ExceptionThrowingDelegationTokenProvider extends HadoopDelegationTokenProvider { ExceptionThrowingDelegationTokenProvider.constructed = true @@ -69,4 +75,48 @@ class HadoopDelegationTokenManagerSuite extends SparkFunSuite { assert(!manager.isProviderLoaded("hadoopfs")) assert(manager.isProviderLoaded("hbase")) } + + test("SPARK-29082: do not fail if current user does not have credentials") { + // SparkHadoopUtil overrides the UGI configuration during initialization. That normally + // happens early in the Spark application, but here it may affect the test depending on + // how it's run, so force its initialization. + SparkHadoopUtil.get + + var kdc: MiniKdc = null + try { + // UserGroupInformation.setConfiguration needs default kerberos realm which can be set in + // krb5.conf. MiniKdc sets "java.security.krb5.conf" in start and removes it when stop called. + val kdcDir = Utils.createTempDir() + val kdcConf = MiniKdc.createConf() + kdc = new MiniKdc(kdcConf, kdcDir) + kdc.start() + + val krbConf = new Configuration() + krbConf.set(HADOOP_SECURITY_AUTHENTICATION, "kerberos") + + UserGroupInformation.setConfiguration(krbConf) + val manager = new HadoopDelegationTokenManager(new SparkConf(false), krbConf, null) + val testImpl = new PrivilegedExceptionAction[Unit] { + override def run(): Unit = { + assert(UserGroupInformation.isSecurityEnabled()) + val creds = new Credentials() + manager.obtainDelegationTokens(creds) + assert(creds.numberOfTokens() === 0) + assert(creds.numberOfSecretKeys() === 0) + } + } + + val realUser = UserGroupInformation.createUserForTesting("realUser", Array.empty) + realUser.doAs(testImpl) + + val proxyUser = UserGroupInformation.createProxyUserForTesting("proxyUser", realUser, + Array.empty) + proxyUser.doAs(testImpl) + } finally { + if (kdc != null) { + kdc.stop() + } + UserGroupInformation.reset() + } + } } diff --git a/core/src/test/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProviderSuite.scala index 1f19884bc24d3..44f38e7043dcd 100644 --- a/core/src/test/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProviderSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/security/HadoopFSDelegationTokenProviderSuite.scala @@ -22,14 +22,15 @@ import org.apache.hadoop.fs.Path import org.scalatest.Matchers import org.apache.spark.{SparkConf, SparkFunSuite} -import org.apache.spark.internal.config.STAGING_DIR +import org.apache.spark.internal.config.{STAGING_DIR, SUBMIT_DEPLOY_MODE} class HadoopFSDelegationTokenProviderSuite extends SparkFunSuite with Matchers { test("hadoopFSsToAccess should return defaultFS even if not configured") { val sparkConf = new SparkConf() val defaultFS = "hdfs://localhost:8020" val statingDir = "hdfs://localhost:8021" - sparkConf.set("spark.master", "yarn-client") + sparkConf.setMaster("yarn") + sparkConf.set(SUBMIT_DEPLOY_MODE, "client") sparkConf.set(STAGING_DIR, statingDir) val hadoopConf = new Configuration() hadoopConf.set("fs.defaultFS", defaultFS) diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/CommandUtilsSuite.scala b/core/src/test/scala/org/apache/spark/deploy/worker/CommandUtilsSuite.scala index 607c0a4fac46b..2d3cc5d3abd65 100644 --- a/core/src/test/scala/org/apache/spark/deploy/worker/CommandUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/worker/CommandUtilsSuite.scala @@ -38,7 +38,7 @@ class CommandUtilsSuite extends SparkFunSuite with Matchers with PrivateMethodTe } test("auth secret shouldn't appear in java opts") { - val buildLocalCommand = PrivateMethod[Command]('buildLocalCommand) + val buildLocalCommand = PrivateMethod[Command](Symbol("buildLocalCommand")) val conf = new SparkConf val secret = "This is the secret sauce" // set auth secret diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/ui/LogPageSuite.scala b/core/src/test/scala/org/apache/spark/deploy/worker/ui/LogPageSuite.scala index 4c3e96777940d..c8b4e3372386b 100644 --- a/core/src/test/scala/org/apache/spark/deploy/worker/ui/LogPageSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/worker/ui/LogPageSuite.scala @@ -53,7 +53,7 @@ class LogPageSuite extends SparkFunSuite with PrivateMethodTester { write(tmpRand, "1 6 4 5 2 7 8") // Get the logs. All log types other than "stderr" or "stdout" will be rejected - val getLog = PrivateMethod[(String, Long, Long, Long)]('getLog) + val getLog = PrivateMethod[(String, Long, Long, Long)](Symbol("getLog")) val (stdout, _, _, _) = logPage invokePrivate getLog(workDir.getAbsolutePath, "stdout", None, 100) val (stderr, _, _, _) = diff --git a/core/src/test/scala/org/apache/spark/executor/CoarseGrainedExecutorBackendSuite.scala b/core/src/test/scala/org/apache/spark/executor/CoarseGrainedExecutorBackendSuite.scala index 64d99a59b9192..3134a738b33fa 100644 --- a/core/src/test/scala/org/apache/spark/executor/CoarseGrainedExecutorBackendSuite.scala +++ b/core/src/test/scala/org/apache/spark/executor/CoarseGrainedExecutorBackendSuite.scala @@ -30,11 +30,11 @@ import org.json4s.JsonAST.{JArray, JObject} import org.json4s.JsonDSL._ import org.mockito.Mockito.when import org.scalatest.concurrent.Eventually.{eventually, timeout} -import org.scalatest.mockito.MockitoSugar +import org.scalatestplus.mockito.MockitoSugar import org.apache.spark._ import org.apache.spark.TestUtils._ -import org.apache.spark.resource.{ResourceAllocation, ResourceInformation} +import org.apache.spark.resource._ import org.apache.spark.resource.ResourceUtils._ import org.apache.spark.resource.TestResourceIDs._ import org.apache.spark.rpc.RpcEnv @@ -50,13 +50,13 @@ class CoarseGrainedExecutorBackendSuite extends SparkFunSuite test("parsing no resources") { val conf = new SparkConf - conf.set(TASK_GPU_ID.amountConf, "2") + val resourceProfile = ResourceProfile.getOrCreateDefaultProfile(conf) val serializer = new JavaSerializer(conf) val env = createMockEnv(conf, serializer) // we don't really use this, just need it to get at the parser function - val backend = new CoarseGrainedExecutorBackend( env.rpcEnv, "driverurl", "1", "host1", - 4, Seq.empty[URL], env, None) + val backend = new CoarseGrainedExecutorBackend( env.rpcEnv, "driverurl", "1", "host1", "host1", + 4, Seq.empty[URL], env, None, resourceProfile) withTempDir { tmpDir => val testResourceArgs: JObject = ("" -> "") val ja = JArray(List(testResourceArgs)) @@ -73,12 +73,11 @@ class CoarseGrainedExecutorBackendSuite extends SparkFunSuite test("parsing one resource") { val conf = new SparkConf conf.set(EXECUTOR_GPU_ID.amountConf, "2") - conf.set(TASK_GPU_ID.amountConf, "2") val serializer = new JavaSerializer(conf) val env = createMockEnv(conf, serializer) // we don't really use this, just need it to get at the parser function - val backend = new CoarseGrainedExecutorBackend( env.rpcEnv, "driverurl", "1", "host1", - 4, Seq.empty[URL], env, None) + val backend = new CoarseGrainedExecutorBackend( env.rpcEnv, "driverurl", "1", "host1", "host1", + 4, Seq.empty[URL], env, None, ResourceProfile.getOrCreateDefaultProfile(conf)) withTempDir { tmpDir => val ra = ResourceAllocation(EXECUTOR_GPU_ID, Seq("0", "1")) val ja = Extraction.decompose(Seq(ra)) @@ -88,22 +87,31 @@ class CoarseGrainedExecutorBackendSuite extends SparkFunSuite assert(parsedResources.size === 1) assert(parsedResources.get(GPU).nonEmpty) assert(parsedResources.get(GPU).get.name === GPU) - assert(parsedResources.get(GPU).get.addresses.deep === Array("0", "1").deep) + assert(parsedResources.get(GPU).get.addresses.sameElements(Array("0", "1"))) } } + test("parsing multiple resources resource profile") { + val rpBuilder = new ResourceProfileBuilder + val ereqs = new ExecutorResourceRequests().resource(GPU, 2) + ereqs.resource(FPGA, 3) + val rp = rpBuilder.require(ereqs).build + testParsingMultipleResources(new SparkConf, rp) + } + test("parsing multiple resources") { val conf = new SparkConf conf.set(EXECUTOR_GPU_ID.amountConf, "2") - conf.set(TASK_GPU_ID.amountConf, "2") conf.set(EXECUTOR_FPGA_ID.amountConf, "3") - conf.set(TASK_FPGA_ID.amountConf, "3") + testParsingMultipleResources(conf, ResourceProfile.getOrCreateDefaultProfile(conf)) + } + def testParsingMultipleResources(conf: SparkConf, resourceProfile: ResourceProfile) { val serializer = new JavaSerializer(conf) val env = createMockEnv(conf, serializer) // we don't really use this, just need it to get at the parser function - val backend = new CoarseGrainedExecutorBackend( env.rpcEnv, "driverurl", "1", "host1", - 4, Seq.empty[URL], env, None) + val backend = new CoarseGrainedExecutorBackend( env.rpcEnv, "driverurl", "1", "host1", "host1", + 4, Seq.empty[URL], env, None, resourceProfile) withTempDir { tmpDir => val gpuArgs = ResourceAllocation(EXECUTOR_GPU_ID, Seq("0", "1")) @@ -116,27 +124,26 @@ class CoarseGrainedExecutorBackendSuite extends SparkFunSuite assert(parsedResources.size === 2) assert(parsedResources.get(GPU).nonEmpty) assert(parsedResources.get(GPU).get.name === GPU) - assert(parsedResources.get(GPU).get.addresses.deep === Array("0", "1").deep) + assert(parsedResources.get(GPU).get.addresses.sameElements(Array("0", "1"))) assert(parsedResources.get(FPGA).nonEmpty) assert(parsedResources.get(FPGA).get.name === FPGA) - assert(parsedResources.get(FPGA).get.addresses.deep === Array("f1", "f2", "f3").deep) + assert(parsedResources.get(FPGA).get.addresses.sameElements(Array("f1", "f2", "f3"))) } } test("error checking parsing resources and executor and task configs") { val conf = new SparkConf conf.set(EXECUTOR_GPU_ID.amountConf, "2") - conf.set(TASK_GPU_ID.amountConf, "2") val serializer = new JavaSerializer(conf) val env = createMockEnv(conf, serializer) // we don't really use this, just need it to get at the parser function - val backend = new CoarseGrainedExecutorBackend(env.rpcEnv, "driverurl", "1", "host1", - 4, Seq.empty[URL], env, None) + val backend = new CoarseGrainedExecutorBackend(env.rpcEnv, "driverurl", "1", "host1", "host1", + 4, Seq.empty[URL], env, None, ResourceProfile.getOrCreateDefaultProfile(conf)) // not enough gpu's on the executor withTempDir { tmpDir => val gpuArgs = ResourceAllocation(EXECUTOR_GPU_ID, Seq("0")) - val ja = Extraction.decompose(Seq(gpuArgs)) + val ja = Extraction.decompose(Seq(gpuArgs)) val f1 = createTempJsonFile(tmpDir, "resources", ja) var error = intercept[IllegalArgumentException] { @@ -157,20 +164,34 @@ class CoarseGrainedExecutorBackendSuite extends SparkFunSuite val parsedResources = backend.parseOrFindResources(Some(f1)) }.getMessage() - assert(error.contains("User is expecting to use resource: gpu, but didn't specify a " + - "discovery script!")) + assert(error.contains("User is expecting to use resource: gpu, but didn't " + + "specify a discovery script!")) } } + test("executor resource found less than required resource profile") { + val rpBuilder = new ResourceProfileBuilder + val ereqs = new ExecutorResourceRequests().resource(GPU, 4) + val treqs = new TaskResourceRequests().resource(GPU, 1) + val rp = rpBuilder.require(ereqs).require(treqs).build + testExecutorResourceFoundLessThanRequired(new SparkConf, rp) + } + test("executor resource found less than required") { - val conf = new SparkConf + val conf = new SparkConf() conf.set(EXECUTOR_GPU_ID.amountConf, "4") conf.set(TASK_GPU_ID.amountConf, "1") + testExecutorResourceFoundLessThanRequired(conf, ResourceProfile.getOrCreateDefaultProfile(conf)) + } + + private def testExecutorResourceFoundLessThanRequired( + conf: SparkConf, + resourceProfile: ResourceProfile) = { val serializer = new JavaSerializer(conf) val env = createMockEnv(conf, serializer) // we don't really use this, just need it to get at the parser function - val backend = new CoarseGrainedExecutorBackend(env.rpcEnv, "driverurl", "1", "host1", - 4, Seq.empty[URL], env, None) + val backend = new CoarseGrainedExecutorBackend(env.rpcEnv, "driverurl", "1", "host1", "host1", + 4, Seq.empty[URL], env, None, resourceProfile) // executor resources < required withTempDir { tmpDir => @@ -190,7 +211,6 @@ class CoarseGrainedExecutorBackendSuite extends SparkFunSuite test("use resource discovery") { val conf = new SparkConf conf.set(EXECUTOR_FPGA_ID.amountConf, "3") - conf.set(TASK_FPGA_ID.amountConf, "3") assume(!(Utils.isWindows)) withTempDir { dir => val scriptPath = createTempScriptWithExpectedOutput(dir, "fpgaDiscoverScript", @@ -201,49 +221,68 @@ class CoarseGrainedExecutorBackendSuite extends SparkFunSuite val env = createMockEnv(conf, serializer) // we don't really use this, just need it to get at the parser function - val backend = new CoarseGrainedExecutorBackend(env.rpcEnv, "driverurl", "1", "host1", - 4, Seq.empty[URL], env, None) + val backend = new CoarseGrainedExecutorBackend(env.rpcEnv, "driverurl", "1", "host1", "host1", + 4, Seq.empty[URL], env, None, ResourceProfile.getOrCreateDefaultProfile(conf)) val parsedResources = backend.parseOrFindResources(None) assert(parsedResources.size === 1) assert(parsedResources.get(FPGA).nonEmpty) assert(parsedResources.get(FPGA).get.name === FPGA) - assert(parsedResources.get(FPGA).get.addresses.deep === Array("f1", "f2", "f3").deep) + assert(parsedResources.get(FPGA).get.addresses.sameElements(Array("f1", "f2", "f3"))) + } + } + + test("use resource discovery and allocated file option with resource profile") { + assume(!(Utils.isWindows)) + withTempDir { dir => + val scriptPath = createTempScriptWithExpectedOutput(dir, "fpgaDiscoverScript", + """{"name": "fpga","addresses":["f1", "f2", "f3"]}""") + val rpBuilder = new ResourceProfileBuilder + val ereqs = new ExecutorResourceRequests().resource(FPGA, 3, scriptPath) + ereqs.resource(GPU, 2) + val rp = rpBuilder.require(ereqs).build + allocatedFileAndConfigsResourceDiscoveryTestFpga(dir, new SparkConf, rp) } } test("use resource discovery and allocated file option") { - val conf = new SparkConf - conf.set(EXECUTOR_FPGA_ID.amountConf, "3") - conf.set(TASK_FPGA_ID.amountConf, "3") assume(!(Utils.isWindows)) withTempDir { dir => val scriptPath = createTempScriptWithExpectedOutput(dir, "fpgaDiscoverScript", """{"name": "fpga","addresses":["f1", "f2", "f3"]}""") + val conf = new SparkConf + conf.set(EXECUTOR_FPGA_ID.amountConf, "3") conf.set(EXECUTOR_FPGA_ID.discoveryScriptConf, scriptPath) - - val serializer = new JavaSerializer(conf) - val env = createMockEnv(conf, serializer) - - // we don't really use this, just need it to get at the parser function - val backend = new CoarseGrainedExecutorBackend(env.rpcEnv, "driverurl", "1", "host1", - 4, Seq.empty[URL], env, None) - val gpuArgs = ResourceAllocation(EXECUTOR_GPU_ID, Seq("0", "1")) - val ja = Extraction.decompose(Seq(gpuArgs)) - val f1 = createTempJsonFile(dir, "resources", ja) - val parsedResources = backend.parseOrFindResources(Some(f1)) - - assert(parsedResources.size === 2) - assert(parsedResources.get(GPU).nonEmpty) - assert(parsedResources.get(GPU).get.name === GPU) - assert(parsedResources.get(GPU).get.addresses.deep === Array("0", "1").deep) - assert(parsedResources.get(FPGA).nonEmpty) - assert(parsedResources.get(FPGA).get.name === FPGA) - assert(parsedResources.get(FPGA).get.addresses.deep === Array("f1", "f2", "f3").deep) + conf.set(EXECUTOR_GPU_ID.amountConf, "2") + val rp = ResourceProfile.getOrCreateDefaultProfile(conf) + allocatedFileAndConfigsResourceDiscoveryTestFpga(dir, conf, rp) } } + private def allocatedFileAndConfigsResourceDiscoveryTestFpga( + dir: File, + conf: SparkConf, + resourceProfile: ResourceProfile) = { + val serializer = new JavaSerializer(conf) + val env = createMockEnv(conf, serializer) + + // we don't really use this, just need it to get at the parser function + val backend = new CoarseGrainedExecutorBackend(env.rpcEnv, "driverurl", "1", "host1", "host1", + 4, Seq.empty[URL], env, None, resourceProfile) + val gpuArgs = ResourceAllocation(EXECUTOR_GPU_ID, Seq("0", "1")) + val ja = Extraction.decompose(Seq(gpuArgs)) + val f1 = createTempJsonFile(dir, "resources", ja) + val parsedResources = backend.parseOrFindResources(Some(f1)) + + assert(parsedResources.size === 2) + assert(parsedResources.get(GPU).nonEmpty) + assert(parsedResources.get(GPU).get.name === GPU) + assert(parsedResources.get(GPU).get.addresses.sameElements(Array("0", "1"))) + assert(parsedResources.get(FPGA).nonEmpty) + assert(parsedResources.get(FPGA).get.name === FPGA) + assert(parsedResources.get(FPGA).get.addresses.sameElements(Array("f1", "f2", "f3"))) + } test("track allocated resources by taskId") { val conf = new SparkConf @@ -254,15 +293,16 @@ class CoarseGrainedExecutorBackendSuite extends SparkFunSuite try { val rpcEnv = RpcEnv.create("1", "localhost", 0, conf, securityMgr) val env = createMockEnv(conf, serializer, Some(rpcEnv)) - backend = new CoarseGrainedExecutorBackend(env.rpcEnv, rpcEnv.address.hostPort, "1", - "host1", 4, Seq.empty[URL], env, None) + backend = new CoarseGrainedExecutorBackend(env.rpcEnv, rpcEnv.address.hostPort, "1", + "host1", "host1", 4, Seq.empty[URL], env, None, + resourceProfile = ResourceProfile.getOrCreateDefaultProfile(conf)) assert(backend.taskResources.isEmpty) val taskId = 1000000 // We don't really verify the data, just pass it around. val data = ByteBuffer.wrap(Array[Byte](1, 2, 3, 4)) - val taskDescription = new TaskDescription(taskId, 2, "1", "TASK 1000000", 19, 1, - mutable.Map.empty, mutable.Map.empty, new Properties, + val taskDescription = new TaskDescription(taskId, 2, "1", "TASK 1000000", + 19, 1, mutable.Map.empty, mutable.Map.empty, new Properties, Map(GPU -> new ResourceInformation(GPU, Array("0", "1"))), data) val serializedTaskDescription = TaskDescription.encode(taskDescription) backend.executor = mock[Executor] @@ -272,13 +312,15 @@ class CoarseGrainedExecutorBackendSuite extends SparkFunSuite backend.self.send(LaunchTask(new SerializableBuffer(serializedTaskDescription))) eventually(timeout(10.seconds)) { assert(backend.taskResources.size == 1) - assert(backend.taskResources(taskId)(GPU).addresses sameElements Array("0", "1")) + val resources = backend.taskResources(taskId) + assert(resources(GPU).addresses sameElements Array("0", "1")) } // Update the status of a running task shall not affect `taskResources` map. backend.statusUpdate(taskId, TaskState.RUNNING, data) assert(backend.taskResources.size == 1) - assert(backend.taskResources(taskId)(GPU).addresses sameElements Array("0", "1")) + val resources = backend.taskResources(taskId) + assert(resources(GPU).addresses sameElements Array("0", "1")) // Update the status of a finished task shall remove the entry from `taskResources` map. backend.statusUpdate(taskId, TaskState.FINISHED, data) @@ -290,6 +332,31 @@ class CoarseGrainedExecutorBackendSuite extends SparkFunSuite } } + test("SPARK-24203 when bindAddress is not set, it defaults to hostname") { + val args1 = Array( + "--driver-url", "driverurl", + "--executor-id", "1", + "--hostname", "host1", + "--cores", "1", + "--app-id", "app1") + + val arg = CoarseGrainedExecutorBackend.parseArguments(args1, "") + assert(arg.bindAddress == "host1") + } + + test("SPARK-24203 when bindAddress is different, it does not default to hostname") { + val args1 = Array( + "--driver-url", "driverurl", + "--executor-id", "1", + "--hostname", "host1", + "--bind-address", "bindaddress1", + "--cores", "1", + "--app-id", "app1") + + val arg = CoarseGrainedExecutorBackend.parseArguments(args1, "") + assert(arg.bindAddress == "bindaddress1") + } + private def createMockEnv(conf: SparkConf, serializer: JavaSerializer, rpcEnv: Option[RpcEnv] = None): SparkEnv = { val mockEnv = mock[SparkEnv] diff --git a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala index ac7e4b51ebc2b..31049d104e63d 100644 --- a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala +++ b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala @@ -33,9 +33,10 @@ import org.mockito.ArgumentMatchers.{any, eq => meq} import org.mockito.Mockito.{inOrder, verify, when} import org.mockito.invocation.InvocationOnMock import org.mockito.stubbing.Answer +import org.scalatest.Assertions._ import org.scalatest.PrivateMethodTester import org.scalatest.concurrent.Eventually -import org.scalatest.mockito.MockitoSugar +import org.scalatestplus.mockito.MockitoSugar import org.apache.spark._ import org.apache.spark.TaskState.TaskState @@ -56,7 +57,7 @@ import org.apache.spark.util.{LongAccumulator, UninterruptibleThread} class ExecutorSuite extends SparkFunSuite with LocalSparkContext with MockitoSugar with Eventually with PrivateMethodTester { - override def afterEach() { + override def afterEach(): Unit = { // Unset any latches after each test; each test that needs them initializes new ones. ExecutorSuiteHelper.latches = null super.afterEach() @@ -116,7 +117,8 @@ class ExecutorSuite extends SparkFunSuite var executor: Executor = null try { - executor = new Executor("id", "localhost", env, userClassPath = Nil, isLocal = true) + executor = new Executor("id", "localhost", env, userClassPath = Nil, isLocal = true, + resources = immutable.Map.empty[String, ResourceInformation]) // the task will be launched in a dedicated worker thread executor.launchTask(mockExecutorBackend, taskDescription) @@ -253,7 +255,8 @@ class ExecutorSuite extends SparkFunSuite val serializer = new JavaSerializer(conf) val env = createMockEnv(conf, serializer) val executor = - new Executor("id", "localhost", SparkEnv.get, userClassPath = Nil, isLocal = true) + new Executor("id", "localhost", SparkEnv.get, userClassPath = Nil, isLocal = true, + resources = immutable.Map.empty[String, ResourceInformation]) val executorClass = classOf[Executor] // Save all heartbeats sent into an ArrayBuffer for verification @@ -275,7 +278,7 @@ class ExecutorSuite extends SparkFunSuite private def heartbeatZeroAccumulatorUpdateTest(dropZeroMetrics: Boolean): Unit = { val c = EXECUTOR_HEARTBEAT_DROP_ZERO_ACCUMULATOR_UPDATES.key -> dropZeroMetrics.toString withHeartbeatExecutor(c) { (executor, heartbeats) => - val reportHeartbeat = PrivateMethod[Unit]('reportHeartBeat) + val reportHeartbeat = PrivateMethod[Unit](Symbol("reportHeartBeat")) // When no tasks are running, there should be no accumulators sent in heartbeat executor.invokePrivate(reportHeartbeat()) @@ -352,7 +355,8 @@ class ExecutorSuite extends SparkFunSuite val mockBackend = mock[ExecutorBackend] var executor: Executor = null try { - executor = new Executor("id", "localhost", SparkEnv.get, userClassPath = Nil, isLocal = true) + executor = new Executor("id", "localhost", SparkEnv.get, userClassPath = Nil, isLocal = true, + resources = immutable.Map.empty[String, ResourceInformation]) executor.launchTask(mockBackend, taskDescription) // Ensure that the executor's metricsPoller is polled so that values are recorded for @@ -465,7 +469,8 @@ class ExecutorSuite extends SparkFunSuite val timedOut = new AtomicBoolean(false) try { executor = new Executor("id", "localhost", SparkEnv.get, userClassPath = Nil, isLocal = true, - uncaughtExceptionHandler = mockUncaughtExceptionHandler) + uncaughtExceptionHandler = mockUncaughtExceptionHandler, + resources = immutable.Map.empty[String, ResourceInformation]) // the task will be launched in a dedicated worker thread executor.launchTask(mockBackend, taskDescription) if (killTask) { @@ -528,7 +533,8 @@ class FetchFailureThrowingRDD(sc: SparkContext) extends RDD[Int](sc, Nil) { throw new FetchFailedException( bmAddress = BlockManagerId("1", "hostA", 1234), shuffleId = 0, - mapId = 0, + mapId = 0L, + mapIndex = 0, reduceId = 0, message = "fake fetch failure" ) diff --git a/core/src/test/scala/org/apache/spark/executor/ProcfsMetricsGetterSuite.scala b/core/src/test/scala/org/apache/spark/executor/ProcfsMetricsGetterSuite.scala index 9ed1497db5e1d..9836697e1647c 100644 --- a/core/src/test/scala/org/apache/spark/executor/ProcfsMetricsGetterSuite.scala +++ b/core/src/test/scala/org/apache/spark/executor/ProcfsMetricsGetterSuite.scala @@ -22,9 +22,9 @@ import org.apache.spark.SparkFunSuite class ProcfsMetricsGetterSuite extends SparkFunSuite { - val p = new ProcfsMetricsGetter(getTestResourcePath("ProcfsMetrics")) test("testGetProcessInfo") { + val p = new ProcfsMetricsGetter(getTestResourcePath("ProcfsMetrics")) var r = ProcfsMetrics(0, 0, 0, 0, 0, 0) r = p.addProcfsMetricsFromOneProcess(r, 26109) assert(r.jvmVmemTotal == 4769947648L) diff --git a/core/src/test/scala/org/apache/spark/input/WholeTextFileInputFormatSuite.scala b/core/src/test/scala/org/apache/spark/input/WholeTextFileInputFormatSuite.scala index 576ca1613f75e..9a21ea6dafcac 100644 --- a/core/src/test/scala/org/apache/spark/input/WholeTextFileInputFormatSuite.scala +++ b/core/src/test/scala/org/apache/spark/input/WholeTextFileInputFormatSuite.scala @@ -25,7 +25,6 @@ import org.scalatest.BeforeAndAfterAll import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.internal.Logging -import org.apache.spark.util.Utils /** * Tests the correctness of @@ -35,13 +34,13 @@ import org.apache.spark.util.Utils class WholeTextFileInputFormatSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { private var sc: SparkContext = _ - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() val conf = new SparkConf() sc = new SparkContext("local", "test", conf) } - override def afterAll() { + override def afterAll(): Unit = { try { sc.stop() } finally { diff --git a/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala b/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala index 47552916adb22..fab7aea6c47aa 100644 --- a/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala +++ b/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala @@ -40,7 +40,7 @@ class WholeTextFileRecordReaderSuite extends SparkFunSuite with BeforeAndAfterAl private var sc: SparkContext = _ private var factory: CompressionCodecFactory = _ - override def beforeAll() { + override def beforeAll(): Unit = { // Hadoop's FileSystem caching does not use the Configuration as part of its cache key, which // can cause Filesystem.get(Configuration) to return a cached instance created with a different // configuration than the one passed to get() (see HADOOP-8490 for more details). This caused @@ -59,7 +59,7 @@ class WholeTextFileRecordReaderSuite extends SparkFunSuite with BeforeAndAfterAl factory = new CompressionCodecFactory(sc.hadoopConfiguration) } - override def afterAll() { + override def afterAll(): Unit = { try { sc.stop() } finally { diff --git a/core/src/test/scala/org/apache/spark/internal/LoggingSuite.scala b/core/src/test/scala/org/apache/spark/internal/LoggingSuite.scala index 250ac3dafcabc..6b7cc304a1baa 100644 --- a/core/src/test/scala/org/apache/spark/internal/LoggingSuite.scala +++ b/core/src/test/scala/org/apache/spark/internal/LoggingSuite.scala @@ -33,18 +33,14 @@ class LoggingSuite extends SparkFunSuite { val originalThreshold = Logging.sparkShellThresholdLevel Logging.sparkShellThresholdLevel = Level.WARN try { - val logger = Logger.getLogger("a.b.c.D") - val logEvent = new LoggingEvent(logger.getName(), logger, Level.INFO, "Test", null) - assert(ssf.decide(logEvent) === Filter.DENY) - - // log level is less than threshold level but different from root level - val logEvent1 = new LoggingEvent(logger.getName(), logger, Level.DEBUG, "Test", null) - assert(ssf.decide(logEvent1) != Filter.DENY) + val logger1 = Logger.getLogger("a.b.c.D") + val logEvent1 = new LoggingEvent(logger1.getName(), logger1, Level.INFO, "Test", null) + assert(ssf.decide(logEvent1) == Filter.DENY) // custom log level configured val parentLogger = Logger.getLogger("a.b.c") parentLogger.setLevel(Level.INFO) - assert(ssf.decide(logEvent) != Filter.DENY) + assert(ssf.decide(logEvent1) != Filter.DENY) // log level is greater than or equal to threshold level val logger2 = Logger.getLogger("a.b.E") diff --git a/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala b/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala new file mode 100644 index 0000000000000..cf2d9293ef822 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala @@ -0,0 +1,337 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.internal.plugin + +import java.io.File +import java.nio.charset.StandardCharsets +import java.util.{Map => JMap} + +import scala.collection.JavaConverters._ +import scala.concurrent.duration._ + +import com.codahale.metrics.Gauge +import com.google.common.io.Files +import org.mockito.ArgumentMatchers.{any, eq => meq} +import org.mockito.Mockito.{mock, spy, verify, when} +import org.scalatest.BeforeAndAfterEach +import org.scalatest.concurrent.Eventually.{eventually, interval, timeout} + +import org.apache.spark._ +import org.apache.spark.TestUtils._ +import org.apache.spark.api.plugin._ +import org.apache.spark.internal.config._ +import org.apache.spark.launcher.SparkLauncher +import org.apache.spark.resource.ResourceInformation +import org.apache.spark.resource.ResourceUtils.GPU +import org.apache.spark.resource.TestResourceIDs.{DRIVER_GPU_ID, EXECUTOR_GPU_ID, WORKER_GPU_ID} +import org.apache.spark.util.Utils + +class PluginContainerSuite extends SparkFunSuite with BeforeAndAfterEach with LocalSparkContext { + + override def afterEach(): Unit = { + TestSparkPlugin.reset() + NonLocalModeSparkPlugin.reset() + super.afterEach() + } + + test("plugin initialization and communication") { + val conf = new SparkConf() + .setAppName(getClass().getName()) + .set(SparkLauncher.SPARK_MASTER, "local[1]") + .set(PLUGINS, Seq(classOf[TestSparkPlugin].getName())) + + TestSparkPlugin.extraConf = Map("foo" -> "bar", "bar" -> "baz").asJava + + sc = new SparkContext(conf) + + assert(TestSparkPlugin.driverPlugin != null) + verify(TestSparkPlugin.driverPlugin).init(meq(sc), any()) + + assert(TestSparkPlugin.executorPlugin != null) + verify(TestSparkPlugin.executorPlugin).init(any(), meq(TestSparkPlugin.extraConf)) + + assert(TestSparkPlugin.executorContext != null) + assert(TestSparkPlugin.executorContext.resources.isEmpty) + + // One way messages don't block, so need to loop checking whether it arrives. + TestSparkPlugin.executorContext.send("oneway") + eventually(timeout(10.seconds), interval(10.millis)) { + verify(TestSparkPlugin.driverPlugin).receive("oneway") + } + + assert(TestSparkPlugin.executorContext.ask("ask") === "reply") + + val err = intercept[Exception] { + TestSparkPlugin.executorContext.ask("unknown message") + } + assert(err.getMessage().contains("unknown message")) + + // It should be possible for the driver plugin to send a message to itself, even if that doesn't + // make a whole lot of sense. It at least allows the same context class to be used on both + // sides. + assert(TestSparkPlugin.driverContext != null) + assert(TestSparkPlugin.driverContext.ask("ask") === "reply") + + val metricSources = sc.env.metricsSystem + .getSourcesByName(s"plugin.${classOf[TestSparkPlugin].getName()}") + assert(metricSources.size === 2) + + def findMetric(name: String): Int = { + val allFound = metricSources.filter(_.metricRegistry.getGauges().containsKey(name)) + assert(allFound.size === 1) + allFound.head.metricRegistry.getGauges().get(name).asInstanceOf[Gauge[Int]].getValue() + } + + assert(findMetric("driverMetric") === 42) + assert(findMetric("executorMetric") === 84) + + sc.stop() + sc = null + + verify(TestSparkPlugin.driverPlugin).shutdown() + verify(TestSparkPlugin.executorPlugin).shutdown() + } + + test("do nothing if plugins are not configured") { + val conf = new SparkConf() + val env = mock(classOf[SparkEnv]) + when(env.conf).thenReturn(conf) + val container = PluginContainer(env, Map.empty[String, ResourceInformation].asJava) + assert(container === None) + } + + test("merging of config options") { + val conf = new SparkConf() + .setAppName(getClass().getName()) + .set(SparkLauncher.SPARK_MASTER, "local[1]") + .set(PLUGINS, Seq(classOf[TestSparkPlugin].getName())) + .set(DEFAULT_PLUGINS_LIST, classOf[TestSparkPlugin].getName()) + + assert(conf.get(PLUGINS).size === 2) + + sc = new SparkContext(conf) + // Just check plugin is loaded. The plugin code below checks whether a single copy was loaded. + assert(TestSparkPlugin.driverPlugin != null) + } + + test("plugin initialization in non-local mode") { + val path = Utils.createTempDir() + + val conf = new SparkConf() + .setAppName(getClass().getName()) + .set(SparkLauncher.SPARK_MASTER, "local-cluster[2,1,1024]") + .set(PLUGINS, Seq(classOf[NonLocalModeSparkPlugin].getName())) + .set(NonLocalModeSparkPlugin.TEST_PATH_CONF, path.getAbsolutePath()) + + sc = new SparkContext(conf) + TestUtils.waitUntilExecutorsUp(sc, 2, 10000) + + eventually(timeout(10.seconds), interval(100.millis)) { + val children = path.listFiles() + assert(children != null) + assert(children.length >= 3) + } + } + + test("plugin initialization in non-local mode with resources") { + withTempDir { dir => + val scriptPath = createTempScriptWithExpectedOutput(dir, "gpuDiscoveryScript", + """{"name": "gpu","addresses":["5", "6"]}""") + + val workerScript = createTempScriptWithExpectedOutput(dir, "resourceDiscoveryScript", + """{"name": "gpu","addresses":["3", "4"]}""") + + val conf = new SparkConf() + .setAppName(getClass().getName()) + .set(SparkLauncher.SPARK_MASTER, "local-cluster[1,1,1024]") + .set(PLUGINS, Seq(classOf[NonLocalModeSparkPlugin].getName())) + .set(NonLocalModeSparkPlugin.TEST_PATH_CONF, dir.getAbsolutePath()) + .set(DRIVER_GPU_ID.amountConf, "2") + .set(DRIVER_GPU_ID.discoveryScriptConf, scriptPath) + .set(WORKER_GPU_ID.amountConf, "2") + .set(WORKER_GPU_ID.discoveryScriptConf, workerScript) + .set(EXECUTOR_GPU_ID.amountConf, "2") + sc = new SparkContext(conf) + + // Ensure all executors has started + TestUtils.waitUntilExecutorsUp(sc, 1, 10000) + + var children = Array.empty[File] + eventually(timeout(10.seconds), interval(100.millis)) { + children = dir.listFiles() + assert(children != null) + // we have 2 discovery scripts and then expect 1 driver and 1 executor file + assert(children.length >= 4) + } + val execFiles = + children.filter(_.getName.startsWith(NonLocalModeSparkPlugin.executorFileStr)) + assert(execFiles.size === 1) + val allLines = Files.readLines(execFiles(0), StandardCharsets.UTF_8) + assert(allLines.size === 1) + val addrs = NonLocalModeSparkPlugin.extractGpuAddrs(allLines.get(0)) + assert(addrs.size === 2) + assert(addrs.sorted === Array("3", "4")) + + assert(NonLocalModeSparkPlugin.driverContext != null) + val driverResources = NonLocalModeSparkPlugin.driverContext.resources() + assert(driverResources.size === 1) + assert(driverResources.get(GPU).addresses === Array("5", "6")) + assert(driverResources.get(GPU).name === GPU) + } + } +} + +class NonLocalModeSparkPlugin extends SparkPlugin { + + override def driverPlugin(): DriverPlugin = { + new DriverPlugin() { + override def init(sc: SparkContext, ctx: PluginContext): JMap[String, String] = { + NonLocalModeSparkPlugin.writeDriverFile(NonLocalModeSparkPlugin.driverFileStr, ctx.conf(), + ctx.executorID()) + NonLocalModeSparkPlugin.driverContext = ctx + Map.empty[String, String].asJava + } + } + } + + override def executorPlugin(): ExecutorPlugin = { + new ExecutorPlugin() { + override def init(ctx: PluginContext, extraConf: JMap[String, String]): Unit = { + NonLocalModeSparkPlugin.writeFile(NonLocalModeSparkPlugin.executorFileStr, ctx.conf(), + ctx.executorID(), ctx.resources().asScala.toMap) + } + } + } +} + +object NonLocalModeSparkPlugin { + val TEST_PATH_CONF = "spark.nonLocalPlugin.path" + var driverContext: PluginContext = _ + val executorFileStr = "EXECUTOR_FILE_" + val driverFileStr = "DRIVER_FILE_" + + private def createFileStringWithGpuAddrs( + id: String, + resources: Map[String, ResourceInformation]): String = { + // try to keep this simple and only write the gpus addresses, if we add more resources need to + // make more complex + val resourcesString = resources.filterKeys(_.equals(GPU)).map { + case (_, ri) => + s"${ri.addresses.mkString(",")}" + }.mkString(",") + s"$id&$resourcesString" + } + + def extractGpuAddrs(str: String): Array[String] = { + val idAndAddrs = str.split("&") + if (idAndAddrs.size > 1) { + idAndAddrs(1).split(",") + } else { + Array.empty[String] + } + } + + def writeDriverFile( + filePrefix: String, + conf: SparkConf, + id: String): Unit = { + writeFile(filePrefix, conf, id, Map.empty) + } + + def writeFile( + filePrefix: String, + conf: SparkConf, + id: String, + resources: Map[String, ResourceInformation]): Unit = { + val path = conf.get(TEST_PATH_CONF) + val strToWrite = createFileStringWithGpuAddrs(id, resources) + Files.write(strToWrite, new File(path, s"$filePrefix$id"), StandardCharsets.UTF_8) + } + + def reset(): Unit = { + driverContext = null + } +} + +class TestSparkPlugin extends SparkPlugin { + + override def driverPlugin(): DriverPlugin = { + val p = new TestDriverPlugin() + require(TestSparkPlugin.driverPlugin == null, "Driver plugin already initialized.") + TestSparkPlugin.driverPlugin = spy(p) + TestSparkPlugin.driverPlugin + } + + override def executorPlugin(): ExecutorPlugin = { + val p = new TestExecutorPlugin() + require(TestSparkPlugin.executorPlugin == null, "Executor plugin already initialized.") + TestSparkPlugin.executorPlugin = spy(p) + TestSparkPlugin.executorPlugin + } + +} + +private class TestDriverPlugin extends DriverPlugin { + + override def init(sc: SparkContext, ctx: PluginContext): JMap[String, String] = { + TestSparkPlugin.driverContext = ctx + TestSparkPlugin.extraConf + } + + override def registerMetrics(appId: String, ctx: PluginContext): Unit = { + ctx.metricRegistry().register("driverMetric", new Gauge[Int] { + override def getValue(): Int = 42 + }) + } + + override def receive(msg: AnyRef): AnyRef = msg match { + case "oneway" => null + case "ask" => "reply" + case other => throw new IllegalArgumentException(s"unknown: $other") + } + +} + +private class TestExecutorPlugin extends ExecutorPlugin { + + override def init(ctx: PluginContext, extraConf: JMap[String, String]): Unit = { + ctx.metricRegistry().register("executorMetric", new Gauge[Int] { + override def getValue(): Int = 84 + }) + TestSparkPlugin.executorContext = ctx + } + +} + +private object TestSparkPlugin { + var driverPlugin: TestDriverPlugin = _ + var driverContext: PluginContext = _ + + var executorPlugin: TestExecutorPlugin = _ + var executorContext: PluginContext = _ + + var extraConf: JMap[String, String] = _ + + def reset(): Unit = { + driverPlugin = null + driverContext = null + executorPlugin = null + executorContext = null + extraConf = null + } +} diff --git a/core/src/test/scala/org/apache/spark/io/ChunkedByteBufferFileRegionSuite.scala b/core/src/test/scala/org/apache/spark/io/ChunkedByteBufferFileRegionSuite.scala index a6b0654204f34..551c0f1a73241 100644 --- a/core/src/test/scala/org/apache/spark/io/ChunkedByteBufferFileRegionSuite.scala +++ b/core/src/test/scala/org/apache/spark/io/ChunkedByteBufferFileRegionSuite.scala @@ -23,7 +23,7 @@ import scala.util.Random import org.mockito.Mockito.when import org.scalatest.BeforeAndAfterEach -import org.scalatest.mockito.MockitoSugar +import org.scalatestplus.mockito.MockitoSugar import org.apache.spark.{SparkConf, SparkEnv, SparkFunSuite} import org.apache.spark.internal.config diff --git a/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala b/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala index 7b40e3e58216d..4b27396e6ae05 100644 --- a/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala +++ b/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala @@ -26,7 +26,7 @@ import org.apache.spark.{SparkConf, SparkFunSuite} class CompressionCodecSuite extends SparkFunSuite { val conf = new SparkConf(false) - def testCodec(codec: CompressionCodec) { + def testCodec(codec: CompressionCodec): Unit = { // Write 1000 integers to the output stream, compressed. val outputStream = new ByteArrayOutputStream() val out = codec.compressedOutputStream(outputStream) diff --git a/core/src/test/scala/org/apache/spark/memory/TestMemoryManager.scala b/core/src/test/scala/org/apache/spark/memory/TestMemoryManager.scala index c26945fa5fa31..60f67699f81be 100644 --- a/core/src/test/scala/org/apache/spark/memory/TestMemoryManager.scala +++ b/core/src/test/scala/org/apache/spark/memory/TestMemoryManager.scala @@ -17,60 +17,110 @@ package org.apache.spark.memory +import javax.annotation.concurrent.GuardedBy + +import scala.collection.mutable + import org.apache.spark.SparkConf import org.apache.spark.storage.BlockId class TestMemoryManager(conf: SparkConf) extends MemoryManager(conf, numCores = 1, Long.MaxValue, Long.MaxValue) { + @GuardedBy("this") + private var consequentOOM = 0 + @GuardedBy("this") + private var available = Long.MaxValue + @GuardedBy("this") + private val memoryForTask = mutable.HashMap[Long, Long]().withDefaultValue(0L) + override private[memory] def acquireExecutionMemory( numBytes: Long, taskAttemptId: Long, - memoryMode: MemoryMode): Long = { - if (consequentOOM > 0) { - consequentOOM -= 1 - 0 - } else if (available >= numBytes) { - available -= numBytes - numBytes - } else { - val grant = available - available = 0 - grant + memoryMode: MemoryMode): Long = synchronized { + require(numBytes >= 0) + val acquired = { + if (consequentOOM > 0) { + consequentOOM -= 1 + 0 + } else if (available >= numBytes) { + available -= numBytes + numBytes + } else { + val grant = available + available = 0 + grant + } } + memoryForTask(taskAttemptId) = memoryForTask.getOrElse(taskAttemptId, 0L) + acquired + acquired + } + + override private[memory] def releaseExecutionMemory( + numBytes: Long, + taskAttemptId: Long, + memoryMode: MemoryMode): Unit = synchronized { + require(numBytes >= 0) + available += numBytes + val existingMemoryUsage = memoryForTask.getOrElse(taskAttemptId, 0L) + val newMemoryUsage = existingMemoryUsage - numBytes + require( + newMemoryUsage >= 0, + s"Attempting to free $numBytes of memory for task attempt $taskAttemptId, but it only " + + s"allocated $existingMemoryUsage bytes of memory") + memoryForTask(taskAttemptId) = newMemoryUsage + } + + override private[memory] def releaseAllExecutionMemoryForTask(taskAttemptId: Long): Long = { + memoryForTask.remove(taskAttemptId).getOrElse(0L) + } + + override private[memory] def getExecutionMemoryUsageForTask(taskAttemptId: Long): Long = { + memoryForTask.getOrElse(taskAttemptId, 0L) } + override def acquireStorageMemory( blockId: BlockId, numBytes: Long, - memoryMode: MemoryMode): Boolean = true + memoryMode: MemoryMode): Boolean = { + require(numBytes >= 0) + true + } + override def acquireUnrollMemory( blockId: BlockId, numBytes: Long, - memoryMode: MemoryMode): Boolean = true - override def releaseStorageMemory(numBytes: Long, memoryMode: MemoryMode): Unit = {} - override private[memory] def releaseExecutionMemory( - numBytes: Long, - taskAttemptId: Long, - memoryMode: MemoryMode): Unit = { - available += numBytes + memoryMode: MemoryMode): Boolean = { + require(numBytes >= 0) + true } + + override def releaseStorageMemory(numBytes: Long, memoryMode: MemoryMode): Unit = { + require(numBytes >= 0) + } + override def maxOnHeapStorageMemory: Long = Long.MaxValue override def maxOffHeapStorageMemory: Long = 0L - private var consequentOOM = 0 - private var available = Long.MaxValue - + /** + * Causes the next call to [[acquireExecutionMemory()]] to fail to allocate + * memory (returning `0`), simulating low-on-memory / out-of-memory conditions. + */ def markExecutionAsOutOfMemoryOnce(): Unit = { markconsequentOOM(1) } - def markconsequentOOM(n : Int) : Unit = { + /** + * Causes the next `n` calls to [[acquireExecutionMemory()]] to fail to allocate + * memory (returning `0`), simulating low-on-memory / out-of-memory conditions. + */ + def markconsequentOOM(n: Int): Unit = synchronized { consequentOOM += n } - def limit(avail: Long): Unit = { + def limit(avail: Long): Unit = synchronized { + require(avail >= 0) available = avail } - } diff --git a/core/src/test/scala/org/apache/spark/memory/TestMemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/memory/TestMemoryManagerSuite.scala new file mode 100644 index 0000000000000..043f341074b88 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/memory/TestMemoryManagerSuite.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.memory + +import org.apache.spark.{SparkConf, SparkFunSuite} + +/** + * Tests of [[TestMemoryManager]] itself. + */ +class TestMemoryManagerSuite extends SparkFunSuite { + test("tracks allocated execution memory by task") { + val testMemoryManager = new TestMemoryManager(new SparkConf()) + + assert(testMemoryManager.getExecutionMemoryUsageForTask(0) == 0) + assert(testMemoryManager.getExecutionMemoryUsageForTask(1) == 0) + + testMemoryManager.acquireExecutionMemory(10, 0, MemoryMode.ON_HEAP) + testMemoryManager.acquireExecutionMemory(5, 1, MemoryMode.ON_HEAP) + testMemoryManager.acquireExecutionMemory(5, 0, MemoryMode.ON_HEAP) + assert(testMemoryManager.getExecutionMemoryUsageForTask(0) == 15) + assert(testMemoryManager.getExecutionMemoryUsageForTask(1) == 5) + + testMemoryManager.releaseExecutionMemory(10, 0, MemoryMode.ON_HEAP) + assert(testMemoryManager.getExecutionMemoryUsageForTask(0) == 5) + + testMemoryManager.releaseAllExecutionMemoryForTask(0) + testMemoryManager.releaseAllExecutionMemoryForTask(1) + assert(testMemoryManager.getExecutionMemoryUsageForTask(0) == 0) + assert(testMemoryManager.getExecutionMemoryUsageForTask(1) == 0) + } + + test("markconsequentOOM") { + val testMemoryManager = new TestMemoryManager(new SparkConf()) + assert(testMemoryManager.acquireExecutionMemory(1, 0, MemoryMode.ON_HEAP) == 1) + testMemoryManager.markconsequentOOM(2) + assert(testMemoryManager.acquireExecutionMemory(1, 0, MemoryMode.ON_HEAP) == 0) + assert(testMemoryManager.acquireExecutionMemory(1, 0, MemoryMode.ON_HEAP) == 0) + assert(testMemoryManager.acquireExecutionMemory(1, 0, MemoryMode.ON_HEAP) == 1) + } +} diff --git a/core/src/test/scala/org/apache/spark/memory/UnifiedMemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/memory/UnifiedMemoryManagerSuite.scala index 0a689f81a5761..0cafe6891c7d1 100644 --- a/core/src/test/scala/org/apache/spark/memory/UnifiedMemoryManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/memory/UnifiedMemoryManagerSuite.scala @@ -305,7 +305,7 @@ class UnifiedMemoryManagerSuite extends MemoryManagerSuite with PrivateMethodTes intercept[RuntimeException] { mm.acquireExecutionMemory(1000L, 0, memoryMode) } - val assertInvariants = PrivateMethod[Unit]('assertInvariants) + val assertInvariants = PrivateMethod[Unit](Symbol("assertInvariants")) mm.invokePrivate[Unit](assertInvariants()) } diff --git a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala index c7bd0c905d027..330347299ab56 100644 --- a/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala +++ b/core/src/test/scala/org/apache/spark/metrics/InputOutputMetricsSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.metrics -import java.io.{File, FileWriter, PrintWriter} +import java.io.{File, PrintWriter} import scala.collection.mutable.ArrayBuffer @@ -166,7 +166,7 @@ class InputOutputMetricsSuite extends SparkFunSuite with SharedSparkContext var shuffleRead = 0L var shuffleWritten = 0L sc.addSparkListener(new SparkListener() { - override def onTaskEnd(taskEnd: SparkListenerTaskEnd) { + override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = { val metrics = taskEnd.taskMetrics inputRead += metrics.inputMetrics.recordsRead outputWritten += metrics.outputMetrics.recordsWritten @@ -182,7 +182,7 @@ class InputOutputMetricsSuite extends SparkFunSuite with SharedSparkContext .reduceByKey(_ + _) .saveAsTextFile(tmpFile.toURI.toString) - sc.listenerBus.waitUntilEmpty(500) + sc.listenerBus.waitUntilEmpty() assert(inputRead == numRecords) assert(outputWritten == numBuckets) @@ -243,17 +243,17 @@ class InputOutputMetricsSuite extends SparkFunSuite with SharedSparkContext val taskMetrics = new ArrayBuffer[Long]() // Avoid receiving earlier taskEnd events - sc.listenerBus.waitUntilEmpty(500) + sc.listenerBus.waitUntilEmpty() sc.addSparkListener(new SparkListener() { - override def onTaskEnd(taskEnd: SparkListenerTaskEnd) { + override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = { taskMetrics += collector(taskEnd) } }) job - sc.listenerBus.waitUntilEmpty(500) + sc.listenerBus.waitUntilEmpty() taskMetrics.sum } @@ -284,16 +284,16 @@ class InputOutputMetricsSuite extends SparkFunSuite with SharedSparkContext val taskBytesWritten = new ArrayBuffer[Long]() sc.addSparkListener(new SparkListener() { - override def onTaskEnd(taskEnd: SparkListenerTaskEnd) { + override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = { taskBytesWritten += taskEnd.taskMetrics.outputMetrics.bytesWritten } }) - val rdd = sc.parallelize(Array("a", "b", "c", "d"), 2) + val rdd = sc.parallelize(Seq("a", "b", "c", "d"), 2) try { rdd.saveAsTextFile(outPath.toString) - sc.listenerBus.waitUntilEmpty(500) + sc.listenerBus.waitUntilEmpty() assert(taskBytesWritten.length == 2) val outFiles = fs.listStatus(outPath).filter(_.getPath.getName != "_SUCCESS") taskBytesWritten.zip(outFiles).foreach { case (bytes, fileStatus) => diff --git a/core/src/test/scala/org/apache/spark/metrics/MetricsSystemSuite.scala b/core/src/test/scala/org/apache/spark/metrics/MetricsSystemSuite.scala index 99c9dde1cf23c..70b6c9a112142 100644 --- a/core/src/test/scala/org/apache/spark/metrics/MetricsSystemSuite.scala +++ b/core/src/test/scala/org/apache/spark/metrics/MetricsSystemSuite.scala @@ -42,8 +42,8 @@ class MetricsSystemSuite extends SparkFunSuite with BeforeAndAfter with PrivateM test("MetricsSystem with default config") { val metricsSystem = MetricsSystem.createMetricsSystem("default", conf, securityMgr) metricsSystem.start() - val sources = PrivateMethod[ArrayBuffer[Source]]('sources) - val sinks = PrivateMethod[ArrayBuffer[Sink]]('sinks) + val sources = PrivateMethod[ArrayBuffer[Source]](Symbol("sources")) + val sinks = PrivateMethod[ArrayBuffer[Sink]](Symbol("sinks")) assert(metricsSystem.invokePrivate(sources()).length === StaticSources.allSources.length) assert(metricsSystem.invokePrivate(sinks()).length === 0) @@ -53,8 +53,8 @@ class MetricsSystemSuite extends SparkFunSuite with BeforeAndAfter with PrivateM test("MetricsSystem with sources add") { val metricsSystem = MetricsSystem.createMetricsSystem("test", conf, securityMgr) metricsSystem.start() - val sources = PrivateMethod[ArrayBuffer[Source]]('sources) - val sinks = PrivateMethod[ArrayBuffer[Sink]]('sinks) + val sources = PrivateMethod[ArrayBuffer[Source]](Symbol("sources")) + val sinks = PrivateMethod[ArrayBuffer[Sink]](Symbol("sinks")) assert(metricsSystem.invokePrivate(sources()).length === StaticSources.allSources.length) assert(metricsSystem.invokePrivate(sinks()).length === 1) diff --git a/core/src/test/scala/org/apache/spark/metrics/source/SourceConfigSuite.scala b/core/src/test/scala/org/apache/spark/metrics/source/SourceConfigSuite.scala new file mode 100644 index 0000000000000..8f5ab7419d4f7 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/metrics/source/SourceConfigSuite.scala @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.metrics.source + +import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite} +import org.apache.spark.internal.config.{METRICS_EXECUTORMETRICS_SOURCE_ENABLED, METRICS_STATIC_SOURCES_ENABLED} + +class SourceConfigSuite extends SparkFunSuite with LocalSparkContext { + + test("Test configuration for adding static sources registration") { + val conf = new SparkConf() + conf.set(METRICS_STATIC_SOURCES_ENABLED, true) + val sc = new SparkContext("local", "test", conf) + try { + val metricsSystem = sc.env.metricsSystem + + // Static sources should be registered + assert (metricsSystem.getSourcesByName("CodeGenerator").nonEmpty) + assert (metricsSystem.getSourcesByName("HiveExternalCatalog").nonEmpty) + } finally { + sc.stop() + } + } + + test("Test configuration for skipping static sources registration") { + val conf = new SparkConf() + conf.set(METRICS_STATIC_SOURCES_ENABLED, false) + val sc = new SparkContext("local", "test", conf) + try { + val metricsSystem = sc.env.metricsSystem + + // Static sources should not be registered + assert (metricsSystem.getSourcesByName("CodeGenerator").isEmpty) + assert (metricsSystem.getSourcesByName("HiveExternalCatalog").isEmpty) + } finally { + sc.stop() + } + } + + test("Test configuration for adding ExecutorMetrics source registration") { + val conf = new SparkConf() + conf.set(METRICS_EXECUTORMETRICS_SOURCE_ENABLED, true) + val sc = new SparkContext("local", "test", conf) + try { + val metricsSystem = sc.env.metricsSystem + + // ExecutorMetrics source should be registered + assert (metricsSystem.getSourcesByName("ExecutorMetrics").nonEmpty) + } finally { + sc.stop() + } + } + + test("Test configuration for skipping ExecutorMetrics source registration") { + val conf = new SparkConf() + conf.set(METRICS_EXECUTORMETRICS_SOURCE_ENABLED, false) + val sc = new SparkContext("local", "test", conf) + try { + val metricsSystem = sc.env.metricsSystem + + // ExecutorMetrics source should not be registered + assert (metricsSystem.getSourcesByName("ExecutorMetrics").isEmpty) + } finally { + sc.stop() + } + } + +} diff --git a/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferSecuritySuite.scala b/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferSecuritySuite.scala index 544d52d48b385..c726329ce8a84 100644 --- a/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferSecuritySuite.scala +++ b/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferSecuritySuite.scala @@ -29,7 +29,7 @@ import scala.util.{Failure, Success, Try} import com.google.common.io.CharStreams import org.mockito.Mockito._ import org.scalatest.Matchers -import org.scalatest.mockito.MockitoSugar +import org.scalatestplus.mockito.MockitoSugar import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite} import org.apache.spark.internal.config._ @@ -122,7 +122,7 @@ class NettyBlockTransferSecuritySuite extends SparkFunSuite with MockitoSugar wi val blockString = "Hello, world!" val blockBuffer = new NioManagedBuffer(ByteBuffer.wrap( blockString.getBytes(StandardCharsets.UTF_8))) - when(blockManager.getBlockData(blockId)).thenReturn(blockBuffer) + when(blockManager.getLocalBlockData(blockId)).thenReturn(blockBuffer) val securityManager0 = new SecurityManager(conf0) val exec0 = new NettyBlockTransferService(conf0, securityManager0, "localhost", "localhost", 0, diff --git a/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala b/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala index 5d67d3358a9ca..edddf88a28f85 100644 --- a/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala +++ b/core/src/test/scala/org/apache/spark/network/netty/NettyBlockTransferServiceSuite.scala @@ -41,7 +41,7 @@ class NettyBlockTransferServiceSuite private var service0: NettyBlockTransferService = _ private var service1: NettyBlockTransferService = _ - override def afterEach() { + override def afterEach(): Unit = { try { if (service0 != null) { service0.close() diff --git a/core/src/test/scala/org/apache/spark/network/netty/SparkTransportConfSuite.scala b/core/src/test/scala/org/apache/spark/network/netty/SparkTransportConfSuite.scala index d7265b6c24fe7..55cd1a4bfe7dd 100644 --- a/core/src/test/scala/org/apache/spark/network/netty/SparkTransportConfSuite.scala +++ b/core/src/test/scala/org/apache/spark/network/netty/SparkTransportConfSuite.scala @@ -17,8 +17,7 @@ package org.apache.spark.network.netty -import org.scalatest.Matchers -import org.scalatest.mockito.MockitoSugar +import org.scalatestplus.mockito.MockitoSugar import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.network.util.NettyUtils diff --git a/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala index a7eb0eca72e56..a5bc557eef5ad 100644 --- a/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala @@ -37,12 +37,12 @@ class AsyncRDDActionsSuite extends SparkFunSuite with BeforeAndAfterAll with Tim // Necessary to make ScalaTest 3.x interrupt a thread on the JVM like ScalaTest 2.2.x implicit val defaultSignaler: Signaler = ThreadSignaler - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() sc = new SparkContext("local[2]", "test") } - override def afterAll() { + override def afterAll(): Unit = { try { LocalSparkContext.stop(sc) sc = null @@ -66,7 +66,7 @@ class AsyncRDDActionsSuite extends SparkFunSuite with BeforeAndAfterAll with Tim } test("foreachAsync") { - zeroPartRdd.foreachAsync(i => Unit).get() + zeroPartRdd.foreachAsync(i => ()).get() val accum = sc.longAccumulator sc.parallelize(1 to 1000, 3).foreachAsync { i => @@ -76,7 +76,7 @@ class AsyncRDDActionsSuite extends SparkFunSuite with BeforeAndAfterAll with Tim } test("foreachPartitionAsync") { - zeroPartRdd.foreachPartitionAsync(iter => Unit).get() + zeroPartRdd.foreachPartitionAsync(iter => ()).get() val accum = sc.longAccumulator sc.parallelize(1 to 1000, 9).foreachPartitionAsync { iter => @@ -86,7 +86,7 @@ class AsyncRDDActionsSuite extends SparkFunSuite with BeforeAndAfterAll with Tim } test("takeAsync") { - def testTake(rdd: RDD[Int], input: Seq[Int], num: Int) { + def testTake(rdd: RDD[Int], input: Seq[Int], num: Int): Unit = { val expected = input.take(num) val saw = rdd.takeAsync(num).get() assert(saw == expected, "incorrect result for rdd with %d partitions (expected %s, saw %s)" diff --git a/core/src/test/scala/org/apache/spark/rdd/CoalescedRDDBenchmark.scala b/core/src/test/scala/org/apache/spark/rdd/CoalescedRDDBenchmark.scala index 42b30707f2624..617ca5a1a8bc4 100644 --- a/core/src/test/scala/org/apache/spark/rdd/CoalescedRDDBenchmark.scala +++ b/core/src/test/scala/org/apache/spark/rdd/CoalescedRDDBenchmark.scala @@ -67,7 +67,8 @@ object CoalescedRDDBenchmark extends BenchmarkBase { benchmark.run() } - private def performCoalesce(blocks: immutable.Seq[(Int, Seq[String])], numPartitions: Int) { + private def performCoalesce(blocks: immutable.Seq[(Int, Seq[String])], + numPartitions: Int): Unit = { sc.makeRDD(blocks).coalesce(numPartitions).partitions } diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala index 1564435a0bbae..2de4b109e40e9 100644 --- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala @@ -34,11 +34,10 @@ import org.scalatest.Assertions import org.apache.spark._ import org.apache.spark.Partitioner -import org.apache.spark.util.Utils class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { test("aggregateByKey") { - val pairs = sc.parallelize(Array((1, 1), (1, 1), (3, 2), (5, 1), (5, 3)), 2) + val pairs = sc.parallelize(Seq((1, 1), (1, 1), (3, 2), (5, 1), (5, 3)), 2) val sets = pairs.aggregateByKey(new HashSet[Int]())(_ += _, _ ++= _).collect() assert(sets.size === 3) @@ -51,7 +50,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("groupByKey") { - val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (2, 1))) + val pairs = sc.parallelize(Seq((1, 1), (1, 2), (1, 3), (2, 1))) val groups = pairs.groupByKey().collect() assert(groups.size === 2) val valuesFor1 = groups.find(_._1 == 1).get._2 @@ -61,7 +60,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("groupByKey with duplicates") { - val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (1, 1), (2, 1))) + val pairs = sc.parallelize(Seq((1, 1), (1, 2), (1, 3), (1, 1), (2, 1))) val groups = pairs.groupByKey().collect() assert(groups.size === 2) val valuesFor1 = groups.find(_._1 == 1).get._2 @@ -71,7 +70,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("groupByKey with negative key hash codes") { - val pairs = sc.parallelize(Array((-1, 1), (-1, 2), (-1, 3), (2, 1))) + val pairs = sc.parallelize(Seq((-1, 1), (-1, 2), (-1, 3), (2, 1))) val groups = pairs.groupByKey().collect() assert(groups.size === 2) val valuesForMinus1 = groups.find(_._1 == -1).get._2 @@ -81,7 +80,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("groupByKey with many output partitions") { - val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (2, 1))) + val pairs = sc.parallelize(Seq((1, 1), (1, 2), (1, 3), (2, 1))) val groups = pairs.groupByKey(10).collect() assert(groups.size === 2) val valuesFor1 = groups.find(_._1 == 1).get._2 @@ -170,13 +169,13 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("reduceByKey") { - val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (1, 1), (2, 1))) + val pairs = sc.parallelize(Seq((1, 1), (1, 2), (1, 3), (1, 1), (2, 1))) val sums = pairs.reduceByKey(_ + _).collect() assert(sums.toSet === Set((1, 7), (2, 1))) } test("reduceByKey with collectAsMap") { - val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (1, 1), (2, 1))) + val pairs = sc.parallelize(Seq((1, 1), (1, 2), (1, 3), (1, 1), (2, 1))) val sums = pairs.reduceByKey(_ + _).collectAsMap() assert(sums.size === 2) assert(sums(1) === 7) @@ -184,7 +183,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("reduceByKey with many output partitions") { - val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (1, 1), (2, 1))) + val pairs = sc.parallelize(Seq((1, 1), (1, 2), (1, 3), (1, 1), (2, 1))) val sums = pairs.reduceByKey(_ + _, 10).collect() assert(sums.toSet === Set((1, 7), (2, 1))) } @@ -194,13 +193,13 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { def numPartitions = 2 def getPartition(key: Any) = key.asInstanceOf[Int] } - val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 1), (0, 1))).partitionBy(p) + val pairs = sc.parallelize(Seq((1, 1), (1, 2), (1, 1), (0, 1))).partitionBy(p) val sums = pairs.reduceByKey(_ + _) assert(sums.collect().toSet === Set((1, 4), (0, 1))) assert(sums.partitioner === Some(p)) // count the dependencies to make sure there is only 1 ShuffledRDD val deps = new HashSet[RDD[_]]() - def visit(r: RDD[_]) { + def visit(r: RDD[_]): Unit = { for (dep <- r.dependencies) { deps += dep.rdd visit(dep.rdd) @@ -246,8 +245,8 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("join") { - val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))) - val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) + val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) + val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) val joined = rdd1.join(rdd2).collect() assert(joined.size === 4) assert(joined.toSet === Set( @@ -259,8 +258,8 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("join all-to-all") { - val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (1, 3))) - val rdd2 = sc.parallelize(Array((1, 'x'), (1, 'y'))) + val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (1, 3))) + val rdd2 = sc.parallelize(Seq((1, 'x'), (1, 'y'))) val joined = rdd1.join(rdd2).collect() assert(joined.size === 6) assert(joined.toSet === Set( @@ -274,8 +273,8 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("leftOuterJoin") { - val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))) - val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) + val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) + val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) val joined = rdd1.leftOuterJoin(rdd2).collect() assert(joined.size === 5) assert(joined.toSet === Set( @@ -292,7 +291,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { import scala.reflect.classTag val intPairCT = classTag[(Int, Int)] - val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))) + val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) val rdd2 = sc.emptyRDD[(Int, Int)](intPairCT) val joined = rdd1.cogroup(rdd2).collect() @@ -304,7 +303,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { import scala.reflect.classTag val intCT = classTag[Int] - val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))) + val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) val rdd2 = sc.emptyRDD[Int](intCT).groupBy((x) => 5) val joined = rdd1.cogroup(rdd2).collect() assert(joined.size > 0) @@ -315,7 +314,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { "with an order of magnitude difference in number of partitions") { val rdd1 = sc.parallelize((1 to 1000).map(x => (x, x)), 1000) val rdd2 = sc - .parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))) + .parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) .partitionBy(new HashPartitioner(10)) val joined = rdd1.cogroup(rdd2) assert(joined.getNumPartitions == rdd1.getNumPartitions) @@ -325,7 +324,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { test("cogroup between multiple RDD with number of partitions similar in order of magnitude") { val rdd1 = sc.parallelize((1 to 1000).map(x => (x, x)), 20) val rdd2 = sc - .parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))) + .parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) .partitionBy(new HashPartitioner(10)) val joined = rdd1.cogroup(rdd2) assert(joined.getNumPartitions == rdd2.getNumPartitions) @@ -336,7 +335,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { try { sc.conf.set("spark.default.parallelism", "4") val rdd1 = sc.parallelize((1 to 1000).map(x => (x, x)), 20) - val rdd2 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1)), 10) + val rdd2 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1)), 10) val joined = rdd1.cogroup(rdd2) assert(joined.getNumPartitions == sc.defaultParallelism) } finally { @@ -349,7 +348,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { try { sc.conf.set("spark.default.parallelism", "4") val rdd1 = sc.parallelize((1 to 1000).map(x => (x, x)), 20) - val rdd2 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))) + val rdd2 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) .partitionBy(new HashPartitioner(10)) val joined = rdd1.cogroup(rdd2) assert(joined.getNumPartitions == rdd2.getNumPartitions) @@ -364,7 +363,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { try { sc.conf.set("spark.default.parallelism", "4") val rdd1 = sc.parallelize((1 to 1000).map(x => (x, x)), 1000) - val rdd2 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))) + val rdd2 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) .partitionBy(new HashPartitioner(10)) val joined = rdd1.cogroup(rdd2) assert(joined.getNumPartitions == rdd2.getNumPartitions) @@ -374,8 +373,8 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("rightOuterJoin") { - val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))) - val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) + val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) + val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) val joined = rdd1.rightOuterJoin(rdd2).collect() assert(joined.size === 5) assert(joined.toSet === Set( @@ -388,8 +387,8 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("fullOuterJoin") { - val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))) - val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) + val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) + val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) val joined = rdd1.fullOuterJoin(rdd2).collect() assert(joined.size === 6) assert(joined.toSet === Set( @@ -403,15 +402,15 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("join with no matches") { - val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))) - val rdd2 = sc.parallelize(Array((4, 'x'), (5, 'y'), (5, 'z'), (6, 'w'))) + val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) + val rdd2 = sc.parallelize(Seq((4, 'x'), (5, 'y'), (5, 'z'), (6, 'w'))) val joined = rdd1.join(rdd2).collect() assert(joined.size === 0) } test("join with many output partitions") { - val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))) - val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) + val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) + val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) val joined = rdd1.join(rdd2, 10).collect() assert(joined.size === 4) assert(joined.toSet === Set( @@ -423,8 +422,8 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("groupWith") { - val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))) - val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) + val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) + val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) val joined = rdd1.groupWith(rdd2).collect() assert(joined.size === 4) val joinedSet = joined.map(x => (x._1, (x._2._1.toList, x._2._2.toList))).toSet @@ -437,9 +436,9 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("groupWith3") { - val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))) - val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) - val rdd3 = sc.parallelize(Array((1, 'a'), (3, 'b'), (4, 'c'), (4, 'd'))) + val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) + val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) + val rdd3 = sc.parallelize(Seq((1, 'a'), (3, 'b'), (4, 'c'), (4, 'd'))) val joined = rdd1.groupWith(rdd2, rdd3).collect() assert(joined.size === 4) val joinedSet = joined.map(x => (x._1, @@ -453,10 +452,10 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("groupWith4") { - val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1))) - val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) - val rdd3 = sc.parallelize(Array((1, 'a'), (3, 'b'), (4, 'c'), (4, 'd'))) - val rdd4 = sc.parallelize(Array((2, '@'))) + val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) + val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) + val rdd3 = sc.parallelize(Seq((1, 'a'), (3, 'b'), (4, 'c'), (4, 'd'))) + val rdd4 = sc.parallelize(Seq((2, '@'))) val joined = rdd1.groupWith(rdd2, rdd3, rdd4).collect() assert(joined.size === 4) val joinedSet = joined.map(x => (x._1, @@ -480,7 +479,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("keys and values") { - val rdd = sc.parallelize(Array((1, "a"), (2, "b"))) + val rdd = sc.parallelize(Seq((1, "a"), (2, "b"))) assert(rdd.keys.collect().toList === List(1, 2)) assert(rdd.values.collect().toList === List("a", "b")) } @@ -496,8 +495,8 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("default partitioner uses largest partitioner") { - val a = sc.makeRDD(Array((1, "a"), (2, "b")), 2) - val b = sc.makeRDD(Array((1, "a"), (2, "b")), 2000) + val a = sc.makeRDD(Seq((1, "a"), (2, "b")), 2) + val b = sc.makeRDD(Seq((1, "a"), (2, "b")), 2000) val c = a.join(b) assert(c.partitions.size === 2000) } @@ -517,9 +516,9 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { def getPartition(key: Any) = key.asInstanceOf[Int] } // partitionBy so we have a narrow dependency - val a = sc.parallelize(Array((1, "a"), (2, "b"), (3, "c"))).partitionBy(p) + val a = sc.parallelize(Seq((1, "a"), (2, "b"), (3, "c"))).partitionBy(p) // more partitions/no partitioner so a shuffle dependency - val b = sc.parallelize(Array((2, "b"), (3, "cc"), (4, "d")), 4) + val b = sc.parallelize(Seq((2, "b"), (3, "cc"), (4, "d")), 4) val c = a.subtract(b) assert(c.collect().toSet === Set((1, "a"), (3, "c"))) // Ideally we could keep the original partitioner... @@ -527,8 +526,8 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("subtractByKey") { - val a = sc.parallelize(Array((1, "a"), (1, "a"), (2, "b"), (3, "c")), 2) - val b = sc.parallelize(Array((2, 20), (3, 30), (4, 40)), 4) + val a = sc.parallelize(Seq((1, "a"), (1, "a"), (2, "b"), (3, "c")), 2) + val b = sc.parallelize(Seq((2, 20), (3, 30), (4, 40)), 4) val c = a.subtractByKey(b) assert(c.collect().toSet === Set((1, "a"), (1, "a"))) assert(c.partitions.size === a.partitions.size) @@ -541,22 +540,22 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { def getPartition(key: Any) = key.asInstanceOf[Int] } // partitionBy so we have a narrow dependency - val a = sc.parallelize(Array((1, "a"), (1, "a"), (2, "b"), (3, "c"))).partitionBy(p) + val a = sc.parallelize(Seq((1, "a"), (1, "a"), (2, "b"), (3, "c"))).partitionBy(p) // more partitions/no partitioner so a shuffle dependency - val b = sc.parallelize(Array((2, "b"), (3, "cc"), (4, "d")), 4) + val b = sc.parallelize(Seq((2, "b"), (3, "cc"), (4, "d")), 4) val c = a.subtractByKey(b) assert(c.collect().toSet === Set((1, "a"), (1, "a"))) assert(c.partitioner.get === p) } test("foldByKey") { - val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (1, 1), (2, 1))) + val pairs = sc.parallelize(Seq((1, 1), (1, 2), (1, 3), (1, 1), (2, 1))) val sums = pairs.foldByKey(0)(_ + _).collect() assert(sums.toSet === Set((1, 7), (2, 1))) } test("foldByKey with mutable result type") { - val pairs = sc.parallelize(Array((1, 1), (1, 2), (1, 3), (1, 1), (2, 1))) + val pairs = sc.parallelize(Seq((1, 1), (1, 2), (1, 3), (1, 1), (2, 1))) val bufs = pairs.mapValues(v => ArrayBuffer(v)).cache() // Fold the values using in-place mutation val sums = bufs.foldByKey(new ArrayBuffer[Int])(_ ++= _).collect() @@ -571,7 +570,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("saveNewAPIHadoopFile should call setConf if format is configurable") { - val pairs = sc.parallelize(Array((Integer.valueOf(1), Integer.valueOf(1)))) + val pairs = sc.parallelize(Seq((Integer.valueOf(1), Integer.valueOf(1)))) // No error, non-configurable formats still work pairs.saveAsNewAPIHadoopFile[NewFakeFormat]("ignored") @@ -587,7 +586,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { test("The JobId on the driver and executors should be the same during the commit") { // Create more than one rdd to mimic stageId not equal to rddId - val pairs = sc.parallelize(Array((1, 2), (2, 3)), 2) + val pairs = sc.parallelize(Seq((1, 2), (2, 3)), 2) .map { p => (Integer.valueOf(p._1 + 1), Integer.valueOf(p._2 + 1)) } .filter { p => p._1 > 0 } pairs.saveAsNewAPIHadoopFile[YetAnotherFakeFormat]("ignored") @@ -595,7 +594,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("saveAsHadoopFile should respect configured output committers") { - val pairs = sc.parallelize(Array((Integer.valueOf(1), Integer.valueOf(1)))) + val pairs = sc.parallelize(Seq((Integer.valueOf(1), Integer.valueOf(1)))) val conf = new JobConf() conf.setOutputCommitter(classOf[FakeOutputCommitter]) @@ -607,7 +606,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("failure callbacks should be called before calling writer.close() in saveNewAPIHadoopFile") { - val pairs = sc.parallelize(Array((Integer.valueOf(1), Integer.valueOf(2))), 1) + val pairs = sc.parallelize(Seq((Integer.valueOf(1), Integer.valueOf(2))), 1) FakeWriterWithCallback.calledBy = "" FakeWriterWithCallback.exception = null @@ -622,7 +621,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("failure callbacks should be called before calling writer.close() in saveAsHadoopFile") { - val pairs = sc.parallelize(Array((Integer.valueOf(1), Integer.valueOf(2))), 1) + val pairs = sc.parallelize(Seq((Integer.valueOf(1), Integer.valueOf(2))), 1) val conf = new JobConf() FakeWriterWithCallback.calledBy = "" @@ -640,7 +639,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { test("saveAsNewAPIHadoopDataset should support invalid output paths when " + "there are no files to be committed to an absolute output location") { - val pairs = sc.parallelize(Array((Integer.valueOf(1), Integer.valueOf(2))), 1) + val pairs = sc.parallelize(Seq((Integer.valueOf(1), Integer.valueOf(2))), 1) def saveRddWithPath(path: String): Unit = { val job = NewJob.getInstance(new Configuration(sc.hadoopConfiguration)) @@ -668,7 +667,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { // for non-null invalid paths. test("saveAsHadoopDataset should respect empty output directory when " + "there are no files to be committed to an absolute output location") { - val pairs = sc.parallelize(Array((Integer.valueOf(1), Integer.valueOf(2))), 1) + val pairs = sc.parallelize(Seq((Integer.valueOf(1), Integer.valueOf(2))), 1) val conf = new JobConf() conf.setOutputKeyClass(classOf[Integer]) @@ -683,7 +682,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("lookup") { - val pairs = sc.parallelize(Array((1, 2), (3, 4), (5, 6), (5, 7))) + val pairs = sc.parallelize(Seq((1, 2), (3, 4), (5, 6), (5, 7))) assert(pairs.partitioner === None) assert(pairs.lookup(1) === Seq(2)) @@ -693,7 +692,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("lookup with partitioner") { - val pairs = sc.parallelize(Array((1, 2), (3, 4), (5, 6), (5, 7))) + val pairs = sc.parallelize(Seq((1, 2), (3, 4), (5, 6), (5, 7))) val p = new Partitioner { def numPartitions: Int = 2 @@ -709,7 +708,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { } test("lookup with bad partitioner") { - val pairs = sc.parallelize(Array((1, 2), (3, 4), (5, 6), (5, 7))) + val pairs = sc.parallelize(Seq((1, 2), (3, 4), (5, 6), (5, 7))) val p = new Partitioner { def numPartitions: Int = 2 diff --git a/core/src/test/scala/org/apache/spark/rdd/ParallelCollectionSplitSuite.scala b/core/src/test/scala/org/apache/spark/rdd/ParallelCollectionSplitSuite.scala index 424d9f825c465..10f4bbcf7f48b 100644 --- a/core/src/test/scala/org/apache/spark/rdd/ParallelCollectionSplitSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/ParallelCollectionSplitSuite.scala @@ -22,7 +22,7 @@ import scala.collection.immutable.NumericRange import org.scalacheck.Arbitrary._ import org.scalacheck.Gen import org.scalacheck.Prop._ -import org.scalatest.prop.Checkers +import org.scalatestplus.scalacheck.Checkers import org.apache.spark.SparkFunSuite diff --git a/core/src/test/scala/org/apache/spark/rdd/PartitionwiseSampledRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PartitionwiseSampledRDDSuite.scala index cb0de1c6beb6b..da2ccbfae181f 100644 --- a/core/src/test/scala/org/apache/spark/rdd/PartitionwiseSampledRDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/PartitionwiseSampledRDDSuite.scala @@ -25,7 +25,7 @@ class MockSampler extends RandomSampler[Long, Long] { private var s: Long = _ - override def setSeed(seed: Long) { + override def setSeed(seed: Long): Unit = { s = seed } diff --git a/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala index 69739a2e58481..2da2854dfbcb9 100644 --- a/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala @@ -21,16 +21,18 @@ import java.io.File import scala.collection.JavaConverters._ import scala.collection.Map +import scala.concurrent.duration._ import scala.io.Codec import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.mapred.{FileSplit, JobConf, TextInputFormat} +import org.scalatest.concurrent.Eventually import org.apache.spark._ import org.apache.spark.util.Utils -class PipedRDDSuite extends SparkFunSuite with SharedSparkContext { +class PipedRDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { val envCommand = if (Utils.isWindows) { "cmd.exe /C set" } else { @@ -100,11 +102,16 @@ class PipedRDDSuite extends SparkFunSuite with SharedSparkContext { assert(result.collect().length === 0) - // collect stderr writer threads - val stderrWriterThread = Thread.getAllStackTraces.keySet().asScala - .find { _.getName.startsWith(PipedRDD.STDIN_WRITER_THREAD_PREFIX) } - - assert(stderrWriterThread.isEmpty) + // SPARK-29104 PipedRDD will invoke `stdinWriterThread.interrupt()` at task completion, + // and `obj.wait` will get InterruptedException. However, there exists a possibility + // which the thread termination gets delayed because the thread starts from `obj.wait()` + // with that exception. To prevent test flakiness, we need to use `eventually`. + eventually(timeout(10.seconds), interval(1.second)) { + // collect stdin writer threads + val stdinWriterThread = Thread.getAllStackTraces.keySet().asScala + .find { _.getName.startsWith(PipedRDD.STDIN_WRITER_THREAD_PREFIX) } + assert(stdinWriterThread.isEmpty) + } } test("advanced pipe") { @@ -131,7 +138,7 @@ class PipedRDDSuite extends SparkFunSuite with SharedSparkContext { assert(c(6) === "3_") assert(c(7) === "4_") - val nums1 = sc.makeRDD(Array("a\t1", "b\t2", "a\t3", "b\t4"), 2) + val nums1 = sc.makeRDD(Seq("a\t1", "b\t2", "a\t3", "b\t4"), 2) val d = nums1.groupBy(str => str.split("\t")(0)). pipe(Seq("cat"), Map[String, String](), @@ -230,7 +237,7 @@ class PipedRDDSuite extends SparkFunSuite with SharedSparkContext { testExportInputFile("mapreduce_map_input_file") } - def testExportInputFile(varName: String) { + def testExportInputFile(varName: String): Unit = { assume(TestUtils.testCommandAvailable(envCommand)) val nums = new HadoopRDD(sc, new JobConf(), classOf[TextInputFormat], classOf[LongWritable], classOf[Text], 2) { diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDBarrierSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDBarrierSuite.scala index 2f6c4d6a42ea3..f048f95430138 100644 --- a/core/src/test/scala/org/apache/spark/rdd/RDDBarrierSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/RDDBarrierSuite.scala @@ -29,6 +29,15 @@ class RDDBarrierSuite extends SparkFunSuite with SharedSparkContext { assert(rdd2.isBarrier()) } + test("RDDBarrier mapPartitionsWithIndex") { + val rdd = sc.parallelize(1 to 12, 4) + assert(rdd.isBarrier() === false) + + val rdd2 = rdd.barrier().mapPartitionsWithIndex((index, iter) => Iterator(index)) + assert(rdd2.isBarrier()) + assert(rdd2.collect().toList === List(0, 1, 2, 3)) + } + test("create an RDDBarrier in the middle of a chain of RDDs") { val rdd = sc.parallelize(1 to 10, 4).map(x => x * 2) val rdd2 = rdd.barrier().mapPartitions(iter => iter).map(x => (x, x + 1)) diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala index 60e63bfd68625..18154d861a731 100644 --- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala @@ -236,7 +236,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { } test("aggregate") { - val pairs = sc.makeRDD(Array(("a", 1), ("b", 2), ("a", 2), ("c", 5), ("a", 3))) + val pairs = sc.makeRDD(Seq(("a", 1), ("b", 2), ("a", 2), ("c", 5), ("a", 3))) type StringMap = HashMap[String, Int] val emptyMap = new StringMap { override def default(key: String): Int = 0 @@ -366,7 +366,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { assert(math.abs(partitions1(1).length - 500) < initialPartitions) assert(repartitioned1.collect() === input) - def testSplitPartitions(input: Seq[Int], initialPartitions: Int, finalPartitions: Int) { + def testSplitPartitions(input: Seq[Int], initialPartitions: Int, finalPartitions: Int): Unit = { val data = sc.parallelize(input, initialPartitions) val repartitioned = data.repartition(finalPartitions) assert(repartitioned.partitions.size === finalPartitions) @@ -1099,7 +1099,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { override def index: Int = 0 }) override def getDependencies: Seq[Dependency[_]] = mutableDependencies - def addDependency(dep: Dependency[_]) { + def addDependency(dep: Dependency[_]): Unit = { mutableDependencies += dep } } diff --git a/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala b/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala index 7f20206202cb9..d5f7d30a253fe 100644 --- a/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala @@ -25,7 +25,7 @@ import org.apache.spark.internal.Logging class SortingSuite extends SparkFunSuite with SharedSparkContext with Matchers with Logging { test("sortByKey") { - val pairs = sc.parallelize(Array((1, 0), (2, 0), (0, 0), (3, 0)), 2) + val pairs = sc.parallelize(Seq((1, 0), (2, 0), (0, 0), (3, 0)), 2) assert(pairs.sortByKey().collect() === Array((0, 0), (1, 0), (2, 0), (3, 0))) } diff --git a/core/src/test/scala/org/apache/spark/rdd/ZippedPartitionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/ZippedPartitionsSuite.scala index 5d7b973fbd9ac..7079b9ea8eadc 100644 --- a/core/src/test/scala/org/apache/spark/rdd/ZippedPartitionsSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/ZippedPartitionsSuite.scala @@ -27,9 +27,9 @@ object ZippedPartitionsSuite { class ZippedPartitionsSuite extends SparkFunSuite with SharedSparkContext { test("print sizes") { - val data1 = sc.makeRDD(Array(1, 2, 3, 4), 2) - val data2 = sc.makeRDD(Array("1", "2", "3", "4", "5", "6"), 2) - val data3 = sc.makeRDD(Array(1.0, 2.0), 2) + val data1 = sc.makeRDD(Seq(1, 2, 3, 4), 2) + val data2 = sc.makeRDD(Seq("1", "2", "3", "4", "5", "6"), 2) + val data3 = sc.makeRDD(Seq(1.0, 2.0), 2) val zippedRDD = data1.zipPartitions(data2, data3)(ZippedPartitionsSuite.procZippedData) diff --git a/core/src/test/scala/org/apache/spark/resource/ResourceDiscoveryPluginSuite.scala b/core/src/test/scala/org/apache/spark/resource/ResourceDiscoveryPluginSuite.scala new file mode 100644 index 0000000000000..7a05daa2ad715 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/resource/ResourceDiscoveryPluginSuite.scala @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.resource + +import java.io.File +import java.nio.charset.StandardCharsets +import java.util.Optional +import java.util.UUID + +import scala.concurrent.duration._ + +import com.google.common.io.Files +import org.scalatest.concurrent.Eventually.{eventually, interval, timeout} + +import org.apache.spark._ +import org.apache.spark.TestUtils.createTempScriptWithExpectedOutput +import org.apache.spark.api.resource.ResourceDiscoveryPlugin +import org.apache.spark.internal.config._ +import org.apache.spark.launcher.SparkLauncher +import org.apache.spark.resource.ResourceUtils.{FPGA, GPU} +import org.apache.spark.resource.TestResourceIDs._ +import org.apache.spark.util.Utils + +class ResourceDiscoveryPluginSuite extends SparkFunSuite with LocalSparkContext { + + test("plugin initialization in non-local mode fpga and gpu") { + assume(!(Utils.isWindows)) + withTempDir { dir => + val conf = new SparkConf() + .setAppName(getClass().getName()) + .set(SparkLauncher.SPARK_MASTER, "local-cluster[2,1,1024]") + .set(RESOURCES_DISCOVERY_PLUGIN, Seq(classOf[TestResourceDiscoveryPluginGPU].getName(), + classOf[TestResourceDiscoveryPluginFPGA].getName())) + .set(TestResourceDiscoveryPlugin.TEST_PATH_CONF, dir.getAbsolutePath()) + .set(WORKER_GPU_ID.amountConf, "2") + .set(TASK_GPU_ID.amountConf, "1") + .set(EXECUTOR_GPU_ID.amountConf, "1") + .set(SPARK_RESOURCES_DIR, dir.getName()) + .set(WORKER_FPGA_ID.amountConf, "2") + .set(TASK_FPGA_ID.amountConf, "1") + .set(EXECUTOR_FPGA_ID.amountConf, "1") + + sc = new SparkContext(conf) + TestUtils.waitUntilExecutorsUp(sc, 2, 10000) + + eventually(timeout(10.seconds), interval(100.millis)) { + val children = dir.listFiles() + assert(children != null) + assert(children.length >= 4) + val gpuFiles = children.filter(f => f.getName().contains(GPU)) + val fpgaFiles = children.filter(f => f.getName().contains(FPGA)) + assert(gpuFiles.length == 2) + assert(fpgaFiles.length == 2) + } + } + } + + test("single plugin gpu") { + assume(!(Utils.isWindows)) + withTempDir { dir => + val conf = new SparkConf() + .setAppName(getClass().getName()) + .set(SparkLauncher.SPARK_MASTER, "local-cluster[2,1,1024]") + .set(RESOURCES_DISCOVERY_PLUGIN, Seq(classOf[TestResourceDiscoveryPluginGPU].getName())) + .set(TestResourceDiscoveryPlugin.TEST_PATH_CONF, dir.getAbsolutePath()) + .set(WORKER_GPU_ID.amountConf, "2") + .set(TASK_GPU_ID.amountConf, "1") + .set(EXECUTOR_GPU_ID.amountConf, "1") + .set(SPARK_RESOURCES_DIR, dir.getName()) + + sc = new SparkContext(conf) + TestUtils.waitUntilExecutorsUp(sc, 2, 10000) + + eventually(timeout(10.seconds), interval(100.millis)) { + val children = dir.listFiles() + assert(children != null) + assert(children.length >= 2) + val gpuFiles = children.filter(f => f.getName().contains(GPU)) + assert(gpuFiles.length == 2) + } + } + } + + test("multiple plugins with one empty") { + assume(!(Utils.isWindows)) + withTempDir { dir => + val conf = new SparkConf() + .setAppName(getClass().getName()) + .set(SparkLauncher.SPARK_MASTER, "local-cluster[2,1,1024]") + .set(RESOURCES_DISCOVERY_PLUGIN, Seq(classOf[TestResourceDiscoveryPluginEmpty].getName(), + classOf[TestResourceDiscoveryPluginGPU].getName())) + .set(TestResourceDiscoveryPlugin.TEST_PATH_CONF, dir.getAbsolutePath()) + .set(WORKER_GPU_ID.amountConf, "2") + .set(TASK_GPU_ID.amountConf, "1") + .set(EXECUTOR_GPU_ID.amountConf, "1") + .set(SPARK_RESOURCES_DIR, dir.getName()) + + sc = new SparkContext(conf) + TestUtils.waitUntilExecutorsUp(sc, 2, 10000) + + eventually(timeout(10.seconds), interval(100.millis)) { + val children = dir.listFiles() + assert(children != null) + assert(children.length >= 2) + val gpuFiles = children.filter(f => f.getName().contains(GPU)) + assert(gpuFiles.length == 2) + } + } + } + + test("empty plugin fallback to discovery script") { + assume(!(Utils.isWindows)) + withTempDir { dir => + val scriptPath = createTempScriptWithExpectedOutput(dir, "gpuDiscoveryScript", + """{"name": "gpu","addresses":["5", "6"]}""") + val conf = new SparkConf() + .setAppName(getClass().getName()) + .set(SparkLauncher.SPARK_MASTER, "local-cluster[2,1,1024]") + .set(RESOURCES_DISCOVERY_PLUGIN, Seq(classOf[TestResourceDiscoveryPluginEmpty].getName())) + .set(DRIVER_GPU_ID.discoveryScriptConf, scriptPath) + .set(DRIVER_GPU_ID.amountConf, "2") + .set(SPARK_RESOURCES_DIR, dir.getName()) + + sc = new SparkContext(conf) + TestUtils.waitUntilExecutorsUp(sc, 2, 10000) + + assert(sc.resources.size === 1) + assert(sc.resources.get(GPU).get.addresses === Array("5", "6")) + assert(sc.resources.get(GPU).get.name === "gpu") + } + } +} + +object TestResourceDiscoveryPlugin { + val TEST_PATH_CONF = "spark.nonLocalDiscoveryPlugin.path" + + def writeFile(conf: SparkConf, id: String): Unit = { + val path = conf.get(TEST_PATH_CONF) + val fileName = s"$id - ${UUID.randomUUID.toString}" + Files.write(id, new File(path, fileName), StandardCharsets.UTF_8) + } +} + +private class TestResourceDiscoveryPluginGPU extends ResourceDiscoveryPlugin { + + override def discoverResource( + request: ResourceRequest, + conf: SparkConf): Optional[ResourceInformation] = { + if (request.id.resourceName.equals(GPU)) { + TestResourceDiscoveryPlugin.writeFile(conf, request.id.resourceName) + Optional.of(new ResourceInformation(GPU, Array("0", "1", "2", "3"))) + } else { + Optional.empty() + } + } +} + +private class TestResourceDiscoveryPluginEmpty extends ResourceDiscoveryPlugin { + + override def discoverResource( + request: ResourceRequest, + conf: SparkConf): Optional[ResourceInformation] = { + Optional.empty() + } +} + +private class TestResourceDiscoveryPluginFPGA extends ResourceDiscoveryPlugin { + + override def discoverResource( + request: ResourceRequest, + conf: SparkConf): Optional[ResourceInformation] = { + if (request.id.resourceName.equals(FPGA)) { + TestResourceDiscoveryPlugin.writeFile(conf, request.id.resourceName) + Optional.of(new ResourceInformation(FPGA, Array("0", "1", "2", "3"))) + } else { + Optional.empty() + } + } +} diff --git a/core/src/test/scala/org/apache/spark/resource/ResourceProfileManagerSuite.scala b/core/src/test/scala/org/apache/spark/resource/ResourceProfileManagerSuite.scala new file mode 100644 index 0000000000000..075260317284d --- /dev/null +++ b/core/src/test/scala/org/apache/spark/resource/ResourceProfileManagerSuite.scala @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.resource + +import org.apache.spark.{SparkConf, SparkException, SparkFunSuite} +import org.apache.spark.internal.config._ +import org.apache.spark.internal.config.Tests._ + +class ResourceProfileManagerSuite extends SparkFunSuite { + + override def beforeAll() { + try { + ResourceProfile.clearDefaultProfile() + } finally { + super.beforeAll() + } + } + + override def afterEach() { + try { + ResourceProfile.clearDefaultProfile() + } finally { + super.afterEach() + } + } + + test("ResourceProfileManager") { + val conf = new SparkConf().set(EXECUTOR_CORES, 4) + val rpmanager = new ResourceProfileManager(conf) + val defaultProf = rpmanager.defaultResourceProfile + assert(defaultProf.id === ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) + assert(defaultProf.executorResources.size === 2, + "Executor resources should contain cores and memory by default") + assert(defaultProf.executorResources(ResourceProfile.CORES).amount === 4, + s"Executor resources should have 4 cores") + } + + test("isSupported yarn no dynamic allocation") { + val conf = new SparkConf().setMaster("yarn").set(EXECUTOR_CORES, 4) + conf.set(RESOURCE_PROFILE_MANAGER_TESTING.key, "true") + val rpmanager = new ResourceProfileManager(conf) + // default profile should always work + val defaultProf = rpmanager.defaultResourceProfile + val rprof = new ResourceProfileBuilder() + val gpuExecReq = + new ExecutorResourceRequests().resource("gpu", 2, "someScript") + val immrprof = rprof.require(gpuExecReq).build + val error = intercept[SparkException] { + rpmanager.isSupported(immrprof) + }.getMessage() + + assert(error.contains("ResourceProfiles are only supported on YARN with dynamic allocation")) + } + + test("isSupported yarn with dynamic allocation") { + val conf = new SparkConf().setMaster("yarn").set(EXECUTOR_CORES, 4) + conf.set(DYN_ALLOCATION_ENABLED, true) + conf.set(RESOURCE_PROFILE_MANAGER_TESTING.key, "true") + val rpmanager = new ResourceProfileManager(conf) + // default profile should always work + val defaultProf = rpmanager.defaultResourceProfile + val rprof = new ResourceProfileBuilder() + val gpuExecReq = + new ExecutorResourceRequests().resource("gpu", 2, "someScript") + val immrprof = rprof.require(gpuExecReq).build + assert(rpmanager.isSupported(immrprof) == true) + } + + test("isSupported yarn with local mode") { + val conf = new SparkConf().setMaster("local").set(EXECUTOR_CORES, 4) + conf.set(RESOURCE_PROFILE_MANAGER_TESTING.key, "true") + val rpmanager = new ResourceProfileManager(conf) + // default profile should always work + val defaultProf = rpmanager.defaultResourceProfile + val rprof = new ResourceProfileBuilder() + val gpuExecReq = + new ExecutorResourceRequests().resource("gpu", 2, "someScript") + val immrprof = rprof.require(gpuExecReq).build + var error = intercept[SparkException] { + rpmanager.isSupported(immrprof) + }.getMessage() + + assert(error.contains("ResourceProfiles are only supported on YARN with dynamic allocation")) + } + + + +} diff --git a/core/src/test/scala/org/apache/spark/resource/ResourceProfileSuite.scala b/core/src/test/scala/org/apache/spark/resource/ResourceProfileSuite.scala new file mode 100644 index 0000000000000..b2f2c3632e454 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/resource/ResourceProfileSuite.scala @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.resource + +import org.apache.spark.{SparkConf, SparkFunSuite} +import org.apache.spark.internal.config.{EXECUTOR_CORES, EXECUTOR_MEMORY, EXECUTOR_MEMORY_OVERHEAD} +import org.apache.spark.internal.config.Python.PYSPARK_EXECUTOR_MEMORY +import org.apache.spark.resource.TestResourceIDs._ + +class ResourceProfileSuite extends SparkFunSuite { + + override def beforeAll() { + try { + ResourceProfile.clearDefaultProfile() + } finally { + super.beforeAll() + } + } + + override def afterEach() { + try { + ResourceProfile.clearDefaultProfile() + } finally { + super.afterEach() + } + } + + test("Default ResourceProfile") { + val rprof = ResourceProfile.getOrCreateDefaultProfile(new SparkConf) + assert(rprof.id === ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) + assert(rprof.executorResources.size === 2, + "Executor resources should contain cores and memory by default") + assert(rprof.executorResources(ResourceProfile.CORES).amount === 1, + "Executor resources should have 1 core") + assert(rprof.getExecutorCores.get === 1, + "Executor resources should have 1 core") + assert(rprof.executorResources(ResourceProfile.MEMORY).amount === 1024, + "Executor resources should have 1024 memory") + assert(rprof.executorResources.get(ResourceProfile.PYSPARK_MEM) == None, + "pyspark memory empty if not specified") + assert(rprof.executorResources.get(ResourceProfile.OVERHEAD_MEM) == None, + "overhead memory empty if not specified") + assert(rprof.taskResources.size === 1, + "Task resources should just contain cpus by default") + assert(rprof.taskResources(ResourceProfile.CPUS).amount === 1, + "Task resources should have 1 cpu") + assert(rprof.getTaskCpus.get === 1, + "Task resources should have 1 cpu") + } + + test("Default ResourceProfile with app level resources specified") { + val conf = new SparkConf + conf.set(PYSPARK_EXECUTOR_MEMORY.key, "2g") + conf.set(EXECUTOR_MEMORY_OVERHEAD.key, "1g") + conf.set(EXECUTOR_MEMORY.key, "4g") + conf.set(EXECUTOR_CORES.key, "4") + conf.set(TASK_GPU_ID.amountConf, "1") + conf.set(EXECUTOR_GPU_ID.amountConf, "1") + conf.set(EXECUTOR_GPU_ID.discoveryScriptConf, "nameOfScript") + val rprof = ResourceProfile.getOrCreateDefaultProfile(conf) + assert(rprof.id === ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) + val execResources = rprof.executorResources + assert(execResources.size === 5, s"Executor resources should contain cores, pyspark " + + s"memory, memory overhead, memory, and gpu $execResources") + assert(execResources.contains("gpu"), "Executor resources should have gpu") + assert(rprof.executorResources(ResourceProfile.CORES).amount === 4, + "Executor resources should have 4 core") + assert(rprof.getExecutorCores.get === 4, + "Executor resources should have 4 core") + assert(rprof.executorResources(ResourceProfile.MEMORY).amount === 4096, + "Executor resources should have 1024 memory") + assert(rprof.executorResources(ResourceProfile.PYSPARK_MEM).amount == 2048, + "pyspark memory empty if not specified") + assert(rprof.executorResources(ResourceProfile.OVERHEAD_MEM).amount == 1024, + "overhead memory empty if not specified") + assert(rprof.taskResources.size === 2, + "Task resources should just contain cpus and gpu") + assert(rprof.taskResources.contains("gpu"), "Task resources should have gpu") + } + + test("test default profile task gpus fractional") { + val sparkConf = new SparkConf() + .set(EXECUTOR_GPU_ID.amountConf, "2") + .set(TASK_GPU_ID.amountConf, "0.33") + val immrprof = ResourceProfile.getOrCreateDefaultProfile(sparkConf) + assert(immrprof.taskResources.get("gpu").get.amount == 0.33) + } + + test("maxTasksPerExecutor cpus") { + val sparkConf = new SparkConf() + .set(EXECUTOR_CORES, 1) + val rprof = new ResourceProfileBuilder() + val taskReq = new TaskResourceRequests().resource("gpu", 1) + val execReq = + new ExecutorResourceRequests().resource("gpu", 2, "myscript", "nvidia") + rprof.require(taskReq).require(execReq) + val immrprof = new ResourceProfile(rprof.executorResources, rprof.taskResources) + assert(immrprof.limitingResource(sparkConf) == "cpus") + assert(immrprof.maxTasksPerExecutor(sparkConf) == 1) + } + + test("maxTasksPerExecutor/limiting no executor cores") { + val sparkConf = new SparkConf().setMaster("spark://testing") + val rprof = new ResourceProfileBuilder() + val taskReq = new TaskResourceRequests().resource("gpu", 1) + val execReq = + new ExecutorResourceRequests().resource("gpu", 2, "myscript", "nvidia") + rprof.require(taskReq).require(execReq) + val immrprof = new ResourceProfile(rprof.executorResources, rprof.taskResources) + assert(immrprof.limitingResource(sparkConf) == "gpu") + assert(immrprof.maxTasksPerExecutor(sparkConf) == 2) + assert(immrprof.isCoresLimitKnown == false) + } + + test("maxTasksPerExecutor/limiting no other resource no executor cores") { + val sparkConf = new SparkConf().setMaster("spark://testing") + val immrprof = ResourceProfile.getOrCreateDefaultProfile(sparkConf) + assert(immrprof.limitingResource(sparkConf) == "") + assert(immrprof.maxTasksPerExecutor(sparkConf) == 1) + assert(immrprof.isCoresLimitKnown == false) + } + + test("maxTasksPerExecutor/limiting executor cores") { + val sparkConf = new SparkConf().setMaster("spark://testing").set(EXECUTOR_CORES, 2) + val rprof = new ResourceProfileBuilder() + val taskReq = new TaskResourceRequests().resource("gpu", 1) + val execReq = + new ExecutorResourceRequests().resource("gpu", 2, "myscript", "nvidia") + rprof.require(taskReq).require(execReq) + val immrprof = new ResourceProfile(rprof.executorResources, rprof.taskResources) + assert(immrprof.limitingResource(sparkConf) == ResourceProfile.CPUS) + assert(immrprof.maxTasksPerExecutor(sparkConf) == 2) + assert(immrprof.isCoresLimitKnown == true) + } + + + test("Create ResourceProfile") { + val rprof = new ResourceProfileBuilder() + val taskReq = new TaskResourceRequests().resource("gpu", 1) + val eReq = new ExecutorResourceRequests().resource("gpu", 2, "myscript", "nvidia") + rprof.require(taskReq).require(eReq) + + assert(rprof.executorResources.size === 1) + assert(rprof.executorResources.contains("gpu"), + "Executor resources should have gpu") + assert(rprof.executorResources.get("gpu").get.vendor === "nvidia", + "gpu vendor should be nvidia") + assert(rprof.executorResources.get("gpu").get.discoveryScript === "myscript", + "discoveryScript should be myscript") + assert(rprof.executorResources.get("gpu").get.amount === 2, + "gpu amount should be 2") + + assert(rprof.taskResources.size === 1, "Should have 1 task resource") + assert(rprof.taskResources.contains("gpu"), "Task resources should have gpu") + assert(rprof.taskResources.get("gpu").get.amount === 1, + "Task resources should have 1 gpu") + + val ereqs = new ExecutorResourceRequests() + ereqs.cores(2).memory("4096") + ereqs.memoryOverhead("2048").pysparkMemory("1024") + val treqs = new TaskResourceRequests() + treqs.cpus(1) + + rprof.require(treqs) + rprof.require(ereqs) + + assert(rprof.executorResources.size === 5) + assert(rprof.executorResources(ResourceProfile.CORES).amount === 2, + "Executor resources should have 2 cores") + assert(rprof.executorResources(ResourceProfile.MEMORY).amount === 4096, + "Executor resources should have 4096 memory") + assert(rprof.executorResources(ResourceProfile.OVERHEAD_MEM).amount === 2048, + "Executor resources should have 2048 overhead memory") + assert(rprof.executorResources(ResourceProfile.PYSPARK_MEM).amount === 1024, + "Executor resources should have 1024 pyspark memory") + + assert(rprof.taskResources.size === 2) + assert(rprof.taskResources("cpus").amount === 1, "Task resources should have cpu") + } + + test("Test ExecutorResourceRequests memory helpers") { + val rprof = new ResourceProfileBuilder() + val ereqs = new ExecutorResourceRequests() + ereqs.memory("4g") + ereqs.memoryOverhead("2000m").pysparkMemory("512000k") + rprof.require(ereqs) + + assert(rprof.executorResources(ResourceProfile.MEMORY).amount === 4096, + "Executor resources should have 4096 memory") + assert(rprof.executorResources(ResourceProfile.OVERHEAD_MEM).amount === 2000, + "Executor resources should have 2000 overhead memory") + assert(rprof.executorResources(ResourceProfile.PYSPARK_MEM).amount === 500, + "Executor resources should have 512 pyspark memory") + } + + test("Test TaskResourceRequest fractional") { + val rprof = new ResourceProfileBuilder() + val treqs = new TaskResourceRequests().resource("gpu", 0.33) + rprof.require(treqs) + + assert(rprof.taskResources.size === 1, "Should have 1 task resource") + assert(rprof.taskResources.contains("gpu"), "Task resources should have gpu") + assert(rprof.taskResources.get("gpu").get.amount === 0.33, + "Task resources should have 0.33 gpu") + + val fpgaReqs = new TaskResourceRequests().resource("fpga", 4.0) + rprof.require(fpgaReqs) + + assert(rprof.taskResources.size === 2, "Should have 2 task resource") + assert(rprof.taskResources.contains("fpga"), "Task resources should have gpu") + assert(rprof.taskResources.get("fpga").get.amount === 4.0, + "Task resources should have 4.0 gpu") + + var taskError = intercept[AssertionError] { + rprof.require(new TaskResourceRequests().resource("gpu", 1.5)) + }.getMessage() + assert(taskError.contains("The resource amount 1.5 must be either <= 0.5, or a whole number.")) + + taskError = intercept[AssertionError] { + rprof.require(new TaskResourceRequests().resource("gpu", 0.7)) + }.getMessage() + assert(taskError.contains("The resource amount 0.7 must be either <= 0.5, or a whole number.")) + } +} diff --git a/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala b/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala index c2ecc96db906b..278a72a7192d8 100644 --- a/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala @@ -19,14 +19,17 @@ package org.apache.spark.resource import java.io.File import java.nio.file.{Files => JavaFiles} +import java.util.Optional import org.json4s.{DefaultFormats, Extraction} import org.apache.spark.{LocalSparkContext, SparkConf, SparkException, SparkFunSuite} import org.apache.spark.TestUtils._ import org.apache.spark.internal.config._ +import org.apache.spark.internal.config.Tests._ import org.apache.spark.resource.ResourceUtils._ import org.apache.spark.resource.TestResourceIDs._ +import org.apache.spark.scheduler.LiveListenerBus import org.apache.spark.util.Utils class ResourceUtilsSuite extends SparkFunSuite @@ -35,7 +38,7 @@ class ResourceUtilsSuite extends SparkFunSuite test("ResourceID") { val componentName = "spark.test" val resourceName = "p100" - val id = ResourceID(componentName, resourceName) + val id = new ResourceID(componentName, resourceName) val confPrefix = s"$componentName.resource.$resourceName." assert(id.confPrefix === confPrefix) assert(id.amountConf === s"${confPrefix}amount") @@ -60,6 +63,20 @@ class ResourceUtilsSuite extends SparkFunSuite } } + test("Resource discoverer amount 0") { + val conf = new SparkConf + assume(!(Utils.isWindows)) + withTempDir { dir => + val scriptPath = createTempScriptWithExpectedOutput(dir, "gpuDiscoverScript", + """{"name": "gpu"}""") + conf.set(EXECUTOR_GPU_ID.amountConf, "0") + conf.set(EXECUTOR_GPU_ID.discoveryScriptConf, scriptPath) + + val res = getOrDiscoverAllResources(conf, SPARK_EXECUTOR_PREFIX, None) + assert(res.isEmpty) + } + } + test("Resource discoverer multiple resource types") { val conf = new SparkConf assume(!(Utils.isWindows)) @@ -74,19 +91,26 @@ class ResourceUtilsSuite extends SparkFunSuite conf.set(EXECUTOR_FPGA_ID.amountConf, "2") conf.set(EXECUTOR_FPGA_ID.discoveryScriptConf, fpgaDiscovery) + // test one with amount 0 to make sure ignored + val fooDiscovery = createTempScriptWithExpectedOutput(dir, "fooDiscoverScript", + """{"name": "foo", "addresses": ["f1", "f2", "f3"]}""") + val fooId = new ResourceID(SPARK_EXECUTOR_PREFIX, "foo") + conf.set(fooId.amountConf, "0") + conf.set(fooId.discoveryScriptConf, fooDiscovery) + val resources = getOrDiscoverAllResources(conf, SPARK_EXECUTOR_PREFIX, None) assert(resources.size === 2) val gpuValue = resources.get(GPU) assert(gpuValue.nonEmpty, "Should have a gpu entry") assert(gpuValue.get.name == "gpu", "name should be gpu") assert(gpuValue.get.addresses.size == 2, "Should have 2 indexes") - assert(gpuValue.get.addresses.deep == Array("0", "1").deep, "should have 0,1 entries") + assert(gpuValue.get.addresses.sameElements(Array("0", "1")), "should have 0,1 entries") val fpgaValue = resources.get(FPGA) assert(fpgaValue.nonEmpty, "Should have a gpu entry") assert(fpgaValue.get.name == "fpga", "name should be fpga") assert(fpgaValue.get.addresses.size == 3, "Should have 3 indexes") - assert(fpgaValue.get.addresses.deep == Array("f1", "f2", "f3").deep, + assert(fpgaValue.get.addresses.sameElements(Array("f1", "f2", "f3")), "should have f1,f2,f3 entries") } } @@ -107,7 +131,8 @@ class ResourceUtilsSuite extends SparkFunSuite assert(resourcesFromFileOnly(FPGA) === expectedFpgaInfo) val gpuDiscovery = createTempScriptWithExpectedOutput( - dir, "gpuDiscoveryScript", """{"name": "gpu", "addresses": ["0", "1"]}""") + dir, "gpuDiscoveryScript", + """{"name": "gpu", "addresses": ["0", "1"]}""") conf.set(EXECUTOR_GPU_ID.amountConf, "2") conf.set(EXECUTOR_GPU_ID.discoveryScriptConf, gpuDiscovery) val resourcesFromBoth = getOrDiscoverAllResources( @@ -118,6 +143,40 @@ class ResourceUtilsSuite extends SparkFunSuite } } + test("get from resources file and discover resource profile remaining") { + val conf = new SparkConf + val rpId = 1 + assume(!(Utils.isWindows)) + withTempDir { dir => + implicit val formats = DefaultFormats + val fpgaAddrs = Seq("f1", "f2", "f3") + val fpgaAllocation = ResourceAllocation(EXECUTOR_FPGA_ID, fpgaAddrs) + val resourcesFile = createTempJsonFile( + dir, "resources", Extraction.decompose(Seq(fpgaAllocation))) + val resourcesFromFileOnly = getOrDiscoverAllResourcesForResourceProfile( + Some(resourcesFile), + SPARK_EXECUTOR_PREFIX, + ResourceProfile.getOrCreateDefaultProfile(conf), + conf) + val expectedFpgaInfo = new ResourceInformation(FPGA, fpgaAddrs.toArray) + assert(resourcesFromFileOnly(FPGA) === expectedFpgaInfo) + + val gpuDiscovery = createTempScriptWithExpectedOutput( + dir, "gpuDiscoveryScript", + """{"name": "gpu", "addresses": ["0", "1"]}""") + val rpBuilder = new ResourceProfileBuilder() + val ereqs = new ExecutorResourceRequests().resource(GPU, 2, gpuDiscovery) + val treqs = new TaskResourceRequests().resource(GPU, 1) + + val rp = rpBuilder.require(ereqs).require(treqs).build + val resourcesFromBoth = getOrDiscoverAllResourcesForResourceProfile( + Some(resourcesFile), SPARK_EXECUTOR_PREFIX, rp, conf) + val expectedGpuInfo = new ResourceInformation(GPU, Array("0", "1")) + assert(resourcesFromBoth(FPGA) === expectedFpgaInfo) + assert(resourcesFromBoth(GPU) === expectedGpuInfo) + } + } + test("list resource ids") { val conf = new SparkConf conf.set(DRIVER_GPU_ID.amountConf, "2") @@ -127,7 +186,7 @@ class ResourceUtilsSuite extends SparkFunSuite conf.set(DRIVER_FPGA_ID.amountConf, "2") val resourcesMap = listResourceIds(conf, SPARK_DRIVER_PREFIX) - .map{ rId => (rId.resourceName, 1)}.toMap + .map { rId => (rId.resourceName, 1) }.toMap assert(resourcesMap.size === 2, "should only have GPU for resource") assert(resourcesMap.get(GPU).nonEmpty, "should have GPU") assert(resourcesMap.get(FPGA).nonEmpty, "should have FPGA") @@ -139,8 +198,8 @@ class ResourceUtilsSuite extends SparkFunSuite var request = parseResourceRequest(conf, DRIVER_GPU_ID) assert(request.id.resourceName === GPU, "should only have GPU for resource") assert(request.amount === 2, "GPU count should be 2") - assert(request.discoveryScript === None, "discovery script should be empty") - assert(request.vendor === None, "vendor should be empty") + assert(request.discoveryScript === Optional.empty(), "discovery script should be empty") + assert(request.vendor === Optional.empty(), "vendor should be empty") val vendor = "nvidia.com" val discoveryScript = "discoveryScriptGPU" @@ -175,7 +234,7 @@ class ResourceUtilsSuite extends SparkFunSuite assert(gpuValue.nonEmpty, "Should have a gpu entry") assert(gpuValue.get.name == "gpu", "name should be gpu") assert(gpuValue.get.addresses.size == 2, "Should have 2 indexes") - assert(gpuValue.get.addresses.deep == Array("0", "1").deep, "should have 0,1 entries") + assert(gpuValue.get.addresses.sameElements(Array("0", "1")), "should have 0,1 entries") } } @@ -186,14 +245,14 @@ class ResourceUtilsSuite extends SparkFunSuite val gpuDiscovery = createTempScriptWithExpectedOutput(dir, "gpuDiscoveryScript", """{"name": "fpga", "addresses": ["0", "1"]}""") val request = - ResourceRequest( + new ResourceRequest( DRIVER_GPU_ID, 2, - Some(gpuDiscovery), - None) + Optional.of(gpuDiscovery), + Optional.empty[String]) val error = intercept[SparkException] { - discoverResource(request) + discoverResource(conf, request) }.getMessage() assert(error.contains(s"Error running the resource discovery script $gpuDiscovery: " + @@ -201,6 +260,28 @@ class ResourceUtilsSuite extends SparkFunSuite } } + test("Resource discoverer with invalid class") { + val conf = new SparkConf() + .set(RESOURCES_DISCOVERY_PLUGIN, Seq("someinvalidclass")) + assume(!(Utils.isWindows)) + withTempDir { dir => + val gpuDiscovery = createTempScriptWithExpectedOutput(dir, "gpuDiscoveryScript", + """{"name": "fpga", "addresses": ["0", "1"]}""") + val request = + new ResourceRequest( + DRIVER_GPU_ID, + 2, + Optional.of(gpuDiscovery), + Optional.empty[String]) + + val error = intercept[ClassNotFoundException] { + discoverResource(conf, request) + }.getMessage() + + assert(error.contains(s"someinvalidclass")) + } + } + test("Resource discoverer script returns invalid format") { val conf = new SparkConf assume(!(Utils.isWindows)) @@ -209,14 +290,14 @@ class ResourceUtilsSuite extends SparkFunSuite """{"addresses": ["0", "1"]}""") val request = - ResourceRequest( + new ResourceRequest( EXECUTOR_GPU_ID, 2, - Some(gpuDiscovery), - None) + Optional.of(gpuDiscovery), + Optional.empty[String]) val error = intercept[SparkException] { - discoverResource(request) + discoverResource(conf, request) }.getMessage() assert(error.contains("Error parsing JSON into ResourceInformation")) @@ -229,14 +310,14 @@ class ResourceUtilsSuite extends SparkFunSuite val file1 = new File(dir, "bogusfilepath") try { val request = - ResourceRequest( + new ResourceRequest( EXECUTOR_GPU_ID, 2, - Some(file1.getPath()), - None) + Optional.of(file1.getPath()), + Optional.empty[String]) val error = intercept[SparkException] { - discoverResource(request) + discoverResource(conf, request) }.getMessage() assert(error.contains("doesn't exist")) @@ -247,10 +328,11 @@ class ResourceUtilsSuite extends SparkFunSuite } test("gpu's specified but not a discovery script") { - val request = ResourceRequest(EXECUTOR_GPU_ID, 2, None, None) + val request = new ResourceRequest(EXECUTOR_GPU_ID, 2, Optional.empty[String], + Optional.empty[String]) val error = intercept[SparkException] { - discoverResource(request) + discoverResource(new SparkConf(), request) }.getMessage() assert(error.contains("User is expecting to use resource: gpu, but " + diff --git a/core/src/test/scala/org/apache/spark/resource/TestResourceIDs.scala b/core/src/test/scala/org/apache/spark/resource/TestResourceIDs.scala index c4509e93104d5..60246f5fad9a8 100644 --- a/core/src/test/scala/org/apache/spark/resource/TestResourceIDs.scala +++ b/core/src/test/scala/org/apache/spark/resource/TestResourceIDs.scala @@ -22,14 +22,14 @@ import org.apache.spark.internal.config.Worker.SPARK_WORKER_PREFIX import org.apache.spark.resource.ResourceUtils.{FPGA, GPU} object TestResourceIDs { - val DRIVER_GPU_ID = ResourceID(SPARK_DRIVER_PREFIX, GPU) - val EXECUTOR_GPU_ID = ResourceID(SPARK_EXECUTOR_PREFIX, GPU) - val TASK_GPU_ID = ResourceID(SPARK_TASK_PREFIX, GPU) - val WORKER_GPU_ID = ResourceID(SPARK_WORKER_PREFIX, GPU) + val DRIVER_GPU_ID = new ResourceID(SPARK_DRIVER_PREFIX, GPU) + val EXECUTOR_GPU_ID = new ResourceID(SPARK_EXECUTOR_PREFIX, GPU) + val TASK_GPU_ID = new ResourceID(SPARK_TASK_PREFIX, GPU) + val WORKER_GPU_ID = new ResourceID(SPARK_WORKER_PREFIX, GPU) - val DRIVER_FPGA_ID = ResourceID(SPARK_DRIVER_PREFIX, FPGA) - val EXECUTOR_FPGA_ID = ResourceID(SPARK_EXECUTOR_PREFIX, FPGA) - val TASK_FPGA_ID = ResourceID(SPARK_TASK_PREFIX, FPGA) - val WORKER_FPGA_ID = ResourceID(SPARK_WORKER_PREFIX, FPGA) + val DRIVER_FPGA_ID = new ResourceID(SPARK_DRIVER_PREFIX, FPGA) + val EXECUTOR_FPGA_ID = new ResourceID(SPARK_EXECUTOR_PREFIX, FPGA) + val TASK_FPGA_ID = new ResourceID(SPARK_TASK_PREFIX, FPGA) + val WORKER_FPGA_ID = new ResourceID(SPARK_WORKER_PREFIX, FPGA) } diff --git a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala index 5bdf71be35b3b..c10f2c244e133 100644 --- a/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala +++ b/core/src/test/scala/org/apache/spark/rpc/RpcEnvSuite.scala @@ -36,7 +36,6 @@ import org.scalatest.concurrent.Eventually._ import org.apache.spark.{SecurityManager, SparkConf, SparkEnv, SparkException, SparkFunSuite} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.config._ -import org.apache.spark.internal.config.Network import org.apache.spark.util.{ThreadUtils, Utils} /** @@ -409,7 +408,7 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll { (0 until 10) foreach { _ => new Thread { - override def run() { + override def run(): Unit = { (0 until 100) foreach { _ => endpointRef.send("Hello") } @@ -954,6 +953,40 @@ abstract class RpcEnvSuite extends SparkFunSuite with BeforeAndAfterAll { verify(endpoint, never()).onDisconnected(any()) verify(endpoint, never()).onNetworkError(any(), any()) } + + test("isolated endpoints") { + val latch = new CountDownLatch(1) + val singleThreadedEnv = createRpcEnv( + new SparkConf().set(Network.RPC_NETTY_DISPATCHER_NUM_THREADS, 1), "singleThread", 0) + try { + val blockingEndpoint = singleThreadedEnv.setupEndpoint("blocking", new IsolatedRpcEndpoint { + override val rpcEnv: RpcEnv = singleThreadedEnv + + override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { + case m => + latch.await() + context.reply(m) + } + }) + + val nonBlockingEndpoint = singleThreadedEnv.setupEndpoint("non-blocking", new RpcEndpoint { + override val rpcEnv: RpcEnv = singleThreadedEnv + + override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { + case m => context.reply(m) + } + }) + + val to = new RpcTimeout(5.seconds, "test-timeout") + val blockingFuture = blockingEndpoint.ask[String]("hi", to) + assert(nonBlockingEndpoint.askSync[String]("hello", to) === "hello") + latch.countDown() + assert(ThreadUtils.awaitResult(blockingFuture, 5.seconds) === "hi") + } finally { + latch.countDown() + singleThreadedEnv.shutdown() + } + } } class UnserializableClass diff --git a/core/src/test/scala/org/apache/spark/rpc/TestRpcEndpoint.scala b/core/src/test/scala/org/apache/spark/rpc/TestRpcEndpoint.scala index 5e8da3e205ab0..7c65f3b126e3d 100644 --- a/core/src/test/scala/org/apache/spark/rpc/TestRpcEndpoint.scala +++ b/core/src/test/scala/org/apache/spark/rpc/TestRpcEndpoint.scala @@ -20,6 +20,7 @@ package org.apache.spark.rpc import scala.collection.mutable.ArrayBuffer import org.scalactic.TripleEquals +import org.scalatest.Assertions._ class TestRpcEndpoint extends ThreadSafeRpcEndpoint with TripleEquals { diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala index e5539566e4b6f..c74c728b3e3f3 100644 --- a/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala +++ b/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala @@ -29,12 +29,9 @@ class InboxSuite extends SparkFunSuite { test("post") { val endpoint = new TestRpcEndpoint - val endpointRef = mock(classOf[NettyRpcEndpointRef]) - when(endpointRef.name).thenReturn("hello") - val dispatcher = mock(classOf[Dispatcher]) - val inbox = new Inbox(endpointRef, endpoint) + val inbox = new Inbox("name", endpoint) val message = OneWayMessage(null, "hi") inbox.post(message) inbox.process(dispatcher) @@ -51,10 +48,9 @@ class InboxSuite extends SparkFunSuite { test("post: with reply") { val endpoint = new TestRpcEndpoint - val endpointRef = mock(classOf[NettyRpcEndpointRef]) val dispatcher = mock(classOf[Dispatcher]) - val inbox = new Inbox(endpointRef, endpoint) + val inbox = new Inbox("name", endpoint) val message = RpcMessage(null, "hi", null) inbox.post(message) inbox.process(dispatcher) @@ -65,13 +61,10 @@ class InboxSuite extends SparkFunSuite { test("post: multiple threads") { val endpoint = new TestRpcEndpoint - val endpointRef = mock(classOf[NettyRpcEndpointRef]) - when(endpointRef.name).thenReturn("hello") - val dispatcher = mock(classOf[Dispatcher]) val numDroppedMessages = new AtomicInteger(0) - val inbox = new Inbox(endpointRef, endpoint) { + val inbox = new Inbox("name", endpoint) { override def onDrop(message: InboxMessage): Unit = { numDroppedMessages.incrementAndGet() } @@ -107,12 +100,10 @@ class InboxSuite extends SparkFunSuite { test("post: Associated") { val endpoint = new TestRpcEndpoint - val endpointRef = mock(classOf[NettyRpcEndpointRef]) val dispatcher = mock(classOf[Dispatcher]) - val remoteAddress = RpcAddress("localhost", 11111) - val inbox = new Inbox(endpointRef, endpoint) + val inbox = new Inbox("name", endpoint) inbox.post(RemoteProcessConnected(remoteAddress)) inbox.process(dispatcher) @@ -121,12 +112,11 @@ class InboxSuite extends SparkFunSuite { test("post: Disassociated") { val endpoint = new TestRpcEndpoint - val endpointRef = mock(classOf[NettyRpcEndpointRef]) val dispatcher = mock(classOf[Dispatcher]) val remoteAddress = RpcAddress("localhost", 11111) - val inbox = new Inbox(endpointRef, endpoint) + val inbox = new Inbox("name", endpoint) inbox.post(RemoteProcessDisconnected(remoteAddress)) inbox.process(dispatcher) @@ -135,13 +125,12 @@ class InboxSuite extends SparkFunSuite { test("post: AssociationError") { val endpoint = new TestRpcEndpoint - val endpointRef = mock(classOf[NettyRpcEndpointRef]) val dispatcher = mock(classOf[Dispatcher]) val remoteAddress = RpcAddress("localhost", 11111) val cause = new RuntimeException("Oops") - val inbox = new Inbox(endpointRef, endpoint) + val inbox = new Inbox("name", endpoint) inbox.post(RemoteProcessConnectionError(cause, remoteAddress)) inbox.process(dispatcher) diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala index 59b4b706bbcdd..378d433cf44f8 100644 --- a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala +++ b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcEnvSuite.scala @@ -22,7 +22,7 @@ import java.util.concurrent.ExecutionException import scala.concurrent.duration._ import org.scalatest.concurrent.{Signaler, ThreadSignaler, TimeLimits} -import org.scalatest.mockito.MockitoSugar +import org.scalatestplus.mockito.MockitoSugar import org.apache.spark._ import org.apache.spark.network.client.TransportClient diff --git a/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala index 8d5f04ac7651a..fc8ac38479932 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/BarrierTaskContextSuite.scala @@ -26,13 +26,18 @@ import org.apache.spark.internal.config.Tests.TEST_NO_STAGE_RETRY class BarrierTaskContextSuite extends SparkFunSuite with LocalSparkContext { - test("global sync by barrier() call") { + def initLocalClusterSparkContext(): Unit = { val conf = new SparkConf() // Init local cluster here so each barrier task runs in a separated process, thus `barrier()` // call is actually useful. .setMaster("local-cluster[4, 1, 1024]") .setAppName("test-cluster") + .set(TEST_NO_STAGE_RETRY, true) sc = new SparkContext(conf) + } + + test("global sync by barrier() call") { + initLocalClusterSparkContext() val rdd = sc.makeRDD(1 to 10, 4) val rdd2 = rdd.barrier().mapPartitions { it => val context = BarrierTaskContext.get() @@ -48,10 +53,7 @@ class BarrierTaskContextSuite extends SparkFunSuite with LocalSparkContext { } test("support multiple barrier() call within a single task") { - val conf = new SparkConf() - .setMaster("local-cluster[4, 1, 1024]") - .setAppName("test-cluster") - sc = new SparkContext(conf) + initLocalClusterSparkContext() val rdd = sc.makeRDD(1 to 10, 4) val rdd2 = rdd.barrier().mapPartitions { it => val context = BarrierTaskContext.get() @@ -77,12 +79,8 @@ class BarrierTaskContextSuite extends SparkFunSuite with LocalSparkContext { } test("throw exception on barrier() call timeout") { - val conf = new SparkConf() - .set("spark.barrier.sync.timeout", "1") - .set(TEST_NO_STAGE_RETRY, true) - .setMaster("local-cluster[4, 1, 1024]") - .setAppName("test-cluster") - sc = new SparkContext(conf) + initLocalClusterSparkContext() + sc.conf.set("spark.barrier.sync.timeout", "1") val rdd = sc.makeRDD(1 to 10, 4) val rdd2 = rdd.barrier().mapPartitions { it => val context = BarrierTaskContext.get() @@ -102,12 +100,8 @@ class BarrierTaskContextSuite extends SparkFunSuite with LocalSparkContext { } test("throw exception if barrier() call doesn't happen on every task") { - val conf = new SparkConf() - .set("spark.barrier.sync.timeout", "1") - .set(TEST_NO_STAGE_RETRY, true) - .setMaster("local-cluster[4, 1, 1024]") - .setAppName("test-cluster") - sc = new SparkContext(conf) + initLocalClusterSparkContext() + sc.conf.set("spark.barrier.sync.timeout", "1") val rdd = sc.makeRDD(1 to 10, 4) val rdd2 = rdd.barrier().mapPartitions { it => val context = BarrierTaskContext.get() @@ -125,12 +119,8 @@ class BarrierTaskContextSuite extends SparkFunSuite with LocalSparkContext { } test("throw exception if the number of barrier() calls are not the same on every task") { - val conf = new SparkConf() - .set("spark.barrier.sync.timeout", "1") - .set(TEST_NO_STAGE_RETRY, true) - .setMaster("local-cluster[4, 1, 1024]") - .setAppName("test-cluster") - sc = new SparkContext(conf) + initLocalClusterSparkContext() + sc.conf.set("spark.barrier.sync.timeout", "1") val rdd = sc.makeRDD(1 to 10, 4) val rdd2 = rdd.barrier().mapPartitions { it => val context = BarrierTaskContext.get() @@ -156,10 +146,7 @@ class BarrierTaskContextSuite extends SparkFunSuite with LocalSparkContext { assert(error.contains("within 1 second(s)")) } - - def testBarrierTaskKilled(sc: SparkContext, interruptOnCancel: Boolean): Unit = { - sc.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, interruptOnCancel.toString) - + def testBarrierTaskKilled(interruptOnKill: Boolean): Unit = { withTempDir { dir => val killedFlagFile = "barrier.task.killed" val rdd = sc.makeRDD(Seq(0, 1), 2) @@ -181,12 +168,15 @@ class BarrierTaskContextSuite extends SparkFunSuite with LocalSparkContext { val listener = new SparkListener { override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = { - new Thread { - override def run: Unit = { - Thread.sleep(1000) - sc.killTaskAttempt(taskStart.taskInfo.taskId, interruptThread = false) - } - }.start() + val partitionId = taskStart.taskInfo.index + if (partitionId == 0) { + new Thread { + override def run: Unit = { + Thread.sleep(1000) + sc.killTaskAttempt(taskStart.taskInfo.taskId, interruptThread = interruptOnKill) + } + }.start() + } } } sc.addSparkListener(listener) @@ -201,15 +191,13 @@ class BarrierTaskContextSuite extends SparkFunSuite with LocalSparkContext { } } - test("barrier task killed") { - val conf = new SparkConf() - .set("spark.barrier.sync.timeout", "1") - .set(TEST_NO_STAGE_RETRY, true) - .setMaster("local-cluster[4, 1, 1024]") - .setAppName("test-cluster") - sc = new SparkContext(conf) + test("barrier task killed, no interrupt") { + initLocalClusterSparkContext() + testBarrierTaskKilled(interruptOnKill = false) + } - testBarrierTaskKilled(sc, true) - testBarrierTaskKilled(sc, false) + test("barrier task killed, interrupt") { + initLocalClusterSparkContext() + testBarrierTaskKilled(interruptOnKill = true) } } diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala index 0fe0e5b78233c..246d4b2f56ec9 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala @@ -16,8 +16,6 @@ */ package org.apache.spark.scheduler -import scala.concurrent.duration._ - import org.apache.spark._ import org.apache.spark.internal.config import org.apache.spark.internal.config.Tests._ diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala index 93a88cc30a20c..a1671a58f0d9b 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala @@ -21,7 +21,7 @@ import org.mockito.ArgumentMatchers.any import org.mockito.Mockito.{never, verify, when} import org.mockito.invocation.InvocationOnMock import org.scalatest.BeforeAndAfterEach -import org.scalatest.mockito.MockitoSugar +import org.scalatestplus.mockito.MockitoSugar import org.apache.spark._ import org.apache.spark.internal.config @@ -437,7 +437,7 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M } test("check blacklist configuration invariants") { - val conf = new SparkConf().setMaster("yarn-cluster") + val conf = new SparkConf().setMaster("yarn").set(config.SUBMIT_DEPLOY_MODE, "cluster") Seq( (2, 2), (2, 3) diff --git a/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala index 3edbbeb9c08f1..7666c6c7810cc 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala @@ -20,7 +20,6 @@ package org.apache.spark.scheduler import java.util.Properties import java.util.concurrent.atomic.AtomicBoolean -import scala.collection.immutable import scala.collection.mutable import scala.concurrent.duration._ import scala.language.postfixOps @@ -29,13 +28,13 @@ import org.mockito.ArgumentMatchers.any import org.mockito.Mockito.when import org.mockito.invocation.InvocationOnMock import org.scalatest.concurrent.Eventually -import org.scalatest.mockito.MockitoSugar._ +import org.scalatestplus.mockito.MockitoSugar._ import org.apache.spark._ import org.apache.spark.internal.config._ import org.apache.spark.internal.config.Network.RPC_MESSAGE_MAX_SIZE import org.apache.spark.rdd.RDD -import org.apache.spark.resource.ResourceInformation +import org.apache.spark.resource.{ExecutorResourceRequests, ResourceInformation, ResourceProfile, TaskResourceRequests} import org.apache.spark.resource.ResourceUtils._ import org.apache.spark.resource.TestResourceIDs._ import org.apache.spark.rpc.{RpcAddress, RpcEndpointRef, RpcEnv} @@ -174,22 +173,24 @@ class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkCo sc.addSparkListener(listener) backend.driverEndpoint.askSync[Boolean]( - RegisterExecutor("1", mockEndpointRef, mockAddress.host, 1, logUrls, attributes, Map.empty)) + RegisterExecutor("1", mockEndpointRef, mockAddress.host, 1, logUrls, attributes, + Map.empty, ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)) backend.driverEndpoint.askSync[Boolean]( - RegisterExecutor("2", mockEndpointRef, mockAddress.host, 1, logUrls, attributes, Map.empty)) + RegisterExecutor("2", mockEndpointRef, mockAddress.host, 1, logUrls, attributes, + Map.empty, ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)) backend.driverEndpoint.askSync[Boolean]( - RegisterExecutor("3", mockEndpointRef, mockAddress.host, 1, logUrls, attributes, Map.empty)) + RegisterExecutor("3", mockEndpointRef, mockAddress.host, 1, logUrls, attributes, + Map.empty, ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)) sc.listenerBus.waitUntilEmpty(executorUpTimeout.toMillis) assert(executorAddedCount === 3) } test("extra resources from executor") { - import TestUtils._ - val conf = new SparkConf() - .set(EXECUTOR_CORES, 3) + .set(EXECUTOR_CORES, 1) .set(SCHEDULER_REVIVE_INTERVAL.key, "1m") // don't let it auto revive during test + .set(EXECUTOR_INSTANCES, 0) // avoid errors about duplicate executor registrations .setMaster( "coarseclustermanager[org.apache.spark.scheduler.TestCoarseGrainedSchedulerBackend]") .setAppName("test") @@ -197,6 +198,11 @@ class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkCo conf.set(EXECUTOR_GPU_ID.amountConf, "1") sc = new SparkContext(conf) + val execGpu = new ExecutorResourceRequests().cores(1).resource(GPU, 3) + val taskGpu = new TaskResourceRequests().cpus(1).resource(GPU, 1) + val rp = new ResourceProfile(execGpu.requests, taskGpu.requests) + sc.resourceProfileManager.addResourceProfile(rp) + assert(rp.id > ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) val backend = sc.schedulerBackend.asInstanceOf[TestCoarseGrainedSchedulerBackend] val mockEndpointRef = mock[RpcEndpointRef] val mockAddress = mock[RpcAddress] @@ -214,20 +220,25 @@ class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkCo sc.addSparkListener(listener) backend.driverEndpoint.askSync[Boolean]( - RegisterExecutor("1", mockEndpointRef, mockAddress.host, 1, Map.empty, Map.empty, resources)) + RegisterExecutor("1", mockEndpointRef, mockAddress.host, 1, Map.empty, Map.empty, resources, + ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)) backend.driverEndpoint.askSync[Boolean]( - RegisterExecutor("2", mockEndpointRef, mockAddress.host, 1, Map.empty, Map.empty, resources)) + RegisterExecutor("2", mockEndpointRef, mockAddress.host, 1, Map.empty, Map.empty, resources, + ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)) backend.driverEndpoint.askSync[Boolean]( - RegisterExecutor("3", mockEndpointRef, mockAddress.host, 1, Map.empty, Map.empty, resources)) + RegisterExecutor("3", mockEndpointRef, mockAddress.host, 1, Map.empty, Map.empty, resources, + rp.id)) val frameSize = RpcUtils.maxMessageSizeBytes(sc.conf) val bytebuffer = java.nio.ByteBuffer.allocate(frameSize - 100) val buffer = new SerializableBuffer(bytebuffer) var execResources = backend.getExecutorAvailableResources("1") - assert(execResources(GPU).availableAddrs.sorted === Array("0", "1", "3")) + var exec3ResourceProfileId = backend.getExecutorResourceProfileId("3") + assert(exec3ResourceProfileId === rp.id) + val taskResources = Map(GPU -> new ResourceInformation(GPU, Array("0"))) var taskDescs: Seq[Seq[TaskDescription]] = Seq(Seq(new TaskDescription(1, 0, "1", "t1", 0, 1, mutable.Map.empty[String, Long], mutable.Map.empty[String, Long], @@ -283,6 +294,7 @@ private class CSMockExternalClusterManager extends ExternalClusterManager { when(ts.applicationAttemptId()).thenReturn(Some("attempt1")) when(ts.schedulingMode).thenReturn(SchedulingMode.FIFO) when(ts.nodeBlacklist()).thenReturn(Set.empty[String]) + when(ts.resourcesReqsPerTask).thenReturn(Seq.empty) ts } diff --git a/core/src/test/scala/org/apache/spark/scheduler/CustomShuffledRDD.scala b/core/src/test/scala/org/apache/spark/scheduler/CustomShuffledRDD.scala index 1be2e2a067115..46e5e6f97b1f1 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/CustomShuffledRDD.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/CustomShuffledRDD.scala @@ -111,7 +111,7 @@ class CustomShuffledRDD[K, V, C]( .asInstanceOf[Iterator[(K, C)]] } - override def clearDependencies() { + override def clearDependencies(): Unit = { super.clearDependencies() dependency = null } diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala index 2b3423f9a4d40..e40b63fe13cb1 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala @@ -19,13 +19,14 @@ package org.apache.spark.scheduler import java.util.Properties import java.util.concurrent.{CountDownLatch, TimeUnit} -import java.util.concurrent.atomic.{AtomicBoolean, AtomicLong} +import java.util.concurrent.atomic.{AtomicBoolean, AtomicInteger, AtomicLong, AtomicReference} import scala.annotation.meta.param import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Map} import scala.util.control.NonFatal import org.scalatest.concurrent.{Signaler, ThreadSignaler, TimeLimits} +import org.scalatest.exceptions.TestFailedException import org.scalatest.time.SpanSugar._ import org.apache.spark._ @@ -36,7 +37,7 @@ import org.apache.spark.rdd.{DeterministicLevel, RDD} import org.apache.spark.scheduler.SchedulingMode.SchedulingMode import org.apache.spark.shuffle.{FetchFailedException, MetadataFetchFailedException} import org.apache.spark.storage.{BlockId, BlockManagerId, BlockManagerMaster} -import org.apache.spark.util.{AccumulatorContext, AccumulatorV2, CallSite, LongAccumulator, Utils} +import org.apache.spark.util.{AccumulatorContext, AccumulatorV2, CallSite, LongAccumulator, ThreadUtils, Utils} class DAGSchedulerEventProcessLoopTester(dagScheduler: DAGScheduler) extends DAGSchedulerEventProcessLoop(dagScheduler) { @@ -150,7 +151,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi taskSet.tasks.foreach(_.epoch = mapOutputTracker.getEpoch) taskSets += taskSet } - override def cancelTasks(stageId: Int, interruptThread: Boolean) { + override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = { cancelledStages += stageId } override def killTaskAttempt( @@ -166,39 +167,72 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi } override def setDAGScheduler(dagScheduler: DAGScheduler) = {} override def defaultParallelism() = 2 + override def executorDecommission(executorId: String) = {} override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {} override def workerRemoved(workerId: String, host: String, message: String): Unit = {} override def applicationAttemptId(): Option[String] = None } - /** Length of time to wait while draining listener events. */ - val WAIT_TIMEOUT_MILLIS = 10000 - - val submittedStageInfos = new HashSet[StageInfo] - val successfulStages = new HashSet[Int] - val failedStages = new ArrayBuffer[Int] - val stageByOrderOfExecution = new ArrayBuffer[Int] - val endedTasks = new HashSet[Long] - val sparkListener = new SparkListener() { - override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted) { - submittedStageInfos += stageSubmitted.stageInfo + /** + * Listeners which records some information to verify in UTs. Getter-kind methods in this class + * ensures the value is returned after ensuring there's no event to process, as well as the + * value is immutable: prevent showing odd result by race condition. + */ + class EventInfoRecordingListener extends SparkListener { + private val _submittedStageInfos = new HashSet[StageInfo] + private val _successfulStages = new HashSet[Int] + private val _failedStages = new ArrayBuffer[Int] + private val _stageByOrderOfExecution = new ArrayBuffer[Int] + private val _endedTasks = new HashSet[Long] + + override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = { + _submittedStageInfos += stageSubmitted.stageInfo } - override def onStageCompleted(stageCompleted: SparkListenerStageCompleted) { + override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = { val stageInfo = stageCompleted.stageInfo - stageByOrderOfExecution += stageInfo.stageId + _stageByOrderOfExecution += stageInfo.stageId if (stageInfo.failureReason.isEmpty) { - successfulStages += stageInfo.stageId + _successfulStages += stageInfo.stageId } else { - failedStages += stageInfo.stageId + _failedStages += stageInfo.stageId } } override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = { - endedTasks += taskEnd.taskInfo.taskId + _endedTasks += taskEnd.taskInfo.taskId + } + + def submittedStageInfos: Set[StageInfo] = { + waitForListeners() + _submittedStageInfos.toSet + } + + def successfulStages: Set[Int] = { + waitForListeners() + _successfulStages.toSet } + + def failedStages: List[Int] = { + waitForListeners() + _failedStages.toList + } + + def stageByOrderOfExecution: List[Int] = { + waitForListeners() + _stageByOrderOfExecution.toList + } + + def endedTasks: Set[Long] = { + waitForListeners() + _endedTasks.toSet + } + + private def waitForListeners(): Unit = sc.listenerBus.waitUntilEmpty() } + var sparkListener: EventInfoRecordingListener = null + var mapOutputTracker: MapOutputTrackerMaster = null var broadcastManager: BroadcastManager = null var securityMgr: SecurityManager = null @@ -212,14 +246,14 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi */ val cacheLocations = new HashMap[(Int, Int), Seq[BlockManagerId]] // stub out BlockManagerMaster.getLocations to use our cacheLocations - val blockManagerMaster = new BlockManagerMaster(null, conf, true) { + val blockManagerMaster = new BlockManagerMaster(null, null, conf, true) { override def getLocations(blockIds: Array[BlockId]): IndexedSeq[Seq[BlockManagerId]] = { blockIds.map { _.asRDDId.map(id => (id.rddId -> id.splitIndex)).flatMap(key => cacheLocations.get(key)). getOrElse(Seq()) }.toIndexedSeq } - override def removeExecutor(execId: String) { + override def removeExecutor(execId: String): Unit = { // don't need to propagate to the driver, which we don't have } } @@ -247,10 +281,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi private def init(testConf: SparkConf): Unit = { sc = new SparkContext("local[2]", "DAGSchedulerSuite", testConf) - submittedStageInfos.clear() - successfulStages.clear() - failedStages.clear() - endedTasks.clear() + sparkListener = new EventInfoRecordingListener failure = null sc.addSparkListener(sparkListener) taskSets.clear() @@ -286,7 +317,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi } } - override def afterAll() { + override def afterAll(): Unit = { super.afterAll() } @@ -303,7 +334,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi * After processing the event, submit waiting stages as is done on most iterations of the * DAGScheduler event loop. */ - private def runEvent(event: DAGSchedulerEvent) { + private def runEvent(event: DAGSchedulerEvent): Unit = { dagEventProcessLoopTester.post(event) } @@ -316,7 +347,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi it.next.asInstanceOf[Tuple2[_, _]]._1 /** Send the given CompletionEvent messages for the tasks in the TaskSet. */ - private def complete(taskSet: TaskSet, results: Seq[(TaskEndReason, Any)]) { + private def complete(taskSet: TaskSet, results: Seq[(TaskEndReason, Any)]): Unit = { assert(taskSet.tasks.size >= results.size) for ((result, i) <- results.zipWithIndex) { if (i < taskSet.tasks.size) { @@ -328,7 +359,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi private def completeWithAccumulator( accumId: Long, taskSet: TaskSet, - results: Seq[(TaskEndReason, Any)]) { + results: Seq[(TaskEndReason, Any)]): Unit = { assert(taskSet.tasks.size >= results.size) for ((result, i) <- results.zipWithIndex) { if (i < taskSet.tasks.size) { @@ -363,19 +394,18 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi } /** Sends TaskSetFailed to the scheduler. */ - private def failed(taskSet: TaskSet, message: String) { + private def failed(taskSet: TaskSet, message: String): Unit = { runEvent(TaskSetFailed(taskSet, message, None)) } /** Sends JobCancelled to the DAG scheduler. */ - private def cancel(jobId: Int) { + private def cancel(jobId: Int): Unit = { runEvent(JobCancelled(jobId, None)) } test("[SPARK-3353] parent stage should have lower stage id") { - stageByOrderOfExecution.clear() sc.parallelize(1 to 10).map(x => (x, x)).reduceByKey(_ + _, 4).count() - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) + val stageByOrderOfExecution = sparkListener.stageByOrderOfExecution assert(stageByOrderOfExecution.length === 2) assert(stageByOrderOfExecution(0) < stageByOrderOfExecution(1)) } @@ -455,18 +485,22 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // map stage1 completes successfully, with one task on each executor complete(taskSets(0), Seq( (Success, - MapStatus(BlockManagerId("exec-hostA1", "hostA", 12345), Array.fill[Long](1)(2))), + MapStatus( + BlockManagerId("exec-hostA1", "hostA", 12345), Array.fill[Long](1)(2), mapTaskId = 5)), (Success, - MapStatus(BlockManagerId("exec-hostA2", "hostA", 12345), Array.fill[Long](1)(2))), - (Success, makeMapStatus("hostB", 1)) + MapStatus( + BlockManagerId("exec-hostA2", "hostA", 12345), Array.fill[Long](1)(2), mapTaskId = 6)), + (Success, makeMapStatus("hostB", 1, mapTaskId = 7)) )) // map stage2 completes successfully, with one task on each executor complete(taskSets(1), Seq( (Success, - MapStatus(BlockManagerId("exec-hostA1", "hostA", 12345), Array.fill[Long](1)(2))), + MapStatus( + BlockManagerId("exec-hostA1", "hostA", 12345), Array.fill[Long](1)(2), mapTaskId = 8)), (Success, - MapStatus(BlockManagerId("exec-hostA2", "hostA", 12345), Array.fill[Long](1)(2))), - (Success, makeMapStatus("hostB", 1)) + MapStatus( + BlockManagerId("exec-hostA2", "hostA", 12345), Array.fill[Long](1)(2), mapTaskId = 9)), + (Success, makeMapStatus("hostB", 1, mapTaskId = 10)) )) // make sure our test setup is correct val initialMapStatus1 = mapOutputTracker.shuffleStatuses(firstShuffleId).mapStatuses @@ -474,16 +508,19 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi assert(initialMapStatus1.count(_ != null) === 3) assert(initialMapStatus1.map{_.location.executorId}.toSet === Set("exec-hostA1", "exec-hostA2", "exec-hostB")) + assert(initialMapStatus1.map{_.mapId}.toSet === Set(5, 6, 7)) val initialMapStatus2 = mapOutputTracker.shuffleStatuses(secondShuffleId).mapStatuses // val initialMapStatus1 = mapOutputTracker.mapStatuses.get(0).get assert(initialMapStatus2.count(_ != null) === 3) assert(initialMapStatus2.map{_.location.executorId}.toSet === Set("exec-hostA1", "exec-hostA2", "exec-hostB")) + assert(initialMapStatus2.map{_.mapId}.toSet === Set(8, 9, 10)) // reduce stage fails with a fetch failure from one host complete(taskSets(2), Seq( - (FetchFailed(BlockManagerId("exec-hostA2", "hostA", 12345), firstShuffleId, 0, 0, "ignored"), + (FetchFailed(BlockManagerId("exec-hostA2", "hostA", 12345), + firstShuffleId, 0L, 0, 0, "ignored"), null) )) @@ -618,9 +655,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi submit(unserializableRdd, Array(0)) assert(failure.getMessage.startsWith( "Job aborted due to stage failure: Task not serializable:")) - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) - assert(failedStages.contains(0)) - assert(failedStages.size === 1) + assert(sparkListener.failedStages === Seq(0)) assertDataStructuresEmpty() } @@ -628,9 +663,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi submit(new MyRDD(sc, 1, Nil), Array(0)) failed(taskSets(0), "some failure") assert(failure.getMessage === "Job aborted due to stage failure: some failure") - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) - assert(failedStages.contains(0)) - assert(failedStages.size === 1) + assert(sparkListener.failedStages === Seq(0)) assertDataStructuresEmpty() } @@ -639,9 +672,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi val jobId = submit(rdd, Array(0)) cancel(jobId) assert(failure.getMessage === s"Job $jobId cancelled ") - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) - assert(failedStages.contains(0)) - assert(failedStages.size === 1) + assert(sparkListener.failedStages === Seq(0)) assertDataStructuresEmpty() } @@ -656,7 +687,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi override def submitTasks(taskSet: TaskSet): Unit = { taskSets += taskSet } - override def cancelTasks(stageId: Int, interruptThread: Boolean) { + override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = { throw new UnsupportedOperationException } override def killTaskAttempt( @@ -677,6 +708,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])], blockManagerId: BlockManagerId, executorUpdates: Map[(Int, Int), ExecutorMetrics]): Boolean = true + override def executorDecommission(executorId: String): Unit = {} override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {} override def workerRemoved(workerId: String, host: String, message: String): Unit = {} override def applicationAttemptId(): Option[String] = None @@ -699,9 +731,8 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi assert(results === Map(0 -> 42)) assertDataStructuresEmpty() - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) - assert(failedStages.isEmpty) - assert(successfulStages.contains(0)) + assert(sparkListener.failedStages.isEmpty) + assert(sparkListener.successfulStages.contains(0)) } test("run trivial shuffle") { @@ -732,7 +763,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // the 2nd ResultTask failed complete(taskSets(1), Seq( (Success, 42), - (FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored"), null))) + (FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0L, 0, 0, "ignored"), null))) // this will get called // blockManagerMaster.removeExecutor("exec-hostA") // ask the scheduler to try it again @@ -788,14 +819,49 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi } } + test("SPARK-28967 properties must be cloned before posting to listener bus for 0 partition") { + val properties = new Properties() + val func = (context: TaskContext, it: Iterator[(_)]) => 1 + val resultHandler = (taskIndex: Int, result: Int) => {} + val assertionError = new AtomicReference[TestFailedException]( + new TestFailedException("Listener didn't receive expected JobStart event", 0)) + val listener = new SparkListener() { + override def onJobStart(event: SparkListenerJobStart): Unit = { + try { + // spark.job.description can be implicitly set for 0 partition jobs. + // So event.properties and properties can be different. See SPARK-29997. + event.properties.remove(SparkContext.SPARK_JOB_DESCRIPTION) + properties.remove(SparkContext.SPARK_JOB_DESCRIPTION) + + assert(event.properties.equals(properties), "Expected same content of properties, " + + s"but got properties with different content. props in caller ${properties} /" + + s" props in event ${event.properties}") + assert(event.properties.ne(properties), "Expected instance with different identity, " + + "but got same instance.") + assertionError.set(null) + } catch { + case e: TestFailedException => assertionError.set(e) + } + } + } + sc.addSparkListener(listener) + + // 0 partition + val testRdd = new MyRDD(sc, 0, Nil) + val waiter = scheduler.submitJob(testRdd, func, Seq.empty, CallSite.empty, + resultHandler, properties) + sc.listenerBus.waitUntilEmpty() + assert(assertionError.get() === null) + } + // Helper function to validate state when creating tests for task failures - private def checkStageId(stageId: Int, attempt: Int, stageAttempt: TaskSet) { + private def checkStageId(stageId: Int, attempt: Int, stageAttempt: TaskSet): Unit = { assert(stageAttempt.stageId === stageId) assert(stageAttempt.stageAttemptId == attempt) } // Helper functions to extract commonly used code in Fetch Failure test cases - private def setupStageAbortTest(sc: SparkContext) { + private def setupStageAbortTest(sc: SparkContext): Unit = { sc.listenerBus.addToSharedQueue(new EndListener()) ended = false jobResult = null @@ -849,7 +915,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi val stageAttempt = taskSets.last checkStageId(stageId, attemptIdx, stageAttempt) complete(stageAttempt, stageAttempt.tasks.zipWithIndex.map { case (task, idx) => - (FetchFailed(makeBlockManagerId("hostA"), shuffleDep.shuffleId, 0, idx, "ignored"), null) + (FetchFailed(makeBlockManagerId("hostA"), shuffleDep.shuffleId, 0L, 0, idx, "ignored"), null) }.toSeq) } @@ -902,7 +968,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi completeNextResultStageWithSuccess(1, 1) // Confirm job finished successfully - sc.listenerBus.waitUntilEmpty(1000) + sc.listenerBus.waitUntilEmpty() assert(ended) assert(results === (0 until parts).map { idx => idx -> 42 }.toMap) assertDataStructuresEmpty() @@ -939,7 +1005,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi } else { // Stage should have been aborted and removed from running stages assertDataStructuresEmpty() - sc.listenerBus.waitUntilEmpty(1000) + sc.listenerBus.waitUntilEmpty() assert(ended) jobResult match { case JobFailed(reason) => @@ -1061,7 +1127,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi completeNextResultStageWithSuccess(2, 1) assertDataStructuresEmpty() - sc.listenerBus.waitUntilEmpty(1000) + sc.listenerBus.waitUntilEmpty() assert(ended) assert(results === Map(0 -> 42)) } @@ -1082,19 +1148,17 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // The first result task fails, with a fetch failure for the output from the first mapper. runEvent(makeCompletionEvent( taskSets(1).tasks(0), - FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored"), + FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0L, 0, 0, "ignored"), null)) - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) - assert(failedStages.contains(1)) + assert(sparkListener.failedStages.contains(1)) // The second ResultTask fails, with a fetch failure for the output from the second mapper. runEvent(makeCompletionEvent( taskSets(1).tasks(0), - FetchFailed(makeBlockManagerId("hostA"), shuffleId, 1, 1, "ignored"), + FetchFailed(makeBlockManagerId("hostA"), shuffleId, 1L, 1, 1, "ignored"), null)) // The SparkListener should not receive redundant failure events. - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) - assert(failedStages.size == 1) + assert(sparkListener.failedStages.size === 1) } test("Retry all the tasks on a resubmitted attempt of a barrier stage caused by FetchFailure") { @@ -1111,7 +1175,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // The first result task fails, with a fetch failure for the output from the first mapper. runEvent(makeCompletionEvent( taskSets(1).tasks(0), - FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored"), + FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0L, 0, 0, "ignored"), null)) assert(mapOutputTracker.findMissingPartitions(shuffleId) === Some(Seq(0, 1))) @@ -1122,7 +1186,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // Complete the result stage. completeNextResultStageWithSuccess(1, 1) - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) + sc.listenerBus.waitUntilEmpty() assertDataStructuresEmpty() } @@ -1141,7 +1205,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi taskSets(0).tasks(1), TaskKilled("test"), null)) - assert(failedStages === Seq(0)) + assert(sparkListener.failedStages === Seq(0)) assert(mapOutputTracker.findMissingPartitions(shuffleId) === Some(Seq(0, 1))) scheduler.resubmitFailedStages() @@ -1151,7 +1215,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // Complete the result stage. completeNextResultStageWithSuccess(1, 0) - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) + sc.listenerBus.waitUntilEmpty() assertDataStructuresEmpty() } @@ -1177,7 +1241,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi null)) // Assert the stage has been cancelled. - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) + sc.listenerBus.waitUntilEmpty() assert(failure.getMessage.startsWith("Job aborted due to stage failure: Could not recover " + "from a failed barrier ResultStage.")) } @@ -1195,11 +1259,10 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi val mapStageId = 0 def countSubmittedMapStageAttempts(): Int = { - submittedStageInfos.count(_.stageId == mapStageId) + sparkListener.submittedStageInfos.count(_.stageId == mapStageId) } // The map stage should have been submitted. - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) assert(countSubmittedMapStageAttempts() === 1) complete(taskSets(0), Seq( @@ -1214,14 +1277,12 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // The first result task fails, with a fetch failure for the output from the first mapper. runEvent(makeCompletionEvent( taskSets(1).tasks(0), - FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored"), + FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0L, 0, 0, "ignored"), null)) - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) - assert(failedStages.contains(1)) + assert(sparkListener.failedStages.contains(1)) // Trigger resubmission of the failed map stage. runEvent(ResubmitFailedStages) - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) // Another attempt for the map stage should have been submitted, resulting in 2 total attempts. assert(countSubmittedMapStageAttempts() === 2) @@ -1229,7 +1290,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // The second ResultTask fails, with a fetch failure for the output from the second mapper. runEvent(makeCompletionEvent( taskSets(1).tasks(1), - FetchFailed(makeBlockManagerId("hostB"), shuffleId, 1, 1, "ignored"), + FetchFailed(makeBlockManagerId("hostB"), shuffleId, 1L, 1, 1, "ignored"), null)) // Another ResubmitFailedStages event should not result in another attempt for the map @@ -1238,7 +1299,6 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // shouldn't effect anything -- our calling it just makes *SURE* it gets called between the // desired event and our check. runEvent(ResubmitFailedStages) - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) assert(countSubmittedMapStageAttempts() === 2) } @@ -1256,14 +1316,13 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi submit(reduceRdd, Array(0, 1)) def countSubmittedReduceStageAttempts(): Int = { - submittedStageInfos.count(_.stageId == 1) + sparkListener.submittedStageInfos.count(_.stageId == 1) } def countSubmittedMapStageAttempts(): Int = { - submittedStageInfos.count(_.stageId == 0) + sparkListener.submittedStageInfos.count(_.stageId == 0) } // The map stage should have been submitted. - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) assert(countSubmittedMapStageAttempts() === 1) // Complete the map stage. @@ -1272,13 +1331,12 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi (Success, makeMapStatus("hostB", 2)))) // The reduce stage should have been submitted. - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) assert(countSubmittedReduceStageAttempts() === 1) // The first result task fails, with a fetch failure for the output from the first mapper. runEvent(makeCompletionEvent( taskSets(1).tasks(0), - FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored"), + FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0L, 0, 0, "ignored"), null)) // Trigger resubmission of the failed map stage and finish the re-started map task. @@ -1287,14 +1345,13 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // Because the map stage finished, another attempt for the reduce stage should have been // submitted, resulting in 2 total attempts for each the map and the reduce stage. - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) assert(countSubmittedMapStageAttempts() === 2) assert(countSubmittedReduceStageAttempts() === 2) // A late FetchFailed arrives from the second task in the original reduce stage. runEvent(makeCompletionEvent( taskSets(1).tasks(1), - FetchFailed(makeBlockManagerId("hostB"), shuffleId, 1, 1, "ignored"), + FetchFailed(makeBlockManagerId("hostB"), shuffleId, 1L, 1, 1, "ignored"), null)) // Running ResubmitFailedStages shouldn't result in any more attempts for the map stage, because @@ -1317,10 +1374,9 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi runEvent(makeCompletionEvent( taskSets(0).tasks(1), Success, 42, Seq.empty, Array.empty, createFakeTaskInfoWithId(1))) - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) // verify stage exists assert(scheduler.stageIdToStage.contains(0)) - assert(endedTasks.size == 2) + assert(sparkListener.endedTasks.size === 2) // finish other 2 tasks runEvent(makeCompletionEvent( @@ -1329,8 +1385,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi runEvent(makeCompletionEvent( taskSets(0).tasks(3), Success, 42, Seq.empty, Array.empty, createFakeTaskInfoWithId(3))) - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) - assert(endedTasks.size == 4) + assert(sparkListener.endedTasks.size === 4) // verify the stage is done assert(!scheduler.stageIdToStage.contains(0)) @@ -1340,15 +1395,13 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi runEvent(makeCompletionEvent( taskSets(0).tasks(3), Success, 42, Seq.empty, Array.empty, createFakeTaskInfoWithId(5))) - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) - assert(endedTasks.size == 5) + assert(sparkListener.endedTasks.size === 5) // make sure non successful tasks also send out event runEvent(makeCompletionEvent( taskSets(0).tasks(3), UnknownReason, 42, Seq.empty, Array.empty, createFakeTaskInfoWithId(6))) - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) - assert(endedTasks.size == 6) + assert(sparkListener.endedTasks.size === 6) } test("ignore late map task completions") { @@ -1421,8 +1474,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // Listener bus should get told about the map stage failing, but not the reduce stage // (since the reduce stage hasn't been started yet). - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) - assert(failedStages.toSet === Set(0)) + assert(sparkListener.failedStages.toSet === Set(0)) assertDataStructuresEmpty() } @@ -1494,7 +1546,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi runEvent(ExecutorLost("exec-hostA", ExecutorKilled)) runEvent(makeCompletionEvent( taskSets(1).tasks(0), - FetchFailed(null, firstShuffleId, 2, 0, "Fetch failed"), + FetchFailed(null, firstShuffleId, 2L, 2, 0, "Fetch failed"), null)) // so we resubmit stage 0, which completes happily @@ -1650,7 +1702,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // listener for all jobs, and here we want to capture the failure for each job separately. class FailureRecordingJobListener() extends JobListener { var failureMessage: String = _ - override def taskSucceeded(index: Int, result: Any) {} + override def taskSucceeded(index: Int, result: Any): Unit = {} override def jobFailed(exception: Exception): Unit = { failureMessage = exception.getMessage } } val listener1 = new FailureRecordingJobListener() @@ -1665,9 +1717,8 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi assert(cancelledStages.toSet === Set(0, 2)) // Make sure the listeners got told about both failed stages. - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) - assert(successfulStages.isEmpty) - assert(failedStages.toSet === Set(0, 2)) + assert(sparkListener.successfulStages.isEmpty) + assert(sparkListener.failedStages.toSet === Set(0, 2)) assert(listener1.failureMessage === s"Job aborted due to stage failure: $stageFailureMessage") assert(listener2.failureMessage === s"Job aborted due to stage failure: $stageFailureMessage") @@ -1754,7 +1805,8 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // lets say there is a fetch failure in this task set, which makes us go back and // run stage 0, attempt 1 complete(taskSets(1), Seq( - (FetchFailed(makeBlockManagerId("hostA"), shuffleDep1.shuffleId, 0, 0, "ignored"), null))) + (FetchFailed(makeBlockManagerId("hostA"), + shuffleDep1.shuffleId, 0L, 0, 0, "ignored"), null))) scheduler.resubmitFailedStages() // stage 0, attempt 1 should have the properties of job2 @@ -1835,7 +1887,8 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi (Success, makeMapStatus("hostC", 1)))) // fail the third stage because hostA went down complete(taskSets(2), Seq( - (FetchFailed(makeBlockManagerId("hostA"), shuffleDepTwo.shuffleId, 0, 0, "ignored"), null))) + (FetchFailed(makeBlockManagerId("hostA"), + shuffleDepTwo.shuffleId, 0L, 0, 0, "ignored"), null))) // TODO assert this: // blockManagerMaster.removeExecutor("exec-hostA") // have DAGScheduler try again @@ -1866,7 +1919,8 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi (Success, makeMapStatus("hostB", 1)))) // pretend stage 2 failed because hostA went down complete(taskSets(2), Seq( - (FetchFailed(makeBlockManagerId("hostA"), shuffleDepTwo.shuffleId, 0, 0, "ignored"), null))) + (FetchFailed(makeBlockManagerId("hostA"), + shuffleDepTwo.shuffleId, 0L, 0, 0, "ignored"), null))) // TODO assert this: // blockManagerMaster.removeExecutor("exec-hostA") // DAGScheduler should notice the cached copy of the second shuffle and try to get it rerun. @@ -2227,7 +2281,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi submit(reduceRdd, Array(0, 1)) complete(taskSets(1), Seq( (Success, 42), - (FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored"), null))) + (FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0L, 0, 0, "ignored"), null))) // Ask the scheduler to try it again; TaskSet 2 will rerun the map task that we couldn't fetch // from, then TaskSet 3 will run the reduce stage scheduler.resubmitFailedStages() @@ -2286,7 +2340,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi assert(taskSets(1).stageId === 1) complete(taskSets(1), Seq( (Success, makeMapStatus("hostA", rdd2.partitions.length)), - (FetchFailed(makeBlockManagerId("hostA"), dep1.shuffleId, 0, 0, "ignored"), null))) + (FetchFailed(makeBlockManagerId("hostA"), dep1.shuffleId, 0L, 0, 0, "ignored"), null))) scheduler.resubmitFailedStages() assert(listener2.results.size === 0) // Second stage listener should not have a result yet @@ -2312,7 +2366,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi assert(taskSets(4).stageId === 2) complete(taskSets(4), Seq( (Success, 52), - (FetchFailed(makeBlockManagerId("hostD"), dep2.shuffleId, 0, 0, "ignored"), null))) + (FetchFailed(makeBlockManagerId("hostD"), dep2.shuffleId, 0L, 0, 0, "ignored"), null))) scheduler.resubmitFailedStages() // TaskSet 5 will rerun stage 1's lost task, then TaskSet 6 will rerun stage 2 @@ -2350,7 +2404,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi assert(taskSets(1).stageId === 1) complete(taskSets(1), Seq( (Success, makeMapStatus("hostC", rdd2.partitions.length)), - (FetchFailed(makeBlockManagerId("hostA"), dep1.shuffleId, 0, 0, "ignored"), null))) + (FetchFailed(makeBlockManagerId("hostA"), dep1.shuffleId, 0L, 0, 0, "ignored"), null))) scheduler.resubmitFailedStages() // Stage1 listener should not have a result yet assert(listener2.results.size === 0) @@ -2485,7 +2539,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi rdd1.map { case (x, _) if (x == 1) => throw new FetchFailedException( - BlockManagerId("1", "1", 1), shuffleHandle.shuffleId, 0, 0, "test") + BlockManagerId("1", "1", 1), shuffleHandle.shuffleId, 0L, 0, 0, "test") case (x, _) => x }.count() } @@ -2498,7 +2552,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi rdd1.map { case (x, _) if (x == 1) && FailThisAttempt._fail.getAndSet(false) => throw new FetchFailedException( - BlockManagerId("1", "1", 1), shuffleHandle.shuffleId, 0, 0, "test") + BlockManagerId("1", "1", 1), shuffleHandle.shuffleId, 0L, 0, 0, "test") } } @@ -2552,7 +2606,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi assert(taskSets(1).stageId === 1 && taskSets(1).stageAttemptId === 0) runEvent(makeCompletionEvent( taskSets(1).tasks(0), - FetchFailed(makeBlockManagerId("hostA"), shuffleIdA, 0, 0, + FetchFailed(makeBlockManagerId("hostA"), shuffleIdA, 0L, 0, 0, "Fetch failure of task: stageId=1, stageAttempt=0, partitionId=0"), result = null)) @@ -2628,7 +2682,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi sc.parallelize(1 to tasks, tasks).foreach { _ => accum.add(1L) } - sc.listenerBus.waitUntilEmpty(1000) + sc.listenerBus.waitUntilEmpty() assert(foundCount.get() === tasks) } } @@ -2641,11 +2695,10 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi val mapStageId = 0 def countSubmittedMapStageAttempts(): Int = { - submittedStageInfos.count(_.stageId == mapStageId) + sparkListener.submittedStageInfos.count(_.stageId == mapStageId) } // The map stage should have been submitted. - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) assert(countSubmittedMapStageAttempts() === 1) // The first map task fails with TaskKilled. @@ -2653,7 +2706,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi taskSets(0).tasks(0), TaskKilled("test"), null)) - assert(failedStages === Seq(0)) + assert(sparkListener.failedStages === Seq(0)) // The second map task fails with TaskKilled. runEvent(makeCompletionEvent( @@ -2663,7 +2716,6 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // Trigger resubmission of the failed map stage. runEvent(ResubmitFailedStages) - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) // Another attempt for the map stage should have been submitted, resulting in 2 total attempts. assert(countSubmittedMapStageAttempts() === 2) @@ -2677,11 +2729,10 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi val mapStageId = 0 def countSubmittedMapStageAttempts(): Int = { - submittedStageInfos.count(_.stageId == mapStageId) + sparkListener.submittedStageInfos.count(_.stageId == mapStageId) } // The map stage should have been submitted. - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) assert(countSubmittedMapStageAttempts() === 1) // The first map task fails with TaskKilled. @@ -2689,11 +2740,10 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi taskSets(0).tasks(0), TaskKilled("test"), null)) - assert(failedStages === Seq(0)) + assert(sparkListener.failedStages === Seq(0)) // Trigger resubmission of the failed map stage. runEvent(ResubmitFailedStages) - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) // Another attempt for the map stage should have been submitted, resulting in 2 total attempts. assert(countSubmittedMapStageAttempts() === 2) @@ -2706,11 +2756,10 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // The second map task failure doesn't trigger stage retry. runEvent(ResubmitFailedStages) - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) assert(countSubmittedMapStageAttempts() === 2) } - test("SPARK-23207: retry all the succeeding stages when the map stage is indeterminate") { + private def constructIndeterminateStageFetchFailed(): (Int, Int) = { val shuffleMapRdd1 = new MyRDD(sc, 2, Nil, indeterminate = true) val shuffleDep1 = new ShuffleDependency(shuffleMapRdd1, new HashPartitioner(2)) @@ -2738,14 +2787,152 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // The first task of the final stage failed with fetch failure runEvent(makeCompletionEvent( taskSets(2).tasks(0), - FetchFailed(makeBlockManagerId("hostC"), shuffleId2, 0, 0, "ignored"), + FetchFailed(makeBlockManagerId("hostC"), shuffleId2, 0L, 0, 0, "ignored"), + null)) + (shuffleId1, shuffleId2) + } + + test("SPARK-25341: abort stage while using old fetch protocol") { + // reset the test context with using old fetch protocol + afterEach() + val conf = new SparkConf() + conf.set(config.SHUFFLE_USE_OLD_FETCH_PROTOCOL.key, "true") + init(conf) + // Construct the scenario of indeterminate stage fetch failed. + constructIndeterminateStageFetchFailed() + // The job should fail because Spark can't rollback the shuffle map stage while + // using old protocol. + assert(failure != null && failure.getMessage.contains( + "Spark can only do this while using the new shuffle block fetching protocol")) + } + + test("SPARK-25341: retry all the succeeding stages when the map stage is indeterminate") { + val (shuffleId1, shuffleId2) = constructIndeterminateStageFetchFailed() + + // Check status for all failedStages + val failedStages = scheduler.failedStages.toSeq + assert(failedStages.map(_.id) == Seq(1, 2)) + // Shuffle blocks of "hostC" is lost, so first task of the `shuffleMapRdd2` needs to retry. + assert(failedStages.collect { + case stage: ShuffleMapStage if stage.shuffleDep.shuffleId == shuffleId2 => stage + }.head.findMissingPartitions() == Seq(0)) + // The result stage is still waiting for its 2 tasks to complete + assert(failedStages.collect { + case stage: ResultStage => stage + }.head.findMissingPartitions() == Seq(0, 1)) + + scheduler.resubmitFailedStages() + + // The first task of the `shuffleMapRdd2` failed with fetch failure + runEvent(makeCompletionEvent( + taskSets(3).tasks(0), + FetchFailed(makeBlockManagerId("hostA"), shuffleId1, 0L, 0, 0, "ignored"), + null)) + + val newFailedStages = scheduler.failedStages.toSeq + assert(newFailedStages.map(_.id) == Seq(0, 1)) + + scheduler.resubmitFailedStages() + + // First shuffle map stage resubmitted and reran all tasks. + assert(taskSets(4).stageId == 0) + assert(taskSets(4).stageAttemptId == 1) + assert(taskSets(4).tasks.length == 2) + + // Finish all stage. + complete(taskSets(4), Seq( + (Success, makeMapStatus("hostA", 2)), + (Success, makeMapStatus("hostB", 2)))) + assert(mapOutputTracker.findMissingPartitions(shuffleId1) === Some(Seq.empty)) + + complete(taskSets(5), Seq( + (Success, makeMapStatus("hostC", 2)), + (Success, makeMapStatus("hostD", 2)))) + assert(mapOutputTracker.findMissingPartitions(shuffleId2) === Some(Seq.empty)) + + complete(taskSets(6), Seq((Success, 11), (Success, 12))) + + // Job successful ended. + assert(results === Map(0 -> 11, 1 -> 12)) + results.clear() + assertDataStructuresEmpty() + } + + test("SPARK-25341: continuous indeterminate stage roll back") { + // shuffleMapRdd1/2/3 are all indeterminate. + val shuffleMapRdd1 = new MyRDD(sc, 2, Nil, indeterminate = true) + val shuffleDep1 = new ShuffleDependency(shuffleMapRdd1, new HashPartitioner(2)) + val shuffleId1 = shuffleDep1.shuffleId + + val shuffleMapRdd2 = new MyRDD( + sc, 2, List(shuffleDep1), tracker = mapOutputTracker, indeterminate = true) + val shuffleDep2 = new ShuffleDependency(shuffleMapRdd2, new HashPartitioner(2)) + val shuffleId2 = shuffleDep2.shuffleId + + val shuffleMapRdd3 = new MyRDD( + sc, 2, List(shuffleDep2), tracker = mapOutputTracker, indeterminate = true) + val shuffleDep3 = new ShuffleDependency(shuffleMapRdd3, new HashPartitioner(2)) + val shuffleId3 = shuffleDep3.shuffleId + val finalRdd = new MyRDD(sc, 2, List(shuffleDep3), tracker = mapOutputTracker) + + submit(finalRdd, Array(0, 1), properties = new Properties()) + + // Finish the first 2 shuffle map stages. + complete(taskSets(0), Seq( + (Success, makeMapStatus("hostA", 2)), + (Success, makeMapStatus("hostB", 2)))) + assert(mapOutputTracker.findMissingPartitions(shuffleId1) === Some(Seq.empty)) + + complete(taskSets(1), Seq( + (Success, makeMapStatus("hostB", 2)), + (Success, makeMapStatus("hostD", 2)))) + assert(mapOutputTracker.findMissingPartitions(shuffleId2) === Some(Seq.empty)) + + // Executor lost on hostB, both of stage 0 and 1 should be reran. + runEvent(makeCompletionEvent( + taskSets(2).tasks(0), + FetchFailed(makeBlockManagerId("hostB"), shuffleId2, 0L, 0, 0, "ignored"), null)) + mapOutputTracker.removeOutputsOnHost("hostB") + + assert(scheduler.failedStages.toSeq.map(_.id) == Seq(1, 2)) + scheduler.resubmitFailedStages() + + def checkAndCompleteRetryStage( + taskSetIndex: Int, + stageId: Int, + shuffleId: Int): Unit = { + assert(taskSets(taskSetIndex).stageId == stageId) + assert(taskSets(taskSetIndex).stageAttemptId == 1) + assert(taskSets(taskSetIndex).tasks.length == 2) + complete(taskSets(taskSetIndex), Seq( + (Success, makeMapStatus("hostA", 2)), + (Success, makeMapStatus("hostB", 2)))) + assert(mapOutputTracker.findMissingPartitions(shuffleId) === Some(Seq.empty)) + } + + // Check all indeterminate stage roll back. + checkAndCompleteRetryStage(3, 0, shuffleId1) + checkAndCompleteRetryStage(4, 1, shuffleId2) + checkAndCompleteRetryStage(5, 2, shuffleId3) - // The second shuffle map stage need to rerun, the job will abort for the indeterminate - // stage rerun. - // TODO: After we support re-generate shuffle file(SPARK-25341), this test will be extended. - assert(failure != null && failure.getMessage - .contains("Spark cannot rollback the ShuffleMapStage 1")) + // Result stage success, all job ended. + complete(taskSets(6), Seq((Success, 11), (Success, 12))) + assert(results === Map(0 -> 11, 1 -> 12)) + results.clear() + assertDataStructuresEmpty() + } + + test("SPARK-29042: Sampled RDD with unordered input should be indeterminate") { + val shuffleMapRdd1 = new MyRDD(sc, 2, Nil, indeterminate = false) + + val shuffleDep1 = new ShuffleDependency(shuffleMapRdd1, new HashPartitioner(2)) + val shuffleMapRdd2 = new MyRDD(sc, 2, List(shuffleDep1), tracker = mapOutputTracker) + + assert(shuffleMapRdd2.outputDeterministicLevel == DeterministicLevel.UNORDERED) + + val sampledRdd = shuffleMapRdd2.sample(true, 0.3, 1000L) + assert(sampledRdd.outputDeterministicLevel == DeterministicLevel.INDETERMINATE) } private def assertResultStageFailToRollback(mapRdd: MyRDD): Unit = { @@ -2766,7 +2953,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // Fail the second task with FetchFailed. runEvent(makeCompletionEvent( taskSets.last.tasks(1), - FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored"), + FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0L, 0, 0, "ignored"), null)) // The job should fail because Spark can't rollback the result stage. @@ -2809,7 +2996,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // Fail the second task with FetchFailed. runEvent(makeCompletionEvent( taskSets.last.tasks(1), - FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored"), + FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0L, 0, 0, "ignored"), null)) assert(failure == null, "job should not fail") @@ -2856,33 +3043,6 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi assert(latch.await(10, TimeUnit.SECONDS)) } - test("SPARK-28699: abort stage if parent stage is indeterminate stage") { - val shuffleMapRdd = new MyRDD(sc, 2, Nil, indeterminate = true) - - val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(2)) - val shuffleId = shuffleDep.shuffleId - val finalRdd = new MyRDD(sc, 2, List(shuffleDep), tracker = mapOutputTracker) - - submit(finalRdd, Array(0, 1)) - - // Finish the first shuffle map stage. - complete(taskSets(0), Seq( - (Success, makeMapStatus("hostA", 2)), - (Success, makeMapStatus("hostB", 2)))) - assert(mapOutputTracker.findMissingPartitions(shuffleId) === Some(Seq.empty)) - - runEvent(makeCompletionEvent( - taskSets(1).tasks(0), - FetchFailed(makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored"), - null)) - - // Shuffle blocks of "hostA" is lost, so first task of the `shuffleMapRdd` needs to retry. - // The result stage is still waiting for its 2 tasks to complete. - // Because of shuffleMapRdd is indeterminate, this job will be abort. - assert(failure != null && failure.getMessage - .contains("Spark cannot rollback the ShuffleMapStage 0")) - } - test("Completions in zombie tasksets update status of non-zombie taskset") { val parts = 4 val shuffleMapRdd = new MyRDD(sc, parts, Nil) @@ -2899,7 +3059,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi // The second task of the shuffle map stage failed with FetchFailed. runEvent(makeCompletionEvent( taskSets(0).tasks(1), - FetchFailed(makeBlockManagerId("hostB"), shuffleDep.shuffleId, 0, 0, "ignored"), + FetchFailed(makeBlockManagerId("hostB"), shuffleDep.shuffleId, 0L, 0, 0, "ignored"), null)) scheduler.resubmitFailedStages() @@ -2938,7 +3098,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi * Assert that the supplied TaskSet has exactly the given hosts as its preferred locations. * Note that this checks only the host and not the executor ID. */ - private def assertLocations(taskSet: TaskSet, hosts: Seq[Seq[String]]) { + private def assertLocations(taskSet: TaskSet, hosts: Seq[Seq[String]]): Unit = { assert(hosts.size === taskSet.tasks.size) for ((taskLocs, expectedLocs) <- taskSet.tasks.map(_.preferredLocations).zip(hosts)) { assert(taskLocs.map(_.host).toSet === expectedLocs.toSet) @@ -2989,8 +3149,8 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi } object DAGSchedulerSuite { - def makeMapStatus(host: String, reduces: Int, sizes: Byte = 2): MapStatus = - MapStatus(makeBlockManagerId(host), Array.fill[Long](reduces)(sizes)) + def makeMapStatus(host: String, reduces: Int, sizes: Byte = 2, mapTaskId: Long = -1): MapStatus = + MapStatus(makeBlockManagerId(host), Array.fill[Long](reduces)(sizes), mapTaskId) def makeBlockManagerId(host: String): BlockManagerId = BlockManagerId("exec-" + host, host, 12345) diff --git a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala index a83ca594ee908..61ea21fa86c5a 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.scheduler -import java.io.{File, FileOutputStream, InputStream, IOException} +import java.io.{File, InputStream} import java.util.Arrays import scala.collection.immutable.Map @@ -32,15 +32,16 @@ import org.scalatest.BeforeAndAfter import org.apache.spark._ import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.deploy.history.{EventLogFileReader, SingleEventLogFileWriter} +import org.apache.spark.deploy.history.EventLogTestHelper._ import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics} import org.apache.spark.internal.Logging -import org.apache.spark.internal.config._ import org.apache.spark.io._ import org.apache.spark.metrics.{ExecutorMetricType, MetricsSystem} +import org.apache.spark.resource.ResourceProfile import org.apache.spark.scheduler.cluster.ExecutorInfo import org.apache.spark.util.{JsonProtocol, Utils} - /** * Test whether EventLoggingListener logs events properly. * @@ -51,8 +52,6 @@ import org.apache.spark.util.{JsonProtocol, Utils} class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext with BeforeAndAfter with Logging { - import EventLoggingListenerSuite._ - private val fileSystem = Utils.getHadoopFileSystem("/", SparkHadoopUtil.get.newConfiguration(new SparkConf())) private var testDir: File = _ @@ -68,40 +67,6 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit Utils.deleteRecursively(testDir) } - test("Verify log file exist") { - // Verify logging directory exists - val conf = getLoggingConf(testDirPath) - val eventLogger = new EventLoggingListener("test", None, testDirPath.toUri(), conf) - eventLogger.start() - - val logPath = new Path(eventLogger.logPath + EventLoggingListener.IN_PROGRESS) - assert(fileSystem.exists(logPath)) - val logStatus = fileSystem.getFileStatus(logPath) - assert(!logStatus.isDirectory) - - // Verify log is renamed after stop() - eventLogger.stop() - assert(!fileSystem.getFileStatus(new Path(eventLogger.logPath)).isDirectory) - } - - test("Basic event logging") { - testEventLogging() - } - - test("spark.eventLog.compression.codec overrides spark.io.compression.codec") { - val conf = new SparkConf - conf.set(EVENT_LOG_COMPRESS, true) - - // The default value is `spark.io.compression.codec`. - val e = new EventLoggingListener("test", None, testDirPath.toUri(), conf) - assert(e.compressionCodecName.contains("lz4")) - - // `spark.eventLog.compression.codec` overrides `spark.io.compression.codec`. - conf.set(EVENT_LOG_COMPRESSION_CODEC, "zstd") - val e2 = new EventLoggingListener("test", None, testDirPath.toUri(), conf) - assert(e2.compressionCodecName.contains("zstd")) - } - test("Basic event logging with compression") { CompressionCodec.ALL_COMPRESSION_CODECS.foreach { codec => testEventLogging(compressionCodec = Some(CompressionCodec.getShortName(codec))) @@ -131,35 +96,6 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit assert(redactedProps(key) == "*********(redacted)") } - test("Log overwriting") { - val logUri = EventLoggingListener.getLogPath(testDir.toURI, "test", None) - val logPath = new Path(logUri).toUri.getPath - // Create file before writing the event log - new FileOutputStream(new File(logPath)).close() - // Expected IOException, since we haven't enabled log overwrite. - intercept[IOException] { testEventLogging() } - // Try again, but enable overwriting. - testEventLogging(extraConf = Map(EVENT_LOG_OVERWRITE.key -> "true")) - } - - test("Event log name") { - val baseDirUri = Utils.resolveURI("/base-dir") - // without compression - assert(s"${baseDirUri.toString}/app1" === EventLoggingListener.getLogPath( - baseDirUri, "app1", None)) - // with compression - assert(s"${baseDirUri.toString}/app1.lzf" === - EventLoggingListener.getLogPath(baseDirUri, "app1", None, Some("lzf"))) - // illegal characters in app ID - assert(s"${baseDirUri.toString}/a-fine-mind_dollar_bills__1" === - EventLoggingListener.getLogPath(baseDirUri, - "a fine:mind$dollar{bills}.1", None)) - // illegal characters in app ID with compression - assert(s"${baseDirUri.toString}/a-fine-mind_dollar_bills__1.lz4" === - EventLoggingListener.getLogPath(baseDirUri, - "a fine:mind$dollar{bills}.1", None, Some("lz4"))) - } - test("Executor metrics update") { testStageExecutorMetricsEventLogging() } @@ -168,8 +104,6 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit * Actual test logic * * ----------------- */ - import EventLoggingListenerSuite._ - /** * Test basic event logging functionality. * @@ -178,7 +112,7 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit */ private def testEventLogging( compressionCodec: Option[String] = None, - extraConf: Map[String, String] = Map()) { + extraConf: Map[String, String] = Map()): Unit = { val conf = getLoggingConf(testDirPath, compressionCodec) extraConf.foreach { case (k, v) => conf.set(k, v) } val logName = compressionCodec.map("test-" + _).getOrElse("test") @@ -198,7 +132,8 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit eventLogger.stop() // Verify file contains exactly the two events logged - val logData = EventLoggingListener.openEventLog(new Path(eventLogger.logPath), fileSystem) + val logPath = eventLogger.logWriter.logPath + val logData = EventLogFileReader.openEventLog(new Path(logPath), fileSystem) try { val lines = readLines(logData) val logStart = SparkListenerLogStart(SPARK_VERSION) @@ -218,7 +153,7 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit * Test end-to-end event logging functionality in an application. * This runs a simple Spark job and asserts that the expected events are logged when expected. */ - private def testApplicationEventLogging(compressionCodec: Option[String] = None) { + private def testApplicationEventLogging(compressionCodec: Option[String] = None): Unit = { // Set defaultFS to something that would cause an exception, to make sure we don't run // into SPARK-6688. val conf = getLoggingConf(testDirPath, compressionCodec) @@ -226,9 +161,10 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit sc = new SparkContext("local-cluster[2,2,1024]", "test", conf) assert(sc.eventLogger.isDefined) val eventLogger = sc.eventLogger.get - val eventLogPath = eventLogger.logPath + + val eventLogPath = eventLogger.logWriter.logPath val expectedLogDir = testDir.toURI() - assert(eventLogPath === EventLoggingListener.getLogPath( + assert(eventLogPath === SingleEventLogFileWriter.getLogPath( expectedLogDir, sc.applicationId, None, compressionCodec.map(CompressionCodec.getShortName))) // Begin listening for events that trigger asserts @@ -243,7 +179,8 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit eventExistenceListener.assertAllCallbacksInvoked() // Make sure expected events exist in the log file. - val logData = EventLoggingListener.openEventLog(new Path(eventLogger.logPath), fileSystem) + val logData = EventLogFileReader.openEventLog(new Path(eventLogger.logWriter.logPath), + fileSystem) val eventSet = mutable.Set( SparkListenerApplicationStart, SparkListenerBlockManagerAdded, @@ -284,7 +221,7 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit * from SparkListenerTaskEnd events for tasks belonging to the stage are * logged in a StageExecutorMetrics event for each executor at stage completion. */ - private def testStageExecutorMetricsEventLogging() { + private def testStageExecutorMetricsEventLogging(): Unit = { val conf = getLoggingConf(testDirPath, None) val logName = "stageExecutorMetrics-test" val eventLogger = new EventLoggingListener(logName, None, testDirPath.toUri(), conf) @@ -466,7 +403,8 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit // Verify the log file contains the expected events. // Posted events should be logged, except for ExecutorMetricsUpdate events -- these // are consolidated, and the peak values for each stage are logged at stage end. - val logData = EventLoggingListener.openEventLog(new Path(eventLogger.logPath), fileSystem) + val logData = EventLogFileReader.openEventLog(new Path(eventLogger.logWriter.logPath), + fileSystem) try { val lines = readLines(logData) val logStart = SparkListenerLogStart(SPARK_VERSION) @@ -501,12 +439,14 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit private def createStageSubmittedEvent(stageId: Int) = { SparkListenerStageSubmitted(new StageInfo(stageId, 0, stageId.toString, 0, - Seq.empty, Seq.empty, "details")) + Seq.empty, Seq.empty, "details", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)) } private def createStageCompletedEvent(stageId: Int) = { SparkListenerStageCompleted(new StageInfo(stageId, 0, stageId.toString, 0, - Seq.empty, Seq.empty, "details")) + Seq.empty, Seq.empty, "details", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)) } private def createExecutorAddedEvent(executorId: Int) = { @@ -621,19 +561,19 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit var jobEnded = false var appEnded = false - override def onJobStart(jobStart: SparkListenerJobStart) { + override def onJobStart(jobStart: SparkListenerJobStart): Unit = { jobStarted = true } - override def onJobEnd(jobEnd: SparkListenerJobEnd) { + override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = { jobEnded = true } - override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd) { + override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = { appEnded = true } - def assertAllCallbacksInvoked() { + def assertAllCallbacksInvoked(): Unit = { assert(jobStarted, "JobStart callback not invoked!") assert(jobEnded, "JobEnd callback not invoked!") assert(appEnded, "ApplicationEnd callback not invoked!") @@ -641,24 +581,3 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit } } - - -object EventLoggingListenerSuite { - - /** Get a SparkConf with event logging enabled. */ - def getLoggingConf(logDir: Path, compressionCodec: Option[String] = None): SparkConf = { - val conf = new SparkConf - conf.set(EVENT_LOG_ENABLED, true) - conf.set(EVENT_LOG_BLOCK_UPDATES, true) - conf.set(EVENT_LOG_TESTING, true) - conf.set(EVENT_LOG_DIR, logDir.toString) - compressionCodec.foreach { codec => - conf.set(EVENT_LOG_COMPRESS, true) - conf.set(EVENT_LOG_COMPRESSION_CODEC, codec) - } - conf.set(EVENT_LOG_STAGE_EXECUTOR_METRICS, true) - conf - } - - def getUniqueApplicationId: String = "test-" + System.currentTimeMillis -} diff --git a/core/src/test/scala/org/apache/spark/scheduler/ExecutorResourceInfoSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/ExecutorResourceInfoSuite.scala index 0109d1f82a453..388d4e25a06cf 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/ExecutorResourceInfoSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/ExecutorResourceInfoSuite.scala @@ -26,7 +26,7 @@ class ExecutorResourceInfoSuite extends SparkFunSuite { test("Track Executor Resource information") { // Init Executor Resource. - val info = new ExecutorResourceInfo(GPU, ArrayBuffer("0", "1", "2", "3")) + val info = new ExecutorResourceInfo(GPU, ArrayBuffer("0", "1", "2", "3"), 1) assert(info.availableAddrs.sorted sameElements Seq("0", "1", "2", "3")) assert(info.assignedAddrs.isEmpty) @@ -43,7 +43,7 @@ class ExecutorResourceInfoSuite extends SparkFunSuite { test("Don't allow acquire address that is not available") { // Init Executor Resource. - val info = new ExecutorResourceInfo(GPU, ArrayBuffer("0", "1", "2", "3")) + val info = new ExecutorResourceInfo(GPU, ArrayBuffer("0", "1", "2", "3"), 1) // Acquire some addresses. info.acquire(Seq("0", "1")) assert(!info.availableAddrs.contains("1")) @@ -56,7 +56,7 @@ class ExecutorResourceInfoSuite extends SparkFunSuite { test("Don't allow acquire address that doesn't exist") { // Init Executor Resource. - val info = new ExecutorResourceInfo(GPU, ArrayBuffer("0", "1", "2", "3")) + val info = new ExecutorResourceInfo(GPU, ArrayBuffer("0", "1", "2", "3"), 1) assert(!info.availableAddrs.contains("4")) // Acquire an address that doesn't exist val e = intercept[SparkException] { @@ -67,7 +67,7 @@ class ExecutorResourceInfoSuite extends SparkFunSuite { test("Don't allow release address that is not assigned") { // Init Executor Resource. - val info = new ExecutorResourceInfo(GPU, ArrayBuffer("0", "1", "2", "3")) + val info = new ExecutorResourceInfo(GPU, ArrayBuffer("0", "1", "2", "3"), 1) // Acquire addresses info.acquire(Array("0", "1")) assert(!info.assignedAddrs.contains("2")) @@ -80,7 +80,7 @@ class ExecutorResourceInfoSuite extends SparkFunSuite { test("Don't allow release address that doesn't exist") { // Init Executor Resource. - val info = new ExecutorResourceInfo(GPU, ArrayBuffer("0", "1", "2", "3")) + val info = new ExecutorResourceInfo(GPU, ArrayBuffer("0", "1", "2", "3"), 1) assert(!info.assignedAddrs.contains("4")) // Release an address that doesn't exist val e = intercept[SparkException] { @@ -88,4 +88,28 @@ class ExecutorResourceInfoSuite extends SparkFunSuite { } assert(e.getMessage.contains("Try to release an address that doesn't exist.")) } + + test("Ensure that we can acquire the same fractions of a resource from an executor") { + val slotSeq = Seq(10, 9, 8, 7, 6, 5, 4, 3, 2, 1) + val addresses = ArrayBuffer("0", "1", "2", "3") + slotSeq.foreach { slots => + val info = new ExecutorResourceInfo(GPU, addresses, slots) + for (_ <- 0 until slots) { + addresses.foreach(addr => info.acquire(Seq(addr))) + } + + // assert that each address was assigned `slots` times + info.assignedAddrs + .groupBy(identity) + .mapValues(_.size) + .foreach(x => assert(x._2 == slots)) + + addresses.foreach { addr => + assertThrows[SparkException] { + info.acquire(Seq(addr)) + } + assert(!info.availableAddrs.contains(addr)) + } + } + } } diff --git a/core/src/test/scala/org/apache/spark/scheduler/ExternalClusterManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/ExternalClusterManagerSuite.scala index 73e88c4a0fda6..9f593e0039adc 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/ExternalClusterManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/ExternalClusterManagerSuite.scala @@ -67,9 +67,9 @@ private class DummyExternalClusterManager extends ExternalClusterManager { private class DummySchedulerBackend extends SchedulerBackend { var initialized = false - def start() {} - def stop() {} - def reviveOffers() {} + def start(): Unit = {} + def stop(): Unit = {} + def reviveOffers(): Unit = {} def defaultParallelism(): Int = 1 def maxNumConcurrentTasks(): Int = 0 } @@ -89,6 +89,7 @@ private class DummyTaskScheduler extends TaskScheduler { override def notifyPartitionCompletion(stageId: Int, partitionId: Int): Unit = {} override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {} override def defaultParallelism(): Int = 2 + override def executorDecommission(executorId: String): Unit = {} override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {} override def workerRemoved(workerId: String, host: String, message: String): Unit = {} override def applicationAttemptId(): Option[String] = None diff --git a/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala b/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala index b29d32f7b35c5..8cb6268f85d36 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/FakeTask.scala @@ -42,22 +42,30 @@ object FakeTask { * locations for each task (given as varargs) if this sequence is not empty. */ def createTaskSet(numTasks: Int, prefLocs: Seq[TaskLocation]*): TaskSet = { - createTaskSet(numTasks, stageAttemptId = 0, prefLocs: _*) + createTaskSet(numTasks, stageId = 0, stageAttemptId = 0, priority = 0, prefLocs: _*) } - def createTaskSet(numTasks: Int, stageAttemptId: Int, prefLocs: Seq[TaskLocation]*): TaskSet = { - createTaskSet(numTasks, stageId = 0, stageAttemptId, prefLocs: _*) + def createTaskSet( + numTasks: Int, + stageId: Int, + stageAttemptId: Int, + prefLocs: Seq[TaskLocation]*): TaskSet = { + createTaskSet(numTasks, stageId, stageAttemptId, priority = 0, prefLocs: _*) } - def createTaskSet(numTasks: Int, stageId: Int, stageAttemptId: Int, prefLocs: Seq[TaskLocation]*): - TaskSet = { + def createTaskSet( + numTasks: Int, + stageId: Int, + stageAttemptId: Int, + priority: Int, + prefLocs: Seq[TaskLocation]*): TaskSet = { if (prefLocs.size != 0 && prefLocs.size != numTasks) { throw new IllegalArgumentException("Wrong number of task locations") } val tasks = Array.tabulate[Task[_]](numTasks) { i => new FakeTask(stageId, i, if (prefLocs.size != 0) prefLocs(i) else Nil) } - new TaskSet(tasks, stageId, stageAttemptId, priority = 0, null) + new TaskSet(tasks, stageId, stageAttemptId, priority = priority, null) } def createShuffleMapTaskSet( @@ -65,6 +73,15 @@ object FakeTask { stageId: Int, stageAttemptId: Int, prefLocs: Seq[TaskLocation]*): TaskSet = { + createShuffleMapTaskSet(numTasks, stageId, stageAttemptId, priority = 0, prefLocs: _*) + } + + def createShuffleMapTaskSet( + numTasks: Int, + stageId: Int, + stageAttemptId: Int, + priority: Int, + prefLocs: Seq[TaskLocation]*): TaskSet = { if (prefLocs.size != 0 && prefLocs.size != numTasks) { throw new IllegalArgumentException("Wrong number of task locations") } @@ -74,17 +91,18 @@ object FakeTask { }, prefLocs(i), new Properties, SparkEnv.get.closureSerializer.newInstance().serialize(TaskMetrics.registered).array()) } - new TaskSet(tasks, stageId, stageAttemptId, priority = 0, null) + new TaskSet(tasks, stageId, stageAttemptId, priority = priority, null) } def createBarrierTaskSet(numTasks: Int, prefLocs: Seq[TaskLocation]*): TaskSet = { - createBarrierTaskSet(numTasks, stageId = 0, stageAttempId = 0, prefLocs: _*) + createBarrierTaskSet(numTasks, stageId = 0, stageAttemptId = 0, priority = 0, prefLocs: _*) } def createBarrierTaskSet( numTasks: Int, stageId: Int, - stageAttempId: Int, + stageAttemptId: Int, + priority: Int, prefLocs: Seq[TaskLocation]*): TaskSet = { if (prefLocs.size != 0 && prefLocs.size != numTasks) { throw new IllegalArgumentException("Wrong number of task locations") @@ -92,6 +110,6 @@ object FakeTask { val tasks = Array.tabulate[Task[_]](numTasks) { i => new FakeTask(stageId, i, if (prefLocs.size != 0) prefLocs(i) else Nil, isBarrier = true) } - new TaskSet(tasks, stageId, stageAttempId, priority = 0, null) + new TaskSet(tasks, stageId, stageAttemptId, priority = priority, null) } } diff --git a/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala index c1e7fb9a1db16..23cc416f8572f 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala @@ -61,7 +61,7 @@ class MapStatusSuite extends SparkFunSuite { stddev <- Seq(0.0, 0.01, 0.5, 1.0) ) { val sizes = Array.fill[Long](numSizes)(abs(round(Random.nextGaussian() * stddev)) + mean) - val status = MapStatus(BlockManagerId("a", "b", 10), sizes) + val status = MapStatus(BlockManagerId("a", "b", 10), sizes, -1) val status1 = compressAndDecompressMapStatus(status) for (i <- 0 until numSizes) { if (sizes(i) != 0) { @@ -75,7 +75,7 @@ class MapStatusSuite extends SparkFunSuite { test("large tasks should use " + classOf[HighlyCompressedMapStatus].getName) { val sizes = Array.fill[Long](2001)(150L) - val status = MapStatus(null, sizes) + val status = MapStatus(null, sizes, -1) assert(status.isInstanceOf[HighlyCompressedMapStatus]) assert(status.getSizeForBlock(10) === 150L) assert(status.getSizeForBlock(50) === 150L) @@ -87,10 +87,12 @@ class MapStatusSuite extends SparkFunSuite { val sizes = Array.tabulate[Long](3000) { i => i.toLong } val avg = sizes.sum / sizes.count(_ != 0) val loc = BlockManagerId("a", "b", 10) - val status = MapStatus(loc, sizes) + val mapTaskAttemptId = 5 + val status = MapStatus(loc, sizes, mapTaskAttemptId) val status1 = compressAndDecompressMapStatus(status) assert(status1.isInstanceOf[HighlyCompressedMapStatus]) assert(status1.location == loc) + assert(status1.mapId == mapTaskAttemptId) for (i <- 0 until 3000) { val estimate = status1.getSizeForBlock(i) if (sizes(i) > 0) { @@ -109,7 +111,7 @@ class MapStatusSuite extends SparkFunSuite { val smallBlockSizes = sizes.filter(n => n > 0 && n < threshold) val avg = smallBlockSizes.sum / smallBlockSizes.length val loc = BlockManagerId("a", "b", 10) - val status = MapStatus(loc, sizes) + val status = MapStatus(loc, sizes, 5) val status1 = compressAndDecompressMapStatus(status) assert(status1.isInstanceOf[HighlyCompressedMapStatus]) assert(status1.location == loc) @@ -165,7 +167,7 @@ class MapStatusSuite extends SparkFunSuite { SparkEnv.set(env) // Value of element in sizes is equal to the corresponding index. val sizes = (0L to 2000L).toArray - val status1 = MapStatus(BlockManagerId("exec-0", "host-0", 100), sizes) + val status1 = MapStatus(BlockManagerId("exec-0", "host-0", 100), sizes, 5) val arrayStream = new ByteArrayOutputStream(102400) val objectOutputStream = new ObjectOutputStream(arrayStream) assert(status1.isInstanceOf[HighlyCompressedMapStatus]) diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala index 848f702935536..7d063c3b3ac53 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorIntegrationSuite.scala @@ -22,7 +22,6 @@ import org.scalatest.concurrent.{Signaler, ThreadSignaler, TimeLimits} import org.scalatest.time.{Seconds, Span} import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite, TaskContext} -import org.apache.spark.util.Utils /** * Integration tests for the OutputCommitCoordinator. diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala index d6964063c118e..728b9d65054ec 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala @@ -158,7 +158,7 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter { def resultHandler(x: Int, y: Unit): Unit = {} val futureAction: SimpleFutureAction[Unit] = sc.submitJob[Int, Unit, Unit](rdd, OutputCommitFunctions(tempDir.getAbsolutePath).commitSuccessfully, - 0 until rdd.partitions.size, resultHandler, () => Unit) + 0 until rdd.partitions.size, resultHandler, () => ()) // It's an error if the job completes successfully even though no committer was authorized, // so throw an exception if the job was allowed to complete. intercept[TimeoutException] { @@ -251,10 +251,10 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter { // stage so that we can check the state of the output committer. val retriedStage = sc.parallelize(1 to 100, 10) .map { i => (i % 10, i) } - .reduceByKey { case (_, _) => + .reduceByKey { (_, _) => val ctx = TaskContext.get() if (ctx.stageAttemptNumber() == 0) { - throw new FetchFailedException(SparkEnv.get.blockManager.blockManagerId, 1, 1, 1, + throw new FetchFailedException(SparkEnv.get.blockManager.blockManagerId, 1, 1L, 1, 1, new Exception("Failure for test.")) } else { ctx.stageId() @@ -288,7 +288,7 @@ private case class OutputCommitFunctions(tempDirPath: String) { // Mock output committer that simulates a failed commit (after commit is authorized) private def failingOutputCommitter = new FakeOutputCommitter { - override def commitTask(taskAttemptContext: TaskAttemptContext) { + override def commitTask(taskAttemptContext: TaskAttemptContext): Unit = { throw new RuntimeException } } diff --git a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala index d65b5cbfc094e..e6fbf9b09d43d 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala @@ -18,7 +18,7 @@ package org.apache.spark.scheduler import java.io._ -import java.net.URI +import java.nio.charset.StandardCharsets import java.util.concurrent.atomic.AtomicInteger import scala.collection.mutable.ArrayBuffer @@ -30,6 +30,8 @@ import org.scalatest.BeforeAndAfter import org.apache.spark._ import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.deploy.history.EventLogFileReader +import org.apache.spark.deploy.history.EventLogTestHelper._ import org.apache.spark.io.{CompressionCodec, LZ4CompressionCodec} import org.apache.spark.util.{JsonProtocol, JsonProtocolSuite, Utils} @@ -52,17 +54,18 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp test("Simple replay") { val logFilePath = getFilePath(testDir, "events.txt") val fstream = fileSystem.create(logFilePath) + val fwriter = new OutputStreamWriter(fstream, StandardCharsets.UTF_8) val applicationStart = SparkListenerApplicationStart("Greatest App (N)ever", None, 125L, "Mickey", None) val applicationEnd = SparkListenerApplicationEnd(1000L) - Utils.tryWithResource(new PrintWriter(fstream)) { writer => + Utils.tryWithResource(new PrintWriter(fwriter)) { writer => // scalastyle:off println writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationStart)))) writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationEnd)))) // scalastyle:on println } - val conf = EventLoggingListenerSuite.getLoggingConf(logFilePath) + val conf = getLoggingConf(logFilePath) val logData = fileSystem.open(logFilePath) val eventMonster = new EventBufferingListener try { @@ -87,8 +90,9 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp test("Replay compressed inprogress log file succeeding on partial read") { val buffered = new ByteArrayOutputStream val codec = new LZ4CompressionCodec(new SparkConf()) - val compstream = codec.compressedOutputStream(buffered) - Utils.tryWithResource(new PrintWriter(compstream)) { writer => + val compstream = codec.compressedContinuousOutputStream(buffered) + val cwriter = new OutputStreamWriter(compstream, StandardCharsets.UTF_8) + Utils.tryWithResource(new PrintWriter(cwriter)) { writer => val applicationStart = SparkListenerApplicationStart("AppStarts", None, 125L, "Mickey", None) @@ -107,14 +111,14 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp } // Read the compressed .inprogress file and verify only first event was parsed. - val conf = EventLoggingListenerSuite.getLoggingConf(logFilePath) + val conf = getLoggingConf(logFilePath) val replayer = new ReplayListenerBus() val eventMonster = new EventBufferingListener replayer.addListener(eventMonster) // Verify the replay returns the events given the input maybe truncated. - val logData = EventLoggingListener.openEventLog(logFilePath, fileSystem) + val logData = EventLogFileReader.openEventLog(logFilePath, fileSystem) Utils.tryWithResource(new EarlyEOFInputStream(logData, buffered.size - 10)) { failingStream => replayer.replay(failingStream, logFilePath.toString, true) @@ -123,7 +127,7 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp } // Verify the replay throws the EOF exception since the input may not be truncated. - val logData2 = EventLoggingListener.openEventLog(logFilePath, fileSystem) + val logData2 = EventLogFileReader.openEventLog(logFilePath, fileSystem) Utils.tryWithResource(new EarlyEOFInputStream(logData2, buffered.size - 10)) { failingStream2 => intercept[EOFException] { replayer.replay(failingStream2, logFilePath.toString, false) @@ -134,10 +138,11 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp test("Replay incompatible event log") { val logFilePath = getFilePath(testDir, "incompatible.txt") val fstream = fileSystem.create(logFilePath) + val fwriter = new OutputStreamWriter(fstream, StandardCharsets.UTF_8) val applicationStart = SparkListenerApplicationStart("Incompatible App", None, 125L, "UserUsingIncompatibleVersion", None) val applicationEnd = SparkListenerApplicationEnd(1000L) - Utils.tryWithResource(new PrintWriter(fstream)) { writer => + Utils.tryWithResource(new PrintWriter(fwriter)) { writer => // scalastyle:off println writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationStart)))) writer.println("""{"Event":"UnrecognizedEventOnlyForTest","Timestamp":1477593059313}""") @@ -145,7 +150,7 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp // scalastyle:on println } - val conf = EventLoggingListenerSuite.getLoggingConf(logFilePath) + val conf = getLoggingConf(logFilePath) val logData = fileSystem.open(logFilePath) val eventMonster = new EventBufferingListener try { @@ -184,14 +189,14 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp * event to the corresponding event replayed from the event logs. This test makes the * assumption that the event logging behavior is correct (tested in a separate suite). */ - private def testApplicationReplay(codecName: Option[String] = None) { + private def testApplicationReplay(codecName: Option[String] = None): Unit = { val logDir = new File(testDir.getAbsolutePath, "test-replay") // Here, it creates `Path` from the URI instead of the absolute path for the explicit file // scheme so that the string representation of this `Path` has leading file scheme correctly. val logDirPath = new Path(logDir.toURI) fileSystem.mkdirs(logDirPath) - val conf = EventLoggingListenerSuite.getLoggingConf(logDirPath, codecName) + val conf = getLoggingConf(logDirPath, codecName) sc = new SparkContext("local-cluster[2,1,1024]", "Test replay", conf) // Run a few jobs @@ -208,7 +213,7 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp assert(!eventLog.isDirectory) // Replay events - val logData = EventLoggingListener.openEventLog(eventLog.getPath(), fileSystem) + val logData = EventLogFileReader.openEventLog(eventLog.getPath(), fileSystem) val eventMonster = new EventBufferingListener try { val replayer = new ReplayListenerBus() @@ -242,7 +247,7 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp private[scheduler] val loggedEvents = new ArrayBuffer[JValue] - override def onEvent(event: SparkListenerEvent) { + override def onEvent(event: SparkListenerEvent): Unit = { val eventJson = JsonProtocol.sparkEventToJson(event) loggedEvents += eventJson } diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala index 96706536fe53c..dff8975a4fe49 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala @@ -26,6 +26,7 @@ import scala.concurrent.duration.{Duration, SECONDS} import scala.reflect.ClassTag import org.scalactic.TripleEquals +import org.scalatest.Assertions import org.scalatest.Assertions.AssertionsHelper import org.scalatest.concurrent.Eventually._ import org.scalatest.time.SpanSugar._ @@ -463,7 +464,7 @@ class MockRDD( override def toString: String = "MockRDD " + id } -object MockRDD extends AssertionsHelper with TripleEquals { +object MockRDD extends AssertionsHelper with TripleEquals with Assertions { /** * make sure all the shuffle dependencies have a consistent number of output partitions * (mostly to make sure the test setup makes sense, not that Spark itself would get this wrong) @@ -621,7 +622,7 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor backend.taskSuccess(taskDescription, DAGSchedulerSuite.makeMapStatus("hostA", 10)) case (1, 0, 0) => val fetchFailed = FetchFailed( - DAGSchedulerSuite.makeBlockManagerId("hostA"), shuffleId, 0, 0, "ignored") + DAGSchedulerSuite.makeBlockManagerId("hostA"), shuffleId, 0L, 0, 0, "ignored") backend.taskFailed(taskDescription, fetchFailed) case (1, _, partition) => backend.taskSuccess(taskDescription, 42 + partition) diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala index 8903e1054f53d..d4e8d63b54e5f 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.scheduler -import java.io.{Externalizable, IOException, ObjectInput, ObjectOutput} +import java.io.{Externalizable, ObjectInput, ObjectOutput} import java.util.concurrent.Semaphore import scala.collection.JavaConverters._ @@ -38,9 +38,6 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match import LiveListenerBus._ - /** Length of time to wait while draining listener events. */ - val WAIT_TIMEOUT_MILLIS = 10000 - val jobCompletionTime = 1421191296660L private val mockSparkContext: SparkContext = Mockito.mock(classOf[SparkContext]) @@ -65,7 +62,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match sc.listenerBus.addToSharedQueue(listener) sc.listenerBus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded)) - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) + sc.listenerBus.waitUntilEmpty() sc.stop() assert(listener.sparkExSeen) @@ -86,7 +83,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match (1 to 5).foreach { _ => bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded)) } // Five messages should be marked as received and queued, but no messages should be posted to - // listeners yet because the the listener bus hasn't been started. + // listeners yet because the listener bus hasn't been started. assert(bus.metrics.numEventsPosted.getCount === 5) assert(bus.queuedEvents.size === 5) @@ -97,7 +94,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match // Starting listener bus should flush all buffered events bus.start(mockSparkContext, mockMetricsSystem) Mockito.verify(mockMetricsSystem).registerSource(bus.metrics) - bus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) + bus.waitUntilEmpty() assert(counter.count === 5) assert(sharedQueueSize(bus) === 0) assert(eventProcessingTimeCount(bus) === 5) @@ -159,7 +156,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match assert(!drained) new Thread("ListenerBusStopper") { - override def run() { + override def run(): Unit = { stopperStarted.release() // stop() will block until notify() is called below bus.stop() @@ -209,7 +206,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match assert(sharedQueueSize(bus) === 1) assert(numDroppedEvents(bus) === 1) - // Allow the the remaining events to be processed so we can stop the listener bus: + // Allow the remaining events to be processed so we can stop the listener bus: listenerWait.release(2) bus.stop() } @@ -223,7 +220,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match rdd2.setName("Target RDD") rdd2.count() - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) + sc.listenerBus.waitUntilEmpty() listener.stageInfos.size should be {1} val (stageInfo, taskInfoMetrics) = listener.stageInfos.head @@ -231,8 +228,8 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match stageInfo.rddInfos.forall(_.numPartitions == 4) should be {true} stageInfo.rddInfos.exists(_.name == "Target RDD") should be {true} stageInfo.numTasks should be {4} - stageInfo.submissionTime should be ('defined) - stageInfo.completionTime should be ('defined) + stageInfo.submissionTime should be (Symbol("defined")) + stageInfo.completionTime should be (Symbol("defined")) taskInfoMetrics.length should be {4} } @@ -248,7 +245,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match rdd3.setName("Trois") rdd1.count() - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) + sc.listenerBus.waitUntilEmpty() listener.stageInfos.size should be {1} val stageInfo1 = listener.stageInfos.keys.find(_.stageId == 0).get stageInfo1.rddInfos.size should be {1} // ParallelCollectionRDD @@ -257,7 +254,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match listener.stageInfos.clear() rdd2.count() - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) + sc.listenerBus.waitUntilEmpty() listener.stageInfos.size should be {1} val stageInfo2 = listener.stageInfos.keys.find(_.stageId == 1).get stageInfo2.rddInfos.size should be {3} @@ -266,7 +263,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match listener.stageInfos.clear() rdd3.count() - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) + sc.listenerBus.waitUntilEmpty() listener.stageInfos.size should be {2} // Shuffle map stage + result stage val stageInfo3 = listener.stageInfos.keys.find(_.stageId == 3).get stageInfo3.rddInfos.size should be {1} // ShuffledRDD @@ -282,7 +279,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match val rdd2 = rdd1.map(_.toString) sc.runJob(rdd2, (items: Iterator[String]) => items.size, Seq(0, 1)) - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) + sc.listenerBus.waitUntilEmpty() listener.stageInfos.size should be {1} val (stageInfo, _) = listener.stageInfos.head @@ -310,7 +307,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match val numSlices = 16 val d = sc.parallelize(0 to 10000, numSlices).map(w) d.count() - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) + sc.listenerBus.waitUntilEmpty() listener.stageInfos.size should be (1) val d2 = d.map { i => w(i) -> i * 2 }.setName("shuffle input 1") @@ -321,7 +318,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match d4.setName("A Cogroup") d4.collectAsMap() - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) + sc.listenerBus.waitUntilEmpty() listener.stageInfos.size should be (4) listener.stageInfos.foreach { case (stageInfo, taskInfoMetrics) => /** @@ -372,7 +369,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match .reduce { case (x, y) => x } assert(result === 1.to(maxRpcMessageSize).toArray) - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) + sc.listenerBus.waitUntilEmpty() val TASK_INDEX = 0 assert(listener.startedTasks.contains(TASK_INDEX)) assert(listener.startedGettingResultTasks.contains(TASK_INDEX)) @@ -388,7 +385,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match val result = sc.parallelize(Seq(1), 1).map(2 * _).reduce { case (x, y) => x } assert(result === 2) - sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) + sc.listenerBus.waitUntilEmpty() val TASK_INDEX = 0 assert(listener.startedTasks.contains(TASK_INDEX)) assert(listener.startedGettingResultTasks.isEmpty) @@ -443,7 +440,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match // Post events to all listeners, and wait until the queue is drained (1 to 5).foreach { _ => bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded)) } - bus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) + bus.waitUntilEmpty() // The exception should be caught, and the event should be propagated to other listeners assert(jobCounter1.count === 5) @@ -513,7 +510,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match // after we post one event, both interrupting listeners should get removed, and the // event log queue should be removed bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded)) - bus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) + bus.waitUntilEmpty() assert(bus.activeQueues() === Set(SHARED_QUEUE, APP_STATUS_QUEUE)) assert(bus.findListenersByClass[BasicJobCounter]().size === 2) assert(bus.findListenersByClass[InterruptingListener]().size === 0) @@ -522,7 +519,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match // posting more events should be fine, they'll just get processed from the OK queue. (0 until 5).foreach { _ => bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded)) } - bus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) + bus.waitUntilEmpty() assert(counter1.count === 6) assert(counter2.count === 6) @@ -532,6 +529,47 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match } } + Seq(true, false).foreach { throwInterruptedException => + val suffix = if (throwInterruptedException) "throw interrupt" else "set Thread interrupted" + test(s"SPARK-30285: Fix deadlock in AsyncEventQueue.removeListenerOnError: $suffix") { + val LISTENER_BUS_STOP_WAITING_TIMEOUT_MILLIS = 10 * 1000L // 10 seconds + val bus = new LiveListenerBus(new SparkConf(false)) + val counter1 = new BasicJobCounter() + val counter2 = new BasicJobCounter() + val interruptingListener = new DelayInterruptingJobCounter(throwInterruptedException, 3) + bus.addToSharedQueue(counter1) + bus.addToSharedQueue(interruptingListener) + bus.addToEventLogQueue(counter2) + assert(bus.activeQueues() === Set(SHARED_QUEUE, EVENT_LOG_QUEUE)) + assert(bus.findListenersByClass[BasicJobCounter]().size === 2) + assert(bus.findListenersByClass[DelayInterruptingJobCounter]().size === 1) + + bus.start(mockSparkContext, mockMetricsSystem) + + (0 until 5).foreach { jobId => + bus.post(SparkListenerJobEnd(jobId, jobCompletionTime, JobSucceeded)) + } + + // Call bus.stop in a separate thread, otherwise we will block here until bus is stopped + val stoppingThread = new Thread(() => { + bus.stop() + }) + stoppingThread.start() + // Notify interrupting listener starts to work + interruptingListener.sleep = false + // Wait for bus to stop + stoppingThread.join(LISTENER_BUS_STOP_WAITING_TIMEOUT_MILLIS) + + // Stopping has been finished + assert(stoppingThread.isAlive === false) + // All queues are removed + assert(bus.activeQueues() === Set.empty) + assert(counter1.count === 5) + assert(counter2.count === 5) + assert(interruptingListener.count === 3) + } + } + test("event queue size can be configued through spark conf") { // configure the shared queue size to be 1, event log queue size to be 2, // and listner bus event queue size to be 5 @@ -563,7 +601,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match /** * Assert that the given list of numbers has an average that is greater than zero. */ - private def checkNonZeroAvg(m: Iterable[Long], msg: String) { + private def checkNonZeroAvg(m: Iterable[Long], msg: String): Unit = { assert(m.sum / m.size.toDouble > 0.0, msg) } @@ -574,7 +612,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match val stageInfos = mutable.Map[StageInfo, Seq[(TaskInfo, TaskMetrics)]]() var taskInfoMetrics = mutable.Buffer[(TaskInfo, TaskMetrics)]() - override def onTaskEnd(task: SparkListenerTaskEnd) { + override def onTaskEnd(task: SparkListenerTaskEnd): Unit = { val info = task.taskInfo val metrics = task.taskMetrics if (info != null && metrics != null) { @@ -582,7 +620,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match } } - override def onStageCompleted(stage: SparkListenerStageCompleted) { + override def onStageCompleted(stage: SparkListenerStageCompleted): Unit = { stageInfos(stage.stageInfo) = taskInfoMetrics taskInfoMetrics = mutable.Buffer.empty } @@ -606,7 +644,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match notify() } - override def onTaskGettingResult(taskGettingResult: SparkListenerTaskGettingResult) { + override def onTaskGettingResult(taskGettingResult: SparkListenerTaskGettingResult): Unit = { startedGettingResultTasks += taskGettingResult.taskInfo.index } } @@ -630,6 +668,35 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match } } } + + /** + * A simple listener that works as follows: + * 1. sleep and wait when `sleep` is true + * 2. when `sleep` is false, start to work: + * if it is interruptOnJobId, interrupt + * else count SparkListenerJobEnd numbers + */ + private class DelayInterruptingJobCounter( + val throwInterruptedException: Boolean, + val interruptOnJobId: Int) extends SparkListener { + @volatile var sleep = true + var count = 0 + + override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = { + while (sleep) { + Thread.sleep(10) + } + if (interruptOnJobId == jobEnd.jobId) { + if (throwInterruptedException) { + throw new InterruptedException("got interrupted") + } else { + Thread.currentThread().interrupt() + } + } else { + count += 1 + } + } + } } // These classes can't be declared inside of the SparkListenerSuite class because we don't want diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala index a6576e0d1c520..c84735c9665a7 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala @@ -57,7 +57,7 @@ class SparkListenerWithClusterSuite extends SparkFunSuite with LocalSparkContext private class SaveExecutorInfo extends SparkListener { val addedExecutorInfo = mutable.Map[String, ExecutorInfo]() - override def onExecutorAdded(executor: SparkListenerExecutorAdded) { + override def onExecutorAdded(executor: SparkListenerExecutorAdded): Unit = { addedExecutorInfo(executor.executorId) = executor.executorInfo } } diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala index c16b552d20891..394a2a9fbf7cb 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala @@ -176,7 +176,7 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark if (stageAttemptNumber < 2) { // Throw FetchFailedException to explicitly trigger stage resubmission. A normal exception // will only trigger task resubmission in the same stage. - throw new FetchFailedException(null, 0, 0, 0, "Fake") + throw new FetchFailedException(null, 0, 0L, 0, 0, "Fake") } Seq(stageAttemptNumber).iterator }.collect() diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala index ae464352da440..2efe6da5e986f 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskResultGetterSuite.scala @@ -25,18 +25,19 @@ import scala.collection.mutable.ArrayBuffer import scala.concurrent.duration._ import scala.util.control.NonFatal -import com.google.common.util.concurrent.MoreExecutors import org.mockito.ArgumentCaptor import org.mockito.ArgumentMatchers.{any, anyLong} import org.mockito.Mockito.{spy, times, verify} +import org.scalatest.Assertions._ import org.scalatest.BeforeAndAfter import org.scalatest.concurrent.Eventually._ import org.apache.spark._ +import org.apache.spark.TaskState.TaskState import org.apache.spark.TestUtils.JavaSourceFromString import org.apache.spark.internal.config.Network.RPC_MESSAGE_MAX_SIZE import org.apache.spark.storage.TaskResultBlockId -import org.apache.spark.util.{MutableURLClassLoader, RpcUtils, Utils} +import org.apache.spark.util.{MutableURLClassLoader, RpcUtils, ThreadUtils, Utils} /** @@ -52,7 +53,7 @@ private class ResultDeletingTaskResultGetter(sparkEnv: SparkEnv, scheduler: Task @volatile var removeBlockSuccessfully = false override def enqueueSuccessfulTask( - taskSetManager: TaskSetManager, tid: Long, serializedData: ByteBuffer) { + taskSetManager: TaskSetManager, tid: Long, serializedData: ByteBuffer): Unit = { if (!removedResult) { // Only remove the result once, since we'd like to test the case where the task eventually // succeeds. @@ -78,6 +79,16 @@ private class ResultDeletingTaskResultGetter(sparkEnv: SparkEnv, scheduler: Task } } +private class DummyTaskSchedulerImpl(sc: SparkContext) + extends TaskSchedulerImpl(sc, 1, true) { + override def handleFailedTask( + taskSetManager: TaskSetManager, + tid: Long, + taskState: TaskState, + reason: TaskFailedReason): Unit = { + // do nothing + } +} /** * A [[TaskResultGetter]] that stores the [[DirectTaskResult]]s it receives from executors @@ -87,7 +98,7 @@ private class MyTaskResultGetter(env: SparkEnv, scheduler: TaskSchedulerImpl) extends TaskResultGetter(env, scheduler) { // Use the current thread so we can access its results synchronously - protected override val getTaskResultExecutor = MoreExecutors.sameThreadExecutor() + protected override val getTaskResultExecutor = ThreadUtils.sameThreadExecutorService // DirectTaskResults that we receive from the executors private val _taskResults = new ArrayBuffer[DirectTaskResult[_]] @@ -130,6 +141,31 @@ class TaskResultGetterSuite extends SparkFunSuite with BeforeAndAfter with Local "Expect result to be removed from the block manager.") } + test("handling total size of results larger than maxResultSize") { + sc = new SparkContext("local", "test", conf) + val scheduler = new DummyTaskSchedulerImpl(sc) + val spyScheduler = spy(scheduler) + val resultGetter = new TaskResultGetter(sc.env, spyScheduler) + scheduler.taskResultGetter = resultGetter + val myTsm = new TaskSetManager(spyScheduler, FakeTask.createTaskSet(2), 1) { + // always returns false + override def canFetchMoreResults(size: Long): Boolean = false + } + val indirectTaskResult = IndirectTaskResult(TaskResultBlockId(0), 0) + val directTaskResult = new DirectTaskResult(ByteBuffer.allocate(0), Nil, Array()) + val ser = sc.env.closureSerializer.newInstance() + val serializedIndirect = ser.serialize(indirectTaskResult) + val serializedDirect = ser.serialize(directTaskResult) + resultGetter.enqueueSuccessfulTask(myTsm, 0, serializedDirect) + resultGetter.enqueueSuccessfulTask(myTsm, 1, serializedIndirect) + eventually(timeout(1.second)) { + verify(spyScheduler, times(1)).handleFailedTask( + myTsm, 0, TaskState.KILLED, TaskKilled("Tasks result size has exceeded maxResultSize")) + verify(spyScheduler, times(1)).handleFailedTask( + myTsm, 1, TaskState.KILLED, TaskKilled("Tasks result size has exceeded maxResultSize")) + } + } + test("task retried if result missing from block manager") { // Set the maximum number of task failures to > 0, so that the task set isn't aborted // after the result is missing. diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala index cac6285e58417..e7ecf847ff4f4 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala @@ -26,7 +26,7 @@ import org.mockito.ArgumentMatchers.{any, anyInt, anyString, eq => meq} import org.mockito.Mockito.{atLeast, atMost, never, spy, times, verify, when} import org.scalatest.BeforeAndAfterEach import org.scalatest.concurrent.Eventually -import org.scalatest.mockito.MockitoSugar +import org.scalatestplus.mockito.MockitoSugar import org.apache.spark._ import org.apache.spark.internal.Logging @@ -36,9 +36,9 @@ import org.apache.spark.resource.TestResourceIDs._ import org.apache.spark.util.ManualClock class FakeSchedulerBackend extends SchedulerBackend { - def start() {} - def stop() {} - def reviveOffers() {} + def start(): Unit = {} + def stop(): Unit = {} + def reviveOffers(): Unit = {} def defaultParallelism(): Int = 1 def maxNumConcurrentTasks(): Int = 0 } @@ -228,19 +228,19 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B taskScheduler.taskSetManagerForAttempt(taskset.stageId, taskset.stageAttemptId).get.isZombie } - val attempt1 = FakeTask.createTaskSet(1, 0) + val attempt1 = FakeTask.createTaskSet(1, stageId = 0, stageAttemptId = 0) taskScheduler.submitTasks(attempt1) // The first submitted taskset is active assert(!isTasksetZombie(attempt1)) - val attempt2 = FakeTask.createTaskSet(1, 1) + val attempt2 = FakeTask.createTaskSet(1, stageId = 0, stageAttemptId = 1) taskScheduler.submitTasks(attempt2) // The first submitted taskset is zombie now assert(isTasksetZombie(attempt1)) // The newly submitted taskset is active assert(!isTasksetZombie(attempt2)) - val attempt3 = FakeTask.createTaskSet(1, 2) + val attempt3 = FakeTask.createTaskSet(1, stageId = 0, stageAttemptId = 2) taskScheduler.submitTasks(attempt3) // The first submitted taskset remains zombie assert(isTasksetZombie(attempt1)) @@ -255,7 +255,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B val numFreeCores = 1 val workerOffers = IndexedSeq(new WorkerOffer("executor0", "host0", numFreeCores)) - val attempt1 = FakeTask.createTaskSet(10) + val attempt1 = FakeTask.createTaskSet(10, stageId = 0, stageAttemptId = 0) // submit attempt 1, offer some resources, some tasks get scheduled taskScheduler.submitTasks(attempt1) @@ -271,7 +271,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B assert(0 === taskDescriptions2.length) // if we schedule another attempt for the same stage, it should get scheduled - val attempt2 = FakeTask.createTaskSet(10, 1) + val attempt2 = FakeTask.createTaskSet(10, stageId = 0, stageAttemptId = 1) // submit attempt 2, offer some resources, some tasks get scheduled taskScheduler.submitTasks(attempt2) @@ -287,7 +287,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B val numFreeCores = 10 val workerOffers = IndexedSeq(new WorkerOffer("executor0", "host0", numFreeCores)) - val attempt1 = FakeTask.createTaskSet(10) + val attempt1 = FakeTask.createTaskSet(10, stageId = 0, stageAttemptId = 0) // submit attempt 1, offer some resources, some tasks get scheduled taskScheduler.submitTasks(attempt1) @@ -303,7 +303,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B assert(0 === taskDescriptions2.length) // submit attempt 2 - val attempt2 = FakeTask.createTaskSet(10, 1) + val attempt2 = FakeTask.createTaskSet(10, stageId = 0, stageAttemptId = 1) taskScheduler.submitTasks(attempt2) // attempt 1 finished (this can happen even if it was marked zombie earlier -- all tasks were @@ -497,7 +497,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B test("abort stage when all executors are blacklisted and we cannot acquire new executor") { taskScheduler = setupSchedulerWithMockTaskSetBlacklist() - val taskSet = FakeTask.createTaskSet(numTasks = 10, stageAttemptId = 0) + val taskSet = FakeTask.createTaskSet(numTasks = 10) taskScheduler.submitTasks(taskSet) val tsm = stageToMockTaskSetManager(0) @@ -539,7 +539,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B config.UNSCHEDULABLE_TASKSET_TIMEOUT.key -> "0") // We have only 1 task remaining with 1 executor - val taskSet = FakeTask.createTaskSet(numTasks = 1, stageAttemptId = 0) + val taskSet = FakeTask.createTaskSet(numTasks = 1) taskScheduler.submitTasks(taskSet) val tsm = stageToMockTaskSetManager(0) @@ -571,7 +571,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B config.UNSCHEDULABLE_TASKSET_TIMEOUT.key -> "10") // We have only 1 task remaining with 1 executor - val taskSet = FakeTask.createTaskSet(numTasks = 1, stageAttemptId = 0) + val taskSet = FakeTask.createTaskSet(numTasks = 1) taskScheduler.submitTasks(taskSet) val tsm = stageToMockTaskSetManager(0) @@ -910,7 +910,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B test("SPARK-16106 locality levels updated if executor added to existing host") { val taskScheduler = setupScheduler() - taskScheduler.submitTasks(FakeTask.createTaskSet(2, 0, + taskScheduler.submitTasks(FakeTask.createTaskSet(2, stageId = 0, stageAttemptId = 0, (0 until 2).map { _ => Seq(TaskLocation("host0", "executor2")) }: _* )) @@ -948,7 +948,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B test("scheduler checks for executors that can be expired from blacklist") { taskScheduler = setupScheduler() - taskScheduler.submitTasks(FakeTask.createTaskSet(1, 0)) + taskScheduler.submitTasks(FakeTask.createTaskSet(1, stageId = 0, stageAttemptId = 0)) taskScheduler.resourceOffers(IndexedSeq( new WorkerOffer("executor0", "host0", 1) )).flatten @@ -962,8 +962,8 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B taskScheduler.initialize(new FakeSchedulerBackend) // Need to initialize a DAGScheduler for the taskScheduler to use for callbacks. new DAGScheduler(sc, taskScheduler) { - override def taskStarted(task: Task[_], taskInfo: TaskInfo) {} - override def executorAdded(execId: String, host: String) {} + override def taskStarted(task: Task[_], taskInfo: TaskInfo): Unit = {} + override def executorAdded(execId: String, host: String): Unit = {} } val e0Offers = IndexedSeq(WorkerOffer("executor0", "host0", 1)) @@ -993,8 +993,8 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B taskScheduler.initialize(new FakeSchedulerBackend) // Need to initialize a DAGScheduler for the taskScheduler to use for callbacks. new DAGScheduler(sc, taskScheduler) { - override def taskStarted(task: Task[_], taskInfo: TaskInfo) {} - override def executorAdded(execId: String, host: String) {} + override def taskStarted(task: Task[_], taskInfo: TaskInfo): Unit = {} + override def executorAdded(execId: String, host: String): Unit = {} } val e0Offers = IndexedSeq(WorkerOffer("executor0", "host0", 1)) @@ -1044,8 +1044,8 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B } // Need to initialize a DAGScheduler for the taskScheduler to use for callbacks. new DAGScheduler(sc, taskScheduler) { - override def taskStarted(task: Task[_], taskInfo: TaskInfo) {} - override def executorAdded(execId: String, host: String) {} + override def taskStarted(task: Task[_], taskInfo: TaskInfo): Unit = {} + override def executorAdded(execId: String, host: String): Unit = {} } taskScheduler.initialize(new FakeSchedulerBackend) @@ -1084,8 +1084,8 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B } // Need to initialize a DAGScheduler for the taskScheduler to use for callbacks. new DAGScheduler(sc, taskScheduler) { - override def taskStarted(task: Task[_], taskInfo: TaskInfo) {} - override def executorAdded(execId: String, host: String) {} + override def taskStarted(task: Task[_], taskInfo: TaskInfo): Unit = {} + override def executorAdded(execId: String, host: String): Unit = {} } taskScheduler.initialize(new FakeSchedulerBackend) // make an offer on the preferred host so the scheduler knows its alive. This is necessary @@ -1154,6 +1154,29 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B assert(3 === taskDescriptions.length) } + test("SPARK-29263: barrier TaskSet can't schedule when higher prio taskset takes the slots") { + val taskCpus = 2 + val taskScheduler = setupSchedulerWithMaster( + s"local[$taskCpus]", + config.CPUS_PER_TASK.key -> taskCpus.toString) + + val numFreeCores = 3 + val workerOffers = IndexedSeq( + new WorkerOffer("executor0", "host0", numFreeCores, Some("192.168.0.101:49625")), + new WorkerOffer("executor1", "host1", numFreeCores, Some("192.168.0.101:49627")), + new WorkerOffer("executor2", "host2", numFreeCores, Some("192.168.0.101:49629"))) + val barrier = FakeTask.createBarrierTaskSet(3, stageId = 0, stageAttemptId = 0, priority = 1) + val highPrio = FakeTask.createTaskSet(1, stageId = 1, stageAttemptId = 0, priority = 0) + + // submit highPrio and barrier taskSet + taskScheduler.submitTasks(highPrio) + taskScheduler.submitTasks(barrier) + val taskDescriptions = taskScheduler.resourceOffers(workerOffers).flatten + // it schedules the highPrio task first, and then will not have enough slots to schedule + // the barrier taskset + assert(1 === taskDescriptions.length) + } + test("cancelTasks shall kill all the running tasks and fail the stage") { val taskScheduler = setupScheduler() @@ -1169,7 +1192,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B } }) - val attempt1 = FakeTask.createTaskSet(10, 0) + val attempt1 = FakeTask.createTaskSet(10) taskScheduler.submitTasks(attempt1) val workerOffers = IndexedSeq(new WorkerOffer("executor0", "host0", 1), @@ -1200,7 +1223,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B } }) - val attempt1 = FakeTask.createTaskSet(10, 0) + val attempt1 = FakeTask.createTaskSet(10) taskScheduler.submitTasks(attempt1) val workerOffers = IndexedSeq(new WorkerOffer("executor0", "host0", 1), diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetBlacklistSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetBlacklistSuite.scala index b3bc76687ce1b..ed97a4c206ca3 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetBlacklistSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetBlacklistSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.scheduler import org.mockito.ArgumentMatchers.isA import org.mockito.Mockito.{never, verify} import org.scalatest.BeforeAndAfterEach -import org.scalatest.mockito.MockitoSugar +import org.scalatestplus.mockito.MockitoSugar import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.internal.config diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala index fedfa083e8d8f..b740e357903a2 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala @@ -21,16 +21,22 @@ import java.util.{Properties, Random} import scala.collection.mutable import scala.collection.mutable.ArrayBuffer +import scala.concurrent.duration._ +import org.apache.hadoop.fs.FileAlreadyExistsException import org.mockito.ArgumentMatchers.{any, anyBoolean, anyInt, anyString} import org.mockito.Mockito._ import org.mockito.invocation.InvocationOnMock +import org.scalatest.Assertions._ +import org.scalatest.PrivateMethodTester +import org.scalatest.concurrent.Eventually import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.internal.config import org.apache.spark.resource.ResourceUtils._ import org.apache.spark.resource.TestResourceIDs._ +import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend import org.apache.spark.serializer.SerializerInstance import org.apache.spark.storage.BlockManagerId import org.apache.spark.util.{AccumulatorV2, ManualClock} @@ -38,7 +44,7 @@ import org.apache.spark.util.{AccumulatorV2, ManualClock} class FakeDAGScheduler(sc: SparkContext, taskScheduler: FakeTaskScheduler) extends DAGScheduler(sc) { - override def taskStarted(task: Task[_], taskInfo: TaskInfo) { + override def taskStarted(task: Task[_], taskInfo: TaskInfo): Unit = { taskScheduler.startedTasks += taskInfo.index } @@ -48,13 +54,13 @@ class FakeDAGScheduler(sc: SparkContext, taskScheduler: FakeTaskScheduler) result: Any, accumUpdates: Seq[AccumulatorV2[_, _]], metricPeaks: Array[Long], - taskInfo: TaskInfo) { + taskInfo: TaskInfo): Unit = { taskScheduler.endedTasks(taskInfo.index) = reason } - override def executorAdded(execId: String, host: String) {} + override def executorAdded(execId: String, host: String): Unit = {} - override def executorLost(execId: String, reason: ExecutorLossReason) {} + override def executorLost(execId: String, reason: ExecutorLossReason): Unit = {} override def taskSetFailed( taskSet: TaskSet, @@ -74,13 +80,13 @@ object FakeRackUtil { var numBatchInvocation = 0 var numSingleHostInvocation = 0 - def cleanUp() { + def cleanUp(): Unit = { hostToRack.clear() numBatchInvocation = 0 numSingleHostInvocation = 0 } - def assignHostToRack(host: String, rack: String) { + def assignHostToRack(host: String, rack: String): Unit = { hostToRack(host) = rack } @@ -124,10 +130,10 @@ class FakeTaskScheduler(sc: SparkContext, liveExecutors: (String, String)* /* ex dagScheduler = new FakeDAGScheduler(sc, this) - def removeExecutor(execId: String) { + def removeExecutor(execId: String): Unit = { executors -= execId val host = executorIdToHost.get(execId) - assert(host != None) + assert(host.isDefined) val hostId = host.get val executorsOnHost = hostToExecutors(hostId) executorsOnHost -= execId @@ -149,7 +155,7 @@ class FakeTaskScheduler(sc: SparkContext, liveExecutors: (String, String)* /* ex hostsByRack.get(rack) != None } - def addExecutor(execId: String, host: String) { + def addExecutor(execId: String, host: String): Unit = { executors.put(execId, host) val executorsOnHost = hostToExecutors.getOrElseUpdate(host, new mutable.HashSet[String]) executorsOnHost += execId @@ -177,7 +183,12 @@ class LargeTask(stageId: Int) extends Task[Array[Byte]](stageId, 0, 0) { override def preferredLocations: Seq[TaskLocation] = Seq[TaskLocation]() } -class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logging { +class TaskSetManagerSuite + extends SparkFunSuite + with LocalSparkContext + with PrivateMethodTester + with Eventually + with Logging { import TaskLocality.{ANY, PROCESS_LOCAL, NO_PREF, NODE_LOCAL, RACK_LOCAL} private val conf = new SparkConf @@ -1262,7 +1273,7 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg // now fail those tasks tsmSpy.handleFailedTask(taskDescs(0).taskId, TaskState.FAILED, - FetchFailed(BlockManagerId(taskDescs(0).executorId, "host1", 12345), 0, 0, 0, "ignored")) + FetchFailed(BlockManagerId(taskDescs(0).executorId, "host1", 12345), 0, 0L, 0, 0, "ignored")) tsmSpy.handleFailedTask(taskDescs(1).taskId, TaskState.FAILED, ExecutorLostFailure(taskDescs(1).executorId, exitCausedByApp = false, reason = None)) tsmSpy.handleFailedTask(taskDescs(2).taskId, TaskState.FAILED, @@ -1302,7 +1313,7 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg // Fail the task with fetch failure tsm.handleFailedTask(taskDescs(0).taskId, TaskState.FAILED, - FetchFailed(BlockManagerId(taskDescs(0).executorId, "host1", 12345), 0, 0, 0, "ignored")) + FetchFailed(BlockManagerId(taskDescs(0).executorId, "host1", 12345), 0, 0L, 0, 0, "ignored")) assert(blacklistTracker.isNodeBlacklisted("host1")) } @@ -1775,4 +1786,208 @@ class TaskSetManagerSuite extends SparkFunSuite with LocalSparkContext with Logg assert(!manager.checkSpeculatableTasks(0)) assert(manager.resourceOffer("exec1", "host1", ANY).isEmpty) } + + private def testSpeculationDurationSetup( + speculationThresholdOpt: Option[String], + speculationQuantile: Double, + numTasks: Int, + numExecutorCores: Int, + numCoresPerTask: Int): (TaskSetManager, ManualClock) = { + sc = new SparkContext("local", "test") + sc.conf.set(config.SPECULATION_ENABLED, true) + sc.conf.set(config.SPECULATION_QUANTILE.key, speculationQuantile.toString) + // Set the number of slots per executor + sc.conf.set(config.EXECUTOR_CORES.key, numExecutorCores.toString) + sc.conf.set(config.CPUS_PER_TASK.key, numCoresPerTask.toString) + if (speculationThresholdOpt.isDefined) { + sc.conf.set(config.SPECULATION_TASK_DURATION_THRESHOLD.key, speculationThresholdOpt.get) + } + sched = new FakeTaskScheduler(sc, ("exec1", "host1"), ("exec2", "host2")) + // Create a task set with the given number of tasks + val taskSet = FakeTask.createTaskSet(numTasks) + val clock = new ManualClock() + val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES, clock = clock) + manager.isZombie = false + + // Offer resources for the task to start + for (i <- 1 to numTasks) { + manager.resourceOffer(s"exec$i", s"host$i", NO_PREF) + } + (manager, clock) + } + + private def testSpeculationDurationThreshold( + speculationThresholdProvided: Boolean, + numTasks: Int, + numSlots: Int): Unit = { + val (manager, clock) = testSpeculationDurationSetup( + // Set the threshold to be 60 minutes + if (speculationThresholdProvided) Some("60min") else None, + // Set the quantile to be 1.0 so that regular speculation would not be triggered + speculationQuantile = 1.0, + numTasks, + numSlots, + numCoresPerTask = 1 + ) + + // if the time threshold has not been exceeded, no speculative run should be triggered + clock.advance(1000*60*60) + assert(!manager.checkSpeculatableTasks(0)) + assert(sched.speculativeTasks.size == 0) + + // Now the task should have been running for 60 minutes and 1 second + clock.advance(1000) + if (speculationThresholdProvided && numSlots >= numTasks) { + assert(manager.checkSpeculatableTasks(0)) + assert(sched.speculativeTasks.size == numTasks) + // Should not submit duplicated tasks + assert(!manager.checkSpeculatableTasks(0)) + assert(sched.speculativeTasks.size == numTasks) + } else { + // If the feature flag is turned off, or the stage contains too many tasks + assert(!manager.checkSpeculatableTasks(0)) + assert(sched.speculativeTasks.size == 0) + } + } + + Seq(1, 2).foreach { numTasks => + test("SPARK-29976 when a speculation time threshold is provided, should speculative " + + s"run the task even if there are not enough successful runs, total tasks: $numTasks") { + testSpeculationDurationThreshold(true, numTasks, numTasks) + } + + test("SPARK-29976: when the speculation time threshold is not provided," + + s"don't speculative run if there are not enough successful runs, total tasks: $numTasks") { + testSpeculationDurationThreshold(false, numTasks, numTasks) + } + } + + test("SPARK-29976 when a speculation time threshold is provided, should not speculative " + + "if there are too many tasks in the stage even though time threshold is provided") { + testSpeculationDurationThreshold(true, 2, 1) + } + + test("SPARK-29976 Regular speculation configs should still take effect even when a " + + "threshold is provided") { + val (manager, clock) = testSpeculationDurationSetup( + Some("60min"), + speculationQuantile = 0.5, + numTasks = 2, + numExecutorCores = 2, + numCoresPerTask = 1 + ) + + // Task duration can't be 0, advance 1 sec + clock.advance(1000) + // Mark one of the task succeeded, which should satisfy the quantile + manager.handleSuccessfulTask(0, createTaskResult(0)) + // Advance 1 more second so the remaining task takes longer than medium but doesn't satisfy the + // duration threshold yet + clock.advance(1000) + assert(manager.checkSpeculatableTasks(0)) + assert(sched.speculativeTasks.size == 1) + } + + test("SPARK-30417 when spark.task.cpus is greater than spark.executor.cores due to " + + "standalone settings, speculate if there is only one task in the stage") { + val (manager, clock) = testSpeculationDurationSetup( + Some("60min"), + // Set the quantile to be 1.0 so that regular speculation would not be triggered + speculationQuantile = 1.0, + numTasks = 1, + numExecutorCores = 1, + numCoresPerTask = 2 + ) + + clock.advance(1000*60*60) + assert(!manager.checkSpeculatableTasks(0)) + assert(sched.speculativeTasks.size == 0) + // Now the task should have been running for 60 minutes and 1 second + clock.advance(1000) + assert(manager.checkSpeculatableTasks(0)) + assert(sched.speculativeTasks.size == 1) + } + + test("TaskOutputFileAlreadyExistException lead to task set abortion") { + sc = new SparkContext("local", "test") + sched = new FakeTaskScheduler(sc, ("exec1", "host1")) + val taskSet = FakeTask.createTaskSet(1) + val manager = new TaskSetManager(sched, taskSet, MAX_TASK_FAILURES) + assert(sched.taskSetsFailed.isEmpty) + + val offerResult = manager.resourceOffer("exec1", "host1", ANY) + assert(offerResult.isDefined, + "Expect resource offer on iteration 0 to return a task") + assert(offerResult.get.index === 0) + val reason = new ExceptionFailure( + new TaskOutputFileAlreadyExistException( + new FileAlreadyExistsException("file already exists")), + Seq.empty[AccumulableInfo]) + manager.handleFailedTask(offerResult.get.taskId, TaskState.FAILED, reason) + assert(sched.taskSetsFailed.contains(taskSet.id)) + } + + test("SPARK-30359: don't clean executorsPendingToRemove " + + "at the beginning of CoarseGrainedSchedulerBackend.reset") { + val conf = new SparkConf() + // use local-cluster mode in order to get CoarseGrainedSchedulerBackend + .setMaster("local-cluster[2, 1, 2048]") + // allow to set up at most two executors + .set("spark.cores.max", "2") + .setAppName("CoarseGrainedSchedulerBackend.reset") + sc = new SparkContext(conf) + val sched = sc.taskScheduler + val backend = sc.schedulerBackend.asInstanceOf[CoarseGrainedSchedulerBackend] + + TestUtils.waitUntilExecutorsUp(sc, 2, 60000) + + val tasks = Array.tabulate[Task[_]](2)(partition => new FakeLongTasks(stageId = 0, partition)) + val taskSet: TaskSet = new TaskSet(tasks, stageId = 0, stageAttemptId = 0, priority = 0, null) + val stageId = taskSet.stageId + val stageAttemptId = taskSet.stageAttemptId + sched.submitTasks(taskSet) + val taskSetManagers = + PrivateMethod[mutable.HashMap[Int, mutable.HashMap[Int, TaskSetManager]]]( + Symbol("taskSetsByStageIdAndAttempt")) + // get the TaskSetManager + val manager = sched.invokePrivate(taskSetManagers()).get(stageId).get(stageAttemptId) + + val (task0, task1) = eventually(timeout(10.seconds), interval(100.milliseconds)) { + (manager.taskInfos(0), manager.taskInfos(1)) + } + + val (taskId0, index0, exec0) = (task0.taskId, task0.index, task0.executorId) + val (taskId1, index1, exec1) = (task1.taskId, task1.index, task1.executorId) + // set up two running tasks + assert(manager.taskInfos(taskId0).running) + assert(manager.taskInfos(taskId1).running) + + val numFailures = PrivateMethod[Array[Int]](Symbol("numFailures")) + // no task failures yet + assert(manager.invokePrivate(numFailures())(index0) === 0) + assert(manager.invokePrivate(numFailures())(index1) === 0) + + // let exec1 count task failures but exec0 doesn't + backend.executorsPendingToRemove(exec0) = true + backend.executorsPendingToRemove(exec1) = false + + backend.reset() + + eventually(timeout(10.seconds), interval(100.milliseconds)) { + // executorsPendingToRemove should eventually be empty after reset() + assert(backend.executorsPendingToRemove.isEmpty) + assert(manager.invokePrivate(numFailures())(index0) === 0) + assert(manager.invokePrivate(numFailures())(index1) === 1) + } + } +} + +class FakeLongTasks(stageId: Int, partitionId: Int) extends FakeTask(stageId, partitionId) { + + override def runTask(context: TaskContext): Int = { + while (true) { + Thread.sleep(10000) + } + 0 + } } diff --git a/core/src/test/scala/org/apache/spark/scheduler/WorkerDecommissionSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/WorkerDecommissionSuite.scala new file mode 100644 index 0000000000000..15733b0d932ec --- /dev/null +++ b/core/src/test/scala/org/apache/spark/scheduler/WorkerDecommissionSuite.scala @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.scheduler + +import java.util.concurrent.Semaphore + +import scala.concurrent.TimeoutException +import scala.concurrent.duration._ + +import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException, SparkFunSuite} +import org.apache.spark.internal.config +import org.apache.spark.scheduler.cluster.StandaloneSchedulerBackend +import org.apache.spark.util.{RpcUtils, SerializableBuffer, ThreadUtils} + +class WorkerDecommissionSuite extends SparkFunSuite with LocalSparkContext { + + override def beforeEach(): Unit = { + val conf = new SparkConf().setAppName("test").setMaster("local") + .set(config.Worker.WORKER_DECOMMISSION_ENABLED, true) + + sc = new SparkContext("local-cluster[2, 1, 1024]", "test", conf) + } + + test("verify task with no decommissioning works as expected") { + val input = sc.parallelize(1 to 10) + input.count() + val sleepyRdd = input.mapPartitions{ x => + Thread.sleep(100) + x + } + assert(sleepyRdd.count() === 10) + } + + test("verify a task with all workers decommissioned succeeds") { + val input = sc.parallelize(1 to 10) + // Do a count to wait for the executors to be registered. + input.count() + val sleepyRdd = input.mapPartitions{ x => + Thread.sleep(50) + x + } + // Listen for the job + val sem = new Semaphore(0) + sc.addSparkListener(new SparkListener { + override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = { + sem.release() + } + }) + // Start the task. + val asyncCount = sleepyRdd.countAsync() + // Wait for the job to have started + sem.acquire(1) + // Decommission all the executors, this should not halt the current task. + // decom.sh message passing is tested manually. + val sched = sc.schedulerBackend.asInstanceOf[StandaloneSchedulerBackend] + val execs = sched.getExecutorIds() + execs.foreach(execId => sched.decommissionExecutor(execId)) + val asyncCountResult = ThreadUtils.awaitResult(asyncCount, 2.seconds) + assert(asyncCountResult === 10) + // Try and launch task after decommissioning, this should fail + val postDecommissioned = input.map(x => x) + val postDecomAsyncCount = postDecommissioned.countAsync() + val thrown = intercept[java.util.concurrent.TimeoutException]{ + val result = ThreadUtils.awaitResult(postDecomAsyncCount, 2.seconds) + } + assert(postDecomAsyncCount.isCompleted === false, + "After exec decommission new task could not launch") + } +} diff --git a/core/src/test/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitorSuite.scala index d3feb35537b34..3596a9ebb1f5a 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitorSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/dynalloc/ExecutorMonitorSuite.scala @@ -27,15 +27,18 @@ import org.mockito.Mockito.{doAnswer, mock, when} import org.apache.spark._ import org.apache.spark.executor.ExecutorMetrics import org.apache.spark.internal.config._ +import org.apache.spark.resource.ResourceProfile.{DEFAULT_RESOURCE_PROFILE_ID, UNKNOWN_RESOURCE_PROFILE_ID} +import org.apache.spark.resource.ResourceProfile import org.apache.spark.scheduler._ +import org.apache.spark.scheduler.cluster.ExecutorInfo import org.apache.spark.storage._ import org.apache.spark.util.ManualClock class ExecutorMonitorSuite extends SparkFunSuite { - private val idleTimeoutMs = TimeUnit.SECONDS.toMillis(60L) - private val storageTimeoutMs = TimeUnit.SECONDS.toMillis(120L) - private val shuffleTimeoutMs = TimeUnit.SECONDS.toMillis(240L) + private val idleTimeoutNs = TimeUnit.SECONDS.toNanos(60L) + private val storageTimeoutNs = TimeUnit.SECONDS.toNanos(120L) + private val shuffleTimeoutNs = TimeUnit.SECONDS.toNanos(240L) private val conf = new SparkConf() .set(DYN_ALLOCATION_EXECUTOR_IDLE_TIMEOUT.key, "60s") @@ -47,6 +50,9 @@ class ExecutorMonitorSuite extends SparkFunSuite { private var client: ExecutorAllocationClient = _ private var clock: ManualClock = _ + private val execInfo = new ExecutorInfo("host1", 1, Map.empty, + Map.empty, Map.empty, DEFAULT_RESOURCE_PROFILE_ID) + // List of known executors. Allows easily mocking which executors are alive without // having to use mockito APIs directly in each test. private val knownExecs = mutable.HashSet[String]() @@ -64,10 +70,12 @@ class ExecutorMonitorSuite extends SparkFunSuite { test("basic executor timeout") { knownExecs += "1" - monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", null)) + monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", execInfo)) assert(monitor.executorCount === 1) assert(monitor.isExecutorIdle("1")) assert(monitor.timedOutExecutors(idleDeadline) === Seq("1")) + assert(monitor.executorCountWithResourceProfile(DEFAULT_RESOURCE_PROFILE_ID) === 1) + assert(monitor.getResourceProfileId("1") === DEFAULT_RESOURCE_PROFILE_ID) } test("SPARK-4951, SPARK-26927: handle out of order task start events") { @@ -75,26 +83,38 @@ class ExecutorMonitorSuite extends SparkFunSuite { monitor.onTaskStart(SparkListenerTaskStart(1, 1, taskInfo("1", 1))) assert(monitor.executorCount === 1) + assert(monitor.executorCountWithResourceProfile(UNKNOWN_RESOURCE_PROFILE_ID) === 1) - monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", null)) + monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", execInfo)) assert(monitor.executorCount === 1) + assert(monitor.executorCountWithResourceProfile(UNKNOWN_RESOURCE_PROFILE_ID) === 0) + assert(monitor.executorCountWithResourceProfile(DEFAULT_RESOURCE_PROFILE_ID) === 1) + assert(monitor.getResourceProfileId("1") === DEFAULT_RESOURCE_PROFILE_ID) - monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "2", null)) + monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "2", execInfo)) assert(monitor.executorCount === 2) + assert(monitor.executorCountWithResourceProfile(DEFAULT_RESOURCE_PROFILE_ID) === 2) + assert(monitor.getResourceProfileId("2") === DEFAULT_RESOURCE_PROFILE_ID) monitor.onExecutorRemoved(SparkListenerExecutorRemoved(clock.getTimeMillis(), "2", null)) assert(monitor.executorCount === 1) + assert(monitor.executorCountWithResourceProfile(DEFAULT_RESOURCE_PROFILE_ID) === 1) knownExecs -= "2" monitor.onTaskStart(SparkListenerTaskStart(1, 1, taskInfo("2", 2))) assert(monitor.executorCount === 1) + assert(monitor.executorCountWithResourceProfile(DEFAULT_RESOURCE_PROFILE_ID) === 1) + + monitor.onExecutorRemoved(SparkListenerExecutorRemoved(clock.getTimeMillis(), "1", null)) + assert(monitor.executorCount === 0) + assert(monitor.executorCountWithResourceProfile(DEFAULT_RESOURCE_PROFILE_ID) === 0) } test("track tasks running on executor") { knownExecs += "1" - monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", null)) + monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", execInfo)) monitor.onTaskStart(SparkListenerTaskStart(1, 1, taskInfo("1", 1))) assert(!monitor.isExecutorIdle("1")) @@ -111,13 +131,13 @@ class ExecutorMonitorSuite extends SparkFunSuite { monitor.onTaskEnd(SparkListenerTaskEnd(1, 1, "foo", Success, taskInfo("1", 1), new ExecutorMetrics, null)) assert(monitor.isExecutorIdle("1")) - assert(monitor.timedOutExecutors(clock.getTimeMillis()).isEmpty) - assert(monitor.timedOutExecutors(clock.getTimeMillis() + idleTimeoutMs + 1) === Seq("1")) + assert(monitor.timedOutExecutors(clock.nanoTime()).isEmpty) + assert(monitor.timedOutExecutors(clock.nanoTime() + idleTimeoutNs + 1) === Seq("1")) } test("use appropriate time out depending on whether blocks are stored") { knownExecs += "1" - monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", null)) + monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", execInfo)) assert(monitor.isExecutorIdle("1")) assert(monitor.timedOutExecutors(idleDeadline) === Seq("1")) @@ -139,7 +159,7 @@ class ExecutorMonitorSuite extends SparkFunSuite { } test("keeps track of stored blocks for each rdd and split") { - monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", null)) + monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", execInfo)) monitor.onBlockUpdated(rddUpdate(1, 0, "1")) assert(monitor.timedOutExecutors(idleDeadline).isEmpty) @@ -166,27 +186,27 @@ class ExecutorMonitorSuite extends SparkFunSuite { // originally went idle. clock.setTime(idleDeadline) monitor.onUnpersistRDD(SparkListenerUnpersistRDD(2)) - assert(monitor.timedOutExecutors(clock.getTimeMillis()) === Seq("1")) + assert(monitor.timedOutExecutors(clock.nanoTime()) === Seq("1")) } test("handle timeouts correctly with multiple executors") { knownExecs ++= Set("1", "2", "3") // start exec 1 at 0s (should idle time out at 60s) - monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", null)) + monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", execInfo)) assert(monitor.isExecutorIdle("1")) // start exec 2 at 30s, store a block (should idle time out at 150s) clock.setTime(TimeUnit.SECONDS.toMillis(30)) - monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "2", null)) + monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "2", execInfo)) monitor.onBlockUpdated(rddUpdate(1, 0, "2")) assert(monitor.isExecutorIdle("2")) assert(!monitor.timedOutExecutors(idleDeadline).contains("2")) // start exec 3 at 60s (should idle timeout at 120s, exec 1 should time out) clock.setTime(TimeUnit.SECONDS.toMillis(60)) - monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "3", null)) - assert(monitor.timedOutExecutors(clock.getTimeMillis()) === Seq("1")) + monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "3", execInfo)) + assert(monitor.timedOutExecutors(clock.nanoTime()) === Seq("1")) // store block on exec 3 (should now idle time out at 180s) monitor.onBlockUpdated(rddUpdate(1, 0, "3")) @@ -196,16 +216,16 @@ class ExecutorMonitorSuite extends SparkFunSuite { // advance to 140s, remove block from exec 3 (time out immediately) clock.setTime(TimeUnit.SECONDS.toMillis(140)) monitor.onBlockUpdated(rddUpdate(1, 0, "3", level = StorageLevel.NONE)) - assert(monitor.timedOutExecutors(clock.getTimeMillis()).toSet === Set("1", "3")) + assert(monitor.timedOutExecutors(clock.nanoTime()).toSet === Set("1", "3")) // advance to 150s, now exec 2 should time out clock.setTime(TimeUnit.SECONDS.toMillis(150)) - assert(monitor.timedOutExecutors(clock.getTimeMillis()).toSet === Set("1", "2", "3")) + assert(monitor.timedOutExecutors(clock.nanoTime()).toSet === Set("1", "2", "3")) } test("SPARK-27677: don't track blocks stored on disk when using shuffle service") { // First make sure that blocks on disk are counted when no shuffle service is available. - monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", null)) + monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", execInfo)) monitor.onBlockUpdated(rddUpdate(1, 0, "1", level = StorageLevel.DISK_ONLY)) assert(monitor.timedOutExecutors(idleDeadline).isEmpty) assert(monitor.timedOutExecutors(storageDeadline) === Seq("1")) @@ -213,7 +233,7 @@ class ExecutorMonitorSuite extends SparkFunSuite { conf.set(SHUFFLE_SERVICE_ENABLED, true).set(SHUFFLE_SERVICE_FETCH_RDD_ENABLED, true) monitor = new ExecutorMonitor(conf, client, null, clock) - monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", null)) + monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", execInfo)) monitor.onBlockUpdated(rddUpdate(1, 0, "1", level = StorageLevel.MEMORY_ONLY)) monitor.onBlockUpdated(rddUpdate(1, 1, "1", level = StorageLevel.MEMORY_ONLY)) assert(monitor.timedOutExecutors(idleDeadline).isEmpty) @@ -236,25 +256,28 @@ class ExecutorMonitorSuite extends SparkFunSuite { test("track executors pending for removal") { knownExecs ++= Set("1", "2", "3") - monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", null)) - monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "2", null)) - monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "3", null)) + val execInfoRp1 = new ExecutorInfo("host1", 1, Map.empty, + Map.empty, Map.empty, 1) + + monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", execInfo)) + monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "2", execInfo)) + monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "3", execInfoRp1)) clock.setTime(idleDeadline) - assert(monitor.timedOutExecutors().toSet === Set("1", "2", "3")) + assert(monitor.timedOutExecutors().toSet === Set(("1", 0), ("2", 0), ("3", 1))) assert(monitor.pendingRemovalCount === 0) // Notify that only a subset of executors was killed, to mimic the case where the scheduler // refuses to kill an executor that is busy for whatever reason the monitor hasn't detected yet. monitor.executorsKilled(Seq("1")) - assert(monitor.timedOutExecutors().toSet === Set("2", "3")) + assert(monitor.timedOutExecutors().toSet === Set(("2", 0), ("3", 1))) assert(monitor.pendingRemovalCount === 1) // Check the timed out executors again so that we're sure they're still timed out when no // events happen. This ensures that the monitor doesn't lose track of them. - assert(monitor.timedOutExecutors().toSet === Set("2", "3")) + assert(monitor.timedOutExecutors().toSet === Set(("2", 0), ("3", 1))) monitor.onTaskStart(SparkListenerTaskStart(1, 1, taskInfo("2", 1))) - assert(monitor.timedOutExecutors().toSet === Set("3")) + assert(monitor.timedOutExecutors().toSet === Set(("3", 1))) monitor.executorsKilled(Seq("3")) assert(monitor.pendingRemovalCount === 2) @@ -263,7 +286,7 @@ class ExecutorMonitorSuite extends SparkFunSuite { new ExecutorMetrics, null)) assert(monitor.timedOutExecutors().isEmpty) clock.advance(idleDeadline) - assert(monitor.timedOutExecutors().toSet === Set("2")) + assert(monitor.timedOutExecutors().toSet === Set(("2", 0))) } test("shuffle block tracking") { @@ -286,7 +309,7 @@ class ExecutorMonitorSuite extends SparkFunSuite { monitor.onJobStart(SparkListenerJobStart(1, clock.getTimeMillis(), Seq(stage1, stage2))) monitor.onJobStart(SparkListenerJobStart(2, clock.getTimeMillis(), Seq(stage3, stage4))) - monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", null)) + monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", execInfo)) assert(monitor.timedOutExecutors(idleDeadline) === Seq("1")) // First a failed task, to make sure it does not count. @@ -342,7 +365,7 @@ class ExecutorMonitorSuite extends SparkFunSuite { throw new IllegalStateException("No event should be sent.") } } - monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", null)) + monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", execInfo)) monitor.shuffleCleaned(0) } @@ -351,8 +374,8 @@ class ExecutorMonitorSuite extends SparkFunSuite { conf.set(DYN_ALLOCATION_SHUFFLE_TRACKING, true).set(SHUFFLE_SERVICE_ENABLED, false) monitor = new ExecutorMonitor(conf, client, bus, clock) - monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", null)) - monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "2", null)) + monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", execInfo)) + monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "2", execInfo)) // Two separate jobs with separate shuffles. The first job will only run tasks on // executor 1, the second on executor 2. Ensures that jobs finishing don't affect @@ -401,7 +424,7 @@ class ExecutorMonitorSuite extends SparkFunSuite { val stage = stageInfo(1, shuffleId = 0) monitor.onJobStart(SparkListenerJobStart(1, clock.getTimeMillis(), Seq(stage))) clock.advance(1000L) - monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", null)) + monitor.onExecutorAdded(SparkListenerExecutorAdded(clock.getTimeMillis(), "1", execInfo)) monitor.onTaskStart(SparkListenerTaskStart(1, 0, taskInfo("1", 1))) monitor.onTaskEnd(SparkListenerTaskEnd(1, 0, "foo", Success, taskInfo("1", 1), new ExecutorMetrics, null)) @@ -410,13 +433,14 @@ class ExecutorMonitorSuite extends SparkFunSuite { assert(monitor.timedOutExecutors(idleDeadline).isEmpty) } - private def idleDeadline: Long = clock.getTimeMillis() + idleTimeoutMs + 1 - private def storageDeadline: Long = clock.getTimeMillis() + storageTimeoutMs + 1 - private def shuffleDeadline: Long = clock.getTimeMillis() + shuffleTimeoutMs + 1 + private def idleDeadline: Long = clock.nanoTime() + idleTimeoutNs + 1 + private def storageDeadline: Long = clock.nanoTime() + storageTimeoutNs + 1 + private def shuffleDeadline: Long = clock.nanoTime() + shuffleTimeoutNs + 1 private def stageInfo(id: Int, shuffleId: Int = -1): StageInfo = { new StageInfo(id, 0, s"stage$id", 1, Nil, Nil, "", - shuffleDepId = if (shuffleId >= 0) Some(shuffleId) else None) + shuffleDepId = if (shuffleId >= 0) Some(shuffleId) else None, + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) } private def taskInfo( diff --git a/core/src/test/scala/org/apache/spark/security/EncryptionFunSuite.scala b/core/src/test/scala/org/apache/spark/security/EncryptionFunSuite.scala index be6b8a6b5b108..213f0ba2ec180 100644 --- a/core/src/test/scala/org/apache/spark/security/EncryptionFunSuite.scala +++ b/core/src/test/scala/org/apache/spark/security/EncryptionFunSuite.scala @@ -27,7 +27,7 @@ trait EncryptionFunSuite { * Runs a test twice, initializing a SparkConf object with encryption off, then on. It's ok * for the test to modify the provided SparkConf. */ - final protected def encryptionTest(name: String)(fn: SparkConf => Unit) { + final protected def encryptionTest(name: String)(fn: SparkConf => Unit): Unit = { encryptionTestHelper(name) { case (name, conf) => test(name)(fn(conf)) } diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerBenchmark.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerBenchmark.scala index 2915b99dcfb60..953b651c72a83 100644 --- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerBenchmark.scala +++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerBenchmark.scala @@ -25,6 +25,7 @@ import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.internal.config._ import org.apache.spark.internal.config.Kryo._ +import org.apache.spark.launcher.SparkLauncher.EXECUTOR_EXTRA_JAVA_OPTIONS import org.apache.spark.serializer.KryoTest._ import org.apache.spark.util.ThreadUtils @@ -71,6 +72,9 @@ object KryoSerializerBenchmark extends BenchmarkBase { def createSparkContext(usePool: Boolean): SparkContext = { val conf = new SparkConf() + // SPARK-29282 This is for consistency between JDK8 and JDK11. + conf.set(EXECUTOR_EXTRA_JAVA_OPTIONS, + "-XX:+UseParallelGC -XX:-UseDynamicNumberOfGCThreads") conf.set(SERIALIZER, "org.apache.spark.serializer.KryoSerializer") conf.set(KRYO_USER_REGISTRATORS, classOf[MyRegistrator].getName) conf.set(KRYO_USE_POOL, usePool) diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala index 5d76c096d46ac..d4fafab4a5d64 100644 --- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala +++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala @@ -56,7 +56,7 @@ object KryoDistributedTest { class MyCustomClass class AppJarRegistrator extends KryoRegistrator { - override def registerClasses(k: Kryo) { + override def registerClasses(k: Kryo): Unit = { k.register(Utils.classForName(AppJarRegistrator.customClassName, noSparkClassLoader = true)) } diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala index 2442670b6d3f0..4c47a67ee9ffc 100644 --- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala +++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala @@ -34,10 +34,11 @@ import org.roaringbitmap.RoaringBitmap import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite} import org.apache.spark.internal.config._ import org.apache.spark.internal.config.Kryo._ +import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage import org.apache.spark.scheduler.HighlyCompressedMapStatus import org.apache.spark.serializer.KryoTest._ import org.apache.spark.storage.BlockManagerId -import org.apache.spark.util.{ThreadUtils, Utils} +import org.apache.spark.util.ThreadUtils class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext { conf.set(SERIALIZER, "org.apache.spark.serializer.KryoSerializer") @@ -86,7 +87,7 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext { conf.set(KRYO_REGISTRATION_REQUIRED, true) val ser = new KryoSerializer(conf).newInstance() - def check[T: ClassTag](t: T) { + def check[T: ClassTag](t: T): Unit = { assert(ser.deserialize[T](ser.serialize(t)) === t) } check(1) @@ -119,7 +120,7 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext { conf.set(KRYO_REGISTRATION_REQUIRED, true) val ser = new KryoSerializer(conf).newInstance() - def check[T: ClassTag](t: T) { + def check[T: ClassTag](t: T): Unit = { assert(ser.deserialize[T](ser.serialize(t)) === t) } check((1, 1)) @@ -146,7 +147,7 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext { conf.set(KRYO_REGISTRATION_REQUIRED, true) val ser = new KryoSerializer(conf).newInstance() - def check[T: ClassTag](t: T) { + def check[T: ClassTag](t: T): Unit = { assert(ser.deserialize[T](ser.serialize(t)) === t) } check(List[Int]()) @@ -173,7 +174,7 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext { test("Bug: SPARK-10251") { val ser = new KryoSerializer(conf.clone.set(KRYO_REGISTRATION_REQUIRED, true)) .newInstance() - def check[T: ClassTag](t: T) { + def check[T: ClassTag](t: T): Unit = { assert(ser.deserialize[T](ser.serialize(t)) === t) } check((1, 3)) @@ -202,7 +203,7 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext { test("ranges") { val ser = new KryoSerializer(conf).newInstance() - def check[T: ClassTag](t: T) { + def check[T: ClassTag](t: T): Unit = { assert(ser.deserialize[T](ser.serialize(t)) === t) // Check that very long ranges don't get written one element at a time assert(ser.serialize(t).limit() < 200) @@ -238,7 +239,7 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext { test("custom registrator") { val ser = new KryoSerializer(conf).newInstance() - def check[T: ClassTag](t: T) { + def check[T: ClassTag](t: T): Unit = { assert(ser.deserialize[T](ser.serialize(t)) === t) } @@ -274,19 +275,19 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext { } test("kryo with parallelize for specialized tuples") { - assert (sc.parallelize( Array((1, 11), (2, 22), (3, 33)) ).count === 3) + assert(sc.parallelize(Seq((1, 11), (2, 22), (3, 33))).count === 3) } test("kryo with parallelize for primitive arrays") { - assert (sc.parallelize( Array(1, 2, 3) ).count === 3) + assert(sc.parallelize(Array(1, 2, 3)).count === 3) } test("kryo with collect for specialized tuples") { - assert (sc.parallelize( Array((1, 11), (2, 22), (3, 33)) ).collect().head === ((1, 11))) + assert(sc.parallelize(Seq((1, 11), (2, 22), (3, 33))).collect().head === ((1, 11))) } test("kryo with SerializableHyperLogLog") { - assert(sc.parallelize( Array(1, 2, 3, 2, 3, 3, 2, 3, 1) ).countApproxDistinct(0.01) === 3) + assert(sc.parallelize(Array(1, 2, 3, 2, 3, 3, 2, 3, 1)).countApproxDistinct(0.01) === 3) } test("kryo with reduce") { @@ -350,8 +351,31 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext { val ser = new KryoSerializer(conf).newInstance() val denseBlockSizes = new Array[Long](5000) val sparseBlockSizes = Array[Long](0L, 1L, 0L, 2L) + var mapTaskId = 0 Seq(denseBlockSizes, sparseBlockSizes).foreach { blockSizes => - ser.serialize(HighlyCompressedMapStatus(BlockManagerId("exec-1", "host", 1234), blockSizes)) + mapTaskId += 1 + ser.serialize(HighlyCompressedMapStatus( + BlockManagerId("exec-1", "host", 1234), blockSizes, mapTaskId)) + } + } + + test("registration of TaskCommitMessage") { + val conf = new SparkConf(false) + conf.set(KRYO_REGISTRATION_REQUIRED, true) + + // HadoopMapReduceCommitProtocol.commitTask() returns a TaskCommitMessage containing a complex + // structure. + + val ser = new KryoSerializer(conf).newInstance() + val addedAbsPathFiles = Map("test1" -> "test1", "test2" -> "test2") + val partitionPaths = Set("test3") + + val taskCommitMessage1 = new TaskCommitMessage(addedAbsPathFiles -> partitionPaths) + val taskCommitMessage2 = new TaskCommitMessage(Map.empty -> Set.empty) + Seq(taskCommitMessage1, taskCommitMessage2).foreach { taskCommitMessage => + val obj1 = ser.deserialize[TaskCommitMessage](ser.serialize(taskCommitMessage)).obj + val obj2 = taskCommitMessage.obj + assert(obj1 == obj2) } } @@ -460,7 +484,7 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext { val tests = mutable.ListBuffer[Future[Boolean]]() - def check[T: ClassTag](t: T) { + def check[T: ClassTag](t: T): Unit = { tests += Future { val serializerInstance = ser.newInstance() serializerInstance.deserialize[T](serializerInstance.serialize(t)) === t @@ -579,7 +603,7 @@ object KryoTest { } class MyRegistrator extends KryoRegistrator { - override def registerClasses(k: Kryo) { + override def registerClasses(k: Kryo): Unit = { k.register(classOf[CaseClass]) k.register(classOf[ClassWithNoArgConstructor]) k.register(classOf[ClassWithoutNoArgConstructor]) @@ -588,7 +612,7 @@ object KryoTest { } class RegistratorWithoutAutoReset extends KryoRegistrator { - override def registerClasses(k: Kryo) { + override def registerClasses(k: Kryo): Unit = { k.setAutoReset(false) } } diff --git a/core/src/test/scala/org/apache/spark/serializer/UnsafeKryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/UnsafeKryoSerializerSuite.scala index 126ba0e8b1e93..65f3793c421fa 100644 --- a/core/src/test/scala/org/apache/spark/serializer/UnsafeKryoSerializerSuite.scala +++ b/core/src/test/scala/org/apache/spark/serializer/UnsafeKryoSerializerSuite.scala @@ -23,12 +23,12 @@ class UnsafeKryoSerializerSuite extends KryoSerializerSuite { // This test suite should run all tests in KryoSerializerSuite with kryo unsafe. - override def beforeAll() { + override def beforeAll(): Unit = { conf.set(KRYO_USE_UNSAFE, true) super.beforeAll() } - override def afterAll() { + override def afterAll(): Unit = { conf.set(KRYO_USE_UNSAFE, false) super.afterAll() } diff --git a/core/src/test/scala/org/apache/spark/shuffle/BlockStoreShuffleReaderSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/BlockStoreShuffleReaderSuite.scala index 6d2ef17a7a790..a82f86a11c77e 100644 --- a/core/src/test/scala/org/apache/spark/shuffle/BlockStoreShuffleReaderSuite.scala +++ b/core/src/test/scala/org/apache/spark/shuffle/BlockStoreShuffleReaderSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.shuffle import java.io.{ByteArrayOutputStream, InputStream} import java.nio.ByteBuffer +import org.mockito.ArgumentMatchers.{eq => meq} import org.mockito.Mockito.{mock, when} import org.apache.spark._ @@ -95,19 +96,20 @@ class BlockStoreShuffleReaderSuite extends SparkFunSuite with LocalSparkContext // Setup the blockManager mock so the buffer gets returned when the shuffle code tries to // fetch shuffle data. val shuffleBlockId = ShuffleBlockId(shuffleId, mapId, reduceId) - when(blockManager.getBlockData(shuffleBlockId)).thenReturn(managedBuffer) + when(blockManager.getLocalBlockData(meq(shuffleBlockId))).thenReturn(managedBuffer) managedBuffer } // Make a mocked MapOutputTracker for the shuffle reader to use to determine what // shuffle data to read. val mapOutputTracker = mock(classOf[MapOutputTracker]) - when(mapOutputTracker.getMapSizesByExecutorId(shuffleId, reduceId, reduceId + 1)).thenReturn { + when(mapOutputTracker.getMapSizesByExecutorId( + shuffleId, reduceId, reduceId + 1)).thenReturn { // Test a scenario where all data is local, to avoid creating a bunch of additional mocks // for the code to read data over the network. val shuffleBlockIdsAndSizes = (0 until numMaps).map { mapId => val shuffleBlockId = ShuffleBlockId(shuffleId, mapId, reduceId) - (shuffleBlockId, byteOutputStream.size().toLong) + (shuffleBlockId, byteOutputStream.size().toLong, mapId) } Seq((localBlockManagerId, shuffleBlockIdsAndSizes)).toIterator } @@ -118,7 +120,7 @@ class BlockStoreShuffleReaderSuite extends SparkFunSuite with LocalSparkContext when(dependency.serializer).thenReturn(serializer) when(dependency.aggregator).thenReturn(None) when(dependency.keyOrdering).thenReturn(None) - new BaseShuffleHandle(shuffleId, numMaps, dependency) + new BaseShuffleHandle(shuffleId, dependency) } val serializerManager = new SerializerManager( @@ -129,15 +131,15 @@ class BlockStoreShuffleReaderSuite extends SparkFunSuite with LocalSparkContext val taskContext = TaskContext.empty() val metrics = taskContext.taskMetrics.createTempShuffleReadMetrics() + val blocksByAddress = mapOutputTracker.getMapSizesByExecutorId( + shuffleId, reduceId, reduceId + 1) val shuffleReader = new BlockStoreShuffleReader( shuffleHandle, - reduceId, - reduceId + 1, + blocksByAddress, taskContext, metrics, serializerManager, - blockManager, - mapOutputTracker) + blockManager) assert(shuffleReader.read().length === keyValuePairsPerMap * numMaps) diff --git a/core/src/test/scala/org/apache/spark/shuffle/ShuffleDriverComponentsSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/ShuffleDriverComponentsSuite.scala new file mode 100644 index 0000000000000..3d70ff1fed29f --- /dev/null +++ b/core/src/test/scala/org/apache/spark/shuffle/ShuffleDriverComponentsSuite.scala @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.shuffle + +import java.util.{Map => JMap} +import java.util.concurrent.atomic.AtomicBoolean + +import com.google.common.collect.ImmutableMap +import org.scalatest.Assertions._ +import org.scalatest.BeforeAndAfterEach + +import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite} +import org.apache.spark.internal.config.SHUFFLE_IO_PLUGIN_CLASS +import org.apache.spark.shuffle.api.{ShuffleDataIO, ShuffleDriverComponents, ShuffleExecutorComponents, ShuffleMapOutputWriter} +import org.apache.spark.shuffle.sort.io.LocalDiskShuffleDataIO + +class ShuffleDriverComponentsSuite + extends SparkFunSuite with LocalSparkContext with BeforeAndAfterEach { + + test("test serialization of shuffle initialization conf to executors") { + val testConf = new SparkConf() + .setAppName("testing") + .set(ShuffleDataIOUtils.SHUFFLE_SPARK_CONF_PREFIX + "test-plugin-key", "user-set-value") + .set(ShuffleDataIOUtils.SHUFFLE_SPARK_CONF_PREFIX + "test-user-key", "user-set-value") + .setMaster("local-cluster[2,1,1024]") + .set(SHUFFLE_IO_PLUGIN_CLASS, "org.apache.spark.shuffle.TestShuffleDataIO") + + sc = new SparkContext(testConf) + + val out = sc.parallelize(Seq((1, "one"), (2, "two"), (3, "three")), 3) + .groupByKey() + .foreach { _ => + if (!TestShuffleExecutorComponentsInitialized.initialized.get()) { + throw new RuntimeException("TestShuffleExecutorComponents wasn't initialized") + } + } + } +} + +class TestShuffleDataIO(sparkConf: SparkConf) extends ShuffleDataIO { + private val delegate = new LocalDiskShuffleDataIO(sparkConf) + + override def driver(): ShuffleDriverComponents = new TestShuffleDriverComponents() + + override def executor(): ShuffleExecutorComponents = + new TestShuffleExecutorComponentsInitialized(delegate.executor()) +} + +class TestShuffleDriverComponents extends ShuffleDriverComponents { + override def initializeApplication(): JMap[String, String] = { + ImmutableMap.of("test-plugin-key", "plugin-set-value") + } + + override def cleanupApplication(): Unit = {} +} + +object TestShuffleExecutorComponentsInitialized { + val initialized = new AtomicBoolean(false) +} + +class TestShuffleExecutorComponentsInitialized(delegate: ShuffleExecutorComponents) + extends ShuffleExecutorComponents { + + override def initializeExecutor( + appId: String, + execId: String, + extraConfigs: JMap[String, String]): Unit = { + delegate.initializeExecutor(appId, execId, extraConfigs) + assert(extraConfigs.get("test-plugin-key") == "plugin-set-value", extraConfigs) + assert(extraConfigs.get("test-user-key") == "user-set-value") + TestShuffleExecutorComponentsInitialized.initialized.set(true) + } + + override def createMapOutputWriter( + shuffleId: Int, + mapTaskId: Long, + numPartitions: Int): ShuffleMapOutputWriter = { + delegate.createMapOutputWriter(shuffleId, mapTaskId, numPartitions) + } +} diff --git a/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala index b9f81fa0d0a06..f8474022867f4 100644 --- a/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala +++ b/core/src/test/scala/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriterSuite.scala @@ -25,7 +25,7 @@ import scala.collection.mutable.ArrayBuffer import org.mockito.{Mock, MockitoAnnotations} import org.mockito.Answers.RETURNS_SMART_NULLS -import org.mockito.ArgumentMatchers.{any, anyInt} +import org.mockito.ArgumentMatchers.{any, anyInt, anyLong} import org.mockito.Mockito._ import org.scalatest.BeforeAndAfterEach @@ -65,7 +65,6 @@ class BypassMergeSortShuffleWriterSuite extends SparkFunSuite with BeforeAndAfte taskMetrics = new TaskMetrics shuffleHandle = new BypassMergeSortShuffleHandle[Int, Int]( shuffleId = 0, - numMaps = 2, dependency = dependency ) val memoryManager = new TestMemoryManager(conf) @@ -78,7 +77,7 @@ class BypassMergeSortShuffleWriterSuite extends SparkFunSuite with BeforeAndAfte when(taskContext.taskMemoryManager()).thenReturn(taskMemoryManager) when(blockResolver.writeIndexFileAndCommit( - anyInt, anyInt, any(classOf[Array[Long]]), any(classOf[File]))) + anyInt, anyLong, any(classOf[Array[Long]]), any(classOf[File]))) .thenAnswer { invocationOnMock => val tmp = invocationOnMock.getArguments()(3).asInstanceOf[File] if (tmp != null) { @@ -139,8 +138,7 @@ class BypassMergeSortShuffleWriterSuite extends SparkFunSuite with BeforeAndAfte val writer = new BypassMergeSortShuffleWriter[Int, Int]( blockManager, shuffleHandle, - 0, // MapId - 0L, // MapTaskAttemptId + 0L, // MapId conf, taskContext.taskMetrics().shuffleWriteMetrics, shuffleExecutorComponents) @@ -166,8 +164,7 @@ class BypassMergeSortShuffleWriterSuite extends SparkFunSuite with BeforeAndAfte val writer = new BypassMergeSortShuffleWriter[Int, Int]( blockManager, shuffleHandle, - 0, // MapId - 0L, + 0L, // MapId transferConf, taskContext.taskMetrics().shuffleWriteMetrics, shuffleExecutorComponents) @@ -202,8 +199,7 @@ class BypassMergeSortShuffleWriterSuite extends SparkFunSuite with BeforeAndAfte val writer = new BypassMergeSortShuffleWriter[Int, Int]( blockManager, shuffleHandle, - 0, // MapId - 0L, + 0L, // MapId conf, taskContext.taskMetrics().shuffleWriteMetrics, shuffleExecutorComponents) @@ -224,8 +220,7 @@ class BypassMergeSortShuffleWriterSuite extends SparkFunSuite with BeforeAndAfte val writer = new BypassMergeSortShuffleWriter[Int, Int]( blockManager, shuffleHandle, - 0, // MapId - 0L, + 0L, // MapId conf, taskContext.taskMetrics().shuffleWriteMetrics, shuffleExecutorComponents) diff --git a/core/src/test/scala/org/apache/spark/shuffle/sort/ShuffleExternalSorterSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/ShuffleExternalSorterSuite.scala index 8b955c98f7953..49055ab71c3fe 100644 --- a/core/src/test/scala/org/apache/spark/shuffle/sort/ShuffleExternalSorterSuite.scala +++ b/core/src/test/scala/org/apache/spark/shuffle/sort/ShuffleExternalSorterSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.shuffle.sort import java.lang.{Long => JLong} import org.mockito.Mockito.when -import org.scalatest.mockito.MockitoSugar +import org.scalatestplus.mockito.MockitoSugar import org.apache.spark._ import org.apache.spark.executor.{ShuffleWriteMetrics, TaskMetrics} diff --git a/core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleWriterSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleWriterSuite.scala index 0dd6040808f9e..4c5694fcf0305 100644 --- a/core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleWriterSuite.scala +++ b/core/src/test/scala/org/apache/spark/shuffle/sort/SortShuffleWriterSuite.scala @@ -57,7 +57,7 @@ class SortShuffleWriterSuite extends SparkFunSuite with SharedSparkContext with when(dependency.serializer).thenReturn(serializer) when(dependency.aggregator).thenReturn(None) when(dependency.keyOrdering).thenReturn(None) - new BaseShuffleHandle(shuffleId, numMaps = numMaps, dependency) + new BaseShuffleHandle(shuffleId, dependency) } shuffleExecutorComponents = new LocalDiskShuffleExecutorComponents( conf, blockManager, shuffleBlockResolver) diff --git a/core/src/test/scala/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriterSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriterSuite.scala index 5156cc2cc47a6..f92455912f510 100644 --- a/core/src/test/scala/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriterSuite.scala +++ b/core/src/test/scala/org/apache/spark/shuffle/sort/io/LocalDiskShuffleMapOutputWriterSuite.scala @@ -23,7 +23,7 @@ import java.nio.file.Files import java.util.Arrays import org.mockito.Answers.RETURNS_SMART_NULLS -import org.mockito.ArgumentMatchers.{any, anyInt} +import org.mockito.ArgumentMatchers.{any, anyInt, anyLong} import org.mockito.Mock import org.mockito.Mockito.when import org.mockito.MockitoAnnotations @@ -73,9 +73,9 @@ class LocalDiskShuffleMapOutputWriterSuite extends SparkFunSuite with BeforeAndA conf = new SparkConf() .set("spark.app.id", "example.spark.app") .set("spark.shuffle.unsafe.file.output.buffer", "16k") - when(blockResolver.getDataFile(anyInt, anyInt)).thenReturn(mergedOutputFile) + when(blockResolver.getDataFile(anyInt, anyLong)).thenReturn(mergedOutputFile) when(blockResolver.writeIndexFileAndCommit( - anyInt, anyInt, any(classOf[Array[Long]]), any(classOf[File]))) + anyInt, anyLong, any(classOf[Array[Long]]), any(classOf[File]))) .thenAnswer { invocationOnMock => partitionSizesInMergedFile = invocationOnMock.getArguments()(2).asInstanceOf[Array[Long]] val tmp: File = invocationOnMock.getArguments()(3).asInstanceOf[File] diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala index 4b71a4844bde1..24eb1685f577a 100644 --- a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala +++ b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala @@ -30,18 +30,21 @@ import org.apache.spark._ import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics} import org.apache.spark.internal.config.Status._ import org.apache.spark.metrics.ExecutorMetricType +import org.apache.spark.resource.ResourceProfile import org.apache.spark.scheduler._ import org.apache.spark.scheduler.cluster._ +import org.apache.spark.status.ListenerEventsTestHelper._ import org.apache.spark.status.api.v1 import org.apache.spark.storage._ import org.apache.spark.util.Utils class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { - private val conf = new SparkConf() .set(LIVE_ENTITY_UPDATE_PERIOD, 0L) .set(ASYNC_TRACKING_ENABLED, false) + private val twoReplicaMemAndDiskLevel = StorageLevel(true, true, false, true, 2) + private var time: Long = _ private var testDir: File = _ private var store: ElementTrackingStore = _ @@ -149,10 +152,13 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { // Start a job with 2 stages / 4 tasks each time += 1 val stages = Seq( - new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1"), - new StageInfo(2, 0, "stage2", 4, Nil, Seq(1), "details2")) + new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID), + new StageInfo(2, 0, "stage2", 4, Nil, Seq(1), "details2", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)) val jobProps = new Properties() + jobProps.setProperty(SparkContext.SPARK_JOB_DESCRIPTION, "jobDescription") jobProps.setProperty(SparkContext.SPARK_JOB_GROUP_ID, "jobGroup") jobProps.setProperty(SparkContext.SPARK_SCHEDULER_POOL, "schedPool") @@ -161,7 +167,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { check[JobDataWrapper](1) { job => assert(job.info.jobId === 1) assert(job.info.name === stages.last.name) - assert(job.info.description === None) + assert(job.info.description === Some("jobDescription")) assert(job.info.status === JobExecutionStatus.RUNNING) assert(job.info.submissionTime === Some(new Date(time))) assert(job.info.jobGroup === Some("jobGroup")) @@ -521,7 +527,8 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { // - Re-submit stage 2, all tasks, and succeed them and the stage. val oldS2 = stages.last val newS2 = new StageInfo(oldS2.stageId, oldS2.attemptNumber + 1, oldS2.name, oldS2.numTasks, - oldS2.rddInfos, oldS2.parentIds, oldS2.details, oldS2.taskMetrics) + oldS2.rddInfos, oldS2.parentIds, oldS2.details, oldS2.taskMetrics, + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) time += 1 newS2.submissionTime = Some(time) @@ -572,8 +579,10 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { // change the stats of the already finished job. time += 1 val j2Stages = Seq( - new StageInfo(3, 0, "stage1", 4, Nil, Nil, "details1"), - new StageInfo(4, 0, "stage2", 4, Nil, Seq(3), "details2")) + new StageInfo(3, 0, "stage1", 4, Nil, Nil, "details1", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID), + new StageInfo(4, 0, "stage2", 4, Nil, Seq(3), "details2", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)) j2Stages.last.submissionTime = Some(time) listener.onJobStart(SparkListenerJobStart(2, time, j2Stages, null)) assert(store.count(classOf[JobDataWrapper]) === 2) @@ -697,10 +706,20 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { val rdd2b1 = RddBlock(2, 1, 5L, 6L) val level = StorageLevel.MEMORY_AND_DISK + // Submit a stage for the first RDD before it's marked for caching, to make sure later + // the listener picks up the correct storage level. + val rdd1Info = new RDDInfo(rdd1b1.rddId, "rdd1", 2, StorageLevel.NONE, false, Nil) + val stage0 = new StageInfo(0, 0, "stage0", 4, Seq(rdd1Info), Nil, "details0", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) + listener.onStageSubmitted(SparkListenerStageSubmitted(stage0, new Properties())) + listener.onStageCompleted(SparkListenerStageCompleted(stage0)) + assert(store.count(classOf[RDDStorageInfoWrapper]) === 0) + // Submit a stage and make sure the RDDs are recorded. - val rdd1Info = new RDDInfo(rdd1b1.rddId, "rdd1", 2, level, false, Nil) + rdd1Info.storageLevel = level val rdd2Info = new RDDInfo(rdd2b1.rddId, "rdd2", 1, level, false, Nil) - val stage = new StageInfo(1, 0, "stage1", 4, Seq(rdd1Info, rdd2Info), Nil, "details1") + val stage = new StageInfo(1, 0, "stage1", 4, Seq(rdd1Info, rdd2Info), Nil, "details1", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) listener.onStageSubmitted(SparkListenerStageSubmitted(stage, new Properties())) check[RDDStorageInfoWrapper](rdd1b1.rddId) { wrapper => @@ -763,6 +782,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { assert(part.memoryUsed === rdd1b1.memSize * 2) assert(part.diskUsed === rdd1b1.diskSize * 2) assert(part.executors === Seq(bm1.executorId, bm2.executorId)) + assert(part.storageLevel === twoReplicaMemAndDiskLevel.description) } check[ExecutorSummaryWrapper](bm2.executorId) { exec => @@ -800,9 +820,30 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { assert(exec.info.diskUsed === rdd1b1.diskSize + rdd1b2.diskSize) } - // Remove block 1 from bm 1. + // Evict block 1 from memory in bm 1. Note that because of SPARK-29319, the disk size + // is reported as "0" here to avoid double-counting; the current behavior of the block + // manager is to provide the actual disk size of the block. + listener.onBlockUpdated(SparkListenerBlockUpdated( + BlockUpdatedInfo(bm1, rdd1b1.blockId, StorageLevel.DISK_ONLY, + rdd1b1.memSize, 0L))) + + check[RDDStorageInfoWrapper](rdd1b1.rddId) { wrapper => + assert(wrapper.info.numCachedPartitions === 2L) + assert(wrapper.info.memoryUsed === rdd1b1.memSize + rdd1b2.memSize) + assert(wrapper.info.diskUsed === 2 * rdd1b1.diskSize + rdd1b2.diskSize) + assert(wrapper.info.dataDistribution.get.size === 2L) + assert(wrapper.info.partitions.get.size === 2L) + } + + check[ExecutorSummaryWrapper](bm1.executorId) { exec => + assert(exec.info.rddBlocks === 2L) + assert(exec.info.memoryUsed === rdd1b2.memSize) + assert(exec.info.diskUsed === rdd1b1.diskSize + rdd1b2.diskSize) + } + + // Remove block 1 from bm 1; note memSize = 0 due to the eviction above. listener.onBlockUpdated(SparkListenerBlockUpdated( - BlockUpdatedInfo(bm1, rdd1b1.blockId, StorageLevel.NONE, rdd1b1.memSize, rdd1b1.diskSize))) + BlockUpdatedInfo(bm1, rdd1b1.blockId, StorageLevel.NONE, 0, rdd1b1.diskSize))) check[RDDStorageInfoWrapper](rdd1b1.rddId) { wrapper => assert(wrapper.info.numCachedPartitions === 2L) @@ -985,9 +1026,12 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { // data is not deleted. time += 1 val stages = Seq( - new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1"), - new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2"), - new StageInfo(3, 0, "stage3", 4, Nil, Nil, "details3")) + new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID), + new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID), + new StageInfo(3, 0, "stage3", 4, Nil, Nil, "details3", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)) // Graph data is generated by the job start event, so fire it. listener.onJobStart(SparkListenerJobStart(4, time, stages, null)) @@ -1035,7 +1079,8 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { } assert(store.count(classOf[CachedQuantile], "stage", key(dropped)) === 0) - val attempt2 = new StageInfo(3, 1, "stage3", 4, Nil, Nil, "details3") + val attempt2 = new StageInfo(3, 1, "stage3", 4, Nil, Nil, "details3", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) time += 1 attempt2.submissionTime = Some(time) listener.onStageSubmitted(SparkListenerStageSubmitted(attempt2, new Properties())) @@ -1106,9 +1151,12 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { val testConf = conf.clone().set(MAX_RETAINED_STAGES, 2) val listener = new AppStatusListener(store, testConf, true) - val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1") - val stage2 = new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2") - val stage3 = new StageInfo(3, 0, "stage3", 4, Nil, Nil, "details3") + val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) + val stage2 = new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) + val stage3 = new StageInfo(3, 0, "stage3", 4, Nil, Nil, "details3", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) // Start stage 1 and stage 2 time += 1 @@ -1139,8 +1187,10 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { val testConf = conf.clone().set(MAX_RETAINED_STAGES, 2) val listener = new AppStatusListener(store, testConf, true) - val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1") - val stage2 = new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2") + val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) + val stage2 = new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) // Sart job 1 time += 1 @@ -1160,7 +1210,8 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { listener.onJobEnd(SparkListenerJobEnd(1, time, JobSucceeded)) // Submit stage 3 and verify stage 2 is evicted - val stage3 = new StageInfo(3, 0, "stage3", 4, Nil, Nil, "details3") + val stage3 = new StageInfo(3, 0, "stage3", 4, Nil, Nil, "details3", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) time += 1 stage3.submissionTime = Some(time) listener.onStageSubmitted(SparkListenerStageSubmitted(stage3, new Properties())) @@ -1175,7 +1226,8 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { val testConf = conf.clone().set(MAX_RETAINED_TASKS_PER_STAGE, 2) val listener = new AppStatusListener(store, testConf, true) - val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1") + val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) stage1.submissionTime = Some(time) listener.onStageSubmitted(SparkListenerStageSubmitted(stage1, new Properties())) @@ -1210,9 +1262,12 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { val listener = new AppStatusListener(store, testConf, true) val appStore = new AppStatusStore(store) - val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1") - val stage2 = new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2") - val stage3 = new StageInfo(3, 0, "stage3", 4, Nil, Nil, "details3") + val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) + val stage2 = new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) + val stage3 = new StageInfo(3, 0, "stage3", 4, Nil, Nil, "details3", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) time += 1 stage1.submissionTime = Some(time) @@ -1241,8 +1296,10 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { test("SPARK-24415: update metrics for tasks that finish late") { val listener = new AppStatusListener(store, conf, true) - val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1") - val stage2 = new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2") + val stage1 = new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) + val stage2 = new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) // Start job listener.onJobStart(SparkListenerJobStart(1, time, Seq(stage1, stage2), null)) @@ -1307,7 +1364,8 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { listener.onExecutorAdded(createExecutorAddedEvent(1)) listener.onExecutorAdded(createExecutorAddedEvent(2)) - val stage = new StageInfo(1, 0, "stage", 4, Nil, Nil, "details") + val stage = new StageInfo(1, 0, "stage", 4, Nil, Nil, "details", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) listener.onJobStart(SparkListenerJobStart(1, time, Seq(stage), null)) listener.onStageSubmitted(SparkListenerStageSubmitted(stage, new Properties())) @@ -1544,7 +1602,8 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { // Submit a stage and make sure the RDDs are recorded. val rdd1Info = new RDDInfo(rdd1b1.rddId, "rdd1", 2, level, false, Nil) - val stage = new StageInfo(1, 0, "stage1", 4, Seq(rdd1Info), Nil, "details1") + val stage = new StageInfo(1, 0, "stage1", 4, Seq(rdd1Info), Nil, "details1", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) listener.onStageSubmitted(SparkListenerStageSubmitted(stage, new Properties())) // Add partition 1 replicated on two block managers. @@ -1571,7 +1630,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { assert(dist.memoryRemaining === maxMemory - dist.memoryUsed) val part1 = wrapper.info.partitions.get.find(_.blockName === rdd1b1.blockId.name).get - assert(part1.storageLevel === level.description) + assert(part1.storageLevel === twoReplicaMemAndDiskLevel.description) assert(part1.memoryUsed === 2 * rdd1b1.memSize) assert(part1.diskUsed === 2 * rdd1b1.diskSize) assert(part1.executors === Seq(bm1.executorId, bm2.executorId)) @@ -1624,6 +1683,30 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { } } + test("clean up used memory when BlockManager added") { + val listener = new AppStatusListener(store, conf, true) + // Add block manager at the first time + val driver = BlockManagerId(SparkContext.DRIVER_IDENTIFIER, "localhost", 42) + listener.onBlockManagerAdded(SparkListenerBlockManagerAdded( + time, driver, 42L, Some(43L), Some(44L))) + // Update the memory metrics + listener.updateExecutorMemoryDiskInfo( + listener.liveExecutors(SparkContext.DRIVER_IDENTIFIER), + StorageLevel.MEMORY_AND_DISK, + 10L, + 10L + ) + // Re-add the same block manager again + listener.onBlockManagerAdded(SparkListenerBlockManagerAdded( + time, driver, 42L, Some(43L), Some(44L))) + + check[ExecutorSummaryWrapper](SparkContext.DRIVER_IDENTIFIER) { d => + val memoryMetrics = d.info.memoryMetrics.get + assert(memoryMetrics.usedOffHeapStorageMemory == 0) + assert(memoryMetrics.usedOnHeapStorageMemory == 0) + } + } + private def key(stage: StageInfo): Array[Int] = Array(stage.stageId, stage.attemptNumber) @@ -1661,40 +1744,4 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter { def blockId: BlockId = RDDBlockId(rddId, partId) } - - /** Create a stage submitted event for the specified stage Id. */ - private def createStageSubmittedEvent(stageId: Int) = { - SparkListenerStageSubmitted(new StageInfo(stageId, 0, stageId.toString, 0, - Seq.empty, Seq.empty, "details")) - } - - /** Create a stage completed event for the specified stage Id. */ - private def createStageCompletedEvent(stageId: Int) = { - SparkListenerStageCompleted(new StageInfo(stageId, 0, stageId.toString, 0, - Seq.empty, Seq.empty, "details")) - } - - /** Create an executor added event for the specified executor Id. */ - private def createExecutorAddedEvent(executorId: Int) = { - SparkListenerExecutorAdded(0L, executorId.toString, - new ExecutorInfo("host1", 1, Map.empty, Map.empty)) - } - - /** Create an executor added event for the specified executor Id. */ - private def createExecutorRemovedEvent(executorId: Int) = { - SparkListenerExecutorRemoved(10L, executorId.toString, "test") - } - - /** Create an executor metrics update event, with the specified executor metrics values. */ - private def createExecutorMetricsUpdateEvent( - stageId: Int, - executorId: Int, - executorMetrics: Array[Long]): SparkListenerExecutorMetricsUpdate = { - val taskMetrics = TaskMetrics.empty - taskMetrics.incDiskBytesSpilled(111) - taskMetrics.incMemoryBytesSpilled(222) - val accum = Array((333L, 1, 1, taskMetrics.accumulators().map(AccumulatorSuite.makeInfo))) - val executorUpdates = Map((stageId, 0) -> new ExecutorMetrics(executorMetrics)) - SparkListenerExecutorMetricsUpdate(executorId.toString, accum, executorUpdates) - } } diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala index 165fdb71cc78b..735e51942626f 100644 --- a/core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala +++ b/core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala @@ -18,7 +18,9 @@ package org.apache.spark.status import org.apache.spark.{SparkConf, SparkFunSuite} -import org.apache.spark.util.Distribution +import org.apache.spark.executor.TaskMetrics +import org.apache.spark.scheduler.{TaskInfo, TaskLocality} +import org.apache.spark.util.{Distribution, Utils} import org.apache.spark.util.kvstore._ class AppStatusStoreSuite extends SparkFunSuite { @@ -76,42 +78,61 @@ class AppStatusStoreSuite extends SparkFunSuite { assert(store.count(classOf[CachedQuantile]) === 2) } - private def createLiveStore(inMemoryStore: InMemoryStore): AppStatusStore = { + private def createAppStore(disk: Boolean, live: Boolean): AppStatusStore = { val conf = new SparkConf() - val store = new ElementTrackingStore(inMemoryStore, conf) - val listener = new AppStatusListener(store, conf, true, None) - new AppStatusStore(store, listener = Some(listener)) - } + if (live) { + return AppStatusStore.createLiveStore(conf) + } - test("SPARK-28638: only successful tasks have taskSummary when with in memory kvstore") { - val store = new InMemoryStore() - (0 until 5).foreach { i => store.write(newTaskData(i, status = "FAILED")) } - Seq(new AppStatusStore(store), createLiveStore(store)).foreach { appStore => - val summary = appStore.taskSummary(stageId, attemptId, uiQuantiles) - assert(summary.size === 0) + val store: KVStore = if (disk) { + val testDir = Utils.createTempDir() + val diskStore = KVUtils.open(testDir, getClass.getName) + new ElementTrackingStore(diskStore, conf) + } else { + new ElementTrackingStore(new InMemoryStore, conf) } + new AppStatusStore(store) } - test("SPARK-28638: summary should contain successful tasks only when with in memory kvstore") { - val store = new InMemoryStore() + Seq( + "disk" -> createAppStore(disk = true, live = false), + "in memory" -> createAppStore(disk = false, live = false), + "in memory live" -> createAppStore(disk = false, live = true) + ).foreach { case (hint, appStore) => + test(s"SPARK-26260: summary should contain only successful tasks' metrics (store = $hint)") { + val store = appStore.store + + // Success and failed tasks metrics + for (i <- 0 to 5) { + if (i % 2 == 0) { + writeTaskDataToStore(i, store, "FAILED") + } else { + writeTaskDataToStore(i, store, "SUCCESS") + } + } - for (i <- 0 to 5) { - if (i % 2 == 1) { - store.write(newTaskData(i, status = "FAILED")) - } else { - store.write(newTaskData(i)) + // Running tasks metrics (-1 = no metrics reported, positive = metrics have been reported) + Seq(-1, 6).foreach { metric => + writeTaskDataToStore(metric, store, "RUNNING") } - } - Seq(new AppStatusStore(store), createLiveStore(store)).foreach { appStore => + /** + * Following are the tasks metrics, + * 1, 3, 5 => Success + * 0, 2, 4 => Failed + * -1, 6 => Running + * + * Task summary will consider (1, 3, 5) only + */ val summary = appStore.taskSummary(stageId, attemptId, uiQuantiles).get - val values = Array(0.0, 2.0, 4.0) + val values = Array(1.0, 3.0, 5.0) val dist = new Distribution(values, 0, values.length).getQuantiles(uiQuantiles.sorted) dist.zip(summary.executorRunTime).foreach { case (expected, actual) => assert(expected === actual) } + appStore.close() } } @@ -133,9 +154,54 @@ class AppStatusStoreSuite extends SparkFunSuite { private def newTaskData(i: Int, status: String = "SUCCESS"): TaskDataWrapper = { new TaskDataWrapper( - i, i, i, i, i, i, i.toString, i.toString, status, i.toString, false, Nil, None, + i.toLong, i, i, i, i, i, i.toString, i.toString, status, i.toString, false, Nil, None, true, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, stageId, attemptId) } + + private def writeTaskDataToStore(i: Int, store: KVStore, status: String): Unit = { + val liveTask = new LiveTask(new TaskInfo( i.toLong, i, i, i.toLong, i.toString, + i.toString, TaskLocality.ANY, false), stageId, attemptId, None) + + if (status == "SUCCESS") { + liveTask.info.finishTime = 1L + } else if (status == "FAILED") { + liveTask.info.failed = true + liveTask.info.finishTime = 1L + } + + val taskMetrics = getTaskMetrics(i) + liveTask.updateMetrics(taskMetrics) + liveTask.write(store.asInstanceOf[ElementTrackingStore], 1L) + } + + private def getTaskMetrics(i: Int): TaskMetrics = { + val taskMetrics = new TaskMetrics() + taskMetrics.setExecutorDeserializeTime(i) + taskMetrics.setExecutorDeserializeCpuTime(i) + taskMetrics.setExecutorRunTime(i) + taskMetrics.setExecutorCpuTime(i) + taskMetrics.setResultSize(i) + taskMetrics.setJvmGCTime(i) + taskMetrics.setResultSerializationTime(i) + taskMetrics.incMemoryBytesSpilled(i) + taskMetrics.incDiskBytesSpilled(i) + taskMetrics.incPeakExecutionMemory(i) + taskMetrics.inputMetrics.incBytesRead(i) + taskMetrics.inputMetrics.incRecordsRead(i) + taskMetrics.outputMetrics.setBytesWritten(i) + taskMetrics.outputMetrics.setRecordsWritten(i) + taskMetrics.shuffleReadMetrics.incRemoteBlocksFetched(i) + taskMetrics.shuffleReadMetrics.incLocalBlocksFetched(i) + taskMetrics.shuffleReadMetrics.incFetchWaitTime(i) + taskMetrics.shuffleReadMetrics.incRemoteBytesRead(i) + taskMetrics.shuffleReadMetrics.incRemoteBytesReadToDisk(i) + taskMetrics.shuffleReadMetrics.incLocalBytesRead(i) + taskMetrics.shuffleReadMetrics.incRecordsRead(i) + taskMetrics.shuffleWriteMetrics.incBytesWritten(i) + taskMetrics.shuffleWriteMetrics.incWriteTime(i) + taskMetrics.shuffleWriteMetrics.incRecordsWritten(i) + taskMetrics + } } diff --git a/core/src/test/scala/org/apache/spark/status/ListenerEventsTestHelper.scala b/core/src/test/scala/org/apache/spark/status/ListenerEventsTestHelper.scala new file mode 100644 index 0000000000000..99c0d9593ccae --- /dev/null +++ b/core/src/test/scala/org/apache/spark/status/ListenerEventsTestHelper.scala @@ -0,0 +1,205 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.status + +import java.util.Properties + +import scala.collection.immutable.Map + +import org.apache.spark.{AccumulatorSuite, SparkContext, Success, TaskState} +import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics} +import org.apache.spark.resource.ResourceProfile +import org.apache.spark.scheduler.{SparkListener, SparkListenerExecutorAdded, SparkListenerExecutorMetricsUpdate, SparkListenerExecutorRemoved, SparkListenerJobStart, SparkListenerStageCompleted, SparkListenerStageSubmitted, SparkListenerTaskEnd, SparkListenerTaskStart, StageInfo, TaskInfo, TaskLocality} +import org.apache.spark.scheduler.cluster.ExecutorInfo +import org.apache.spark.storage.{RDDInfo, StorageLevel} + +object ListenerEventsTestHelper { + + private var taskIdTracker = -1L + private var rddIdTracker = -1 + private var stageIdTracker = -1 + + def reset(): Unit = { + taskIdTracker = -1L + rddIdTracker = -1 + stageIdTracker = -1 + } + + def createJobProps(): Properties = { + val jobProps = new Properties() + jobProps.setProperty(SparkContext.SPARK_JOB_DESCRIPTION, "jobDescription") + jobProps.setProperty(SparkContext.SPARK_JOB_GROUP_ID, "jobGroup") + jobProps.setProperty(SparkContext.SPARK_SCHEDULER_POOL, "schedPool") + jobProps + } + + def createRddsWithId(ids: Seq[Int]): Seq[RDDInfo] = { + ids.map { rddId => + new RDDInfo(rddId, s"rdd${rddId}", 2, StorageLevel.NONE, false, Nil) + } + } + + def createRdds(count: Int): Seq[RDDInfo] = { + (1 to count).map { _ => + val rddId = nextRddId() + new RDDInfo(rddId, s"rdd${rddId}", 2, StorageLevel.NONE, false, Nil) + } + } + + def createStage(id: Int, rdds: Seq[RDDInfo], parentIds: Seq[Int]): StageInfo = { + new StageInfo(id, 0, s"stage${id}", 4, rdds, parentIds, s"details${id}", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) + } + + def createStage(rdds: Seq[RDDInfo], parentIds: Seq[Int]): StageInfo = { + createStage(nextStageId(), rdds, parentIds) + } + + def createTasks(ids: Seq[Long], execs: Array[String], time: Long): Seq[TaskInfo] = { + ids.zipWithIndex.map { case (id, idx) => + val exec = execs(idx % execs.length) + new TaskInfo(id, idx, 1, time, exec, s"$exec.example.com", + TaskLocality.PROCESS_LOCAL, idx % 2 == 0) + } + } + + def createTasks(count: Int, execs: Array[String], time: Long): Seq[TaskInfo] = { + createTasks((1 to count).map { _ => nextTaskId() }, execs, time) + } + + def createTaskWithNewAttempt(orig: TaskInfo, time: Long): TaskInfo = { + // Task reattempts have a different ID, but the same index as the original. + new TaskInfo(nextTaskId(), orig.index, orig.attemptNumber + 1, time, orig.executorId, + s"${orig.executorId}.example.com", TaskLocality.PROCESS_LOCAL, orig.speculative) + } + + def createTaskStartEvent( + taskInfo: TaskInfo, + stageId: Int, + attemptId: Int): SparkListenerTaskStart = { + SparkListenerTaskStart(stageId, attemptId, taskInfo) + } + + /** Create a stage submitted event for the specified stage Id. */ + def createStageSubmittedEvent(stageId: Int): SparkListenerStageSubmitted = { + SparkListenerStageSubmitted(new StageInfo(stageId, 0, stageId.toString, 0, + Seq.empty, Seq.empty, "details", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)) + } + + /** Create a stage completed event for the specified stage Id. */ + def createStageCompletedEvent(stageId: Int): SparkListenerStageCompleted = { + SparkListenerStageCompleted(new StageInfo(stageId, 0, stageId.toString, 0, + Seq.empty, Seq.empty, "details", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)) + } + + def createExecutorAddedEvent(executorId: Int): SparkListenerExecutorAdded = { + createExecutorAddedEvent(executorId.toString, 0) + } + + /** Create an executor added event for the specified executor Id. */ + def createExecutorAddedEvent(executorId: String, time: Long): SparkListenerExecutorAdded = { + SparkListenerExecutorAdded(time, executorId, + new ExecutorInfo("host1", 1, Map.empty, Map.empty)) + } + + def createExecutorRemovedEvent(executorId: Int): SparkListenerExecutorRemoved = { + createExecutorRemovedEvent(executorId.toString, 10L) + } + + /** Create an executor added event for the specified executor Id. */ + def createExecutorRemovedEvent(executorId: String, time: Long): SparkListenerExecutorRemoved = { + SparkListenerExecutorRemoved(time, executorId, "test") + } + + /** Create an executor metrics update event, with the specified executor metrics values. */ + def createExecutorMetricsUpdateEvent( + stageId: Int, + executorId: Int, + executorMetrics: Array[Long]): SparkListenerExecutorMetricsUpdate = { + val taskMetrics = TaskMetrics.empty + taskMetrics.incDiskBytesSpilled(111) + taskMetrics.incMemoryBytesSpilled(222) + val accum = Array((333L, 1, 1, taskMetrics.accumulators().map(AccumulatorSuite.makeInfo))) + val executorUpdates = Map((stageId, 0) -> new ExecutorMetrics(executorMetrics)) + SparkListenerExecutorMetricsUpdate(executorId.toString, accum, executorUpdates) + } + + case class JobInfo( + stageIds: Seq[Int], + stageToTaskIds: Map[Int, Seq[Long]], + stageToRddIds: Map[Int, Seq[Int]]) + + def pushJobEventsWithoutJobEnd( + listener: SparkListener, + jobId: Int, + jobProps: Properties, + execIds: Array[String], + time: Long): JobInfo = { + // Start a job with 1 stage / 4 tasks each + val rddsForStage = createRdds(2) + val stage = createStage(rddsForStage, Nil) + + listener.onJobStart(SparkListenerJobStart(jobId, time, Seq(stage), jobProps)) + + // Submit stage + stage.submissionTime = Some(time) + listener.onStageSubmitted(SparkListenerStageSubmitted(stage, jobProps)) + + // Start tasks from stage + val s1Tasks = createTasks(4, execIds, time) + s1Tasks.foreach { task => + listener.onTaskStart(SparkListenerTaskStart(stage.stageId, + stage.attemptNumber(), task)) + } + + // Succeed all tasks in stage. + val s1Metrics = TaskMetrics.empty + s1Metrics.setExecutorCpuTime(2L) + s1Metrics.setExecutorRunTime(4L) + + s1Tasks.foreach { task => + task.markFinished(TaskState.FINISHED, time) + listener.onTaskEnd(SparkListenerTaskEnd(stage.stageId, stage.attemptNumber, + "taskType", Success, task, new ExecutorMetrics, s1Metrics)) + } + + // End stage. + stage.completionTime = Some(time) + listener.onStageCompleted(SparkListenerStageCompleted(stage)) + + JobInfo(Seq(stage.stageId), Map(stage.stageId -> s1Tasks.map(_.taskId)), + Map(stage.stageId -> rddsForStage.map(_.id))) + } + + private def nextTaskId(): Long = { + taskIdTracker += 1 + taskIdTracker + } + + private def nextRddId(): Int = { + rddIdTracker += 1 + rddIdTracker + } + + private def nextStageId(): Int = { + stageIdTracker += 1 + stageIdTracker + } +} diff --git a/core/src/test/scala/org/apache/spark/status/LiveEntitySuite.scala b/core/src/test/scala/org/apache/spark/status/LiveEntitySuite.scala index bb2d2633001f0..35e8a62c93c99 100644 --- a/core/src/test/scala/org/apache/spark/status/LiveEntitySuite.scala +++ b/core/src/test/scala/org/apache/spark/status/LiveEntitySuite.scala @@ -17,8 +17,11 @@ package org.apache.spark.status +import java.util.Arrays + import org.apache.spark.SparkFunSuite -import org.apache.spark.status.api.v1.RDDPartitionInfo +import org.apache.spark.storage.StorageLevel +import org.apache.spark.util.{AccumulatorMetadata, CollectionAccumulator} class LiveEntitySuite extends SparkFunSuite { @@ -52,6 +55,17 @@ class LiveEntitySuite extends SparkFunSuite { assert(!seq.exists(_.blockName == items(5).blockName)) } + test("Only show few elements of CollectionAccumulator when converting to v1.AccumulableInfo") { + val acc = new CollectionAccumulator[Int]() + val value = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) + acc.setValue(value) + acc.metadata = AccumulatorMetadata(0L, None, false) + val accuInfo = LiveEntityHelpers + .newAccumulatorInfos(Seq(acc.toInfo(Some(acc.value), Some(acc.value))))(0) + assert(accuInfo.update.get == "[1,2,3,4,5,... 5 more items]") + assert(accuInfo.value == "[1,2,3,4,5,... 5 more items]") + } + private def checkSize(seq: Seq[_], expected: Int): Unit = { assert(seq.length === expected) var count = 0 @@ -60,8 +74,8 @@ class LiveEntitySuite extends SparkFunSuite { } private def newPartition(i: Int): LiveRDDPartition = { - val part = new LiveRDDPartition(i.toString) - part.update(Seq(i.toString), i.toString, i, i) + val part = new LiveRDDPartition(i.toString, StorageLevel.MEMORY_AND_DISK) + part.update(Seq(i.toString), i, i) part } diff --git a/core/src/test/scala/org/apache/spark/storage/BlockIdSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockIdSuite.scala index ff4755833a916..ef7b13875540f 100644 --- a/core/src/test/scala/org/apache/spark/storage/BlockIdSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/BlockIdSuite.scala @@ -22,13 +22,13 @@ import java.util.UUID import org.apache.spark.SparkFunSuite class BlockIdSuite extends SparkFunSuite { - def assertSame(id1: BlockId, id2: BlockId) { + def assertSame(id1: BlockId, id2: BlockId): Unit = { assert(id1.name === id2.name) assert(id1.hashCode === id2.hashCode) assert(id1 === id2) } - def assertDifferent(id1: BlockId, id2: BlockId) { + def assertDifferent(id1: BlockId, id2: BlockId): Unit = { assert(id1.name != id2.name) assert(id1.hashCode != id2.hashCode) assert(id1 != id2) @@ -64,6 +64,20 @@ class BlockIdSuite extends SparkFunSuite { assertSame(id, BlockId(id.toString)) } + test("shuffle batch") { + val id = ShuffleBlockBatchId(1, 2, 3, 4) + assertSame(id, ShuffleBlockBatchId(1, 2, 3, 4)) + assertDifferent(id, ShuffleBlockBatchId(2, 2, 3, 4)) + assert(id.name === "shuffle_1_2_3_4") + assert(id.asRDDId === None) + assert(id.shuffleId === 1) + assert(id.mapId === 2) + assert(id.startReduceId === 3) + assert(id.endReduceId === 4) + assert(id.isShuffle) + assertSame(id, BlockId(id.toString)) + } + test("shuffle data") { val id = ShuffleDataBlockId(4, 5, 6) assertSame(id, ShuffleDataBlockId(4, 5, 6)) diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerInfoSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerInfoSuite.scala index 49cbd66cccb86..01e3d6a46e709 100644 --- a/core/src/test/scala/org/apache/spark/storage/BlockManagerInfoSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerInfoSuite.scala @@ -31,7 +31,6 @@ class BlockManagerInfoSuite extends SparkFunSuite { val bmInfo = new BlockManagerInfo( BlockManagerId("executor0", "host", 1234, None), timeMs = 300, - Array(), maxOnHeapMem = 10000, maxOffHeapMem = 20000, slaveEndpoint = null, diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala index 05a9ac685e5e7..59ace850d0bd2 100644 --- a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.storage import java.util.Locale +import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import scala.concurrent.duration._ import scala.language.implicitConversions @@ -97,9 +98,12 @@ trait BlockManagerReplicationBehavior extends SparkFunSuite conf.set(STORAGE_CACHED_PEERS_TTL, 10) sc = new SparkContext("local", "test", conf) + val blockManagerInfo = new mutable.HashMap[BlockManagerId, BlockManagerInfo]() master = new BlockManagerMaster(rpcEnv.setupEndpoint("blockmanager", new BlockManagerMasterEndpoint(rpcEnv, true, conf, - new LiveListenerBus(conf), None)), conf, true) + new LiveListenerBus(conf), None, blockManagerInfo)), + rpcEnv.setupEndpoint("blockmanagerHeartbeat", + new BlockManagerMasterHeartbeatEndpoint(rpcEnv, true, blockManagerInfo)), conf, true) allStores.clear() } @@ -308,7 +312,7 @@ trait BlockManagerReplicationBehavior extends SparkFunSuite * is correct. Then it also drops the block from memory of each store (using LRU) and * again checks whether the master's knowledge gets updated. */ - protected def testReplication(maxReplication: Int, storageLevels: Seq[StorageLevel]) { + protected def testReplication(maxReplication: Int, storageLevels: Seq[StorageLevel]): Unit = { import org.apache.spark.storage.StorageLevel._ assert(maxReplication > 1, @@ -431,7 +435,7 @@ class BlockManagerProactiveReplicationSuite extends BlockManagerReplicationBehav } } - def testProactiveReplication(replicationFactor: Int) { + def testProactiveReplication(replicationFactor: Int): Unit = { val blockSize = 1000 val storeSize = 10000 val initialStores = (1 to 10).map { i => makeBlockManager(storeSize, s"store$i") } diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala index 509d4efcab67a..8d06768a2b284 100644 --- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala @@ -21,6 +21,7 @@ import java.io.File import java.nio.ByteBuffer import scala.collection.JavaConverters._ +import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import scala.concurrent.Future import scala.concurrent.duration._ @@ -28,9 +29,8 @@ import scala.language.implicitConversions import scala.reflect.ClassTag import org.apache.commons.lang3.RandomUtils -import org.mockito.{ArgumentMatchers => mc} -import org.mockito.Mockito.{doAnswer, mock, spy, times, verify, when} -import org.mockito.invocation.InvocationOnMock +import org.mockito.{ArgumentCaptor, ArgumentMatchers => mc} +import org.mockito.Mockito.{doAnswer, mock, never, spy, times, verify, when} import org.scalatest._ import org.scalatest.concurrent.{Signaler, ThreadSignaler, TimeLimits} import org.scalatest.concurrent.Eventually._ @@ -50,7 +50,7 @@ import org.apache.spark.network.server.{NoOpRpcHandler, TransportServer, Transpo import org.apache.spark.network.shuffle.{BlockFetchingListener, DownloadFileManager, ExecutorDiskUtils, ExternalBlockStoreClient} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, RegisterExecutor} import org.apache.spark.rpc.RpcEnv -import org.apache.spark.scheduler.LiveListenerBus +import org.apache.spark.scheduler.{LiveListenerBus, SparkListenerBlockUpdated} import org.apache.spark.security.{CryptoStreamUtils, EncryptionFunSuite} import org.apache.spark.serializer.{JavaSerializer, KryoSerializer, SerializerManager} import org.apache.spark.shuffle.sort.SortShuffleManager @@ -71,6 +71,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE val allStores = ArrayBuffer[BlockManager]() var rpcEnv: RpcEnv = null var master: BlockManagerMaster = null + var liveListenerBus: LiveListenerBus = null val securityMgr = new SecurityManager(new SparkConf(false)) val bcastManager = new BroadcastManager(true, new SparkConf(false), securityMgr) val mapOutputTracker = new MapOutputTrackerMaster(new SparkConf(false), bcastManager, true) @@ -143,11 +144,16 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE // need to create a SparkContext is to initialize LiveListenerBus. sc = mock(classOf[SparkContext]) when(sc.conf).thenReturn(conf) - master = new BlockManagerMaster(rpcEnv.setupEndpoint("blockmanager", + + val blockManagerInfo = new mutable.HashMap[BlockManagerId, BlockManagerInfo]() + liveListenerBus = spy(new LiveListenerBus(conf)) + master = spy(new BlockManagerMaster(rpcEnv.setupEndpoint("blockmanager", new BlockManagerMasterEndpoint(rpcEnv, true, conf, - new LiveListenerBus(conf), None)), conf, true) + liveListenerBus, None, blockManagerInfo)), + rpcEnv.setupEndpoint("blockmanagerHeartbeat", + new BlockManagerMasterHeartbeatEndpoint(rpcEnv, true, blockManagerInfo)), conf, true)) - val initialize = PrivateMethod[Unit]('initialize) + val initialize = PrivateMethod[Unit](Symbol("initialize")) SizeEstimator invokePrivate initialize() } @@ -160,6 +166,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE rpcEnv.awaitTermination() rpcEnv = null master = null + liveListenerBus = null } finally { super.afterEach() } @@ -289,14 +296,19 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE eventually(timeout(1.second), interval(10.milliseconds)) { assert(!store.hasLocalBlock("a1-to-remove")) master.getLocations("a1-to-remove") should have size 0 + assertUpdateBlockInfoReportedForRemovingBlock(store, "a1-to-remove", + removedFromMemory = true, removedFromDisk = false) } eventually(timeout(1.second), interval(10.milliseconds)) { assert(!store.hasLocalBlock("a2-to-remove")) master.getLocations("a2-to-remove") should have size 0 + assertUpdateBlockInfoReportedForRemovingBlock(store, "a2-to-remove", + removedFromMemory = true, removedFromDisk = false) } eventually(timeout(1.second), interval(10.milliseconds)) { assert(store.hasLocalBlock("a3-to-remove")) master.getLocations("a3-to-remove") should have size 0 + assertUpdateBlockInfoNotReported(store, "a3-to-remove") } eventually(timeout(1.second), interval(10.milliseconds)) { val memStatus = master.getMemoryStatus.head._2 @@ -375,16 +387,21 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE assert(!executorStore.hasLocalBlock(broadcast0BlockId)) assert(executorStore.hasLocalBlock(broadcast1BlockId)) assert(executorStore.hasLocalBlock(broadcast2BlockId)) + assertUpdateBlockInfoReportedForRemovingBlock(executorStore, broadcast0BlockId, + removedFromMemory = false, removedFromDisk = true) // nothing should be removed from the driver store assert(driverStore.hasLocalBlock(broadcast0BlockId)) assert(driverStore.hasLocalBlock(broadcast1BlockId)) assert(driverStore.hasLocalBlock(broadcast2BlockId)) + assertUpdateBlockInfoNotReported(driverStore, broadcast0BlockId) // remove broadcast 0 block from the driver as well master.removeBroadcast(0, removeFromMaster = true, blocking = true) assert(!driverStore.hasLocalBlock(broadcast0BlockId)) assert(driverStore.hasLocalBlock(broadcast1BlockId)) + assertUpdateBlockInfoReportedForRemovingBlock(driverStore, broadcast0BlockId, + removedFromMemory = false, removedFromDisk = true) // remove broadcast 1 block from both the stores asynchronously // and verify all broadcast 1 blocks have been removed @@ -392,6 +409,10 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE eventually(timeout(1.second), interval(10.milliseconds)) { assert(!driverStore.hasLocalBlock(broadcast1BlockId)) assert(!executorStore.hasLocalBlock(broadcast1BlockId)) + assertUpdateBlockInfoReportedForRemovingBlock(driverStore, broadcast1BlockId, + removedFromMemory = false, removedFromDisk = true) + assertUpdateBlockInfoReportedForRemovingBlock(executorStore, broadcast1BlockId, + removedFromMemory = false, removedFromDisk = true) } // remove broadcast 2 from both the stores asynchronously @@ -402,11 +423,46 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE assert(!driverStore.hasLocalBlock(broadcast2BlockId2)) assert(!executorStore.hasLocalBlock(broadcast2BlockId)) assert(!executorStore.hasLocalBlock(broadcast2BlockId2)) + assertUpdateBlockInfoReportedForRemovingBlock(driverStore, broadcast2BlockId, + removedFromMemory = false, removedFromDisk = true) + assertUpdateBlockInfoReportedForRemovingBlock(driverStore, broadcast2BlockId2, + removedFromMemory = false, removedFromDisk = true) + assertUpdateBlockInfoReportedForRemovingBlock(executorStore, broadcast2BlockId, + removedFromMemory = false, removedFromDisk = true) + assertUpdateBlockInfoReportedForRemovingBlock(executorStore, broadcast2BlockId2, + removedFromMemory = false, removedFromDisk = true) } executorStore.stop() driverStore.stop() } + private def assertUpdateBlockInfoReportedForRemovingBlock( + store: BlockManager, + blockId: BlockId, + removedFromMemory: Boolean, + removedFromDisk: Boolean): Unit = { + def assertSizeReported(captor: ArgumentCaptor[Long], expectRemoved: Boolean): Unit = { + assert(captor.getAllValues().size() === 1) + if (expectRemoved) { + assert(captor.getValue() > 0) + } else { + assert(captor.getValue() === 0) + } + } + + val memSizeCaptor = ArgumentCaptor.forClass(classOf[Long]).asInstanceOf[ArgumentCaptor[Long]] + val diskSizeCaptor = ArgumentCaptor.forClass(classOf[Long]).asInstanceOf[ArgumentCaptor[Long]] + verify(master).updateBlockInfo(mc.eq(store.blockManagerId), mc.eq(blockId), + mc.eq(StorageLevel.NONE), memSizeCaptor.capture(), diskSizeCaptor.capture()) + assertSizeReported(memSizeCaptor, removedFromMemory) + assertSizeReported(diskSizeCaptor, removedFromDisk) + } + + private def assertUpdateBlockInfoNotReported(store: BlockManager, blockId: BlockId): Unit = { + verify(master, never()).updateBlockInfo(mc.eq(store.blockManagerId), mc.eq(blockId), + mc.eq(StorageLevel.NONE), mc.anyInt(), mc.anyInt()) + } + test("reregistration on heart beat") { val store = makeBlockManager(2000) val a1 = new Array[Byte](400) @@ -419,7 +475,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE master.removeExecutor(store.blockManagerId.executorId) assert(master.getLocations("a1").size == 0, "a1 was not removed from master") - val reregister = !master.driverEndpoint.askSync[Boolean]( + val reregister = !master.driverHeartbeatEndPoint.askSync[Boolean]( BlockManagerHeartbeat(store.blockManagerId)) assert(reregister) } @@ -451,18 +507,18 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE for (i <- 1 to 100) { master.removeExecutor(store.blockManagerId.executorId) val t1 = new Thread { - override def run() { + override def run(): Unit = { store.putIterator( "a2", a2.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true) } } val t2 = new Thread { - override def run() { + override def run(): Unit = { store.putSingle("a1", a1, StorageLevel.MEMORY_ONLY) } } val t3 = new Thread { - override def run() { + override def run(): Unit = { store.reregister() } } @@ -520,7 +576,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE when(bmMaster.getLocations(mc.any[BlockId])).thenReturn(Seq(bmId1, bmId2, bmId3)) val blockManager = makeBlockManager(128, "exec", bmMaster) - val sortLocations = PrivateMethod[Seq[BlockManagerId]]('sortLocations) + val sortLocations = PrivateMethod[Seq[BlockManagerId]](Symbol("sortLocations")) val locations = blockManager invokePrivate sortLocations(bmMaster.getLocations("test")) assert(locations.map(_.host) === Seq(localHost, localHost, otherHost)) } @@ -543,7 +599,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE val blockManager = makeBlockManager(128, "exec", bmMaster) blockManager.blockManagerId = BlockManagerId(SparkContext.DRIVER_IDENTIFIER, localHost, 1, Some(localRack)) - val sortLocations = PrivateMethod[Seq[BlockManagerId]]('sortLocations) + val sortLocations = PrivateMethod[Seq[BlockManagerId]](Symbol("sortLocations")) val locations = blockManager invokePrivate sortLocations(bmMaster.getLocations("test")) assert(locations.map(_.host) === Seq(localHost, localHost, otherHost, otherHost, otherHost)) assert(locations.flatMap(_.topologyInfo) @@ -601,7 +657,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE // check getRemoteBytes val bytesViaStore1 = cleanBm.getRemoteBytes(blockId) assert(bytesViaStore1.isDefined) - val expectedContent = sameHostBm.getBlockData(blockId).nioByteBuffer().array() + val expectedContent = sameHostBm.getLocalBlockData(blockId).nioByteBuffer().array() assert(bytesViaStore1.get.toArray === expectedContent) // check getRemoteValues @@ -1042,7 +1098,7 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE val blockStatus = blockStatusOption.get assert((blockStatus.diskSize > 0) === !storageLevel.useMemory) assert((blockStatus.memSize > 0) === storageLevel.useMemory) - assert(blockManager.getBlockData(blockId).nioByteBuffer().array() === ser) + assert(blockManager.getLocalBlockData(blockId).nioByteBuffer().array() === ser) } Seq( @@ -1640,6 +1696,16 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE assert(locs(blockIds(0)) == expectedLocs) } + test("SPARK-30594: Do not post SparkListenerBlockUpdated when updateBlockInfo returns false") { + // update block info for non-existent block manager + val updateInfo = UpdateBlockInfo(BlockManagerId("1", "host1", 100), + BlockId("test_1"), StorageLevel.MEMORY_ONLY, 1, 1) + val result = master.driverEndpoint.askSync[Boolean](updateInfo) + + assert(!result) + verify(liveListenerBus, never()).post(SparkListenerBlockUpdated(BlockUpdatedInfo(updateInfo))) + } + class MockBlockTransferService(val maxFailures: Int) extends BlockTransferService { var numCalls = 0 var tempFileManager: DownloadFileManager = null diff --git a/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala index 0c4f3c48ef802..ccc525e854838 100644 --- a/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/DiskBlockManagerSuite.scala @@ -18,7 +18,6 @@ package org.apache.spark.storage import java.io.{File, FileWriter} -import java.util.UUID import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} @@ -33,14 +32,14 @@ class DiskBlockManagerSuite extends SparkFunSuite with BeforeAndAfterEach with B var diskBlockManager: DiskBlockManager = _ - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() rootDir0 = Utils.createTempDir() rootDir1 = Utils.createTempDir() rootDirs = rootDir0.getAbsolutePath + "," + rootDir1.getAbsolutePath } - override def afterAll() { + override def afterAll(): Unit = { try { Utils.deleteRecursively(rootDir0) Utils.deleteRecursively(rootDir1) @@ -49,14 +48,14 @@ class DiskBlockManagerSuite extends SparkFunSuite with BeforeAndAfterEach with B } } - override def beforeEach() { + override def beforeEach(): Unit = { super.beforeEach() val conf = testConf.clone - conf.set("spark.local.dir", rootDirs) + conf.set("spark.local.dir", rootDirs).set("spark.diskStore.subDirectories", "1") diskBlockManager = new DiskBlockManager(conf, deleteFilesOnStop = true) } - override def afterEach() { + override def afterEach(): Unit = { try { diskBlockManager.stop() } finally { @@ -86,9 +85,50 @@ class DiskBlockManagerSuite extends SparkFunSuite with BeforeAndAfterEach with B assert(diskBlockManager.getAllBlocks().isEmpty) } - def writeToFile(file: File, numBytes: Int) { + def writeToFile(file: File, numBytes: Int): Unit = { val writer = new FileWriter(file, true) for (i <- 0 until numBytes) writer.write(i) writer.close() } + + test("temporary shuffle/local file should be able to handle disk failures") { + try { + // the following two lines pre-create subdirectories under each root dir of block manager + diskBlockManager.getFile("1") + diskBlockManager.getFile("2") + + val tempShuffleFile1 = diskBlockManager.createTempShuffleBlock()._2 + val tempLocalFile1 = diskBlockManager.createTempLocalBlock()._2 + assert(tempShuffleFile1.exists(), "There are no bad disks, so temp shuffle file exists") + assert(tempLocalFile1.exists(), "There are no bad disks, so temp local file exists") + + // partial disks damaged + rootDir0.setExecutable(false) + val tempShuffleFile2 = diskBlockManager.createTempShuffleBlock()._2 + val tempLocalFile2 = diskBlockManager.createTempLocalBlock()._2 + // It's possible that after 10 retries we still not able to find the healthy disk. we need to + // remove the flakiness of these two asserts + if (tempShuffleFile2.getParentFile.getParentFile.getParent === rootDir1.getAbsolutePath) { + assert(tempShuffleFile2.exists(), + "There is only one bad disk, so temp shuffle file should be created") + } + if (tempLocalFile2.getParentFile.getParentFile.getParent === rootDir1.getAbsolutePath) { + assert(tempLocalFile2.exists(), + "There is only one bad disk, so temp local file should be created") + } + + // all disks damaged + rootDir1.setExecutable(false) + val tempShuffleFile3 = diskBlockManager.createTempShuffleBlock()._2 + val tempLocalFile3 = diskBlockManager.createTempLocalBlock()._2 + assert(!tempShuffleFile3.exists(), + "All disks are broken, so there should be no temp shuffle file created") + assert(!tempLocalFile3.exists(), + "All disks are broken, so there should be no temp local file created") + } finally { + rootDir0.setExecutable(true) + rootDir1.setExecutable(true) + } + + } } diff --git a/core/src/test/scala/org/apache/spark/storage/MemoryStoreSuite.scala b/core/src/test/scala/org/apache/spark/storage/MemoryStoreSuite.scala index a7231411e81de..ccd7e4b62ad9e 100644 --- a/core/src/test/scala/org/apache/spark/storage/MemoryStoreSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/MemoryStoreSuite.scala @@ -55,7 +55,7 @@ class MemoryStoreSuite super.beforeEach() // Set the arch to 64-bit and compressedOops to true to get a deterministic test-case System.setProperty("os.arch", "amd64") - val initialize = PrivateMethod[Unit]('initialize) + val initialize = PrivateMethod[Unit](Symbol("initialize")) SizeEstimator invokePrivate initialize() } diff --git a/core/src/test/scala/org/apache/spark/storage/PartiallySerializedBlockSuite.scala b/core/src/test/scala/org/apache/spark/storage/PartiallySerializedBlockSuite.scala index 3dbc1c4b457a8..8177ef6e140b2 100644 --- a/core/src/test/scala/org/apache/spark/storage/PartiallySerializedBlockSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/PartiallySerializedBlockSuite.scala @@ -43,9 +43,10 @@ class PartiallySerializedBlockSuite private val memoryStore = Mockito.mock(classOf[MemoryStore], Mockito.RETURNS_SMART_NULLS) private val serializerManager = new SerializerManager(new JavaSerializer(conf), conf) - private val getSerializationStream = PrivateMethod[SerializationStream]('serializationStream) + private val getSerializationStream = + PrivateMethod[SerializationStream](Symbol("serializationStream")) private val getRedirectableOutputStream = - PrivateMethod[RedirectableOutputStream]('redirectableOutputStream) + PrivateMethod[RedirectableOutputStream](Symbol("redirectableOutputStream")) override protected def beforeEach(): Unit = { super.beforeEach() diff --git a/core/src/test/scala/org/apache/spark/storage/PartiallyUnrolledIteratorSuite.scala b/core/src/test/scala/org/apache/spark/storage/PartiallyUnrolledIteratorSuite.scala index 56860b2e55709..74442c2966a72 100644 --- a/core/src/test/scala/org/apache/spark/storage/PartiallyUnrolledIteratorSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/PartiallyUnrolledIteratorSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.storage import org.mockito.ArgumentMatchers.{eq => meq} import org.mockito.Mockito._ -import org.scalatest.mockito.MockitoSugar +import org.scalatestplus.mockito.MockitoSugar import org.apache.spark.SparkFunSuite import org.apache.spark.memory.MemoryMode.ON_HEAP diff --git a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala index ed402440e74f1..45f47c7c49bca 100644 --- a/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/ShuffleBlockFetcherIteratorSuite.scala @@ -20,7 +20,7 @@ package org.apache.spark.storage import java.io._ import java.nio.ByteBuffer import java.util.UUID -import java.util.concurrent.Semaphore +import java.util.concurrent.{CompletableFuture, Semaphore} import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent.Future @@ -33,7 +33,7 @@ import org.scalatest.PrivateMethodTester import org.apache.spark.{SparkFunSuite, TaskContext} import org.apache.spark.network._ import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer} -import org.apache.spark.network.shuffle.{BlockFetchingListener, DownloadFileManager} +import org.apache.spark.network.shuffle.{BlockFetchingListener, DownloadFileManager, ExternalBlockStoreClient} import org.apache.spark.network.util.LimitedInputStream import org.apache.spark.shuffle.FetchFailedException import org.apache.spark.util.Utils @@ -65,6 +65,29 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT transfer } + private def initHostLocalDirManager( + blockManager: BlockManager, + hostLocalDirs: Map[String, Array[String]]): Unit = { + val mockExternalBlockStoreClient = mock(classOf[ExternalBlockStoreClient]) + val hostLocalDirManager = new HostLocalDirManager( + futureExecutionContext = global, + cacheSize = 1, + externalBlockStoreClient = mockExternalBlockStoreClient, + host = "localhost", + externalShuffleServicePort = 7337) + + when(blockManager.hostLocalDirManager).thenReturn(Some(hostLocalDirManager)) + when(mockExternalBlockStoreClient.getHostLocalDirs(any(), any(), any(), any())) + .thenAnswer { invocation => + val completableFuture = invocation.getArguments()(3) + .asInstanceOf[CompletableFuture[java.util.Map[String, Array[String]]]] + import scala.collection.JavaConverters._ + completableFuture.complete(hostLocalDirs.asJava) + } + + blockManager.hostLocalDirManager = Some(hostLocalDirManager) + } + // Create a mock managed buffer for testing def createMockManagedBuffer(size: Int = 1): ManagedBuffer = { val mockManagedBuffer = mock(classOf[ManagedBuffer]) @@ -76,9 +99,24 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT mockManagedBuffer } - test("successful 3 local reads + 2 remote reads") { + def verifyBufferRelease(buffer: ManagedBuffer, inputStream: InputStream): Unit = { + // Note: ShuffleBlockFetcherIterator wraps input streams in a BufferReleasingInputStream + val wrappedInputStream = inputStream.asInstanceOf[BufferReleasingInputStream] + verify(buffer, times(0)).release() + val delegateAccess = PrivateMethod[InputStream](Symbol("delegate")) + + verify(wrappedInputStream.invokePrivate(delegateAccess()), times(0)).close() + wrappedInputStream.close() + verify(buffer, times(1)).release() + verify(wrappedInputStream.invokePrivate(delegateAccess()), times(1)).close() + wrappedInputStream.close() // close should be idempotent + verify(buffer, times(1)).release() + verify(wrappedInputStream.invokePrivate(delegateAccess()), times(1)).close() + } + + test("successful 3 local + 4 host local + 2 remote reads") { val blockManager = mock(classOf[BlockManager]) - val localBmId = BlockManagerId("test-client", "test-client", 1) + val localBmId = BlockManagerId("test-local-client", "test-local-host", 1) doReturn(localBmId).when(blockManager).blockManagerId // Make sure blockManager.getBlockData would return the blocks @@ -87,20 +125,38 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT ShuffleBlockId(0, 1, 0) -> createMockManagedBuffer(), ShuffleBlockId(0, 2, 0) -> createMockManagedBuffer()) localBlocks.foreach { case (blockId, buf) => - doReturn(buf).when(blockManager).getBlockData(meq(blockId)) + doReturn(buf).when(blockManager).getLocalBlockData(meq(blockId)) } // Make sure remote blocks would return - val remoteBmId = BlockManagerId("test-client-1", "test-client-1", 2) + val remoteBmId = BlockManagerId("test-remote-client-1", "test-remote-host", 2) val remoteBlocks = Map[BlockId, ManagedBuffer]( ShuffleBlockId(0, 3, 0) -> createMockManagedBuffer(), ShuffleBlockId(0, 4, 0) -> createMockManagedBuffer()) val transfer = createMockTransfer(remoteBlocks) - val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long)])]( - (localBmId, localBlocks.keys.map(blockId => (blockId, 1.asInstanceOf[Long])).toSeq), - (remoteBmId, remoteBlocks.keys.map(blockId => (blockId, 1.asInstanceOf[Long])).toSeq) + // Create a block manager running on the same host (host-local) + val hostLocalBmId = BlockManagerId("test-host-local-client-1", "test-local-host", 3) + val hostLocalBlocks = Map[BlockId, ManagedBuffer]( + ShuffleBlockId(0, 5, 0) -> createMockManagedBuffer(), + ShuffleBlockId(0, 6, 0) -> createMockManagedBuffer(), + ShuffleBlockId(0, 7, 0) -> createMockManagedBuffer(), + ShuffleBlockId(0, 8, 0) -> createMockManagedBuffer()) + + hostLocalBlocks.foreach { case (blockId, buf) => + doReturn(buf) + .when(blockManager) + .getHostLocalShuffleData(meq(blockId.asInstanceOf[ShuffleBlockId]), any()) + } + val hostLocalDirs = Map("test-host-local-client-1" -> Array("local-dir")) + // returning local dir for hostLocalBmId + initHostLocalDirManager(blockManager, hostLocalDirs) + + val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])]( + (localBmId, localBlocks.keys.map(blockId => (blockId, 1L, 0)).toSeq), + (remoteBmId, remoteBlocks.keys.map(blockId => (blockId, 1L, 1)).toSeq), + (hostLocalBmId, hostLocalBlocks.keys.map(blockId => (blockId, 1L, 1)).toSeq) ).toIterator val taskContext = TaskContext.empty() @@ -117,35 +173,229 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT Int.MaxValue, true, false, - metrics) + metrics, + false) // 3 local blocks fetched in initialization - verify(blockManager, times(3)).getBlockData(any()) + verify(blockManager, times(3)).getLocalBlockData(any()) - for (i <- 0 until 5) { - assert(iterator.hasNext, s"iterator should have 5 elements but actually has $i elements") + val allBlocks = localBlocks ++ remoteBlocks ++ hostLocalBlocks + for (i <- 0 until allBlocks.size) { + assert(iterator.hasNext, + s"iterator should have ${allBlocks.size} elements but actually has $i elements") val (blockId, inputStream) = iterator.next() // Make sure we release buffers when a wrapped input stream is closed. - val mockBuf = localBlocks.getOrElse(blockId, remoteBlocks(blockId)) - // Note: ShuffleBlockFetcherIterator wraps input streams in a BufferReleasingInputStream - val wrappedInputStream = inputStream.asInstanceOf[BufferReleasingInputStream] - verify(mockBuf, times(0)).release() - val delegateAccess = PrivateMethod[InputStream]('delegate) - - verify(wrappedInputStream.invokePrivate(delegateAccess()), times(0)).close() - wrappedInputStream.close() - verify(mockBuf, times(1)).release() - verify(wrappedInputStream.invokePrivate(delegateAccess()), times(1)).close() - wrappedInputStream.close() // close should be idempotent - verify(mockBuf, times(1)).release() - verify(wrappedInputStream.invokePrivate(delegateAccess()), times(1)).close() + val mockBuf = allBlocks(blockId) + verifyBufferRelease(mockBuf, inputStream) } - // 3 local blocks, and 2 remote blocks - // (but from the same block manager so one call to fetchBlocks) - verify(blockManager, times(3)).getBlockData(any()) + // 4 host-local locks fetched + verify(blockManager, times(4)) + .getHostLocalShuffleData(any(), meq(Array("local-dir"))) + + // 2 remote blocks are read from the same block manager verify(transfer, times(1)).fetchBlocks(any(), any(), any(), any(), any(), any()) + assert(blockManager.hostLocalDirManager.get.getCachedHostLocalDirs().size === 1) + } + + test("error during accessing host local dirs for executors") { + val blockManager = mock(classOf[BlockManager]) + val localBmId = BlockManagerId("test-local-client", "test-local-host", 1) + doReturn(localBmId).when(blockManager).blockManagerId + val hostLocalBlocks = Map[BlockId, ManagedBuffer]( + ShuffleBlockId(0, 1, 0) -> createMockManagedBuffer()) + + hostLocalBlocks.foreach { case (blockId, buf) => + doReturn(buf) + .when(blockManager) + .getHostLocalShuffleData(meq(blockId.asInstanceOf[ShuffleBlockId]), any()) + } + val hostLocalBmId = BlockManagerId("test-host-local-client-1", "test-local-host", 3) + + val mockExternalBlockStoreClient = mock(classOf[ExternalBlockStoreClient]) + val hostLocalDirManager = new HostLocalDirManager( + futureExecutionContext = global, + cacheSize = 1, + externalBlockStoreClient = mockExternalBlockStoreClient, + host = "localhost", + externalShuffleServicePort = 7337) + + when(blockManager.hostLocalDirManager).thenReturn(Some(hostLocalDirManager)) + when(mockExternalBlockStoreClient.getHostLocalDirs(any(), any(), any(), any())) + .thenAnswer { invocation => + val completableFuture = invocation.getArguments()(3) + .asInstanceOf[CompletableFuture[java.util.Map[String, Array[String]]]] + completableFuture.completeExceptionally(new Throwable("failed fetch")) + } + + blockManager.hostLocalDirManager = Some(hostLocalDirManager) + val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])]( + (hostLocalBmId, hostLocalBlocks.keys.map(blockId => (blockId, 1L, 1)).toSeq) + ).toIterator + + val transfer = createMockTransfer(Map()) + val taskContext = TaskContext.empty() + val metrics = taskContext.taskMetrics.createTempShuffleReadMetrics() + val iterator = new ShuffleBlockFetcherIterator( + taskContext, + transfer, + blockManager, + blocksByAddress, + (_, in) => in, + 48 * 1024 * 1024, + Int.MaxValue, + Int.MaxValue, + Int.MaxValue, + true, + false, + metrics, + false) + intercept[FetchFailedException] { iterator.next() } + } + + test("fetch continuous blocks in batch successful 3 local + 4 host local + 2 remote reads") { + val blockManager = mock(classOf[BlockManager]) + val localBmId = BlockManagerId("test-client", "test-local-host", 1) + doReturn(localBmId).when(blockManager).blockManagerId + + // Make sure blockManager.getBlockData would return the merged block + val localBlocks = Seq[BlockId]( + ShuffleBlockId(0, 0, 0), + ShuffleBlockId(0, 0, 1), + ShuffleBlockId(0, 0, 2)) + val mergedLocalBlocks = Map[BlockId, ManagedBuffer]( + ShuffleBlockBatchId(0, 0, 0, 3) -> createMockManagedBuffer()) + mergedLocalBlocks.foreach { case (blockId, buf) => + doReturn(buf).when(blockManager).getLocalBlockData(meq(blockId)) + } + + // Make sure remote blocks would return the merged block + val remoteBmId = BlockManagerId("test-client-1", "test-client-1", 2) + val remoteBlocks = Seq[BlockId]( + ShuffleBlockId(0, 3, 0), + ShuffleBlockId(0, 3, 1)) + val mergedRemoteBlocks = Map[BlockId, ManagedBuffer]( + ShuffleBlockBatchId(0, 3, 0, 2) -> createMockManagedBuffer()) + val transfer = createMockTransfer(mergedRemoteBlocks) + + // Create a block manager running on the same host (host-local) + val hostLocalBmId = BlockManagerId("test-host-local-client-1", "test-local-host", 3) + val hostLocalBlocks = Map[BlockId, ManagedBuffer]( + ShuffleBlockId(0, 4, 0) -> createMockManagedBuffer(), + ShuffleBlockId(0, 4, 1) -> createMockManagedBuffer(), + ShuffleBlockId(0, 4, 2) -> createMockManagedBuffer(), + ShuffleBlockId(0, 4, 3) -> createMockManagedBuffer()) + val mergedHostLocalBlocks = Map[BlockId, ManagedBuffer]( + ShuffleBlockBatchId(0, 4, 0, 4) -> createMockManagedBuffer()) + + mergedHostLocalBlocks.foreach { case (blockId, buf) => + doReturn(buf) + .when(blockManager) + .getHostLocalShuffleData(meq(blockId.asInstanceOf[ShuffleBlockBatchId]), any()) + } + val hostLocalDirs = Map("test-host-local-client-1" -> Array("local-dir")) + // returning local dir for hostLocalBmId + initHostLocalDirManager(blockManager, hostLocalDirs) + + val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])]( + (localBmId, localBlocks.map(blockId => (blockId, 1L, 0))), + (remoteBmId, remoteBlocks.map(blockId => (blockId, 1L, 1))), + (hostLocalBmId, hostLocalBlocks.keys.map(blockId => (blockId, 1L, 1)).toSeq) + ).toIterator + + val taskContext = TaskContext.empty() + val metrics = taskContext.taskMetrics.createTempShuffleReadMetrics() + val iterator = new ShuffleBlockFetcherIterator( + taskContext, + transfer, + blockManager, + blocksByAddress, + (_, in) => in, + 48 * 1024 * 1024, + Int.MaxValue, + Int.MaxValue, + Int.MaxValue, + true, + false, + metrics, + true) + + // 3 local blocks batch fetched in initialization + verify(blockManager, times(1)).getLocalBlockData(any()) + + val allBlocks = mergedLocalBlocks ++ mergedRemoteBlocks ++ mergedHostLocalBlocks + for (i <- 0 until 3) { + assert(iterator.hasNext, s"iterator should have 3 elements but actually has $i elements") + val (blockId, inputStream) = iterator.next() + verify(transfer, times(1)).fetchBlocks(any(), any(), any(), any(), any(), any()) + // Make sure we release buffers when a wrapped input stream is closed. + val mockBuf = allBlocks(blockId) + verifyBufferRelease(mockBuf, inputStream) + } + + // 4 host-local locks fetched + verify(blockManager, times(1)) + .getHostLocalShuffleData(any(), meq(Array("local-dir"))) + + assert(blockManager.hostLocalDirManager.get.getCachedHostLocalDirs().size === 1) + } + + test("fetch continuous blocks in batch respects maxSize and maxBlocks") { + val blockManager = mock(classOf[BlockManager]) + val localBmId = BlockManagerId("test-client", "test-local-host", 1) + doReturn(localBmId).when(blockManager).blockManagerId + + // Make sure remote blocks would return the merged block + val remoteBmId = BlockManagerId("test-client-1", "test-client-1", 2) + val remoteBlocks = Seq[BlockId]( + ShuffleBlockId(0, 3, 0), + ShuffleBlockId(0, 3, 1), + ShuffleBlockId(0, 3, 2), + ShuffleBlockId(0, 4, 0), + ShuffleBlockId(0, 4, 1), + ShuffleBlockId(0, 5, 0), + ShuffleBlockId(0, 5, 1), + ShuffleBlockId(0, 5, 2)) + val mergedRemoteBlocks = Map[BlockId, ManagedBuffer]( + ShuffleBlockBatchId(0, 3, 0, 3) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 4, 0, 2) -> createMockManagedBuffer(), + ShuffleBlockBatchId(0, 5, 0, 3) -> createMockManagedBuffer()) + val transfer = createMockTransfer(mergedRemoteBlocks) + + val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])]( + (remoteBmId, remoteBlocks.map(blockId => (blockId, 1L, 1))) + ).toIterator + + val taskContext = TaskContext.empty() + val metrics = taskContext.taskMetrics.createTempShuffleReadMetrics() + val iterator = new ShuffleBlockFetcherIterator( + taskContext, + transfer, + blockManager, + blocksByAddress, + (_, in) => in, + 35, + Int.MaxValue, + 2, + Int.MaxValue, + true, + false, + metrics, + true) + + var numResults = 0 + while (iterator.hasNext) { + val (blockId, inputStream) = iterator.next() + // Make sure we release buffers when a wrapped input stream is closed. + val mockBuf = mergedRemoteBlocks(blockId) + verifyBufferRelease(mockBuf, inputStream) + numResults += 1 + } + // The first 2 batch block ids are in the same fetch request as they don't exceed the max size + // and max blocks, so 2 requests in total. + verify(transfer, times(2)).fetchBlocks(any(), any(), any(), any(), any(), any()) + assert(numResults == 3) } test("release current unexhausted buffer in case the task completes early") { @@ -179,8 +429,8 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT } }) - val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long)])]( - (remoteBmId, blocks.keys.map(blockId => (blockId, 1.asInstanceOf[Long])).toSeq)).toIterator + val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])]( + (remoteBmId, blocks.keys.map(blockId => (blockId, 1L, 0)).toSeq)).toIterator val taskContext = TaskContext.empty() val iterator = new ShuffleBlockFetcherIterator( @@ -195,7 +445,8 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT Int.MaxValue, true, false, - taskContext.taskMetrics.createTempShuffleReadMetrics()) + taskContext.taskMetrics.createTempShuffleReadMetrics(), + false) verify(blocks(ShuffleBlockId(0, 0, 0)), times(0)).release() iterator.next()._2.close() // close() first block's input stream @@ -247,8 +498,9 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT } }) - val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long)])]( - (remoteBmId, blocks.keys.map(blockId => (blockId, 1.asInstanceOf[Long])).toSeq)).toIterator + val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])]( + (remoteBmId, blocks.keys.map(blockId => (blockId, 1L, 0)).toSeq)) + .toIterator val taskContext = TaskContext.empty() val iterator = new ShuffleBlockFetcherIterator( @@ -263,7 +515,8 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT Int.MaxValue, true, false, - taskContext.taskMetrics.createTempShuffleReadMetrics()) + taskContext.taskMetrics.createTempShuffleReadMetrics(), + false) // Continue only after the mock calls onBlockFetchFailure sem.acquire() @@ -336,8 +589,8 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT } }) - val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long)])]( - (remoteBmId, blocks.keys.map(blockId => (blockId, 1.asInstanceOf[Long])).toSeq)).toIterator + val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])]( + (remoteBmId, blocks.keys.map(blockId => (blockId, 1L, 0)).toSeq)).toIterator val taskContext = TaskContext.empty() val iterator = new ShuffleBlockFetcherIterator( @@ -352,7 +605,8 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT Int.MaxValue, true, true, - taskContext.taskMetrics.createTempShuffleReadMetrics()) + taskContext.taskMetrics.createTempShuffleReadMetrics(), + false) // Continue only after the mock calls onBlockFetchFailure sem.acquire() @@ -389,8 +643,8 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT val corruptBuffer1 = mockCorruptBuffer(streamLength, 0) val blockManagerId1 = BlockManagerId("remote-client-1", "remote-client-1", 1) val shuffleBlockId1 = ShuffleBlockId(0, 1, 0) - val blockLengths1 = Seq[Tuple2[BlockId, Long]]( - shuffleBlockId1 -> corruptBuffer1.size() + val blockLengths1 = Seq[Tuple3[BlockId, Long, Int]]( + (shuffleBlockId1, corruptBuffer1.size(), 1) ) val streamNotCorruptTill = 8 * 1024 @@ -398,13 +652,13 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT val corruptBuffer2 = mockCorruptBuffer(streamLength, streamNotCorruptTill) val blockManagerId2 = BlockManagerId("remote-client-2", "remote-client-2", 2) val shuffleBlockId2 = ShuffleBlockId(0, 2, 0) - val blockLengths2 = Seq[Tuple2[BlockId, Long]]( - shuffleBlockId2 -> corruptBuffer2.size() + val blockLengths2 = Seq[Tuple3[BlockId, Long, Int]]( + (shuffleBlockId2, corruptBuffer2.size(), 2) ) val transfer = createMockTransfer( Map(shuffleBlockId1 -> corruptBuffer1, shuffleBlockId2 -> corruptBuffer2)) - val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long)])]( + val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])]( (blockManagerId1, blockLengths1), (blockManagerId2, blockLengths2) ).toIterator @@ -422,7 +676,8 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT Int.MaxValue, true, true, - taskContext.taskMetrics.createTempShuffleReadMetrics()) + taskContext.taskMetrics.createTempShuffleReadMetrics(), + false) // We'll get back the block which has corruption after maxBytesInFlight/3 because the other // block will detect corruption on first fetch, and then get added to the queue again for @@ -464,12 +719,12 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT val blockManager = mock(classOf[BlockManager]) val localBmId = BlockManagerId("test-client", "test-client", 1) doReturn(localBmId).when(blockManager).blockManagerId - doReturn(managedBuffer).when(blockManager).getBlockData(ShuffleBlockId(0, 0, 0)) - val localBlockLengths = Seq[Tuple2[BlockId, Long]]( - ShuffleBlockId(0, 0, 0) -> 10000 + doReturn(managedBuffer).when(blockManager).getLocalBlockData(meq(ShuffleBlockId(0, 0, 0))) + val localBlockLengths = Seq[Tuple3[BlockId, Long, Int]]( + (ShuffleBlockId(0, 0, 0), 10000, 0) ) val transfer = createMockTransfer(Map(ShuffleBlockId(0, 0, 0) -> managedBuffer)) - val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long)])]( + val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])]( (localBmId, localBlockLengths) ).toIterator @@ -486,7 +741,8 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT Int.MaxValue, true, true, - taskContext.taskMetrics.createTempShuffleReadMetrics()) + taskContext.taskMetrics.createTempShuffleReadMetrics(), + false) val (id, st) = iterator.next() // Check that the test setup is correct -- make sure we have a concatenated stream. assert (st.asInstanceOf[BufferReleasingInputStream].delegate.isInstanceOf[SequenceInputStream]) @@ -531,8 +787,9 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT } }) - val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long)])]( - (remoteBmId, blocks.keys.map(blockId => (blockId, 1.asInstanceOf[Long])).toSeq)).toIterator + val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])]( + (remoteBmId, blocks.keys.map(blockId => (blockId, 1L, 0)).toSeq)) + .toIterator val taskContext = TaskContext.empty() val iterator = new ShuffleBlockFetcherIterator( @@ -547,7 +804,8 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT Int.MaxValue, true, false, - taskContext.taskMetrics.createTempShuffleReadMetrics()) + taskContext.taskMetrics.createTempShuffleReadMetrics(), + false) // Continue only after the mock calls onBlockFetchFailure sem.acquire() @@ -591,7 +849,7 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT }) def fetchShuffleBlock( - blocksByAddress: Iterator[(BlockManagerId, Seq[(BlockId, Long)])]): Unit = { + blocksByAddress: Iterator[(BlockManagerId, Seq[(BlockId, Long, Int)])]): Unit = { // Set `maxBytesInFlight` and `maxReqsInFlight` to `Int.MaxValue`, so that during the // construction of `ShuffleBlockFetcherIterator`, all requests to fetch remote shuffle blocks // are issued. The `maxReqSizeShuffleToMem` is hard-coded as 200 here. @@ -608,18 +866,19 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT maxReqSizeShuffleToMem = 200, detectCorrupt = true, false, - taskContext.taskMetrics.createTempShuffleReadMetrics()) + taskContext.taskMetrics.createTempShuffleReadMetrics(), + false) } - val blocksByAddress1 = Seq[(BlockManagerId, Seq[(BlockId, Long)])]( - (remoteBmId, remoteBlocks.keys.map(blockId => (blockId, 100L)).toSeq)).toIterator + val blocksByAddress1 = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])]( + (remoteBmId, remoteBlocks.keys.map(blockId => (blockId, 100L, 0)).toSeq)).toIterator fetchShuffleBlock(blocksByAddress1) // `maxReqSizeShuffleToMem` is 200, which is greater than the block size 100, so don't fetch // shuffle block to disk. assert(tempFileManager == null) - val blocksByAddress2 = Seq[(BlockManagerId, Seq[(BlockId, Long)])]( - (remoteBmId, remoteBlocks.keys.map(blockId => (blockId, 300L)).toSeq)).toIterator + val blocksByAddress2 = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])]( + (remoteBmId, remoteBlocks.keys.map(blockId => (blockId, 300L, 0)).toSeq)).toIterator fetchShuffleBlock(blocksByAddress2) // `maxReqSizeShuffleToMem` is 200, which is smaller than the block size 300, so fetch // shuffle block to disk. @@ -640,8 +899,8 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT val transfer = createMockTransfer(blocks.mapValues(_ => createMockManagedBuffer(0))) - val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long)])]( - (remoteBmId, blocks.keys.map(blockId => (blockId, 1.asInstanceOf[Long])).toSeq)) + val blocksByAddress = Seq[(BlockManagerId, Seq[(BlockId, Long, Int)])]( + (remoteBmId, blocks.keys.map(blockId => (blockId, 1L, 0)).toSeq)) val taskContext = TaskContext.empty() val iterator = new ShuffleBlockFetcherIterator( @@ -656,7 +915,8 @@ class ShuffleBlockFetcherIteratorSuite extends SparkFunSuite with PrivateMethodT Int.MaxValue, true, false, - taskContext.taskMetrics.createTempShuffleReadMetrics()) + taskContext.taskMetrics.createTempShuffleReadMetrics(), + false) // All blocks fetched return zero length and should trigger a receive-side error: val e = intercept[FetchFailedException] { iterator.next() } diff --git a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala index bd18e9e628da8..7711934cbe8a6 100644 --- a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala +++ b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala @@ -27,6 +27,7 @@ import org.mockito.Mockito.{mock, when, RETURNS_SMART_NULLS} import org.apache.spark._ import org.apache.spark.executor.{ExecutorMetrics, TaskMetrics} import org.apache.spark.internal.config.Status._ +import org.apache.spark.resource.ResourceProfile import org.apache.spark.scheduler._ import org.apache.spark.status.AppStatusStore import org.apache.spark.status.api.v1.{AccumulableInfo => UIAccumulableInfo, StageData, StageStatus} @@ -131,7 +132,8 @@ class StagePageSuite extends SparkFunSuite with LocalSparkContext { val page = new StagePage(tab, statusStore) // Simulate a stage in job progress listener - val stageInfo = new StageInfo(0, 0, "dummy", 1, Seq.empty, Seq.empty, "details") + val stageInfo = new StageInfo(0, 0, "dummy", 1, Seq.empty, Seq.empty, "details", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) // Simulate two tasks to test PEAK_EXECUTION_MEMORY correctness (1 to 2).foreach { taskId => diff --git a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala index 1913b8d425519..9f0cdeac9ca39 100644 --- a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala +++ b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.ui -import java.net.{HttpURLConnection, URL} +import java.net.URL import java.util.Locale import javax.servlet.http.{HttpServletRequest, HttpServletResponse} @@ -31,8 +31,8 @@ import org.openqa.selenium.{By, WebDriver} import org.openqa.selenium.htmlunit.HtmlUnitDriver import org.scalatest._ import org.scalatest.concurrent.Eventually._ -import org.scalatest.selenium.WebBrowser import org.scalatest.time.SpanSugar._ +import org.scalatestplus.selenium.WebBrowser import org.w3c.css.sac.CSSParseException import org.apache.spark._ @@ -233,7 +233,7 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B test("spark.ui.killEnabled should properly control kill button display") { def hasKillLink: Boolean = find(className("kill-link")).isDefined - def runSlowJob(sc: SparkContext) { + def runSlowJob(sc: SparkContext): Unit = { sc.parallelize(1 to 10).map{x => Thread.sleep(10000); x}.countAsync() } @@ -316,10 +316,12 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B val env = SparkEnv.get val bmAddress = env.blockManager.blockManagerId val shuffleId = shuffleHandle.shuffleId - val mapId = 0 + val mapId = 0L + val mapIndex = 0 val reduceId = taskContext.partitionId() val message = "Simulated fetch failure" - throw new FetchFailedException(bmAddress, shuffleId, mapId, reduceId, message) + throw new FetchFailedException( + bmAddress, shuffleId, mapId, mapIndex, reduceId, message) } else { x } @@ -754,6 +756,22 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B } } + test("description for empty jobs") { + withSpark(newSparkContext()) { sc => + sc.emptyRDD[Int].collect + val description = "This is my job" + sc.setJobDescription(description) + sc.emptyRDD[Int].collect + + eventually(timeout(10.seconds), interval(50.milliseconds)) { + goToUi(sc, "/jobs") + val descriptions = findAll(className("description-input")).toArray + descriptions(0).text should be (description) + descriptions(1).text should include ("collect") + } + } + } + def goToUi(sc: SparkContext, path: String): Unit = { goToUi(sc.ui.get, path) } diff --git a/core/src/test/scala/org/apache/spark/ui/UISuite.scala b/core/src/test/scala/org/apache/spark/ui/UISuite.scala index 34fd218437f87..2ad4a634cd9a7 100644 --- a/core/src/test/scala/org/apache/spark/ui/UISuite.scala +++ b/core/src/test/scala/org/apache/spark/ui/UISuite.scala @@ -32,7 +32,7 @@ import org.scalatest.time.SpanSugar._ import org.apache.spark._ import org.apache.spark.LocalSparkContext._ -import org.apache.spark.internal.config.UI.UI_ENABLED +import org.apache.spark.internal.config.UI import org.apache.spark.util.Utils class UISuite extends SparkFunSuite { @@ -45,7 +45,7 @@ class UISuite extends SparkFunSuite { val conf = new SparkConf() .setMaster("local") .setAppName("test") - .set(UI_ENABLED, true) + .set(UI.UI_ENABLED, true) val sc = new SparkContext(conf) assert(sc.ui.isDefined) sc @@ -273,7 +273,6 @@ class UISuite extends SparkFunSuite { val (_, testContext) = newContext("/test2") serverInfo.addHandler(testContext, securityMgr) - testContext.start() val httpPort = serverInfo.boundPort @@ -318,6 +317,54 @@ class UISuite extends SparkFunSuite { } } + test("redirect with proxy server support") { + val proxyRoot = "https://proxy.example.com:443/prefix" + val (conf, securityMgr, sslOptions) = sslDisabledConf() + conf.set(UI.PROXY_REDIRECT_URI, proxyRoot) + + val serverInfo = JettyUtils.startJettyServer("0.0.0.0", 0, sslOptions, conf) + try { + val serverAddr = s"http://localhost:${serverInfo.boundPort}" + + val (_, ctx) = newContext("/ctx1") + serverInfo.addHandler(ctx, securityMgr) + + val redirect = JettyUtils.createRedirectHandler("/src", "/dst") + serverInfo.addHandler(redirect, securityMgr) + + // Test Jetty's built-in redirect to add the trailing slash to the context path. + TestUtils.withHttpConnection(new URL(s"$serverAddr/ctx1")) { conn => + assert(conn.getResponseCode() === HttpServletResponse.SC_FOUND) + val location = Option(conn.getHeaderFields().get("Location")) + .map(_.get(0)).orNull + assert(location === s"$proxyRoot/ctx1/") + } + + // Test with a URL handled by the added redirect handler, and also including a path prefix. + val headers = Seq("X-Forwarded-Context" -> "/prefix") + TestUtils.withHttpConnection( + new URL(s"$serverAddr/src/"), + headers = headers) { conn => + assert(conn.getResponseCode() === HttpServletResponse.SC_FOUND) + val location = Option(conn.getHeaderFields().get("Location")) + .map(_.get(0)).orNull + assert(location === s"$proxyRoot/prefix/dst") + } + + // Not really used by Spark, but test with a relative redirect. + val relative = JettyUtils.createRedirectHandler("/rel", "root") + serverInfo.addHandler(relative, securityMgr) + TestUtils.withHttpConnection(new URL(s"$serverAddr/rel/")) { conn => + assert(conn.getResponseCode() === HttpServletResponse.SC_FOUND) + val location = Option(conn.getHeaderFields().get("Location")) + .map(_.get(0)).orNull + assert(location === s"$proxyRoot/rel/root") + } + } finally { + stopServer(serverInfo) + } + } + /** * Create a new context handler for the given path, with a single servlet that responds to * requests in `$path/root`. diff --git a/core/src/test/scala/org/apache/spark/ui/UIUtilsSuite.scala b/core/src/test/scala/org/apache/spark/ui/UIUtilsSuite.scala index de105b6f188f5..82773e3cc6860 100644 --- a/core/src/test/scala/org/apache/spark/ui/UIUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/ui/UIUtilsSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.ui import scala.xml.{Node, Text} +import scala.xml.Utility.trim import org.apache.spark.SparkFunSuite @@ -129,6 +130,55 @@ class UIUtilsSuite extends SparkFunSuite { assert(decoded1 === decodeURLParameter(decoded1)) } + test("listingTable with tooltips") { + + def generateDataRowValue: String => Seq[Node] = row => {row} + val header = Seq("Header1", "Header2") + val data = Seq("Data1", "Data2") + val tooltip = Seq(None, Some("tooltip")) + + val generated = listingTable(header, generateDataRowValue, data, tooltipHeaders = tooltip) + + val expected: Node = +
    Pool NameMinimum SharePool Weight + Minimum Share + + Pool Weight + Active Stages Running TasksSchedulingModeScheduling Mode
    {errorSummary}{details}{failureReasonSummary}{details}
    + + + + + + {data.map(generateDataRowValue)} + +
    {header(0)} + + {header(1)} + +
    + + assert(trim(generated(0)) == trim(expected)) + } + + test("listingTable without tooltips") { + + def generateDataRowValue: String => Seq[Node] = row => {row} + val header = Seq("Header1", "Header2") + val data = Seq("Data1", "Data2") + + val generated = listingTable(header, generateDataRowValue, data) + + val expected = + + + + + + + {data.map(generateDataRowValue)} + +
    {header(0)}{header(1)}
    + + assert(trim(generated(0)) == trim(expected)) + } + private def verify( desc: String, expected: Node, diff --git a/core/src/test/scala/org/apache/spark/ui/storage/StoragePageSuite.scala b/core/src/test/scala/org/apache/spark/ui/storage/StoragePageSuite.scala index 06f01a60868f9..f93ecd3b006b2 100644 --- a/core/src/test/scala/org/apache/spark/ui/storage/StoragePageSuite.scala +++ b/core/src/test/scala/org/apache/spark/ui/storage/StoragePageSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.ui.storage import javax.servlet.http.HttpServletRequest import org.mockito.Mockito._ +import scala.xml.{Node, Text} import org.apache.spark.SparkFunSuite import org.apache.spark.status.StreamBlockData @@ -74,7 +75,21 @@ class StoragePageSuite extends SparkFunSuite { "Fraction Cached", "Size in Memory", "Size on Disk") - assert((xmlNodes \\ "th").map(_.text) === headers) + + val headerRow: Seq[Node] = { + headers.view.zipWithIndex.map { x => + storagePage.tooltips(x._2) match { + case Some(tooltip) => + + + {Text(x._1)} + + + case None => {Text(x._1)} + } + }.toList + } + assert((xmlNodes \\ "th").map(_.text) === headerRow.map(_.text)) assert((xmlNodes \\ "tr").size === 3) assert(((xmlNodes \\ "tr")(0) \\ "td").map(_.text.trim) === diff --git a/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala b/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala index 5e08a3dc1181d..b0520c7ab1b1f 100644 --- a/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/ClosureCleanerSuite.scala @@ -299,7 +299,7 @@ private object TestUserClosuresActuallyCleaned { rdd.aggregateByKey(0)({ case (_, _) => return; 1 }, { case (_, _) => return; 1 }).count() } def testFoldByKey(rdd: RDD[(Int, Int)]): Unit = { rdd.foldByKey(0) { case (_, _) => return; 1 } } - def testReduceByKey(rdd: RDD[(Int, Int)]): Unit = { rdd.reduceByKey { case (_, _) => return; 1 } } + def testReduceByKey(rdd: RDD[(Int, Int)]): Unit = { rdd.reduceByKey { (_, _) => return; 1 } } def testReduceByKeyLocally(rdd: RDD[(Int, Int)]): Unit = { rdd.reduceByKeyLocally { case (_, _) => return; 1 } } diff --git a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala index f5f93ece660b8..21e69550785a4 100644 --- a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala @@ -356,7 +356,7 @@ class FileAppenderSuite extends SparkFunSuite with BeforeAndAfter with Logging { } /** Delete all the generated rolled over files */ - def cleanup() { + def cleanup(): Unit = { testFile.getParentFile.listFiles.filter { file => file.getName.startsWith(testFile.getName) }.foreach { _.delete() } diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala index e781c5f71faf4..edc0662a0f73e 100644 --- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala @@ -32,8 +32,7 @@ import org.apache.spark._ import org.apache.spark.executor._ import org.apache.spark.metrics.ExecutorMetricType import org.apache.spark.rdd.RDDOperationScope -import org.apache.spark.resource.ResourceInformation -import org.apache.spark.resource.ResourceUtils +import org.apache.spark.resource.{ResourceInformation, ResourceProfile, ResourceUtils} import org.apache.spark.scheduler._ import org.apache.spark.scheduler.cluster.ExecutorInfo import org.apache.spark.shuffle.MetadataFetchFailedException @@ -179,7 +178,7 @@ class JsonProtocolSuite extends SparkFunSuite { testJobResult(jobFailed) // TaskEndReason - val fetchFailed = FetchFailed(BlockManagerId("With or", "without you", 15), 17, 18, 19, + val fetchFailed = FetchFailed(BlockManagerId("With or", "without you", 15), 17, 16L, 18, 19, "Some exception") val fetchMetadataFailed = new MetadataFetchFailedException(17, 19, "metadata Fetch failed exception").toTaskFailedReason @@ -296,12 +295,12 @@ class JsonProtocolSuite extends SparkFunSuite { test("FetchFailed backwards compatibility") { // FetchFailed in Spark 1.1.0 does not have a "Message" property. - val fetchFailed = FetchFailed(BlockManagerId("With or", "without you", 15), 17, 18, 19, + val fetchFailed = FetchFailed(BlockManagerId("With or", "without you", 15), 17, 16L, 18, 19, "ignored") val oldEvent = JsonProtocol.taskEndReasonToJson(fetchFailed) .removeField({ _._1 == "Message" }) - val expectedFetchFailed = FetchFailed(BlockManagerId("With or", "without you", 15), 17, 18, 19, - "Unknown reason") + val expectedFetchFailed = FetchFailed(BlockManagerId("With or", "without you", 15), 17, 16L, + 18, 19, "Unknown reason") assert(expectedFetchFailed === JsonProtocol.taskEndReasonFromJson(oldEvent)) } @@ -341,7 +340,8 @@ class JsonProtocolSuite extends SparkFunSuite { val stageIds = Seq[Int](1, 2, 3, 4) val stageInfos = stageIds.map(x => makeStageInfo(x, x * 200, x * 300, x * 400L, x * 500L)) val dummyStageInfos = - stageIds.map(id => new StageInfo(id, 0, "unknown", 0, Seq.empty, Seq.empty, "unknown")) + stageIds.map(id => new StageInfo(id, 0, "unknown", 0, Seq.empty, Seq.empty, "unknown", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID)) val jobStart = SparkListenerJobStart(10, jobSubmissionTime, stageInfos, properties) val oldEvent = JsonProtocol.jobStartToJson(jobStart).removeField({_._1 == "Stage Infos"}) val expectedJobStart = @@ -383,9 +383,11 @@ class JsonProtocolSuite extends SparkFunSuite { test("StageInfo backward compatibility (parent IDs)") { // Prior to Spark 1.4.0, StageInfo did not have the "Parent IDs" property - val stageInfo = new StageInfo(1, 1, "me-stage", 1, Seq.empty, Seq(1, 2, 3), "details") + val stageInfo = new StageInfo(1, 1, "me-stage", 1, Seq.empty, Seq(1, 2, 3), "details", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) val oldStageInfo = JsonProtocol.stageInfoToJson(stageInfo).removeField({ _._1 == "Parent IDs"}) - val expectedStageInfo = new StageInfo(1, 1, "me-stage", 1, Seq.empty, Seq.empty, "details") + val expectedStageInfo = new StageInfo(1, 1, "me-stage", 1, Seq.empty, Seq.empty, "details", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) assertEquals(expectedStageInfo, JsonProtocol.stageInfoFromJson(oldStageInfo)) } @@ -496,59 +498,59 @@ private[spark] object JsonProtocolSuite extends Assertions { private val nodeBlacklistedTime = 1421458952000L private val nodeUnblacklistedTime = 1421458962000L - private def testEvent(event: SparkListenerEvent, jsonString: String) { + private def testEvent(event: SparkListenerEvent, jsonString: String): Unit = { val actualJsonString = compact(render(JsonProtocol.sparkEventToJson(event))) val newEvent = JsonProtocol.sparkEventFromJson(parse(actualJsonString)) assertJsonStringEquals(jsonString, actualJsonString, event.getClass.getSimpleName) assertEquals(event, newEvent) } - private def testRDDInfo(info: RDDInfo) { + private def testRDDInfo(info: RDDInfo): Unit = { val newInfo = JsonProtocol.rddInfoFromJson(JsonProtocol.rddInfoToJson(info)) assertEquals(info, newInfo) } - private def testStageInfo(info: StageInfo) { + private def testStageInfo(info: StageInfo): Unit = { val newInfo = JsonProtocol.stageInfoFromJson(JsonProtocol.stageInfoToJson(info)) assertEquals(info, newInfo) } - private def testStorageLevel(level: StorageLevel) { + private def testStorageLevel(level: StorageLevel): Unit = { val newLevel = JsonProtocol.storageLevelFromJson(JsonProtocol.storageLevelToJson(level)) assertEquals(level, newLevel) } - private def testTaskMetrics(metrics: TaskMetrics) { + private def testTaskMetrics(metrics: TaskMetrics): Unit = { val newMetrics = JsonProtocol.taskMetricsFromJson(JsonProtocol.taskMetricsToJson(metrics)) assertEquals(metrics, newMetrics) } - private def testBlockManagerId(id: BlockManagerId) { + private def testBlockManagerId(id: BlockManagerId): Unit = { val newId = JsonProtocol.blockManagerIdFromJson(JsonProtocol.blockManagerIdToJson(id)) assert(id === newId) } - private def testTaskInfo(info: TaskInfo) { + private def testTaskInfo(info: TaskInfo): Unit = { val newInfo = JsonProtocol.taskInfoFromJson(JsonProtocol.taskInfoToJson(info)) assertEquals(info, newInfo) } - private def testJobResult(result: JobResult) { + private def testJobResult(result: JobResult): Unit = { val newResult = JsonProtocol.jobResultFromJson(JsonProtocol.jobResultToJson(result)) assertEquals(result, newResult) } - private def testTaskEndReason(reason: TaskEndReason) { + private def testTaskEndReason(reason: TaskEndReason): Unit = { val newReason = JsonProtocol.taskEndReasonFromJson(JsonProtocol.taskEndReasonToJson(reason)) assertEquals(reason, newReason) } - private def testBlockId(blockId: BlockId) { + private def testBlockId(blockId: BlockId): Unit = { val newBlockId = BlockId(blockId.toString) assert(blockId === newBlockId) } - private def testExecutorInfo(info: ExecutorInfo) { + private def testExecutorInfo(info: ExecutorInfo): Unit = { val newInfo = JsonProtocol.executorInfoFromJson(JsonProtocol.executorInfoToJson(info)) assertEquals(info, newInfo) } @@ -565,7 +567,7 @@ private[spark] object JsonProtocolSuite extends Assertions { | Util methods for comparing events | * --------------------------------- */ - private[spark] def assertEquals(event1: SparkListenerEvent, event2: SparkListenerEvent) { + private[spark] def assertEquals(event1: SparkListenerEvent, event2: SparkListenerEvent): Unit = { (event1, event2) match { case (e1: SparkListenerStageSubmitted, e2: SparkListenerStageSubmitted) => assert(e1.properties === e2.properties) @@ -633,7 +635,7 @@ private[spark] object JsonProtocolSuite extends Assertions { } } - private def assertEquals(info1: StageInfo, info2: StageInfo) { + private def assertEquals(info1: StageInfo, info2: StageInfo): Unit = { assert(info1.stageId === info2.stageId) assert(info1.name === info2.name) assert(info1.numTasks === info2.numTasks) @@ -647,7 +649,7 @@ private[spark] object JsonProtocolSuite extends Assertions { assert(info1.details === info2.details) } - private def assertEquals(info1: RDDInfo, info2: RDDInfo) { + private def assertEquals(info1: RDDInfo, info2: RDDInfo): Unit = { assert(info1.id === info2.id) assert(info1.name === info2.name) assert(info1.numPartitions === info2.numPartitions) @@ -657,14 +659,14 @@ private[spark] object JsonProtocolSuite extends Assertions { assertEquals(info1.storageLevel, info2.storageLevel) } - private def assertEquals(level1: StorageLevel, level2: StorageLevel) { + private def assertEquals(level1: StorageLevel, level2: StorageLevel): Unit = { assert(level1.useDisk === level2.useDisk) assert(level1.useMemory === level2.useMemory) assert(level1.deserialized === level2.deserialized) assert(level1.replication === level2.replication) } - private def assertEquals(info1: TaskInfo, info2: TaskInfo) { + private def assertEquals(info1: TaskInfo, info2: TaskInfo): Unit = { assert(info1.taskId === info2.taskId) assert(info1.index === info2.index) assert(info1.attemptNumber === info2.attemptNumber) @@ -679,12 +681,12 @@ private[spark] object JsonProtocolSuite extends Assertions { assert(info1.accumulables === info2.accumulables) } - private def assertEquals(info1: ExecutorInfo, info2: ExecutorInfo) { + private def assertEquals(info1: ExecutorInfo, info2: ExecutorInfo): Unit = { assert(info1.executorHost == info2.executorHost) assert(info1.totalCores == info2.totalCores) } - private def assertEquals(metrics1: TaskMetrics, metrics2: TaskMetrics) { + private def assertEquals(metrics1: TaskMetrics, metrics2: TaskMetrics): Unit = { assert(metrics1.executorDeserializeTime === metrics2.executorDeserializeTime) assert(metrics1.executorDeserializeCpuTime === metrics2.executorDeserializeCpuTime) assert(metrics1.executorRunTime === metrics2.executorRunTime) @@ -700,23 +702,23 @@ private[spark] object JsonProtocolSuite extends Assertions { assertBlocksEquals(metrics1.updatedBlockStatuses, metrics2.updatedBlockStatuses) } - private def assertEquals(metrics1: ShuffleReadMetrics, metrics2: ShuffleReadMetrics) { + private def assertEquals(metrics1: ShuffleReadMetrics, metrics2: ShuffleReadMetrics): Unit = { assert(metrics1.remoteBlocksFetched === metrics2.remoteBlocksFetched) assert(metrics1.localBlocksFetched === metrics2.localBlocksFetched) assert(metrics1.fetchWaitTime === metrics2.fetchWaitTime) assert(metrics1.remoteBytesRead === metrics2.remoteBytesRead) } - private def assertEquals(metrics1: ShuffleWriteMetrics, metrics2: ShuffleWriteMetrics) { + private def assertEquals(metrics1: ShuffleWriteMetrics, metrics2: ShuffleWriteMetrics): Unit = { assert(metrics1.bytesWritten === metrics2.bytesWritten) assert(metrics1.writeTime === metrics2.writeTime) } - private def assertEquals(metrics1: InputMetrics, metrics2: InputMetrics) { + private def assertEquals(metrics1: InputMetrics, metrics2: InputMetrics): Unit = { assert(metrics1.bytesRead === metrics2.bytesRead) } - private def assertEquals(result1: JobResult, result2: JobResult) { + private def assertEquals(result1: JobResult, result2: JobResult): Unit = { (result1, result2) match { case (JobSucceeded, JobSucceeded) => case (r1: JobFailed, r2: JobFailed) => @@ -725,13 +727,14 @@ private[spark] object JsonProtocolSuite extends Assertions { } } - private def assertEquals(reason1: TaskEndReason, reason2: TaskEndReason) { + private def assertEquals(reason1: TaskEndReason, reason2: TaskEndReason): Unit = { (reason1, reason2) match { case (Success, Success) => case (Resubmitted, Resubmitted) => case (r1: FetchFailed, r2: FetchFailed) => assert(r1.shuffleId === r2.shuffleId) assert(r1.mapId === r2.mapId) + assert(r1.mapIndex === r2.mapIndex) assert(r1.reduceId === r2.reduceId) assert(r1.bmAddress === r2.bmAddress) assert(r1.message === r2.message) @@ -761,7 +764,7 @@ private[spark] object JsonProtocolSuite extends Assertions { private def assertEquals( details1: Map[String, Seq[(String, String)]], - details2: Map[String, Seq[(String, String)]]) { + details2: Map[String, Seq[(String, String)]]): Unit = { details1.zip(details2).foreach { case ((key1, values1: Seq[(String, String)]), (key2, values2: Seq[(String, String)])) => assert(key1 === key2) @@ -769,7 +772,7 @@ private[spark] object JsonProtocolSuite extends Assertions { } } - private def assertEquals(exception1: Exception, exception2: Exception) { + private def assertEquals(exception1: Exception, exception2: Exception): Unit = { assert(exception1.getMessage === exception2.getMessage) assertSeqEquals( exception1.getStackTrace, @@ -783,7 +786,7 @@ private[spark] object JsonProtocolSuite extends Assertions { } } - private def assertJsonStringEquals(expected: String, actual: String, metadata: String) { + private def assertJsonStringEquals(expected: String, actual: String, metadata: String): Unit = { val expectedJson = parse(expected) val actualJson = parse(actual) if (expectedJson != actualJson) { @@ -796,7 +799,7 @@ private[spark] object JsonProtocolSuite extends Assertions { } } - private def assertSeqEquals[T](seq1: Seq[T], seq2: Seq[T], assertEquals: (T, T) => Unit) { + private def assertSeqEquals[T](seq1: Seq[T], seq2: Seq[T], assertEquals: (T, T) => Unit): Unit = { assert(seq1.length === seq2.length) seq1.zip(seq2).foreach { case (t1, t2) => assertEquals(t1, t2) @@ -806,7 +809,7 @@ private[spark] object JsonProtocolSuite extends Assertions { private def assertOptionEquals[T]( opt1: Option[T], opt2: Option[T], - assertEquals: (T, T) => Unit) { + assertEquals: (T, T) => Unit): Unit = { if (opt1.isDefined) { assert(opt2.isDefined) assertEquals(opt1.get, opt2.get) @@ -825,11 +828,12 @@ private[spark] object JsonProtocolSuite extends Assertions { assertSeqEquals(blocks1, blocks2, assertBlockEquals) } - private def assertBlockEquals(b1: (BlockId, BlockStatus), b2: (BlockId, BlockStatus)) { + private def assertBlockEquals(b1: (BlockId, BlockStatus), b2: (BlockId, BlockStatus)): Unit = { assert(b1 === b2) } - private def assertStackTraceElementEquals(ste1: StackTraceElement, ste2: StackTraceElement) { + private def assertStackTraceElementEquals(ste1: StackTraceElement, + ste2: StackTraceElement): Unit = { // This mimics the equals() method from Java 8 and earlier. Java 9 adds checks for // class loader and module, which will cause them to be not equal, when we don't // care about those @@ -871,7 +875,8 @@ private[spark] object JsonProtocolSuite extends Assertions { private def makeStageInfo(a: Int, b: Int, c: Int, d: Long, e: Long) = { val rddInfos = (0 until a % 5).map { i => makeRddInfo(a + i, b + i, c + i, d + i, e + i) } - val stageInfo = new StageInfo(a, 0, "greetings", b, rddInfos, Seq(100, 200, 300), "details") + val stageInfo = new StageInfo(a, 0, "greetings", b, rddInfos, Seq(100, 200, 300), "details", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) val (acc1, acc2) = (makeAccumulableInfo(1), makeAccumulableInfo(2)) stageInfo.accumulables(acc1.id) = acc1 stageInfo.accumulables(acc2.id) = acc2 @@ -936,6 +941,7 @@ private[spark] object JsonProtocolSuite extends Assertions { t.setExecutorDeserializeCpuTime(a) t.setExecutorRunTime(b) t.setExecutorCpuTime(b) + t.setPeakExecutionMemory(c) t.setResultSize(c) t.setJvmGCTime(d) t.setResultSerializationTime(a + b) @@ -1241,6 +1247,7 @@ private[spark] object JsonProtocolSuite extends Assertions { | "Executor Deserialize CPU Time": 300, | "Executor Run Time": 400, | "Executor CPU Time": 400, + | "Peak Execution Memory": 500, | "Result Size": 500, | "JVM GC Time": 600, | "Result Serialization Time": 700, @@ -1364,6 +1371,7 @@ private[spark] object JsonProtocolSuite extends Assertions { | "Executor Deserialize CPU Time": 300, | "Executor Run Time": 400, | "Executor CPU Time": 400, + | "Peak Execution Memory": 500, | "Result Size": 500, | "JVM GC Time": 600, | "Result Serialization Time": 700, @@ -1487,6 +1495,7 @@ private[spark] object JsonProtocolSuite extends Assertions { | "Executor Deserialize CPU Time": 300, | "Executor Run Time": 400, | "Executor CPU Time": 400, + | "Peak Execution Memory": 500, | "Result Size": 500, | "JVM GC Time": 600, | "Result Serialization Time": 700, @@ -2050,7 +2059,7 @@ private[spark] object JsonProtocolSuite extends Assertions { | { | "ID": 9, | "Name": "$PEAK_EXECUTION_MEMORY", - | "Update": 0, + | "Update": 500, | "Internal": true, | "Count Failed Values": true | }, diff --git a/core/src/test/scala/org/apache/spark/util/KeyLockSuite.scala b/core/src/test/scala/org/apache/spark/util/KeyLockSuite.scala index 2169a0e4d442f..6888e492a8d33 100644 --- a/core/src/test/scala/org/apache/spark/util/KeyLockSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/KeyLockSuite.scala @@ -49,7 +49,7 @@ class KeyLockSuite extends SparkFunSuite with TimeLimits { @volatile var e: Throwable = null val threads = (0 until numThreads).map { i => new Thread() { - override def run(): Unit = try { + override def run(): Unit = { latch.await(foreverMs, TimeUnit.MILLISECONDS) keyLock.withLock(keys(i)) { var cur = numThreadsHoldingLock.get() diff --git a/core/src/test/scala/org/apache/spark/util/NextIteratorSuite.scala b/core/src/test/scala/org/apache/spark/util/NextIteratorSuite.scala index 4b7164d8acbce..1efd399b5db68 100644 --- a/core/src/test/scala/org/apache/spark/util/NextIteratorSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/NextIteratorSuite.scala @@ -81,7 +81,7 @@ class NextIteratorSuite extends SparkFunSuite with Matchers { } } - override def close() { + override def close(): Unit = { closeCalled += 1 } } diff --git a/core/src/test/scala/org/apache/spark/util/PeriodicRDDCheckpointerSuite.scala b/core/src/test/scala/org/apache/spark/util/PeriodicRDDCheckpointerSuite.scala index 06c2ceb68bd79..f14ec175232be 100644 --- a/core/src/test/scala/org/apache/spark/util/PeriodicRDDCheckpointerSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/PeriodicRDDCheckpointerSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.util import org.apache.hadoop.fs.Path +import org.scalatest.Assertions._ import org.apache.spark.{SharedSparkContext, SparkContext, SparkFunSuite} import org.apache.spark.rdd.RDD diff --git a/core/src/test/scala/org/apache/spark/util/PropertiesCloneBenchmark.scala b/core/src/test/scala/org/apache/spark/util/PropertiesCloneBenchmark.scala new file mode 100644 index 0000000000000..baacc7527a806 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/util/PropertiesCloneBenchmark.scala @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.util + +import java.util.Properties + +import scala.util.Random + +import org.apache.commons.lang3.SerializationUtils + +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} + + +/** + * Benchmark for Kryo Unsafe vs safe Serialization. + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class --jars + * 2. build/sbt "core/test:runMain " + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "core/test:runMain " + * Results will be written to "benchmarks/PropertiesCloneBenchmark-results.txt". + * }}} + */ +object PropertiesCloneBenchmark extends BenchmarkBase { + /** + * Benchmark various cases of cloning properties objects + */ + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + runBenchmark("Properties Cloning") { + def compareSerialization(name: String, props: Properties): Unit = { + val benchmark = new Benchmark(name, 1, output = output) + benchmark.addCase("SerializationUtils.clone") { _ => + SerializationUtils.clone(props) + } + benchmark.addCase("Utils.cloneProperties") { _ => + Utils.cloneProperties(props) + } + benchmark.run() + } + compareSerialization("Empty Properties", new Properties) + compareSerialization("System Properties", System.getProperties) + compareSerialization("Small Properties", makeRandomProps(10, 40, 100)) + compareSerialization("Medium Properties", makeRandomProps(50, 40, 100)) + compareSerialization("Large Properties", makeRandomProps(100, 40, 100)) + } + } + + def makeRandomProps(numProperties: Int, keySize: Int, valueSize: Int): Properties = { + val props = new Properties + for (_ <- 1 to numProperties) { + props.put( + Random.alphanumeric.take(keySize), + Random.alphanumeric.take(valueSize) + ) + } + props + } +} diff --git a/core/src/test/scala/org/apache/spark/util/ResetSystemProperties.scala b/core/src/test/scala/org/apache/spark/util/ResetSystemProperties.scala index 75e4504850679..0b1796540abbb 100644 --- a/core/src/test/scala/org/apache/spark/util/ResetSystemProperties.scala +++ b/core/src/test/scala/org/apache/spark/util/ResetSystemProperties.scala @@ -19,7 +19,6 @@ package org.apache.spark.util import java.util.Properties -import org.apache.commons.lang3.SerializationUtils import org.scalatest.{BeforeAndAfterEach, Suite} /** @@ -43,11 +42,11 @@ private[spark] trait ResetSystemProperties extends BeforeAndAfterEach { this: Su var oldProperties: Properties = null override def beforeEach(): Unit = { - // we need SerializationUtils.clone instead of `new Properties(System.getProperties())` because + // we need Utils.cloneProperties instead of `new Properties(System.getProperties())` because // the later way of creating a copy does not copy the properties but it initializes a new // Properties object with the given properties as defaults. They are not recognized at all // by standard Scala wrapper over Java Properties then. - oldProperties = SerializationUtils.clone(System.getProperties) + oldProperties = Utils.cloneProperties(System.getProperties) super.beforeEach() } diff --git a/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala b/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala index 8bc62db81e4f9..d4f2053e0b2f4 100644 --- a/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/SizeEstimatorSuite.scala @@ -73,7 +73,7 @@ class SizeEstimatorSuite with PrivateMethodTester with ResetSystemProperties { - override def beforeEach() { + override def beforeEach(): Unit = { // Set the arch to 64-bit and compressedOops to true to get a deterministic test-case super.beforeEach() System.setProperty("os.arch", "amd64") @@ -180,7 +180,7 @@ class SizeEstimatorSuite test("32-bit arch") { System.setProperty("os.arch", "x86") - val initialize = PrivateMethod[Unit]('initialize) + val initialize = PrivateMethod[Unit](Symbol("initialize")) SizeEstimator invokePrivate initialize() assertResult(40)(SizeEstimator.estimate(DummyString(""))) @@ -194,7 +194,7 @@ class SizeEstimatorSuite test("64-bit arch with no compressed oops") { System.setProperty("os.arch", "amd64") System.setProperty(TEST_USE_COMPRESSED_OOPS_KEY, "false") - val initialize = PrivateMethod[Unit]('initialize) + val initialize = PrivateMethod[Unit](Symbol("initialize")) SizeEstimator invokePrivate initialize() assertResult(56)(SizeEstimator.estimate(DummyString(""))) @@ -220,7 +220,7 @@ class SizeEstimatorSuite test("check 64-bit detection for s390x arch") { System.setProperty("os.arch", "s390x") - val initialize = PrivateMethod[Unit]('initialize) + val initialize = PrivateMethod[Unit](Symbol("initialize")) SizeEstimator invokePrivate initialize() // Class should be 32 bytes on s390x if recognised as 64 bit platform assertResult(32)(SizeEstimator.estimate(new DummyClass7)) diff --git a/core/src/test/scala/org/apache/spark/util/SparkUncaughtExceptionHandlerSuite.scala b/core/src/test/scala/org/apache/spark/util/SparkUncaughtExceptionHandlerSuite.scala new file mode 100644 index 0000000000000..90741a6bde7f0 --- /dev/null +++ b/core/src/test/scala/org/apache/spark/util/SparkUncaughtExceptionHandlerSuite.scala @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.util + +import java.io.File + +import scala.util.Try + +import org.apache.spark.SparkFunSuite + +class SparkUncaughtExceptionHandlerSuite extends SparkFunSuite { + + private val sparkHome = + sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) + + Seq( + (ThrowableTypes.RuntimeException, true, SparkExitCode.UNCAUGHT_EXCEPTION), + (ThrowableTypes.RuntimeException, false, 0), + (ThrowableTypes.OutOfMemoryError, true, SparkExitCode.OOM), + (ThrowableTypes.OutOfMemoryError, false, SparkExitCode.OOM), + (ThrowableTypes.SparkFatalRuntimeException, true, SparkExitCode.UNCAUGHT_EXCEPTION), + (ThrowableTypes.SparkFatalRuntimeException, false, 0), + (ThrowableTypes.SparkFatalOutOfMemoryError, true, SparkExitCode.OOM), + (ThrowableTypes.SparkFatalOutOfMemoryError, false, SparkExitCode.OOM) + ).foreach { + case (throwable: ThrowableTypes.ThrowableTypesVal, + exitOnUncaughtException: Boolean, expectedExitCode) => + test(s"SPARK-30310: Test uncaught $throwable, " + + s"exitOnUncaughtException = $exitOnUncaughtException") { + + // creates a ThrowableThrower process via spark-class and verify the exit code + val process = Utils.executeCommand( + Seq(s"$sparkHome/bin/spark-class", + ThrowableThrower.getClass.getCanonicalName.dropRight(1), // drops the "$" at the end + throwable.name, + exitOnUncaughtException.toString), + new File(sparkHome), + Map("SPARK_TESTING" -> "1", "SPARK_HOME" -> sparkHome) + ) + assert(process.waitFor == expectedExitCode) + } + } +} + +// enumeration object for the Throwable types that SparkUncaughtExceptionHandler handles +object ThrowableTypes extends Enumeration { + + sealed case class ThrowableTypesVal(name: String, t: Throwable) extends Val(name) + + val RuntimeException = ThrowableTypesVal("RuntimeException", new RuntimeException) + val OutOfMemoryError = ThrowableTypesVal("OutOfMemoryError", new OutOfMemoryError) + val SparkFatalRuntimeException = ThrowableTypesVal("SparkFatalException(RuntimeException)", + new SparkFatalException(new RuntimeException)) + val SparkFatalOutOfMemoryError = ThrowableTypesVal("SparkFatalException(OutOfMemoryError)", + new SparkFatalException(new OutOfMemoryError)) + + // returns the actual Throwable by its name + def getThrowableByName(name: String): Throwable = { + super.withName(name).asInstanceOf[ThrowableTypesVal].t + } +} + +// Invoked by spark-class for throwing a Throwable +object ThrowableThrower { + + // a thread that uses SparkUncaughtExceptionHandler and throws a Throwable by name + class ThrowerThread(name: String, exitOnUncaughtException: Boolean) extends Thread { + override def run() { + Thread.setDefaultUncaughtExceptionHandler( + new SparkUncaughtExceptionHandler(exitOnUncaughtException)) + throw ThrowableTypes.getThrowableByName(name) + } + } + + // main() requires 2 args: + // - args(0): name of the Throwable defined in ThrowableTypes + // - args(1): exitOnUncaughtException (true/false) + // + // it exits with the exit code dictated by either: + // - SparkUncaughtExceptionHandler (SparkExitCode) + // - main() (0, or -1 when number of args is wrong) + def main(args: Array[String]): Unit = { + if (args.length == 2) { + val t = new ThrowerThread(args(0), + Try(args(1).toBoolean).getOrElse(false)) + t.start() + t.join() + System.exit(0) + } else { + System.exit(-1) + } + } +} diff --git a/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala index aa3f062e582c3..ac36e537c75bb 100644 --- a/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala @@ -132,7 +132,7 @@ class ThreadUtilsSuite extends SparkFunSuite { val t = new Thread() { setDaemon(true) - override def run() { + override def run(): Unit = { try { // "par" is uninterruptible. The following will keep running even if the thread is // interrupted. We should prefer to use "ThreadUtils.parmap". diff --git a/core/src/test/scala/org/apache/spark/util/TimeStampedHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/TimeStampedHashMapSuite.scala index 77a92e7e1eb43..1644540946839 100644 --- a/core/src/test/scala/org/apache/spark/util/TimeStampedHashMapSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/TimeStampedHashMapSuite.scala @@ -63,7 +63,7 @@ class TimeStampedHashMapSuite extends SparkFunSuite { } /** Test basic operations of a Scala mutable Map. */ - def testMap(hashMapConstructor: => mutable.Map[String, String]) { + def testMap(hashMapConstructor: => mutable.Map[String, String]): Unit = { def newMap() = hashMapConstructor val testMap1 = newMap() val testMap2 = newMap() @@ -134,7 +134,7 @@ class TimeStampedHashMapSuite extends SparkFunSuite { } /** Test thread safety of a Scala mutable map. */ - def testMapThreadSafety(hashMapConstructor: => mutable.Map[String, String]) { + def testMapThreadSafety(hashMapConstructor: => mutable.Map[String, String]): Unit = { def newMap() = hashMapConstructor val name = newMap().getClass.getSimpleName val testMap = newMap() @@ -150,7 +150,7 @@ class TimeStampedHashMapSuite extends SparkFunSuite { } val threads = (1 to 25).map(i => new Thread() { - override def run() { + override def run(): Unit = { try { for (j <- 1 to 1000) { Random.nextInt(3) match { diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index 7e4a40b60aac5..8f8902e497d49 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -849,36 +849,6 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging { assert(buffer.toString === "st circular test circular") } - test("nanSafeCompareDoubles") { - def shouldMatchDefaultOrder(a: Double, b: Double): Unit = { - assert(Utils.nanSafeCompareDoubles(a, b) === JDouble.compare(a, b)) - assert(Utils.nanSafeCompareDoubles(b, a) === JDouble.compare(b, a)) - } - shouldMatchDefaultOrder(0d, 0d) - shouldMatchDefaultOrder(0d, 1d) - shouldMatchDefaultOrder(Double.MinValue, Double.MaxValue) - assert(Utils.nanSafeCompareDoubles(Double.NaN, Double.NaN) === 0) - assert(Utils.nanSafeCompareDoubles(Double.NaN, Double.PositiveInfinity) === 1) - assert(Utils.nanSafeCompareDoubles(Double.NaN, Double.NegativeInfinity) === 1) - assert(Utils.nanSafeCompareDoubles(Double.PositiveInfinity, Double.NaN) === -1) - assert(Utils.nanSafeCompareDoubles(Double.NegativeInfinity, Double.NaN) === -1) - } - - test("nanSafeCompareFloats") { - def shouldMatchDefaultOrder(a: Float, b: Float): Unit = { - assert(Utils.nanSafeCompareFloats(a, b) === JFloat.compare(a, b)) - assert(Utils.nanSafeCompareFloats(b, a) === JFloat.compare(b, a)) - } - shouldMatchDefaultOrder(0f, 0f) - shouldMatchDefaultOrder(1f, 1f) - shouldMatchDefaultOrder(Float.MinValue, Float.MaxValue) - assert(Utils.nanSafeCompareFloats(Float.NaN, Float.NaN) === 0) - assert(Utils.nanSafeCompareFloats(Float.NaN, Float.PositiveInfinity) === 1) - assert(Utils.nanSafeCompareFloats(Float.NaN, Float.NegativeInfinity) === 1) - assert(Utils.nanSafeCompareFloats(Float.PositiveInfinity, Float.NaN) === -1) - assert(Utils.nanSafeCompareFloats(Float.NegativeInfinity, Float.NaN) === -1) - } - test("isDynamicAllocationEnabled") { val conf = new SparkConf() conf.set("spark.master", "yarn") diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala index 2b5993a352cb0..0b4e1494bf300 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala @@ -436,7 +436,7 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite val it = map.iterator assert(it.isInstanceOf[CompletionIterator[_, _]]) // org.apache.spark.util.collection.AppendOnlyMap.destructiveSortedIterator returns - // an instance of an annonymous Iterator class. + // an instance of an anonymous Iterator class. val underlyingMapRef = WeakReference(map.currentMap) diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala index 2bad56d7ff424..a6de64b6c68a0 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala @@ -294,7 +294,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext { | Helper methods that contain the test body | * =========================================== */ - private def emptyDataStream(conf: SparkConf) { + private def emptyDataStream(conf: SparkConf): Unit = { conf.set(SHUFFLE_MANAGER, "sort") sc = new SparkContext("local", "test", conf) val context = MemoryTestingUtils.fakeTaskContext(sc.env) @@ -327,7 +327,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext { sorter4.stop() } - private def fewElementsPerPartition(conf: SparkConf) { + private def fewElementsPerPartition(conf: SparkConf): Unit = { conf.set(SHUFFLE_MANAGER, "sort") sc = new SparkContext("local", "test", conf) val context = MemoryTestingUtils.fakeTaskContext(sc.env) @@ -368,7 +368,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext { sorter4.stop() } - private def emptyPartitionsWithSpilling(conf: SparkConf) { + private def emptyPartitionsWithSpilling(conf: SparkConf): Unit = { val size = 1000 conf.set(SHUFFLE_MANAGER, "sort") conf.set(SHUFFLE_SPILL_NUM_ELEMENTS_FORCE_SPILL_THRESHOLD, size / 2) @@ -393,7 +393,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext { sorter.stop() } - private def testSpillingInLocalCluster(conf: SparkConf, numReduceTasks: Int) { + private def testSpillingInLocalCluster(conf: SparkConf, numReduceTasks: Int): Unit = { val size = 5000 conf.set(SHUFFLE_MANAGER, "sort") conf.set(SHUFFLE_SPILL_NUM_ELEMENTS_FORCE_SPILL_THRESHOLD, size / 4) @@ -517,7 +517,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext { conf: SparkConf, withPartialAgg: Boolean, withOrdering: Boolean, - withSpilling: Boolean) { + withSpilling: Boolean): Unit = { val size = 1000 if (withSpilling) { conf.set(SHUFFLE_SPILL_NUM_ELEMENTS_FORCE_SPILL_THRESHOLD, size / 2) @@ -551,7 +551,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext { assert(results === expected) } - private def sortWithoutBreakingSortingContracts(conf: SparkConf) { + private def sortWithoutBreakingSortingContracts(conf: SparkConf): Unit = { val size = 100000 val conf = createSparkConf(loadDefaults = true, kryo = false) conf.set(SHUFFLE_MANAGER, "sort") diff --git a/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala index 4759a830da4ca..8aa4be6c2ff8d 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala @@ -71,7 +71,7 @@ class SizeTrackerSuite extends SparkFunSuite { testMap[String, Int](10000, i => (randString(0, 10000), i)) } - def testVector[T: ClassTag](numElements: Int, makeElement: Int => T) { + def testVector[T: ClassTag](numElements: Int, makeElement: Int => T): Unit = { val vector = new SizeTrackingVector[T] for (i <- 0 until numElements) { val item = makeElement(i) @@ -80,7 +80,7 @@ class SizeTrackerSuite extends SparkFunSuite { } } - def testMap[K, V](numElements: Int, makeElement: (Int) => (K, V)) { + def testMap[K, V](numElements: Int, makeElement: (Int) => (K, V)): Unit = { val map = new SizeTrackingAppendOnlyMap[K, V] for (i <- 0 until numElements) { val (k, v) = makeElement(i) @@ -89,7 +89,7 @@ class SizeTrackerSuite extends SparkFunSuite { } } - def expectWithinError(obj: AnyRef, estimatedSize: Long, error: Double) { + def expectWithinError(obj: AnyRef, estimatedSize: Long, error: Double): Unit = { val betterEstimatedSize = SizeEstimator.estimate(obj) assert(betterEstimatedSize * (1 - error) < estimatedSize, s"Estimated size $estimatedSize was less than expected size $betterEstimatedSize") diff --git a/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala index e80bd96c982df..bb03f0d3cdc20 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala @@ -59,7 +59,7 @@ class SorterSuite extends SparkFunSuite with Logging { Arrays.sort(keys) new Sorter(new KVArraySortDataFormat[Double, Number]) - .sort(keyValueArray, 0, keys.length, Ordering.Double) + .sort(keyValueArray, 0, keys.length, (x, y) => java.lang.Double.compare(x, y)) keys.zipWithIndex.foreach { case (k, i) => assert(k === keyValueArray(2 * i)) @@ -311,12 +311,13 @@ abstract class AbstractIntArraySortDataFormat[K] extends SortDataFormat[K, Array data(pos1) = tmp } - override def copyElement(src: Array[Int], srcPos: Int, dst: Array[Int], dstPos: Int) { + override def copyElement(src: Array[Int], srcPos: Int, dst: Array[Int], dstPos: Int): Unit = { dst(dstPos) = src(srcPos) } /** Copy a range of elements starting at src(srcPos) to dest, starting at destPos. */ - override def copyRange(src: Array[Int], srcPos: Int, dst: Array[Int], dstPos: Int, length: Int) { + override def copyRange(src: Array[Int], srcPos: Int, + dst: Array[Int], dstPos: Int, length: Int): Unit = { System.arraycopy(src, srcPos, dst, dstPos, length) } @@ -334,13 +335,13 @@ abstract class AbstractByteArraySortDataFormat[K] extends SortDataFormat[K, Arra data(pos1) = tmp } - override def copyElement(src: Array[Byte], srcPos: Int, dst: Array[Byte], dstPos: Int) { + override def copyElement(src: Array[Byte], srcPos: Int, dst: Array[Byte], dstPos: Int): Unit = { dst(dstPos) = src(srcPos) } /** Copy a range of elements starting at src(srcPos) to dest, starting at destPos. */ override def copyRange(src: Array[Byte], - srcPos: Int, dst: Array[Byte], dstPos: Int, length: Int) { + srcPos: Int, dst: Array[Byte], dstPos: Int, length: Int): Unit = { System.arraycopy(src, srcPos, dst, dstPos, length) } diff --git a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala index 38cb37c524594..a55004f664a54 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/PrefixComparatorsSuite.scala @@ -20,12 +20,12 @@ package org.apache.spark.util.collection.unsafe.sort import java.nio.charset.StandardCharsets import com.google.common.primitives.UnsignedBytes -import org.scalatest.prop.PropertyChecks +import org.scalatestplus.scalacheck.ScalaCheckPropertyChecks import org.apache.spark.SparkFunSuite import org.apache.spark.unsafe.types.UTF8String -class PrefixComparatorsSuite extends SparkFunSuite with PropertyChecks { +class PrefixComparatorsSuite extends SparkFunSuite with ScalaCheckPropertyChecks { test("String prefix comparator") { diff --git a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/RadixSortSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/RadixSortSuite.scala index a3c006b43d8e4..9ae6a8ef879f3 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/RadixSortSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/unsafe/sort/RadixSortSuite.scala @@ -108,7 +108,8 @@ class RadixSortSuite extends SparkFunSuite with Logging { } } - private def referenceKeyPrefixSort(buf: LongArray, lo: Long, hi: Long, refCmp: PrefixComparator) { + private def referenceKeyPrefixSort(buf: LongArray, lo: Long, hi: Long, + refCmp: PrefixComparator): Unit = { val sortBuffer = new LongArray(MemoryBlock.fromLongArray(new Array[Long](buf.size().toInt))) new Sorter(new UnsafeSortDataFormat(sortBuffer)).sort( buf, Ints.checkedCast(lo), Ints.checkedCast(hi), diff --git a/core/src/test/scala/org/apache/spark/util/logging/DriverLoggerSuite.scala b/core/src/test/scala/org/apache/spark/util/logging/DriverLoggerSuite.scala index 973f71cdeb755..bd7ec242a9317 100644 --- a/core/src/test/scala/org/apache/spark/util/logging/DriverLoggerSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/logging/DriverLoggerSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.util.logging -import java.io.{BufferedInputStream, File, FileInputStream} +import java.io.File import org.apache.commons.io.FileUtils diff --git a/dev/.rat-excludes b/dev/.rat-excludes index e12dc994b0842..73f461255de43 100644 --- a/dev/.rat-excludes +++ b/dev/.rat-excludes @@ -118,3 +118,4 @@ announce.tmpl vote.tmpl SessionManager.java SessionHandler.java +GangliaReporter.java diff --git a/dev/appveyor-install-dependencies.ps1 b/dev/appveyor-install-dependencies.ps1 index d33a107cc86a5..1658cca6050bc 100644 --- a/dev/appveyor-install-dependencies.ps1 +++ b/dev/appveyor-install-dependencies.ps1 @@ -81,7 +81,7 @@ if (!(Test-Path $tools)) { # ========================== Maven Push-Location $tools -$mavenVer = "3.6.2" +$mavenVer = "3.6.3" Start-FileDownload "https://archive.apache.org/dist/maven/maven-3/$mavenVer/binaries/apache-maven-$mavenVer-bin.zip" "maven.zip" # extract @@ -90,7 +90,7 @@ Invoke-Expression "7z.exe x maven.zip" # add maven to environment variables $env:PATH = "$tools\apache-maven-$mavenVer\bin;" + $env:PATH $env:M2_HOME = "$tools\apache-maven-$mavenVer" -$env:MAVEN_OPTS = "-Xmx2g -XX:ReservedCodeCacheSize=512m" +$env:MAVEN_OPTS = "-Xmx2g -XX:ReservedCodeCacheSize=1g" Pop-Location @@ -115,7 +115,7 @@ $env:Path += ";$env:HADOOP_HOME\bin" Pop-Location # ========================== R -$rVer = "3.6.1" +$rVer = "3.6.2" $rToolsVer = "3.5.1" InstallR diff --git a/dev/change-scala-version.sh b/dev/change-scala-version.sh index 4054d530d065e..06411b9b12a0d 100755 --- a/dev/change-scala-version.sh +++ b/dev/change-scala-version.sh @@ -19,7 +19,7 @@ set -e -VALID_VERSIONS=( 2.12 ) +VALID_VERSIONS=( 2.12 2.13 ) usage() { echo "Usage: $(basename $0) [-h|--help] diff --git a/dev/checkstyle-suppressions.xml b/dev/checkstyle-suppressions.xml index 945686de49967..804a178a5fe28 100644 --- a/dev/checkstyle-suppressions.xml +++ b/dev/checkstyle-suppressions.xml @@ -30,6 +30,8 @@ + + + + + + @@ -91,10 +96,6 @@ - - - - diff --git a/dev/create-release/do-release-docker.sh b/dev/create-release/do-release-docker.sh index c1a122ebfb12e..694a87bf78084 100755 --- a/dev/create-release/do-release-docker.sh +++ b/dev/create-release/do-release-docker.sh @@ -127,6 +127,7 @@ GPG_KEY=$GPG_KEY ASF_PASSWORD=$ASF_PASSWORD GPG_PASSPHRASE=$GPG_PASSPHRASE RELEASE_STEP=$RELEASE_STEP +USER=$USER EOF JAVA_VOL= @@ -135,9 +136,6 @@ if [ -n "$JAVA" ]; then JAVA_VOL="--volume $JAVA:/opt/spark-java" fi -# SPARK-24530: Sphinx must work with python 3 to generate doc correctly. -echo "SPHINXPYTHON=/opt/p35/bin/python" >> $ENVFILE - echo "Building $RELEASE_TAG; output will be at $WORKDIR/output" docker run -ti \ --env-file "$ENVFILE" \ diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index f35bc4f48652b..022d3af95c05d 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -138,7 +138,8 @@ fi # Hive-specific profiles for some builds HIVE_PROFILES="-Phive -Phive-thriftserver" # Profiles for publishing snapshots and release to Maven Central -PUBLISH_PROFILES="$BASE_PROFILES $HIVE_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl" +# We use Apache Hive 2.3 for publishing +PUBLISH_PROFILES="$BASE_PROFILES $HIVE_PROFILES -Phive-2.3 -Pspark-ganglia-lgpl -Pkinesis-asl" # Profiles for building binary releases BASE_RELEASE_PROFILES="$BASE_PROFILES -Psparkr" @@ -164,7 +165,6 @@ DEST_DIR_NAME="$SPARK_PACKAGE_VERSION" git clean -d -f -x rm .gitignore -rm -rf .git cd .. if [[ "$1" == "package" ]]; then @@ -179,7 +179,7 @@ if [[ "$1" == "package" ]]; then rm -r spark-$SPARK_VERSION/licenses-binary fi - tar cvzf spark-$SPARK_VERSION.tgz spark-$SPARK_VERSION + tar cvzf spark-$SPARK_VERSION.tgz --exclude spark-$SPARK_VERSION/.git spark-$SPARK_VERSION echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour --output spark-$SPARK_VERSION.tgz.asc \ --detach-sig spark-$SPARK_VERSION.tgz echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \ @@ -220,7 +220,7 @@ if [[ "$1" == "package" ]]; then # Write out the VERSION to PySpark version info we rewrite the - into a . and SNAPSHOT # to dev0 to be closer to PEP440. - PYSPARK_VERSION=`echo "$SPARK_VERSION" | sed -r "s/-/./" | sed -r "s/SNAPSHOT/dev0/"` + PYSPARK_VERSION=`echo "$SPARK_VERSION" | sed -e "s/-/./" -e "s/SNAPSHOT/dev0/" -e "s/preview/dev/"` echo "__version__='$PYSPARK_VERSION'" > python/pyspark/version.py # Get maven home set by MVN @@ -281,6 +281,9 @@ if [[ "$1" == "package" ]]; then BINARY_PKGS_ARGS["without-hadoop"]="-Phadoop-provided" if [[ $SPARK_VERSION < "3.0." ]]; then BINARY_PKGS_ARGS["hadoop2.6"]="-Phadoop-2.6 $HIVE_PROFILES" + else + BINARY_PKGS_ARGS["hadoop2.7-hive1.2"]="-Phadoop-2.7 -Phive-1.2 $HIVE_PROFILES" + BINARY_PKGS_ARGS["hadoop3.2"]="-Phadoop-3.2 $HIVE_PROFILES" fi fi @@ -413,13 +416,13 @@ if [[ "$1" == "publish-release" ]]; then # TODO: revisit for Scala 2.13 support - if ! is_dry_run && [[ $PUBLISH_SCALA_2_11 = 1 ]]; then + if [[ $PUBLISH_SCALA_2_11 = 1 ]]; then ./dev/change-scala-version.sh 2.11 $MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo -DskipTests \ $SCALA_2_11_PROFILES $PUBLISH_PROFILES clean install fi - if ! is_dry_run && [[ $PUBLISH_SCALA_2_12 = 1 ]]; then + if [[ $PUBLISH_SCALA_2_12 = 1 ]]; then ./dev/change-scala-version.sh 2.12 $MVN -DzincPort=$((ZINC_PORT + 2)) -Dmaven.repo.local=$tmp_repo -DskipTests \ $SCALA_2_11_PROFILES $PUBLISH_PROFILES clean install diff --git a/dev/create-release/release-tag.sh b/dev/create-release/release-tag.sh index 8024440759eb5..39856a9955955 100755 --- a/dev/create-release/release-tag.sh +++ b/dev/create-release/release-tag.sh @@ -73,8 +73,12 @@ git config user.email $GIT_EMAIL # Create release version $MVN versions:set -DnewVersion=$RELEASE_VERSION | grep -v "no value" # silence logs -# Set the release version in R/pkg/DESCRIPTION -sed -i".tmp1" 's/Version.*$/Version: '"$RELEASE_VERSION"'/g' R/pkg/DESCRIPTION +if [[ $RELEASE_VERSION != *"preview"* ]]; then + # Set the release version in R/pkg/DESCRIPTION + sed -i".tmp1" 's/Version.*$/Version: '"$RELEASE_VERSION"'/g' R/pkg/DESCRIPTION +else + sed -i".tmp1" 's/-SNAPSHOT/'"-$(cut -d "-" -f 2 <<< $RELEASE_VERSION)"'/g' R/pkg/R/sparkR.R +fi # Set the release version in docs sed -i".tmp1" 's/SPARK_VERSION:.*$/SPARK_VERSION: '"$RELEASE_VERSION"'/g' docs/_config.yml sed -i".tmp2" 's/SPARK_VERSION_SHORT:.*$/SPARK_VERSION_SHORT: '"$RELEASE_VERSION"'/g' docs/_config.yml @@ -104,7 +108,11 @@ git commit -a -m "Preparing development version $NEXT_VERSION" if ! is_dry_run; then # Push changes git push origin $RELEASE_TAG - git push origin HEAD:$GIT_BRANCH + if [[ $RELEASE_VERSION != *"preview"* ]]; then + git push origin HEAD:$GIT_BRANCH + else + echo "It's preview release. We only push $RELEASE_TAG to remote." + fi cd .. rm -rf spark diff --git a/dev/create-release/spark-rm/Dockerfile b/dev/create-release/spark-rm/Dockerfile index 4bfecedbf0406..63451687ee8c2 100644 --- a/dev/create-release/spark-rm/Dockerfile +++ b/dev/create-release/spark-rm/Dockerfile @@ -20,7 +20,7 @@ # Includes: # * Java 8 # * Ivy -# * Python/PyPandoc (2.7.15/3.6.7) +# * Python (2.7.15/3.6.7) # * R-base/R-base-dev (3.6.1) # * Ruby 2.3 build utilities @@ -33,8 +33,8 @@ ENV DEBCONF_NONINTERACTIVE_SEEN true # These arguments are just for reuse and not really meant to be customized. ARG APT_INSTALL="apt-get install --no-install-recommends -y" -ARG BASE_PIP_PKGS="setuptools wheel virtualenv" -ARG PIP_PKGS="pyopenssl pypandoc numpy pygments sphinx" +ARG BASE_PIP_PKGS="setuptools wheel" +ARG PIP_PKGS="pyopenssl numpy sphinx" # Install extra needed repos and refresh. # - CRAN repo @@ -62,14 +62,13 @@ RUN apt-get clean && apt-get update && $APT_INSTALL gnupg ca-certificates && \ curl -sL https://deb.nodesource.com/setup_11.x | bash && \ $APT_INSTALL nodejs && \ # Install needed python packages. Use pip for installing packages (for consistency). - $APT_INSTALL libpython2.7-dev libpython3-dev python-pip python3-pip && \ - pip install $BASE_PIP_PKGS && \ - pip install $PIP_PKGS && \ - cd && \ - virtualenv -p python3 /opt/p35 && \ - . /opt/p35/bin/activate && \ - pip install $BASE_PIP_PKGS && \ - pip install $PIP_PKGS && \ + $APT_INSTALL libpython3-dev python3-pip && \ + # Change default python version to python3. + update-alternatives --install /usr/bin/python python /usr/bin/python2.7 1 && \ + update-alternatives --install /usr/bin/python python /usr/bin/python3.6 2 && \ + update-alternatives --set python /usr/bin/python3.6 && \ + pip3 install $BASE_PIP_PKGS && \ + pip3 install $PIP_PKGS && \ # Install R packages and dependencies used when building. # R depends on pandoc*, libssl (which are installed above). $APT_INSTALL r-base r-base-dev && \ @@ -79,8 +78,8 @@ RUN apt-get clean && apt-get update && $APT_INSTALL gnupg ca-certificates && \ # Install tools needed to build the documentation. $APT_INSTALL ruby2.3 ruby2.3-dev mkdocs && \ gem install jekyll --no-rdoc --no-ri -v 3.8.6 && \ - gem install jekyll-redirect-from && \ - gem install pygments.rb + gem install jekyll-redirect-from -v 0.15.0 && \ + gem install rouge WORKDIR /opt/spark-rm/output diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 deleted file mode 100644 index 775fb3c0a22e8..0000000000000 --- a/dev/deps/spark-deps-hadoop-2.7 +++ /dev/null @@ -1,202 +0,0 @@ -JavaEWAH-0.3.2.jar -RoaringBitmap-0.7.45.jar -ST4-4.0.4.jar -activation-1.1.1.jar -aircompressor-0.10.jar -antlr-2.7.7.jar -antlr-runtime-3.4.jar -antlr4-runtime-4.7.1.jar -aopalliance-1.0.jar -aopalliance-repackaged-2.5.0.jar -apache-log4j-extras-1.2.17.jar -apacheds-i18n-2.0.0-M15.jar -apacheds-kerberos-codec-2.0.0-M15.jar -api-asn1-api-1.0.0-M20.jar -api-util-1.0.0-M20.jar -arpack_combined_all-0.1.jar -arrow-format-0.12.0.jar -arrow-memory-0.12.0.jar -arrow-vector-0.12.0.jar -automaton-1.11-8.jar -avro-1.8.2.jar -avro-ipc-1.8.2.jar -avro-mapred-1.8.2-hadoop2.jar -bonecp-0.8.0.RELEASE.jar -breeze-macros_2.12-0.13.2.jar -breeze_2.12-0.13.2.jar -chill-java-0.9.3.jar -chill_2.12-0.9.3.jar -commons-beanutils-1.9.3.jar -commons-cli-1.2.jar -commons-codec-1.10.jar -commons-collections-3.2.2.jar -commons-compiler-3.0.15.jar -commons-compress-1.8.1.jar -commons-configuration-1.6.jar -commons-crypto-1.0.0.jar -commons-dbcp-1.4.jar -commons-digester-1.8.jar -commons-httpclient-3.1.jar -commons-io-2.4.jar -commons-lang-2.6.jar -commons-lang3-3.8.1.jar -commons-logging-1.1.3.jar -commons-math3-3.4.1.jar -commons-net-3.1.jar -commons-pool-1.5.4.jar -commons-text-1.6.jar -compress-lzf-1.0.3.jar -core-1.1.2.jar -curator-client-2.7.1.jar -curator-framework-2.7.1.jar -curator-recipes-2.7.1.jar -datanucleus-api-jdo-3.2.6.jar -datanucleus-core-3.2.10.jar -datanucleus-rdbms-3.2.9.jar -derby-10.12.1.1.jar -flatbuffers-java-1.9.0.jar -generex-1.0.2.jar -gson-2.2.4.jar -guava-14.0.1.jar -guice-3.0.jar -guice-servlet-3.0.jar -hadoop-annotations-2.7.4.jar -hadoop-auth-2.7.4.jar -hadoop-client-2.7.4.jar -hadoop-common-2.7.4.jar -hadoop-hdfs-2.7.4.jar -hadoop-mapreduce-client-app-2.7.4.jar -hadoop-mapreduce-client-common-2.7.4.jar -hadoop-mapreduce-client-core-2.7.4.jar -hadoop-mapreduce-client-jobclient-2.7.4.jar -hadoop-mapreduce-client-shuffle-2.7.4.jar -hadoop-yarn-api-2.7.4.jar -hadoop-yarn-client-2.7.4.jar -hadoop-yarn-common-2.7.4.jar -hadoop-yarn-server-common-2.7.4.jar -hadoop-yarn-server-web-proxy-2.7.4.jar -hk2-api-2.5.0.jar -hk2-locator-2.5.0.jar -hk2-utils-2.5.0.jar -hppc-0.7.2.jar -htrace-core-3.1.0-incubating.jar -httpclient-4.5.6.jar -httpcore-4.4.10.jar -istack-commons-runtime-3.0.8.jar -ivy-2.4.0.jar -jackson-annotations-2.9.9.jar -jackson-core-2.9.9.jar -jackson-core-asl-1.9.13.jar -jackson-databind-2.9.9.3.jar -jackson-dataformat-yaml-2.9.9.jar -jackson-jaxrs-1.9.13.jar -jackson-mapper-asl-1.9.13.jar -jackson-module-jaxb-annotations-2.9.9.jar -jackson-module-paranamer-2.9.9.jar -jackson-module-scala_2.12-2.9.9.jar -jackson-xc-1.9.13.jar -jakarta.annotation-api-1.3.4.jar -jakarta.inject-2.5.0.jar -jakarta.ws.rs-api-2.1.5.jar -jakarta.xml.bind-api-2.3.2.jar -janino-3.0.15.jar -javassist-3.22.0-CR2.jar -javax.el-3.0.1-b11.jar -javax.inject-1.jar -javax.servlet-api-3.1.0.jar -javolution-5.5.1.jar -jaxb-api-2.2.2.jar -jaxb-runtime-2.3.2.jar -jcl-over-slf4j-1.7.16.jar -jdo-api-3.0.1.jar -jersey-client-2.29.jar -jersey-common-2.29.jar -jersey-container-servlet-2.29.jar -jersey-container-servlet-core-2.29.jar -jersey-hk2-2.29.jar -jersey-media-jaxb-2.29.jar -jersey-server-2.29.jar -jetty-6.1.26.jar -jetty-sslengine-6.1.26.jar -jetty-util-6.1.26.jar -jline-2.14.6.jar -joda-time-2.9.3.jar -jodd-core-3.5.2.jar -jpam-1.1.jar -json4s-ast_2.12-3.6.6.jar -json4s-core_2.12-3.6.6.jar -json4s-jackson_2.12-3.6.6.jar -json4s-scalap_2.12-3.6.6.jar -jsp-api-2.1.jar -jsr305-3.0.0.jar -jta-1.1.jar -jtransforms-2.4.0.jar -jul-to-slf4j-1.7.16.jar -kryo-shaded-4.0.2.jar -kubernetes-client-4.4.2.jar -kubernetes-model-4.4.2.jar -kubernetes-model-common-4.4.2.jar -leveldbjni-all-1.8.jar -libfb303-0.9.3.jar -libthrift-0.12.0.jar -log4j-1.2.17.jar -logging-interceptor-3.12.0.jar -lz4-java-1.6.0.jar -machinist_2.12-0.6.1.jar -macro-compat_2.12-1.1.1.jar -mesos-1.4.0-shaded-protobuf.jar -metrics-core-3.1.5.jar -metrics-graphite-3.1.5.jar -metrics-json-3.1.5.jar -metrics-jvm-3.1.5.jar -minlog-1.3.0.jar -netty-all-4.1.30.Final.jar -objenesis-2.5.1.jar -okapi-shade-0.4.2.jar -okhttp-3.8.1.jar -okio-1.13.0.jar -opencsv-2.3.jar -orc-core-1.5.5-nohive.jar -orc-mapreduce-1.5.5-nohive.jar -orc-shims-1.5.5.jar -oro-2.0.8.jar -osgi-resource-locator-1.0.3.jar -paranamer-2.8.jar -parquet-column-1.10.1.jar -parquet-common-1.10.1.jar -parquet-encoding-1.10.1.jar -parquet-format-2.4.0.jar -parquet-hadoop-1.10.1.jar -parquet-hadoop-bundle-1.6.0.jar -parquet-jackson-1.10.1.jar -protobuf-java-2.5.0.jar -py4j-0.10.8.1.jar -pyrolite-4.30.jar -scala-compiler-2.12.8.jar -scala-library-2.12.8.jar -scala-parser-combinators_2.12-1.1.0.jar -scala-reflect-2.12.8.jar -scala-xml_2.12-1.2.0.jar -shapeless_2.12-2.3.2.jar -shims-0.7.45.jar -slf4j-api-1.7.16.jar -slf4j-log4j12-1.7.16.jar -snakeyaml-1.23.jar -snappy-0.2.jar -snappy-java-1.1.7.3.jar -spire-macros_2.12-0.13.0.jar -spire_2.12-0.13.0.jar -stax-api-1.0-2.jar -stax-api-1.0.1.jar -stream-2.9.6.jar -stringtemplate-3.2.1.jar -super-csv-2.2.0.jar -univocity-parsers-2.7.3.jar -validation-api-2.0.1.Final.jar -xbean-asm7-shaded-4.14.jar -xercesImpl-2.9.1.jar -xmlenc-0.52.jar -xz-1.5.jar -zjsonpatch-0.3.0.jar -zookeeper-3.4.6.jar -zstd-jni-1.4.2-1.jar diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-1.2 b/dev/deps/spark-deps-hadoop-2.7-hive-1.2 new file mode 100644 index 0000000000000..534ac39e0c46e --- /dev/null +++ b/dev/deps/spark-deps-hadoop-2.7-hive-1.2 @@ -0,0 +1,209 @@ +JLargeArrays/1.5//JLargeArrays-1.5.jar +JTransforms/3.1//JTransforms-3.1.jar +JavaEWAH/0.3.2//JavaEWAH-0.3.2.jar +RoaringBitmap/0.7.45//RoaringBitmap-0.7.45.jar +ST4/4.0.4//ST4-4.0.4.jar +activation/1.1.1//activation-1.1.1.jar +aircompressor/0.10//aircompressor-0.10.jar +algebra_2.12/2.0.0-M2//algebra_2.12-2.0.0-M2.jar +antlr-runtime/3.4//antlr-runtime-3.4.jar +antlr/2.7.7//antlr-2.7.7.jar +antlr4-runtime/4.7.1//antlr4-runtime-4.7.1.jar +aopalliance-repackaged/2.6.1//aopalliance-repackaged-2.6.1.jar +aopalliance/1.0//aopalliance-1.0.jar +apache-log4j-extras/1.2.17//apache-log4j-extras-1.2.17.jar +apacheds-i18n/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/1.0.0-M20//api-util-1.0.0-M20.jar +arpack_combined_all/0.1//arpack_combined_all-0.1.jar +arrow-format/0.15.1//arrow-format-0.15.1.jar +arrow-memory/0.15.1//arrow-memory-0.15.1.jar +arrow-vector/0.15.1//arrow-vector-0.15.1.jar +audience-annotations/0.5.0//audience-annotations-0.5.0.jar +automaton/1.11-8//automaton-1.11-8.jar +avro-ipc/1.8.2//avro-ipc-1.8.2.jar +avro-mapred/1.8.2/hadoop2/avro-mapred-1.8.2-hadoop2.jar +avro/1.8.2//avro-1.8.2.jar +bonecp/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar +breeze-macros_2.12/1.0//breeze-macros_2.12-1.0.jar +breeze_2.12/1.0//breeze_2.12-1.0.jar +cats-kernel_2.12/2.0.0-M4//cats-kernel_2.12-2.0.0-M4.jar +chill-java/0.9.5//chill-java-0.9.5.jar +chill_2.12/0.9.5//chill_2.12-0.9.5.jar +commons-beanutils/1.9.4//commons-beanutils-1.9.4.jar +commons-cli/1.2//commons-cli-1.2.jar +commons-codec/1.10//commons-codec-1.10.jar +commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compiler/3.0.15//commons-compiler-3.0.15.jar +commons-compress/1.8.1//commons-compress-1.8.1.jar +commons-configuration/1.6//commons-configuration-1.6.jar +commons-crypto/1.0.0//commons-crypto-1.0.0.jar +commons-dbcp/1.4//commons-dbcp-1.4.jar +commons-digester/1.8//commons-digester-1.8.jar +commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/2.4//commons-io-2.4.jar +commons-lang/2.6//commons-lang-2.6.jar +commons-lang3/3.9//commons-lang3-3.9.jar +commons-logging/1.1.3//commons-logging-1.1.3.jar +commons-math3/3.4.1//commons-math3-3.4.1.jar +commons-net/3.1//commons-net-3.1.jar +commons-pool/1.5.4//commons-pool-1.5.4.jar +commons-text/1.6//commons-text-1.6.jar +compress-lzf/1.0.3//compress-lzf-1.0.3.jar +core/1.1.2//core-1.1.2.jar +curator-client/2.7.1//curator-client-2.7.1.jar +curator-framework/2.7.1//curator-framework-2.7.1.jar +curator-recipes/2.7.1//curator-recipes-2.7.1.jar +datanucleus-api-jdo/3.2.6//datanucleus-api-jdo-3.2.6.jar +datanucleus-core/3.2.10//datanucleus-core-3.2.10.jar +datanucleus-rdbms/3.2.9//datanucleus-rdbms-3.2.9.jar +derby/10.12.1.1//derby-10.12.1.1.jar +flatbuffers-java/1.9.0//flatbuffers-java-1.9.0.jar +generex/1.0.2//generex-1.0.2.jar +gson/2.2.4//gson-2.2.4.jar +guava/14.0.1//guava-14.0.1.jar +guice-servlet/3.0//guice-servlet-3.0.jar +guice/3.0//guice-3.0.jar +hadoop-annotations/2.7.4//hadoop-annotations-2.7.4.jar +hadoop-auth/2.7.4//hadoop-auth-2.7.4.jar +hadoop-client/2.7.4//hadoop-client-2.7.4.jar +hadoop-common/2.7.4//hadoop-common-2.7.4.jar +hadoop-hdfs/2.7.4//hadoop-hdfs-2.7.4.jar +hadoop-mapreduce-client-app/2.7.4//hadoop-mapreduce-client-app-2.7.4.jar +hadoop-mapreduce-client-common/2.7.4//hadoop-mapreduce-client-common-2.7.4.jar +hadoop-mapreduce-client-core/2.7.4//hadoop-mapreduce-client-core-2.7.4.jar +hadoop-mapreduce-client-jobclient/2.7.4//hadoop-mapreduce-client-jobclient-2.7.4.jar +hadoop-mapreduce-client-shuffle/2.7.4//hadoop-mapreduce-client-shuffle-2.7.4.jar +hadoop-yarn-api/2.7.4//hadoop-yarn-api-2.7.4.jar +hadoop-yarn-client/2.7.4//hadoop-yarn-client-2.7.4.jar +hadoop-yarn-common/2.7.4//hadoop-yarn-common-2.7.4.jar +hadoop-yarn-server-common/2.7.4//hadoop-yarn-server-common-2.7.4.jar +hadoop-yarn-server-web-proxy/2.7.4//hadoop-yarn-server-web-proxy-2.7.4.jar +hk2-api/2.6.1//hk2-api-2.6.1.jar +hk2-locator/2.6.1//hk2-locator-2.6.1.jar +hk2-utils/2.6.1//hk2-utils-2.6.1.jar +htrace-core/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/4.5.6//httpclient-4.5.6.jar +httpcore/4.4.12//httpcore-4.4.12.jar +istack-commons-runtime/3.0.8//istack-commons-runtime-3.0.8.jar +ivy/2.4.0//ivy-2.4.0.jar +jackson-annotations/2.10.0//jackson-annotations-2.10.0.jar +jackson-core-asl/1.9.13//jackson-core-asl-1.9.13.jar +jackson-core/2.10.0//jackson-core-2.10.0.jar +jackson-databind/2.10.0//jackson-databind-2.10.0.jar +jackson-dataformat-yaml/2.10.0//jackson-dataformat-yaml-2.10.0.jar +jackson-jaxrs/1.9.13//jackson-jaxrs-1.9.13.jar +jackson-mapper-asl/1.9.13//jackson-mapper-asl-1.9.13.jar +jackson-module-jaxb-annotations/2.10.0//jackson-module-jaxb-annotations-2.10.0.jar +jackson-module-paranamer/2.10.0//jackson-module-paranamer-2.10.0.jar +jackson-module-scala_2.12/2.10.0//jackson-module-scala_2.12-2.10.0.jar +jackson-xc/1.9.13//jackson-xc-1.9.13.jar +jakarta.activation-api/1.2.1//jakarta.activation-api-1.2.1.jar +jakarta.annotation-api/1.3.5//jakarta.annotation-api-1.3.5.jar +jakarta.inject/2.6.1//jakarta.inject-2.6.1.jar +jakarta.validation-api/2.0.2//jakarta.validation-api-2.0.2.jar +jakarta.ws.rs-api/2.1.6//jakarta.ws.rs-api-2.1.6.jar +jakarta.xml.bind-api/2.3.2//jakarta.xml.bind-api-2.3.2.jar +janino/3.0.15//janino-3.0.15.jar +javassist/3.25.0-GA//javassist-3.25.0-GA.jar +javax.inject/1//javax.inject-1.jar +javax.servlet-api/3.1.0//javax.servlet-api-3.1.0.jar +javolution/5.5.1//javolution-5.5.1.jar +jaxb-api/2.2.2//jaxb-api-2.2.2.jar +jaxb-runtime/2.3.2//jaxb-runtime-2.3.2.jar +jcl-over-slf4j/1.7.16//jcl-over-slf4j-1.7.16.jar +jdo-api/3.0.1//jdo-api-3.0.1.jar +jersey-client/2.30//jersey-client-2.30.jar +jersey-common/2.30//jersey-common-2.30.jar +jersey-container-servlet-core/2.30//jersey-container-servlet-core-2.30.jar +jersey-container-servlet/2.30//jersey-container-servlet-2.30.jar +jersey-hk2/2.30//jersey-hk2-2.30.jar +jersey-media-jaxb/2.30//jersey-media-jaxb-2.30.jar +jersey-server/2.30//jersey-server-2.30.jar +jetty-sslengine/6.1.26//jetty-sslengine-6.1.26.jar +jetty-util/6.1.26//jetty-util-6.1.26.jar +jetty/6.1.26//jetty-6.1.26.jar +jline/2.14.6//jline-2.14.6.jar +joda-time/2.10.5//joda-time-2.10.5.jar +jodd-core/3.5.2//jodd-core-3.5.2.jar +jpam/1.1//jpam-1.1.jar +json4s-ast_2.12/3.6.6//json4s-ast_2.12-3.6.6.jar +json4s-core_2.12/3.6.6//json4s-core_2.12-3.6.6.jar +json4s-jackson_2.12/3.6.6//json4s-jackson_2.12-3.6.6.jar +json4s-scalap_2.12/3.6.6//json4s-scalap_2.12-3.6.6.jar +jsp-api/2.1//jsp-api-2.1.jar +jsr305/3.0.0//jsr305-3.0.0.jar +jta/1.1//jta-1.1.jar +jul-to-slf4j/1.7.16//jul-to-slf4j-1.7.16.jar +kryo-shaded/4.0.2//kryo-shaded-4.0.2.jar +kubernetes-client/4.7.1//kubernetes-client-4.7.1.jar +kubernetes-model-common/4.7.1//kubernetes-model-common-4.7.1.jar +kubernetes-model/4.7.1//kubernetes-model-4.7.1.jar +leveldbjni-all/1.8//leveldbjni-all-1.8.jar +libfb303/0.9.3//libfb303-0.9.3.jar +libthrift/0.12.0//libthrift-0.12.0.jar +log4j/1.2.17//log4j-1.2.17.jar +logging-interceptor/3.12.6//logging-interceptor-3.12.6.jar +lz4-java/1.7.1//lz4-java-1.7.1.jar +machinist_2.12/0.6.8//machinist_2.12-0.6.8.jar +macro-compat_2.12/1.1.1//macro-compat_2.12-1.1.1.jar +mesos/1.4.0/shaded-protobuf/mesos-1.4.0-shaded-protobuf.jar +metrics-core/4.1.1//metrics-core-4.1.1.jar +metrics-graphite/4.1.1//metrics-graphite-4.1.1.jar +metrics-jmx/4.1.1//metrics-jmx-4.1.1.jar +metrics-json/4.1.1//metrics-json-4.1.1.jar +metrics-jvm/4.1.1//metrics-jvm-4.1.1.jar +minlog/1.3.0//minlog-1.3.0.jar +netty-all/4.1.42.Final//netty-all-4.1.42.Final.jar +objenesis/2.5.1//objenesis-2.5.1.jar +okhttp/3.12.6//okhttp-3.12.6.jar +okio/1.15.0//okio-1.15.0.jar +opencsv/2.3//opencsv-2.3.jar +orc-core/1.5.9/nohive/orc-core-1.5.9-nohive.jar +orc-mapreduce/1.5.9/nohive/orc-mapreduce-1.5.9-nohive.jar +orc-shims/1.5.9//orc-shims-1.5.9.jar +oro/2.0.8//oro-2.0.8.jar +osgi-resource-locator/1.0.3//osgi-resource-locator-1.0.3.jar +paranamer/2.8//paranamer-2.8.jar +parquet-column/1.10.1//parquet-column-1.10.1.jar +parquet-common/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop-bundle/1.6.0//parquet-hadoop-bundle-1.6.0.jar +parquet-hadoop/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/2.5.0//protobuf-java-2.5.0.jar +py4j/0.10.8.1//py4j-0.10.8.1.jar +pyrolite/4.30//pyrolite-4.30.jar +scala-collection-compat_2.12/2.1.1//scala-collection-compat_2.12-2.1.1.jar +scala-compiler/2.12.10//scala-compiler-2.12.10.jar +scala-library/2.12.10//scala-library-2.12.10.jar +scala-parser-combinators_2.12/1.1.2//scala-parser-combinators_2.12-1.1.2.jar +scala-reflect/2.12.10//scala-reflect-2.12.10.jar +scala-xml_2.12/1.2.0//scala-xml_2.12-1.2.0.jar +shapeless_2.12/2.3.3//shapeless_2.12-2.3.3.jar +shims/0.7.45//shims-0.7.45.jar +slf4j-api/1.7.16//slf4j-api-1.7.16.jar +slf4j-log4j12/1.7.16//slf4j-log4j12-1.7.16.jar +snakeyaml/1.24//snakeyaml-1.24.jar +snappy-java/1.1.7.3//snappy-java-1.1.7.3.jar +snappy/0.2//snappy-0.2.jar +spire-macros_2.12/0.17.0-M1//spire-macros_2.12-0.17.0-M1.jar +spire-platform_2.12/0.17.0-M1//spire-platform_2.12-0.17.0-M1.jar +spire-util_2.12/0.17.0-M1//spire-util_2.12-0.17.0-M1.jar +spire_2.12/0.17.0-M1//spire_2.12-0.17.0-M1.jar +stax-api/1.0-2//stax-api-1.0-2.jar +stax-api/1.0.1//stax-api-1.0.1.jar +stream/2.9.6//stream-2.9.6.jar +stringtemplate/3.2.1//stringtemplate-3.2.1.jar +super-csv/2.2.0//super-csv-2.2.0.jar +threeten-extra/1.5.0//threeten-extra-1.5.0.jar +univocity-parsers/2.8.3//univocity-parsers-2.8.3.jar +xbean-asm7-shaded/4.15//xbean-asm7-shaded-4.15.jar +xercesImpl/2.9.1//xercesImpl-2.9.1.jar +xmlenc/0.52//xmlenc-0.52.jar +xz/1.5//xz-1.5.jar +zjsonpatch/0.3.0//zjsonpatch-0.3.0.jar +zookeeper/3.4.14//zookeeper-3.4.14.jar +zstd-jni/1.4.4-3//zstd-jni-1.4.4-3.jar diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 new file mode 100644 index 0000000000000..c50cf96dc9065 --- /dev/null +++ b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 @@ -0,0 +1,223 @@ +HikariCP/2.5.1//HikariCP-2.5.1.jar +JLargeArrays/1.5//JLargeArrays-1.5.jar +JTransforms/3.1//JTransforms-3.1.jar +RoaringBitmap/0.7.45//RoaringBitmap-0.7.45.jar +ST4/4.0.4//ST4-4.0.4.jar +activation/1.1.1//activation-1.1.1.jar +aircompressor/0.10//aircompressor-0.10.jar +algebra_2.12/2.0.0-M2//algebra_2.12-2.0.0-M2.jar +antlr-runtime/3.5.2//antlr-runtime-3.5.2.jar +antlr4-runtime/4.7.1//antlr4-runtime-4.7.1.jar +aopalliance-repackaged/2.6.1//aopalliance-repackaged-2.6.1.jar +aopalliance/1.0//aopalliance-1.0.jar +apacheds-i18n/2.0.0-M15//apacheds-i18n-2.0.0-M15.jar +apacheds-kerberos-codec/2.0.0-M15//apacheds-kerberos-codec-2.0.0-M15.jar +api-asn1-api/1.0.0-M20//api-asn1-api-1.0.0-M20.jar +api-util/1.0.0-M20//api-util-1.0.0-M20.jar +arpack_combined_all/0.1//arpack_combined_all-0.1.jar +arrow-format/0.15.1//arrow-format-0.15.1.jar +arrow-memory/0.15.1//arrow-memory-0.15.1.jar +arrow-vector/0.15.1//arrow-vector-0.15.1.jar +audience-annotations/0.5.0//audience-annotations-0.5.0.jar +automaton/1.11-8//automaton-1.11-8.jar +avro-ipc/1.8.2//avro-ipc-1.8.2.jar +avro-mapred/1.8.2/hadoop2/avro-mapred-1.8.2-hadoop2.jar +avro/1.8.2//avro-1.8.2.jar +bonecp/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar +breeze-macros_2.12/1.0//breeze-macros_2.12-1.0.jar +breeze_2.12/1.0//breeze_2.12-1.0.jar +cats-kernel_2.12/2.0.0-M4//cats-kernel_2.12-2.0.0-M4.jar +chill-java/0.9.5//chill-java-0.9.5.jar +chill_2.12/0.9.5//chill_2.12-0.9.5.jar +commons-beanutils/1.9.4//commons-beanutils-1.9.4.jar +commons-cli/1.2//commons-cli-1.2.jar +commons-codec/1.10//commons-codec-1.10.jar +commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compiler/3.0.15//commons-compiler-3.0.15.jar +commons-compress/1.8.1//commons-compress-1.8.1.jar +commons-configuration/1.6//commons-configuration-1.6.jar +commons-crypto/1.0.0//commons-crypto-1.0.0.jar +commons-dbcp/1.4//commons-dbcp-1.4.jar +commons-digester/1.8//commons-digester-1.8.jar +commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/2.4//commons-io-2.4.jar +commons-lang/2.6//commons-lang-2.6.jar +commons-lang3/3.9//commons-lang3-3.9.jar +commons-logging/1.1.3//commons-logging-1.1.3.jar +commons-math3/3.4.1//commons-math3-3.4.1.jar +commons-net/3.1//commons-net-3.1.jar +commons-pool/1.5.4//commons-pool-1.5.4.jar +commons-text/1.6//commons-text-1.6.jar +compress-lzf/1.0.3//compress-lzf-1.0.3.jar +core/1.1.2//core-1.1.2.jar +curator-client/2.7.1//curator-client-2.7.1.jar +curator-framework/2.7.1//curator-framework-2.7.1.jar +curator-recipes/2.7.1//curator-recipes-2.7.1.jar +datanucleus-api-jdo/4.2.4//datanucleus-api-jdo-4.2.4.jar +datanucleus-core/4.1.17//datanucleus-core-4.1.17.jar +datanucleus-rdbms/4.1.19//datanucleus-rdbms-4.1.19.jar +derby/10.12.1.1//derby-10.12.1.1.jar +dropwizard-metrics-hadoop-metrics2-reporter/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar +flatbuffers-java/1.9.0//flatbuffers-java-1.9.0.jar +generex/1.0.2//generex-1.0.2.jar +gson/2.2.4//gson-2.2.4.jar +guava/14.0.1//guava-14.0.1.jar +guice-servlet/3.0//guice-servlet-3.0.jar +guice/3.0//guice-3.0.jar +hadoop-annotations/2.7.4//hadoop-annotations-2.7.4.jar +hadoop-auth/2.7.4//hadoop-auth-2.7.4.jar +hadoop-client/2.7.4//hadoop-client-2.7.4.jar +hadoop-common/2.7.4//hadoop-common-2.7.4.jar +hadoop-hdfs/2.7.4//hadoop-hdfs-2.7.4.jar +hadoop-mapreduce-client-app/2.7.4//hadoop-mapreduce-client-app-2.7.4.jar +hadoop-mapreduce-client-common/2.7.4//hadoop-mapreduce-client-common-2.7.4.jar +hadoop-mapreduce-client-core/2.7.4//hadoop-mapreduce-client-core-2.7.4.jar +hadoop-mapreduce-client-jobclient/2.7.4//hadoop-mapreduce-client-jobclient-2.7.4.jar +hadoop-mapreduce-client-shuffle/2.7.4//hadoop-mapreduce-client-shuffle-2.7.4.jar +hadoop-yarn-api/2.7.4//hadoop-yarn-api-2.7.4.jar +hadoop-yarn-client/2.7.4//hadoop-yarn-client-2.7.4.jar +hadoop-yarn-common/2.7.4//hadoop-yarn-common-2.7.4.jar +hadoop-yarn-server-common/2.7.4//hadoop-yarn-server-common-2.7.4.jar +hadoop-yarn-server-web-proxy/2.7.4//hadoop-yarn-server-web-proxy-2.7.4.jar +hive-beeline/2.3.6//hive-beeline-2.3.6.jar +hive-cli/2.3.6//hive-cli-2.3.6.jar +hive-common/2.3.6//hive-common-2.3.6.jar +hive-exec/2.3.6/core/hive-exec-2.3.6-core.jar +hive-jdbc/2.3.6//hive-jdbc-2.3.6.jar +hive-llap-common/2.3.6//hive-llap-common-2.3.6.jar +hive-metastore/2.3.6//hive-metastore-2.3.6.jar +hive-serde/2.3.6//hive-serde-2.3.6.jar +hive-shims-0.23/2.3.6//hive-shims-0.23-2.3.6.jar +hive-shims-common/2.3.6//hive-shims-common-2.3.6.jar +hive-shims-scheduler/2.3.6//hive-shims-scheduler-2.3.6.jar +hive-shims/2.3.6//hive-shims-2.3.6.jar +hive-storage-api/2.7.1//hive-storage-api-2.7.1.jar +hive-vector-code-gen/2.3.6//hive-vector-code-gen-2.3.6.jar +hk2-api/2.6.1//hk2-api-2.6.1.jar +hk2-locator/2.6.1//hk2-locator-2.6.1.jar +hk2-utils/2.6.1//hk2-utils-2.6.1.jar +htrace-core/3.1.0-incubating//htrace-core-3.1.0-incubating.jar +httpclient/4.5.6//httpclient-4.5.6.jar +httpcore/4.4.12//httpcore-4.4.12.jar +istack-commons-runtime/3.0.8//istack-commons-runtime-3.0.8.jar +ivy/2.4.0//ivy-2.4.0.jar +jackson-annotations/2.10.0//jackson-annotations-2.10.0.jar +jackson-core-asl/1.9.13//jackson-core-asl-1.9.13.jar +jackson-core/2.10.0//jackson-core-2.10.0.jar +jackson-databind/2.10.0//jackson-databind-2.10.0.jar +jackson-dataformat-yaml/2.10.0//jackson-dataformat-yaml-2.10.0.jar +jackson-jaxrs/1.9.13//jackson-jaxrs-1.9.13.jar +jackson-mapper-asl/1.9.13//jackson-mapper-asl-1.9.13.jar +jackson-module-jaxb-annotations/2.10.0//jackson-module-jaxb-annotations-2.10.0.jar +jackson-module-paranamer/2.10.0//jackson-module-paranamer-2.10.0.jar +jackson-module-scala_2.12/2.10.0//jackson-module-scala_2.12-2.10.0.jar +jackson-xc/1.9.13//jackson-xc-1.9.13.jar +jakarta.activation-api/1.2.1//jakarta.activation-api-1.2.1.jar +jakarta.annotation-api/1.3.5//jakarta.annotation-api-1.3.5.jar +jakarta.inject/2.6.1//jakarta.inject-2.6.1.jar +jakarta.validation-api/2.0.2//jakarta.validation-api-2.0.2.jar +jakarta.ws.rs-api/2.1.6//jakarta.ws.rs-api-2.1.6.jar +jakarta.xml.bind-api/2.3.2//jakarta.xml.bind-api-2.3.2.jar +janino/3.0.15//janino-3.0.15.jar +javassist/3.25.0-GA//javassist-3.25.0-GA.jar +javax.inject/1//javax.inject-1.jar +javax.jdo/3.2.0-m3//javax.jdo-3.2.0-m3.jar +javax.servlet-api/3.1.0//javax.servlet-api-3.1.0.jar +javolution/5.5.1//javolution-5.5.1.jar +jaxb-api/2.2.2//jaxb-api-2.2.2.jar +jaxb-runtime/2.3.2//jaxb-runtime-2.3.2.jar +jcl-over-slf4j/1.7.16//jcl-over-slf4j-1.7.16.jar +jdo-api/3.0.1//jdo-api-3.0.1.jar +jersey-client/2.30//jersey-client-2.30.jar +jersey-common/2.30//jersey-common-2.30.jar +jersey-container-servlet-core/2.30//jersey-container-servlet-core-2.30.jar +jersey-container-servlet/2.30//jersey-container-servlet-2.30.jar +jersey-hk2/2.30//jersey-hk2-2.30.jar +jersey-media-jaxb/2.30//jersey-media-jaxb-2.30.jar +jersey-server/2.30//jersey-server-2.30.jar +jetty-sslengine/6.1.26//jetty-sslengine-6.1.26.jar +jetty-util/6.1.26//jetty-util-6.1.26.jar +jetty/6.1.26//jetty-6.1.26.jar +jline/2.14.6//jline-2.14.6.jar +joda-time/2.10.5//joda-time-2.10.5.jar +jodd-core/3.5.2//jodd-core-3.5.2.jar +jpam/1.1//jpam-1.1.jar +json/1.8//json-1.8.jar +json4s-ast_2.12/3.6.6//json4s-ast_2.12-3.6.6.jar +json4s-core_2.12/3.6.6//json4s-core_2.12-3.6.6.jar +json4s-jackson_2.12/3.6.6//json4s-jackson_2.12-3.6.6.jar +json4s-scalap_2.12/3.6.6//json4s-scalap_2.12-3.6.6.jar +jsp-api/2.1//jsp-api-2.1.jar +jsr305/3.0.0//jsr305-3.0.0.jar +jta/1.1//jta-1.1.jar +jul-to-slf4j/1.7.16//jul-to-slf4j-1.7.16.jar +kryo-shaded/4.0.2//kryo-shaded-4.0.2.jar +kubernetes-client/4.7.1//kubernetes-client-4.7.1.jar +kubernetes-model-common/4.7.1//kubernetes-model-common-4.7.1.jar +kubernetes-model/4.7.1//kubernetes-model-4.7.1.jar +leveldbjni-all/1.8//leveldbjni-all-1.8.jar +libfb303/0.9.3//libfb303-0.9.3.jar +libthrift/0.12.0//libthrift-0.12.0.jar +log4j/1.2.17//log4j-1.2.17.jar +logging-interceptor/3.12.6//logging-interceptor-3.12.6.jar +lz4-java/1.7.1//lz4-java-1.7.1.jar +machinist_2.12/0.6.8//machinist_2.12-0.6.8.jar +macro-compat_2.12/1.1.1//macro-compat_2.12-1.1.1.jar +mesos/1.4.0/shaded-protobuf/mesos-1.4.0-shaded-protobuf.jar +metrics-core/4.1.1//metrics-core-4.1.1.jar +metrics-graphite/4.1.1//metrics-graphite-4.1.1.jar +metrics-jmx/4.1.1//metrics-jmx-4.1.1.jar +metrics-json/4.1.1//metrics-json-4.1.1.jar +metrics-jvm/4.1.1//metrics-jvm-4.1.1.jar +minlog/1.3.0//minlog-1.3.0.jar +netty-all/4.1.42.Final//netty-all-4.1.42.Final.jar +objenesis/2.5.1//objenesis-2.5.1.jar +okhttp/3.12.6//okhttp-3.12.6.jar +okio/1.15.0//okio-1.15.0.jar +opencsv/2.3//opencsv-2.3.jar +orc-core/1.5.9//orc-core-1.5.9.jar +orc-mapreduce/1.5.9//orc-mapreduce-1.5.9.jar +orc-shims/1.5.9//orc-shims-1.5.9.jar +oro/2.0.8//oro-2.0.8.jar +osgi-resource-locator/1.0.3//osgi-resource-locator-1.0.3.jar +paranamer/2.8//paranamer-2.8.jar +parquet-column/1.10.1//parquet-column-1.10.1.jar +parquet-common/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/2.5.0//protobuf-java-2.5.0.jar +py4j/0.10.8.1//py4j-0.10.8.1.jar +pyrolite/4.30//pyrolite-4.30.jar +scala-collection-compat_2.12/2.1.1//scala-collection-compat_2.12-2.1.1.jar +scala-compiler/2.12.10//scala-compiler-2.12.10.jar +scala-library/2.12.10//scala-library-2.12.10.jar +scala-parser-combinators_2.12/1.1.2//scala-parser-combinators_2.12-1.1.2.jar +scala-reflect/2.12.10//scala-reflect-2.12.10.jar +scala-xml_2.12/1.2.0//scala-xml_2.12-1.2.0.jar +shapeless_2.12/2.3.3//shapeless_2.12-2.3.3.jar +shims/0.7.45//shims-0.7.45.jar +slf4j-api/1.7.16//slf4j-api-1.7.16.jar +slf4j-log4j12/1.7.16//slf4j-log4j12-1.7.16.jar +snakeyaml/1.24//snakeyaml-1.24.jar +snappy-java/1.1.7.3//snappy-java-1.1.7.3.jar +spire-macros_2.12/0.17.0-M1//spire-macros_2.12-0.17.0-M1.jar +spire-platform_2.12/0.17.0-M1//spire-platform_2.12-0.17.0-M1.jar +spire-util_2.12/0.17.0-M1//spire-util_2.12-0.17.0-M1.jar +spire_2.12/0.17.0-M1//spire_2.12-0.17.0-M1.jar +stax-api/1.0-2//stax-api-1.0-2.jar +stax-api/1.0.1//stax-api-1.0.1.jar +stream/2.9.6//stream-2.9.6.jar +super-csv/2.2.0//super-csv-2.2.0.jar +threeten-extra/1.5.0//threeten-extra-1.5.0.jar +transaction-api/1.1//transaction-api-1.1.jar +univocity-parsers/2.8.3//univocity-parsers-2.8.3.jar +velocity/1.5//velocity-1.5.jar +xbean-asm7-shaded/4.15//xbean-asm7-shaded-4.15.jar +xercesImpl/2.9.1//xercesImpl-2.9.1.jar +xmlenc/0.52//xmlenc-0.52.jar +xz/1.5//xz-1.5.jar +zjsonpatch/0.3.0//zjsonpatch-0.3.0.jar +zookeeper/3.4.14//zookeeper-3.4.14.jar +zstd-jni/1.4.4-3//zstd-jni-1.4.4-3.jar diff --git a/dev/deps/spark-deps-hadoop-3.2 b/dev/deps/spark-deps-hadoop-3.2 deleted file mode 100644 index de046634eefbb..0000000000000 --- a/dev/deps/spark-deps-hadoop-3.2 +++ /dev/null @@ -1,221 +0,0 @@ -JavaEWAH-0.3.2.jar -RoaringBitmap-0.7.45.jar -ST4-4.0.4.jar -accessors-smart-1.2.jar -activation-1.1.1.jar -aircompressor-0.10.jar -antlr-2.7.7.jar -antlr-runtime-3.4.jar -antlr4-runtime-4.7.1.jar -aopalliance-1.0.jar -aopalliance-repackaged-2.5.0.jar -apache-log4j-extras-1.2.17.jar -arpack_combined_all-0.1.jar -arrow-format-0.12.0.jar -arrow-memory-0.12.0.jar -arrow-vector-0.12.0.jar -audience-annotations-0.5.0.jar -automaton-1.11-8.jar -avro-1.8.2.jar -avro-ipc-1.8.2.jar -avro-mapred-1.8.2-hadoop2.jar -bonecp-0.8.0.RELEASE.jar -breeze-macros_2.12-0.13.2.jar -breeze_2.12-0.13.2.jar -chill-java-0.9.3.jar -chill_2.12-0.9.3.jar -commons-beanutils-1.9.3.jar -commons-cli-1.2.jar -commons-codec-1.10.jar -commons-collections-3.2.2.jar -commons-compiler-3.0.15.jar -commons-compress-1.8.1.jar -commons-configuration2-2.1.1.jar -commons-crypto-1.0.0.jar -commons-daemon-1.0.13.jar -commons-dbcp-1.4.jar -commons-httpclient-3.1.jar -commons-io-2.4.jar -commons-lang-2.6.jar -commons-lang3-3.8.1.jar -commons-logging-1.1.3.jar -commons-math3-3.4.1.jar -commons-net-3.1.jar -commons-pool-1.5.4.jar -commons-text-1.6.jar -compress-lzf-1.0.3.jar -core-1.1.2.jar -curator-client-2.13.0.jar -curator-framework-2.13.0.jar -curator-recipes-2.13.0.jar -datanucleus-api-jdo-3.2.6.jar -datanucleus-core-4.1.17.jar -datanucleus-rdbms-3.2.9.jar -derby-10.12.1.1.jar -dnsjava-2.1.7.jar -ehcache-3.3.1.jar -flatbuffers-java-1.9.0.jar -generex-1.0.2.jar -geronimo-jcache_1.0_spec-1.0-alpha-1.jar -gson-2.2.4.jar -guava-14.0.1.jar -guice-4.0.jar -guice-servlet-4.0.jar -hadoop-annotations-3.2.0.jar -hadoop-auth-3.2.0.jar -hadoop-client-3.2.0.jar -hadoop-common-3.2.0.jar -hadoop-hdfs-client-3.2.0.jar -hadoop-mapreduce-client-common-3.2.0.jar -hadoop-mapreduce-client-core-3.2.0.jar -hadoop-mapreduce-client-jobclient-3.2.0.jar -hadoop-yarn-api-3.2.0.jar -hadoop-yarn-client-3.2.0.jar -hadoop-yarn-common-3.2.0.jar -hadoop-yarn-registry-3.2.0.jar -hadoop-yarn-server-common-3.2.0.jar -hadoop-yarn-server-web-proxy-3.2.0.jar -hive-storage-api-2.6.0.jar -hk2-api-2.5.0.jar -hk2-locator-2.5.0.jar -hk2-utils-2.5.0.jar -hppc-0.7.2.jar -htrace-core4-4.1.0-incubating.jar -httpclient-4.5.6.jar -httpcore-4.4.10.jar -istack-commons-runtime-3.0.8.jar -ivy-2.4.0.jar -jackson-annotations-2.9.9.jar -jackson-core-2.9.9.jar -jackson-core-asl-1.9.13.jar -jackson-databind-2.9.9.3.jar -jackson-dataformat-yaml-2.9.9.jar -jackson-jaxrs-base-2.9.5.jar -jackson-jaxrs-json-provider-2.9.5.jar -jackson-mapper-asl-1.9.13.jar -jackson-module-jaxb-annotations-2.9.9.jar -jackson-module-paranamer-2.9.9.jar -jackson-module-scala_2.12-2.9.9.jar -jakarta.annotation-api-1.3.4.jar -jakarta.inject-2.5.0.jar -jakarta.ws.rs-api-2.1.5.jar -jakarta.xml.bind-api-2.3.2.jar -janino-3.0.15.jar -javassist-3.22.0-CR2.jar -javax.el-3.0.1-b11.jar -javax.inject-1.jar -javax.servlet-api-3.1.0.jar -javolution-5.5.1.jar -jaxb-api-2.2.11.jar -jaxb-runtime-2.3.2.jar -jcip-annotations-1.0-1.jar -jcl-over-slf4j-1.7.16.jar -jdo-api-3.0.1.jar -jersey-client-2.29.jar -jersey-common-2.29.jar -jersey-container-servlet-2.29.jar -jersey-container-servlet-core-2.29.jar -jersey-hk2-2.29.jar -jersey-media-jaxb-2.29.jar -jersey-server-2.29.jar -jetty-webapp-9.4.18.v20190429.jar -jetty-xml-9.4.18.v20190429.jar -jline-2.14.6.jar -joda-time-2.9.3.jar -jodd-core-3.5.2.jar -jpam-1.1.jar -json-smart-2.3.jar -json4s-ast_2.12-3.6.6.jar -json4s-core_2.12-3.6.6.jar -json4s-jackson_2.12-3.6.6.jar -json4s-scalap_2.12-3.6.6.jar -jsp-api-2.1.jar -jsr305-3.0.0.jar -jta-1.1.jar -jtransforms-2.4.0.jar -jul-to-slf4j-1.7.16.jar -kerb-admin-1.0.1.jar -kerb-client-1.0.1.jar -kerb-common-1.0.1.jar -kerb-core-1.0.1.jar -kerb-crypto-1.0.1.jar -kerb-identity-1.0.1.jar -kerb-server-1.0.1.jar -kerb-simplekdc-1.0.1.jar -kerb-util-1.0.1.jar -kerby-asn1-1.0.1.jar -kerby-config-1.0.1.jar -kerby-pkix-1.0.1.jar -kerby-util-1.0.1.jar -kerby-xdr-1.0.1.jar -kryo-shaded-4.0.2.jar -kubernetes-client-4.4.2.jar -kubernetes-model-4.4.2.jar -kubernetes-model-common-4.4.2.jar -leveldbjni-all-1.8.jar -libfb303-0.9.3.jar -libthrift-0.12.0.jar -log4j-1.2.17.jar -logging-interceptor-3.12.0.jar -lz4-java-1.6.0.jar -machinist_2.12-0.6.1.jar -macro-compat_2.12-1.1.1.jar -mesos-1.4.0-shaded-protobuf.jar -metrics-core-3.1.5.jar -metrics-graphite-3.1.5.jar -metrics-json-3.1.5.jar -metrics-jvm-3.1.5.jar -minlog-1.3.0.jar -mssql-jdbc-6.2.1.jre7.jar -netty-all-4.1.30.Final.jar -nimbus-jose-jwt-4.41.1.jar -objenesis-2.5.1.jar -okapi-shade-0.4.2.jar -okhttp-2.7.5.jar -okhttp-3.8.1.jar -okio-1.13.0.jar -opencsv-2.3.jar -orc-core-1.5.5-nohive.jar -orc-mapreduce-1.5.5-nohive.jar -orc-shims-1.5.5.jar -oro-2.0.8.jar -osgi-resource-locator-1.0.3.jar -paranamer-2.8.jar -parquet-column-1.10.1.jar -parquet-common-1.10.1.jar -parquet-encoding-1.10.1.jar -parquet-format-2.4.0.jar -parquet-hadoop-1.10.1.jar -parquet-jackson-1.10.1.jar -protobuf-java-2.5.0.jar -py4j-0.10.8.1.jar -pyrolite-4.30.jar -re2j-1.1.jar -scala-compiler-2.12.8.jar -scala-library-2.12.8.jar -scala-parser-combinators_2.12-1.1.0.jar -scala-reflect-2.12.8.jar -scala-xml_2.12-1.2.0.jar -shapeless_2.12-2.3.2.jar -shims-0.7.45.jar -slf4j-api-1.7.16.jar -slf4j-log4j12-1.7.16.jar -snakeyaml-1.23.jar -snappy-0.2.jar -snappy-java-1.1.7.3.jar -spire-macros_2.12-0.13.0.jar -spire_2.12-0.13.0.jar -stax-api-1.0.1.jar -stax2-api-3.1.4.jar -stream-2.9.6.jar -stringtemplate-3.2.1.jar -super-csv-2.2.0.jar -token-provider-1.0.1.jar -univocity-parsers-2.7.3.jar -validation-api-2.0.1.Final.jar -woodstox-core-5.0.3.jar -xbean-asm7-shaded-4.14.jar -xz-1.5.jar -zjsonpatch-0.3.0.jar -zookeeper-3.4.13.jar -zstd-jni-1.4.2-1.jar diff --git a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 new file mode 100644 index 0000000000000..c37ce7fab36f6 --- /dev/null +++ b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 @@ -0,0 +1,239 @@ +HikariCP/2.5.1//HikariCP-2.5.1.jar +JLargeArrays/1.5//JLargeArrays-1.5.jar +JTransforms/3.1//JTransforms-3.1.jar +RoaringBitmap/0.7.45//RoaringBitmap-0.7.45.jar +ST4/4.0.4//ST4-4.0.4.jar +accessors-smart/1.2//accessors-smart-1.2.jar +activation/1.1.1//activation-1.1.1.jar +aircompressor/0.10//aircompressor-0.10.jar +algebra_2.12/2.0.0-M2//algebra_2.12-2.0.0-M2.jar +antlr-runtime/3.5.2//antlr-runtime-3.5.2.jar +antlr4-runtime/4.7.1//antlr4-runtime-4.7.1.jar +aopalliance-repackaged/2.6.1//aopalliance-repackaged-2.6.1.jar +aopalliance/1.0//aopalliance-1.0.jar +arpack_combined_all/0.1//arpack_combined_all-0.1.jar +arrow-format/0.15.1//arrow-format-0.15.1.jar +arrow-memory/0.15.1//arrow-memory-0.15.1.jar +arrow-vector/0.15.1//arrow-vector-0.15.1.jar +audience-annotations/0.5.0//audience-annotations-0.5.0.jar +automaton/1.11-8//automaton-1.11-8.jar +avro-ipc/1.8.2//avro-ipc-1.8.2.jar +avro-mapred/1.8.2/hadoop2/avro-mapred-1.8.2-hadoop2.jar +avro/1.8.2//avro-1.8.2.jar +bonecp/0.8.0.RELEASE//bonecp-0.8.0.RELEASE.jar +breeze-macros_2.12/1.0//breeze-macros_2.12-1.0.jar +breeze_2.12/1.0//breeze_2.12-1.0.jar +cats-kernel_2.12/2.0.0-M4//cats-kernel_2.12-2.0.0-M4.jar +chill-java/0.9.5//chill-java-0.9.5.jar +chill_2.12/0.9.5//chill_2.12-0.9.5.jar +commons-beanutils/1.9.4//commons-beanutils-1.9.4.jar +commons-cli/1.2//commons-cli-1.2.jar +commons-codec/1.10//commons-codec-1.10.jar +commons-collections/3.2.2//commons-collections-3.2.2.jar +commons-compiler/3.0.15//commons-compiler-3.0.15.jar +commons-compress/1.8.1//commons-compress-1.8.1.jar +commons-configuration2/2.1.1//commons-configuration2-2.1.1.jar +commons-crypto/1.0.0//commons-crypto-1.0.0.jar +commons-daemon/1.0.13//commons-daemon-1.0.13.jar +commons-dbcp/1.4//commons-dbcp-1.4.jar +commons-httpclient/3.1//commons-httpclient-3.1.jar +commons-io/2.4//commons-io-2.4.jar +commons-lang/2.6//commons-lang-2.6.jar +commons-lang3/3.9//commons-lang3-3.9.jar +commons-logging/1.1.3//commons-logging-1.1.3.jar +commons-math3/3.4.1//commons-math3-3.4.1.jar +commons-net/3.1//commons-net-3.1.jar +commons-pool/1.5.4//commons-pool-1.5.4.jar +commons-text/1.6//commons-text-1.6.jar +compress-lzf/1.0.3//compress-lzf-1.0.3.jar +core/1.1.2//core-1.1.2.jar +curator-client/2.13.0//curator-client-2.13.0.jar +curator-framework/2.13.0//curator-framework-2.13.0.jar +curator-recipes/2.13.0//curator-recipes-2.13.0.jar +datanucleus-api-jdo/4.2.4//datanucleus-api-jdo-4.2.4.jar +datanucleus-core/4.1.17//datanucleus-core-4.1.17.jar +datanucleus-rdbms/4.1.19//datanucleus-rdbms-4.1.19.jar +derby/10.12.1.1//derby-10.12.1.1.jar +dnsjava/2.1.7//dnsjava-2.1.7.jar +dropwizard-metrics-hadoop-metrics2-reporter/0.1.2//dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar +ehcache/3.3.1//ehcache-3.3.1.jar +flatbuffers-java/1.9.0//flatbuffers-java-1.9.0.jar +generex/1.0.2//generex-1.0.2.jar +geronimo-jcache_1.0_spec/1.0-alpha-1//geronimo-jcache_1.0_spec-1.0-alpha-1.jar +gson/2.2.4//gson-2.2.4.jar +guava/14.0.1//guava-14.0.1.jar +guice-servlet/4.0//guice-servlet-4.0.jar +guice/4.0//guice-4.0.jar +hadoop-annotations/3.2.0//hadoop-annotations-3.2.0.jar +hadoop-auth/3.2.0//hadoop-auth-3.2.0.jar +hadoop-client/3.2.0//hadoop-client-3.2.0.jar +hadoop-common/3.2.0//hadoop-common-3.2.0.jar +hadoop-hdfs-client/3.2.0//hadoop-hdfs-client-3.2.0.jar +hadoop-mapreduce-client-common/3.2.0//hadoop-mapreduce-client-common-3.2.0.jar +hadoop-mapreduce-client-core/3.2.0//hadoop-mapreduce-client-core-3.2.0.jar +hadoop-mapreduce-client-jobclient/3.2.0//hadoop-mapreduce-client-jobclient-3.2.0.jar +hadoop-yarn-api/3.2.0//hadoop-yarn-api-3.2.0.jar +hadoop-yarn-client/3.2.0//hadoop-yarn-client-3.2.0.jar +hadoop-yarn-common/3.2.0//hadoop-yarn-common-3.2.0.jar +hadoop-yarn-registry/3.2.0//hadoop-yarn-registry-3.2.0.jar +hadoop-yarn-server-common/3.2.0//hadoop-yarn-server-common-3.2.0.jar +hadoop-yarn-server-web-proxy/3.2.0//hadoop-yarn-server-web-proxy-3.2.0.jar +hive-beeline/2.3.6//hive-beeline-2.3.6.jar +hive-cli/2.3.6//hive-cli-2.3.6.jar +hive-common/2.3.6//hive-common-2.3.6.jar +hive-exec/2.3.6/core/hive-exec-2.3.6-core.jar +hive-jdbc/2.3.6//hive-jdbc-2.3.6.jar +hive-llap-common/2.3.6//hive-llap-common-2.3.6.jar +hive-metastore/2.3.6//hive-metastore-2.3.6.jar +hive-serde/2.3.6//hive-serde-2.3.6.jar +hive-shims-0.23/2.3.6//hive-shims-0.23-2.3.6.jar +hive-shims-common/2.3.6//hive-shims-common-2.3.6.jar +hive-shims-scheduler/2.3.6//hive-shims-scheduler-2.3.6.jar +hive-shims/2.3.6//hive-shims-2.3.6.jar +hive-storage-api/2.7.1//hive-storage-api-2.7.1.jar +hive-vector-code-gen/2.3.6//hive-vector-code-gen-2.3.6.jar +hk2-api/2.6.1//hk2-api-2.6.1.jar +hk2-locator/2.6.1//hk2-locator-2.6.1.jar +hk2-utils/2.6.1//hk2-utils-2.6.1.jar +htrace-core4/4.1.0-incubating//htrace-core4-4.1.0-incubating.jar +httpclient/4.5.6//httpclient-4.5.6.jar +httpcore/4.4.12//httpcore-4.4.12.jar +istack-commons-runtime/3.0.8//istack-commons-runtime-3.0.8.jar +ivy/2.4.0//ivy-2.4.0.jar +jackson-annotations/2.10.0//jackson-annotations-2.10.0.jar +jackson-core-asl/1.9.13//jackson-core-asl-1.9.13.jar +jackson-core/2.10.0//jackson-core-2.10.0.jar +jackson-databind/2.10.0//jackson-databind-2.10.0.jar +jackson-dataformat-yaml/2.10.0//jackson-dataformat-yaml-2.10.0.jar +jackson-jaxrs-base/2.9.5//jackson-jaxrs-base-2.9.5.jar +jackson-jaxrs-json-provider/2.9.5//jackson-jaxrs-json-provider-2.9.5.jar +jackson-mapper-asl/1.9.13//jackson-mapper-asl-1.9.13.jar +jackson-module-jaxb-annotations/2.10.0//jackson-module-jaxb-annotations-2.10.0.jar +jackson-module-paranamer/2.10.0//jackson-module-paranamer-2.10.0.jar +jackson-module-scala_2.12/2.10.0//jackson-module-scala_2.12-2.10.0.jar +jakarta.activation-api/1.2.1//jakarta.activation-api-1.2.1.jar +jakarta.annotation-api/1.3.5//jakarta.annotation-api-1.3.5.jar +jakarta.inject/2.6.1//jakarta.inject-2.6.1.jar +jakarta.validation-api/2.0.2//jakarta.validation-api-2.0.2.jar +jakarta.ws.rs-api/2.1.6//jakarta.ws.rs-api-2.1.6.jar +jakarta.xml.bind-api/2.3.2//jakarta.xml.bind-api-2.3.2.jar +janino/3.0.15//janino-3.0.15.jar +javassist/3.25.0-GA//javassist-3.25.0-GA.jar +javax.inject/1//javax.inject-1.jar +javax.jdo/3.2.0-m3//javax.jdo-3.2.0-m3.jar +javax.servlet-api/3.1.0//javax.servlet-api-3.1.0.jar +javolution/5.5.1//javolution-5.5.1.jar +jaxb-api/2.2.11//jaxb-api-2.2.11.jar +jaxb-runtime/2.3.2//jaxb-runtime-2.3.2.jar +jcip-annotations/1.0-1//jcip-annotations-1.0-1.jar +jcl-over-slf4j/1.7.16//jcl-over-slf4j-1.7.16.jar +jdo-api/3.0.1//jdo-api-3.0.1.jar +jersey-client/2.30//jersey-client-2.30.jar +jersey-common/2.30//jersey-common-2.30.jar +jersey-container-servlet-core/2.30//jersey-container-servlet-core-2.30.jar +jersey-container-servlet/2.30//jersey-container-servlet-2.30.jar +jersey-hk2/2.30//jersey-hk2-2.30.jar +jersey-media-jaxb/2.30//jersey-media-jaxb-2.30.jar +jersey-server/2.30//jersey-server-2.30.jar +jline/2.14.6//jline-2.14.6.jar +joda-time/2.10.5//joda-time-2.10.5.jar +jodd-core/3.5.2//jodd-core-3.5.2.jar +jpam/1.1//jpam-1.1.jar +json-smart/2.3//json-smart-2.3.jar +json/1.8//json-1.8.jar +json4s-ast_2.12/3.6.6//json4s-ast_2.12-3.6.6.jar +json4s-core_2.12/3.6.6//json4s-core_2.12-3.6.6.jar +json4s-jackson_2.12/3.6.6//json4s-jackson_2.12-3.6.6.jar +json4s-scalap_2.12/3.6.6//json4s-scalap_2.12-3.6.6.jar +jsp-api/2.1//jsp-api-2.1.jar +jsr305/3.0.0//jsr305-3.0.0.jar +jta/1.1//jta-1.1.jar +jul-to-slf4j/1.7.16//jul-to-slf4j-1.7.16.jar +kerb-admin/1.0.1//kerb-admin-1.0.1.jar +kerb-client/1.0.1//kerb-client-1.0.1.jar +kerb-common/1.0.1//kerb-common-1.0.1.jar +kerb-core/1.0.1//kerb-core-1.0.1.jar +kerb-crypto/1.0.1//kerb-crypto-1.0.1.jar +kerb-identity/1.0.1//kerb-identity-1.0.1.jar +kerb-server/1.0.1//kerb-server-1.0.1.jar +kerb-simplekdc/1.0.1//kerb-simplekdc-1.0.1.jar +kerb-util/1.0.1//kerb-util-1.0.1.jar +kerby-asn1/1.0.1//kerby-asn1-1.0.1.jar +kerby-config/1.0.1//kerby-config-1.0.1.jar +kerby-pkix/1.0.1//kerby-pkix-1.0.1.jar +kerby-util/1.0.1//kerby-util-1.0.1.jar +kerby-xdr/1.0.1//kerby-xdr-1.0.1.jar +kryo-shaded/4.0.2//kryo-shaded-4.0.2.jar +kubernetes-client/4.7.1//kubernetes-client-4.7.1.jar +kubernetes-model-common/4.7.1//kubernetes-model-common-4.7.1.jar +kubernetes-model/4.7.1//kubernetes-model-4.7.1.jar +leveldbjni-all/1.8//leveldbjni-all-1.8.jar +libfb303/0.9.3//libfb303-0.9.3.jar +libthrift/0.12.0//libthrift-0.12.0.jar +log4j/1.2.17//log4j-1.2.17.jar +logging-interceptor/3.12.6//logging-interceptor-3.12.6.jar +lz4-java/1.7.1//lz4-java-1.7.1.jar +machinist_2.12/0.6.8//machinist_2.12-0.6.8.jar +macro-compat_2.12/1.1.1//macro-compat_2.12-1.1.1.jar +mesos/1.4.0/shaded-protobuf/mesos-1.4.0-shaded-protobuf.jar +metrics-core/4.1.1//metrics-core-4.1.1.jar +metrics-graphite/4.1.1//metrics-graphite-4.1.1.jar +metrics-jmx/4.1.1//metrics-jmx-4.1.1.jar +metrics-json/4.1.1//metrics-json-4.1.1.jar +metrics-jvm/4.1.1//metrics-jvm-4.1.1.jar +minlog/1.3.0//minlog-1.3.0.jar +mssql-jdbc/6.2.1.jre7//mssql-jdbc-6.2.1.jre7.jar +netty-all/4.1.42.Final//netty-all-4.1.42.Final.jar +nimbus-jose-jwt/4.41.1//nimbus-jose-jwt-4.41.1.jar +objenesis/2.5.1//objenesis-2.5.1.jar +okhttp/2.7.5//okhttp-2.7.5.jar +okhttp/3.12.6//okhttp-3.12.6.jar +okio/1.15.0//okio-1.15.0.jar +opencsv/2.3//opencsv-2.3.jar +orc-core/1.5.9//orc-core-1.5.9.jar +orc-mapreduce/1.5.9//orc-mapreduce-1.5.9.jar +orc-shims/1.5.9//orc-shims-1.5.9.jar +oro/2.0.8//oro-2.0.8.jar +osgi-resource-locator/1.0.3//osgi-resource-locator-1.0.3.jar +paranamer/2.8//paranamer-2.8.jar +parquet-column/1.10.1//parquet-column-1.10.1.jar +parquet-common/1.10.1//parquet-common-1.10.1.jar +parquet-encoding/1.10.1//parquet-encoding-1.10.1.jar +parquet-format/2.4.0//parquet-format-2.4.0.jar +parquet-hadoop/1.10.1//parquet-hadoop-1.10.1.jar +parquet-jackson/1.10.1//parquet-jackson-1.10.1.jar +protobuf-java/2.5.0//protobuf-java-2.5.0.jar +py4j/0.10.8.1//py4j-0.10.8.1.jar +pyrolite/4.30//pyrolite-4.30.jar +re2j/1.1//re2j-1.1.jar +scala-collection-compat_2.12/2.1.1//scala-collection-compat_2.12-2.1.1.jar +scala-compiler/2.12.10//scala-compiler-2.12.10.jar +scala-library/2.12.10//scala-library-2.12.10.jar +scala-parser-combinators_2.12/1.1.2//scala-parser-combinators_2.12-1.1.2.jar +scala-reflect/2.12.10//scala-reflect-2.12.10.jar +scala-xml_2.12/1.2.0//scala-xml_2.12-1.2.0.jar +shapeless_2.12/2.3.3//shapeless_2.12-2.3.3.jar +shims/0.7.45//shims-0.7.45.jar +slf4j-api/1.7.16//slf4j-api-1.7.16.jar +slf4j-log4j12/1.7.16//slf4j-log4j12-1.7.16.jar +snakeyaml/1.24//snakeyaml-1.24.jar +snappy-java/1.1.7.3//snappy-java-1.1.7.3.jar +spire-macros_2.12/0.17.0-M1//spire-macros_2.12-0.17.0-M1.jar +spire-platform_2.12/0.17.0-M1//spire-platform_2.12-0.17.0-M1.jar +spire-util_2.12/0.17.0-M1//spire-util_2.12-0.17.0-M1.jar +spire_2.12/0.17.0-M1//spire_2.12-0.17.0-M1.jar +stax-api/1.0.1//stax-api-1.0.1.jar +stax2-api/3.1.4//stax2-api-3.1.4.jar +stream/2.9.6//stream-2.9.6.jar +super-csv/2.2.0//super-csv-2.2.0.jar +threeten-extra/1.5.0//threeten-extra-1.5.0.jar +token-provider/1.0.1//token-provider-1.0.1.jar +transaction-api/1.1//transaction-api-1.1.jar +univocity-parsers/2.8.3//univocity-parsers-2.8.3.jar +velocity/1.5//velocity-1.5.jar +woodstox-core/5.0.3//woodstox-core-5.0.3.jar +xbean-asm7-shaded/4.15//xbean-asm7-shaded-4.15.jar +xz/1.5//xz-1.5.jar +zjsonpatch/0.3.0//zjsonpatch-0.3.0.jar +zookeeper/3.4.14//zookeeper-3.4.14.jar +zstd-jni/1.4.4-3//zstd-jni-1.4.4-3.jar diff --git a/dev/github_jira_sync.py b/dev/github_jira_sync.py index fa1736163d4c6..b444b74d4027c 100755 --- a/dev/github_jira_sync.py +++ b/dev/github_jira_sync.py @@ -116,7 +116,8 @@ def build_pr_component_dic(jira_prs): dic = {} for issue, pr in jira_prs: print(issue) - jira_components = [c.name.upper() for c in jira_client.issue(issue).fields.components] + page = get_json(get_url(JIRA_API_BASE + "/rest/api/2/issue/" + issue)) + jira_components = [c['name'].upper() for c in page['fields']['components']] if pr['number'] in dic: dic[pr['number']][1].update(jira_components) else: @@ -163,7 +164,8 @@ def reset_pr_labels(pr_num, jira_components): url = pr['html_url'] title = "[Github] Pull Request #%s (%s)" % (pr['number'], pr['user']['login']) try: - existing_links = map(lambda l: l.raw['object']['url'], jira_client.remote_links(issue)) + page = get_json(get_url(JIRA_API_BASE + "/rest/api/2/issue/" + issue + "/remotelink")) + existing_links = map(lambda l: l['object']['url'], page) except: print("Failure reading JIRA %s (does it exist?)" % issue) print(sys.exc_info()[0]) diff --git a/dev/lint-python b/dev/lint-python index 06816932e754a..24f0d8fb6ea36 100755 --- a/dev/lint-python +++ b/dev/lint-python @@ -27,6 +27,8 @@ MINIMUM_PYCODESTYLE="2.4.0" SPHINX_BUILD="sphinx-build" +PYTHON_EXECUTABLE="python3" + function compile_python_test { local COMPILE_STATUS= local COMPILE_REPORT= @@ -36,9 +38,9 @@ function compile_python_test { exit 1; fi - # compileall: https://docs.python.org/2/library/compileall.html + # compileall: https://docs.python.org/3/library/compileall.html echo "starting python compilation test..." - COMPILE_REPORT=$( (python -B -mcompileall -q -l $1) 2>&1) + COMPILE_REPORT=$( ("$PYTHON_EXECUTABLE" -B -mcompileall -q -l -x "[/\\\\][.]git" $1) 2>&1) COMPILE_STATUS=$? if [ $COMPILE_STATUS -ne 0 ]; then @@ -70,7 +72,7 @@ function pycodestyle_test { RUN_LOCAL_PYCODESTYLE="False" if hash "$PYCODESTYLE_BUILD" 2> /dev/null; then VERSION=$( $PYCODESTYLE_BUILD --version 2> /dev/null) - EXPECTED_PYCODESTYLE=$( (python -c 'from distutils.version import LooseVersion; + EXPECTED_PYCODESTYLE=$( ("$PYTHON_EXECUTABLE" -c 'from distutils.version import LooseVersion; print(LooseVersion("""'${VERSION[0]}'""") >= LooseVersion("""'$MINIMUM_PYCODESTYLE'"""))')\ 2> /dev/null) @@ -96,7 +98,7 @@ function pycodestyle_test { fi echo "starting pycodestyle test..." - PYCODESTYLE_REPORT=$( (python "$PYCODESTYLE_SCRIPT_PATH" --config=dev/tox.ini $1) 2>&1) + PYCODESTYLE_REPORT=$( ("$PYTHON_EXECUTABLE" "$PYCODESTYLE_SCRIPT_PATH" --config=dev/tox.ini $1) 2>&1) PYCODESTYLE_STATUS=$? else # we have the right version installed, so run locally @@ -130,7 +132,7 @@ function flake8_test { FLAKE8_VERSION="$($FLAKE8_BUILD --version 2> /dev/null)" VERSION=($FLAKE8_VERSION) - EXPECTED_FLAKE8=$( (python -c 'from distutils.version import LooseVersion; + EXPECTED_FLAKE8=$( ("$PYTHON_EXECUTABLE" -c 'from distutils.version import LooseVersion; print(LooseVersion("""'${VERSION[0]}'""") >= LooseVersion("""'$MINIMUM_FLAKE8'"""))') \ 2> /dev/null) @@ -175,7 +177,7 @@ function pydocstyle_test { fi PYDOCSTYLE_VERSION="$($PYDOCSTYLEBUILD --version 2> /dev/null)" - EXPECTED_PYDOCSTYLE=$(python -c 'from distutils.version import LooseVersion; \ + EXPECTED_PYDOCSTYLE=$("$PYTHON_EXECUTABLE" -c 'from distutils.version import LooseVersion; \ print(LooseVersion("""'$PYDOCSTYLE_VERSION'""") >= LooseVersion("""'$MINIMUM_PYDOCSTYLE'"""))' \ 2> /dev/null) diff --git a/dev/lint-r b/dev/lint-r index bfda0bca15eb7..b08f5efecd5d3 100755 --- a/dev/lint-r +++ b/dev/lint-r @@ -17,6 +17,9 @@ # limitations under the License. # +set -o pipefail +set -e + SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )" SPARK_ROOT_DIR="$(dirname $SCRIPT_DIR)" LINT_R_REPORT_FILE_NAME="$SPARK_ROOT_DIR/dev/lint-r-report.log" @@ -24,7 +27,7 @@ LINT_R_REPORT_FILE_NAME="$SPARK_ROOT_DIR/dev/lint-r-report.log" if ! type "Rscript" > /dev/null; then echo "ERROR: You should install R" - exit + exit 1 fi `which Rscript` --vanilla "$SPARK_ROOT_DIR/dev/lint-r.R" "$SPARK_ROOT_DIR" | tee "$LINT_R_REPORT_FILE_NAME" diff --git a/dev/lint-r.R b/dev/lint-r.R index a4261d266bbc0..7e165319e316a 100644 --- a/dev/lint-r.R +++ b/dev/lint-r.R @@ -27,7 +27,7 @@ if (! library(SparkR, lib.loc = LOCAL_LIB_LOC, logical.return = TRUE)) { # Installs lintr from Github in a local directory. # NOTE: The CRAN's version is too old to adapt to our rules. if ("lintr" %in% row.names(installed.packages()) == FALSE) { - devtools::install_github("jimhester/lintr@5431140") + devtools::install_github("jimhester/lintr@v2.0.0") } library(lintr) diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh index a550af93feecd..0b30eec76bb53 100755 --- a/dev/make-distribution.sh +++ b/dev/make-distribution.sh @@ -160,7 +160,7 @@ fi # Build uber fat JAR cd "$SPARK_HOME" -export MAVEN_OPTS="${MAVEN_OPTS:--Xmx2g -XX:ReservedCodeCacheSize=512m}" +export MAVEN_OPTS="${MAVEN_OPTS:--Xmx2g -XX:ReservedCodeCacheSize=1g}" # Store the command as an array because $MVN variable might have spaces in it. # Normal quoting tricks don't work. @@ -233,7 +233,7 @@ if [ "$MAKE_PIP" == "true" ]; then pushd "$SPARK_HOME/python" > /dev/null # Delete the egg info file if it exists, this can cache older setup files. rm -rf pyspark.egg-info || echo "No existing egg info file, skipping deletion" - python setup.py sdist + python3 setup.py sdist popd > /dev/null else echo "Skipping building python distribution package" diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py index fa3d50b8989f1..967cdace60dc9 100755 --- a/dev/merge_spark_pr.py +++ b/dev/merge_spark_pr.py @@ -97,9 +97,9 @@ def fail(msg): def run_cmd(cmd): print(cmd) if isinstance(cmd, list): - return subprocess.check_output(cmd).decode(sys.getdefaultencoding()) + return subprocess.check_output(cmd).decode('utf-8') else: - return subprocess.check_output(cmd.split(" ")).decode(sys.getdefaultencoding()) + return subprocess.check_output(cmd.split(" ")).decode('utf-8') def continue_maybe(prompt): diff --git a/dev/pip-sanity-check.py b/dev/pip-sanity-check.py index 4171f28684d59..e9f10233b12b7 100644 --- a/dev/pip-sanity-check.py +++ b/dev/pip-sanity-check.py @@ -15,8 +15,6 @@ # limitations under the License. # -from __future__ import print_function - from pyspark.sql import SparkSession from pyspark.mllib.linalg import * import sys diff --git a/dev/requirements.txt b/dev/requirements.txt index 3fdd3425ffcc2..baea9213dbc97 100644 --- a/dev/requirements.txt +++ b/dev/requirements.txt @@ -2,5 +2,4 @@ flake8==3.5.0 jira==1.0.3 PyGithub==1.26.0 Unidecode==0.04.19 -pypandoc==1.3.3 sphinx diff --git a/dev/run-pip-tests b/dev/run-pip-tests index 60cf4d8209416..470f21e69d46a 100755 --- a/dev/run-pip-tests +++ b/dev/run-pip-tests @@ -39,21 +39,16 @@ PYTHON_EXECS=() # Some systems don't have pip or virtualenv - in those cases our tests won't work. if hash virtualenv 2>/dev/null && [ ! -n "$USE_CONDA" ]; then echo "virtualenv installed - using. Note if this is a conda virtual env you may wish to set USE_CONDA" - # Figure out which Python execs we should test pip installation with - if hash python2 2>/dev/null; then - # We do this since we are testing with virtualenv and the default virtual env python - # is in /usr/bin/python - PYTHON_EXECS+=('python2') - elif hash python 2>/dev/null; then - # If python2 isn't installed fallback to python if available - PYTHON_EXECS+=('python') - fi + # test only against python3 if hash python3 2>/dev/null; then - PYTHON_EXECS+=('python3') + PYTHON_EXECS=('python3') + else + echo "Python3 not installed on system, skipping pip installability tests" + exit 0 fi elif hash conda 2>/dev/null; then echo "Using conda virtual environments" - PYTHON_EXECS=('3.5') + PYTHON_EXECS=('3.6') USE_CONDA=1 else echo "Missing virtualenv & conda, skipping pip installability tests" @@ -90,14 +85,14 @@ for python in "${PYTHON_EXECS[@]}"; do fi # Upgrade pip & friends if using virtual env if [ ! -n "$USE_CONDA" ]; then - pip install --upgrade pip pypandoc wheel numpy + pip install --upgrade pip wheel numpy fi echo "Creating pip installable source dist" cd "$FWDIR"/python # Delete the egg info file if it exists, this can cache the setup file. rm -rf pyspark.egg-info || echo "No existing egg info file, skipping deletion" - python setup.py sdist + python3 setup.py sdist echo "Installing dist into virtual env" @@ -117,9 +112,9 @@ for python in "${PYTHON_EXECS[@]}"; do echo "Run basic sanity check on pip installed version with spark-submit" spark-submit "$FWDIR"/dev/pip-sanity-check.py echo "Run basic sanity check with import based" - python "$FWDIR"/dev/pip-sanity-check.py + python3 "$FWDIR"/dev/pip-sanity-check.py echo "Run the tests for context.py" - python "$FWDIR"/python/pyspark/context.py + python3 "$FWDIR"/python/pyspark/context.py cd "$FWDIR" diff --git a/dev/run-tests b/dev/run-tests index 9cf93d000d0ea..143d78ec63731 100755 --- a/dev/run-tests +++ b/dev/run-tests @@ -20,10 +20,10 @@ FWDIR="$(cd "`dirname $0`"/..; pwd)" cd "$FWDIR" -PYTHON_VERSION_CHECK=$(python -c 'import sys; print(sys.version_info < (2, 7, 0))') +PYTHON_VERSION_CHECK=$(python3 -c 'import sys; print(sys.version_info < (3, 6, 0))') if [[ "$PYTHON_VERSION_CHECK" == "True" ]]; then - echo "Python versions prior to 2.7 are not supported." + echo "Python versions prior to 3.6 are not supported." exit -1 fi -exec python -u ./dev/run-tests.py "$@" +exec python3 -u ./dev/run-tests.py "$@" diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins index 5bc03e41d1f2d..c3adc696a5122 100755 --- a/dev/run-tests-jenkins +++ b/dev/run-tests-jenkins @@ -25,10 +25,12 @@ FWDIR="$( cd "$( dirname "$0" )/.." && pwd )" cd "$FWDIR" -PYTHON_VERSION_CHECK=$(python -c 'import sys; print(sys.version_info < (2, 7, 0))') +export PATH=/home/anaconda/envs/py36/bin:$PATH + +PYTHON_VERSION_CHECK=$(python3 -c 'import sys; print(sys.version_info < (3, 6, 0))') if [[ "$PYTHON_VERSION_CHECK" == "True" ]]; then - echo "Python versions prior to 2.7 are not supported." + echo "Python versions prior to 3.6 are not supported." exit -1 fi -exec python -u ./dev/run-tests-jenkins.py "$@" +exec python3 -u ./dev/run-tests-jenkins.py "$@" diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py index e9b0b327603be..72e32d4e16e14 100755 --- a/dev/run-tests-jenkins.py +++ b/dev/run-tests-jenkins.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # # Licensed to the Apache Software Foundation (ASF) under one or more @@ -17,7 +17,6 @@ # limitations under the License. # -from __future__ import print_function import os import sys import json @@ -177,12 +176,15 @@ def main(): if "test-maven" in ghprb_pull_title: os.environ["AMPLAB_JENKINS_BUILD_TOOL"] = "maven" # Switch the Hadoop profile based on the PR title: - if "test-hadoop2.6" in ghprb_pull_title: - os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.6" if "test-hadoop2.7" in ghprb_pull_title: os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.7" if "test-hadoop3.2" in ghprb_pull_title: os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop3.2" + # Switch the Hive profile based on the PR title: + if "test-hive1.2" in ghprb_pull_title: + os.environ["AMPLAB_JENKINS_BUILD_HIVE_PROFILE"] = "hive1.2" + if "test-hive2.3" in ghprb_pull_title: + os.environ["AMPLAB_JENKINS_BUILD_HIVE_PROFILE"] = "hive2.3" build_display_name = os.environ["BUILD_DISPLAY_NAME"] build_url = os.environ["BUILD_URL"] diff --git a/dev/run-tests.py b/dev/run-tests.py index ea515708124db..5255a77ec2081 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # # Licensed to the Apache Software Foundation (ASF) under one or more @@ -17,7 +17,6 @@ # limitations under the License. # -from __future__ import print_function import itertools from argparse import ArgumentParser import os @@ -44,15 +43,20 @@ def determine_modules_for_files(filenames): """ Given a list of filenames, return the set of modules that contain those files. If a file is not associated with a more specific submodule, then this method will consider that - file to belong to the 'root' module. + file to belong to the 'root' module. GitHub Action and Appveyor files are ignored. >>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/core/foo"])) ['pyspark-core', 'sql'] >>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])] ['root'] + >>> [x.name for x in determine_modules_for_files( \ + [".github/workflows/master.yml", "appveyor.yml"])] + [] """ changed_modules = set() for filename in filenames: + if filename in (".github/workflows/master.yml", "appveyor.yml"): + continue matched_at_least_one_module = False for module in modules.all_modules: if module.contains_file(filename): @@ -175,7 +179,8 @@ def run_apache_rat_checks(): run_cmd([os.path.join(SPARK_HOME, "dev", "check-license")]) -def run_scala_style_checks(build_profiles): +def run_scala_style_checks(extra_profiles): + build_profiles = extra_profiles + modules.root.build_profile_flags set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE") profiles = " ".join(build_profiles) print("[info] Checking Scala style using SBT with these profiles: ", profiles) @@ -265,7 +270,7 @@ def exec_sbt(sbt_args=()): echo_proc.wait() for line in iter(sbt_proc.stdout.readline, b''): if not sbt_output_filter.match(line): - print(line, end='') + print(line.decode('utf-8'), end='') retcode = sbt_proc.wait() if retcode != 0: @@ -291,9 +296,28 @@ def get_hadoop_profiles(hadoop_version): sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) -def build_spark_maven(hadoop_version): +def get_hive_profiles(hive_version): + """ + For the given Hive version tag, return a list of Maven/SBT profile flags for + building and testing against that Hive version. + """ + + sbt_maven_hive_profiles = { + "hive1.2": ["-Phive-1.2"], + "hive2.3": ["-Phive-2.3"], + } + + if hive_version in sbt_maven_hive_profiles: + return sbt_maven_hive_profiles[hive_version] + else: + print("[error] Could not find", hive_version, "in the list. Valid options", + " are", sbt_maven_hive_profiles.keys()) + sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) + + +def build_spark_maven(extra_profiles): # Enable all of the profiles for the build: - build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags + build_profiles = extra_profiles + modules.root.build_profile_flags mvn_goals = ["clean", "package", "-DskipTests"] profiles_and_goals = build_profiles + mvn_goals @@ -302,9 +326,9 @@ def build_spark_maven(hadoop_version): exec_maven(profiles_and_goals) -def build_spark_sbt(hadoop_version): +def build_spark_sbt(extra_profiles): # Enable all of the profiles for the build: - build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags + build_profiles = extra_profiles + modules.root.build_profile_flags sbt_goals = ["test:package", # Build test jars as some tests depend on them "streaming-kinesis-asl-assembly/assembly"] profiles_and_goals = build_profiles + sbt_goals @@ -314,10 +338,10 @@ def build_spark_sbt(hadoop_version): exec_sbt(profiles_and_goals) -def build_spark_unidoc_sbt(hadoop_version): +def build_spark_unidoc_sbt(extra_profiles): set_title_and_block("Building Unidoc API Documentation", "BLOCK_DOCUMENTATION") # Enable all of the profiles for the build: - build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags + build_profiles = extra_profiles + modules.root.build_profile_flags sbt_goals = ["unidoc"] profiles_and_goals = build_profiles + sbt_goals @@ -327,9 +351,9 @@ def build_spark_unidoc_sbt(hadoop_version): exec_sbt(profiles_and_goals) -def build_spark_assembly_sbt(hadoop_version, checkstyle=False): +def build_spark_assembly_sbt(extra_profiles, checkstyle=False): # Enable all of the profiles for the build: - build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags + build_profiles = extra_profiles + modules.root.build_profile_flags sbt_goals = ["assembly/package"] profiles_and_goals = build_profiles + sbt_goals print("[info] Building Spark assembly using SBT with these arguments: ", @@ -339,25 +363,25 @@ def build_spark_assembly_sbt(hadoop_version, checkstyle=False): if checkstyle: run_java_style_checks(build_profiles) - build_spark_unidoc_sbt(hadoop_version) + build_spark_unidoc_sbt(extra_profiles) -def build_apache_spark(build_tool, hadoop_version): - """Will build Spark against Hive v1.2.1 given the passed in build tool (either `sbt` or - `maven`). Defaults to using `sbt`.""" +def build_apache_spark(build_tool, extra_profiles): + """Will build Spark with the extra profiles and the passed in build tool + (either `sbt` or `maven`). Defaults to using `sbt`.""" set_title_and_block("Building Spark", "BLOCK_BUILD") rm_r("lib_managed") if build_tool == "maven": - build_spark_maven(hadoop_version) + build_spark_maven(extra_profiles) else: - build_spark_sbt(hadoop_version) + build_spark_sbt(extra_profiles) -def detect_binary_inop_with_mima(hadoop_version): - build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags +def detect_binary_inop_with_mima(extra_profiles): + build_profiles = extra_profiles + modules.root.build_profile_flags set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA") profiles = " ".join(build_profiles) print("[info] Detecting binary incompatibilities with MiMa using SBT with these profiles: ", @@ -391,14 +415,14 @@ def run_scala_tests_sbt(test_modules, test_profiles): exec_sbt(profiles_and_goals) -def run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags): +def run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags): """Function to properly execute all tests passed in as a set from the `determine_test_suites` function""" set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS") test_modules = set(test_modules) - test_profiles = get_hadoop_profiles(hadoop_version) + \ + test_profiles = extra_profiles + \ list(set(itertools.chain.from_iterable(m.build_profile_flags for m in test_modules))) if excluded_tags: @@ -551,6 +575,7 @@ def main(): # to reflect the environment settings build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt") hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.7") + hive_version = os.environ.get("AMPLAB_JENKINS_BUILD_HIVE_PROFILE", "hive2.3") test_env = "amplab_jenkins" # add path for Python3 in Jenkins if we're calling from a Jenkins machine # TODO(sknapp): after all builds are ported to the ubuntu workers, change this to be: @@ -560,10 +585,12 @@ def main(): # else we're running locally and can use local settings build_tool = "sbt" hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7") + hive_version = os.environ.get("HIVE_PROFILE", "hive2.3") test_env = "local" print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version, - "under environment", test_env) + "and Hive profile", hive_version, "under environment", test_env) + extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version) changed_modules = None changed_files = None @@ -597,8 +624,7 @@ def main(): if not changed_files or any(f.endswith(".scala") or f.endswith("scalastyle-config.xml") for f in changed_files): - build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags - run_scala_style_checks(build_profiles) + run_scala_style_checks(extra_profiles) should_run_java_style_checks = False if not changed_files or any(f.endswith(".java") or f.endswith("checkstyle.xml") @@ -626,18 +652,18 @@ def main(): run_build_tests() # spark build - build_apache_spark(build_tool, hadoop_version) + build_apache_spark(build_tool, extra_profiles) # backwards compatibility checks if build_tool == "sbt": # Note: compatibility tests only supported in sbt for now - detect_binary_inop_with_mima(hadoop_version) + detect_binary_inop_with_mima(extra_profiles) # Since we did not build assembly/package before running dev/mima, we need to # do it here because the tests still rely on it; see SPARK-13294 for details. - build_spark_assembly_sbt(hadoop_version, should_run_java_style_checks) + build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks) # run the test suites - run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags) + run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags) modules_with_python_tests = [m for m in test_modules if m.python_test_goals] if modules_with_python_tests: diff --git a/dev/scalafmt b/dev/scalafmt index 76f688a2f5b88..3f69bc98f51c7 100755 --- a/dev/scalafmt +++ b/dev/scalafmt @@ -17,7 +17,6 @@ # limitations under the License. # -# by default, format only files that differ from git master -params="${@:---diff}" +VERSION="${@:-2.12}" +./build/mvn -Pscala-$VERSION mvn-scalafmt_$VERSION:format -Dscalafmt.skip=false -./build/mvn mvn-scalafmt_2.12:format -Dscalafmt.skip=false -Dscalafmt.parameters="$params" diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 0f6dbf2f99a97..391e4bbe1b1f0 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -329,6 +329,7 @@ def __hash__(self): "pyspark.tests.test_join", "pyspark.tests.test_profiler", "pyspark.tests.test_rdd", + "pyspark.tests.test_rddbarrier", "pyspark.tests.test_readwrite", "pyspark.tests.test_serializers", "pyspark.tests.test_shuffle", @@ -361,8 +362,14 @@ def __hash__(self): "pyspark.sql.udf", "pyspark.sql.window", "pyspark.sql.avro.functions", + "pyspark.sql.pandas.conversion", + "pyspark.sql.pandas.map_ops", + "pyspark.sql.pandas.group_ops", + "pyspark.sql.pandas.types", + "pyspark.sql.pandas.serializers", + "pyspark.sql.pandas.typehints", + "pyspark.sql.pandas.utils", # unittests - "pyspark.sql.tests.test_appsubmit", "pyspark.sql.tests.test_arrow", "pyspark.sql.tests.test_catalog", "pyspark.sql.tests.test_column", @@ -372,10 +379,13 @@ def __hash__(self): "pyspark.sql.tests.test_datasources", "pyspark.sql.tests.test_functions", "pyspark.sql.tests.test_group", + "pyspark.sql.tests.test_pandas_cogrouped_map", + "pyspark.sql.tests.test_pandas_grouped_map", + "pyspark.sql.tests.test_pandas_map", "pyspark.sql.tests.test_pandas_udf", "pyspark.sql.tests.test_pandas_udf_grouped_agg", - "pyspark.sql.tests.test_pandas_udf_grouped_map", "pyspark.sql.tests.test_pandas_udf_scalar", + "pyspark.sql.tests.test_pandas_udf_typehints", "pyspark.sql.tests.test_pandas_udf_window", "pyspark.sql.tests.test_readwriter", "pyspark.sql.tests.test_serde", @@ -459,6 +469,7 @@ def __hash__(self): "pyspark.ml.evaluation", "pyspark.ml.feature", "pyspark.ml.fpm", + "pyspark.ml.functions", "pyspark.ml.image", "pyspark.ml.linalg.__init__", "pyspark.ml.recommendation", diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py index ec6ea86269f5e..d9cb8aa45c8d2 100644 --- a/dev/sparktestsupport/shellutils.py +++ b/dev/sparktestsupport/shellutils.py @@ -15,14 +15,12 @@ # limitations under the License. # -from __future__ import print_function import os import shutil import subprocess import sys subprocess_check_output = subprocess.check_output -subprocess_check_call = subprocess.check_call def exit_from_command_with_retcode(cmd, retcode): @@ -55,9 +53,9 @@ def run_cmd(cmd, return_output=False): cmd = cmd.split() try: if return_output: - return subprocess_check_output(cmd).decode(sys.getdefaultencoding()) + return subprocess_check_output(cmd).decode('utf-8') else: - return subprocess_check_call(cmd) + return subprocess.run(cmd, universal_newlines=True, check=True) except subprocess.CalledProcessError as e: exit_from_command_with_retcode(e.cmd, e.returncode) diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh index 54574f6097e26..936ac00f6b9e7 100755 --- a/dev/test-dependencies.sh +++ b/dev/test-dependencies.sh @@ -31,9 +31,10 @@ export LC_ALL=C # NOTE: These should match those in the release publishing script HADOOP2_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pkubernetes -Pyarn -Phive" MVN="build/mvn" -HADOOP_PROFILES=( - hadoop-2.7 - hadoop-3.2 +HADOOP_HIVE_PROFILES=( + hadoop-2.7-hive-1.2 + hadoop-2.7-hive-2.3 + hadoop-3.2-hive-2.3 ) # We'll switch the version to a temp. one, publish POMs using that new version, then switch back to @@ -66,19 +67,45 @@ trap reset_version EXIT $MVN -q versions:set -DnewVersion=$TEMP_VERSION -DgenerateBackupPoms=false > /dev/null # Generate manifests for each Hadoop profile: -for HADOOP_PROFILE in "${HADOOP_PROFILES[@]}"; do - echo "Performing Maven install for $HADOOP_PROFILE" - $MVN $HADOOP2_MODULE_PROFILES -P$HADOOP_PROFILE jar:jar jar:test-jar install:install clean -q +for HADOOP_HIVE_PROFILE in "${HADOOP_HIVE_PROFILES[@]}"; do + if [[ $HADOOP_HIVE_PROFILE == **hadoop-3.2-hive-2.3** ]]; then + HADOOP_PROFILE=hadoop-3.2 + HIVE_PROFILE=hive-2.3 + elif [[ $HADOOP_HIVE_PROFILE == **hadoop-2.7-hive-2.3** ]]; then + HADOOP_PROFILE=hadoop-2.7 + HIVE_PROFILE=hive-2.3 + else + HADOOP_PROFILE=hadoop-2.7 + HIVE_PROFILE=hive-1.2 + fi + echo "Performing Maven install for $HADOOP_HIVE_PROFILE" + $MVN $HADOOP2_MODULE_PROFILES -P$HADOOP_PROFILE -P$HIVE_PROFILE jar:jar jar:test-jar install:install clean -q - echo "Performing Maven validate for $HADOOP_PROFILE" - $MVN $HADOOP2_MODULE_PROFILES -P$HADOOP_PROFILE validate -q + echo "Performing Maven validate for $HADOOP_HIVE_PROFILE" + $MVN $HADOOP2_MODULE_PROFILES -P$HADOOP_PROFILE -P$HIVE_PROFILE validate -q - echo "Generating dependency manifest for $HADOOP_PROFILE" + echo "Generating dependency manifest for $HADOOP_HIVE_PROFILE" mkdir -p dev/pr-deps - $MVN $HADOOP2_MODULE_PROFILES -P$HADOOP_PROFILE dependency:build-classpath -pl assembly \ + $MVN $HADOOP2_MODULE_PROFILES -P$HADOOP_PROFILE -P$HIVE_PROFILE dependency:build-classpath -pl assembly -am \ | grep "Dependencies classpath:" -A 1 \ - | tail -n 1 | tr ":" "\n" | rev | cut -d "/" -f 1 | rev | sort \ - | grep -v spark > dev/pr-deps/spark-deps-$HADOOP_PROFILE + | tail -n 1 | tr ":" "\n" | awk -F '/' '{ + # For each dependency classpath, we fetch the last three parts split by "/": artifact id, version, and jar name. + # Since classifier, if exists, always sits between "artifact_id-version-" and ".jar" suffix in the jar name, + # we extract classifier and put it right before the jar name explicitly. + # For example, `orc-core/1.5.5/nohive/orc-core-1.5.5-nohive.jar` + # ^^^^^^ + # extracted classifier + # `okio/1.15.0//okio-1.15.0.jar` + # ^ + # empty for dependencies without classifier + artifact_id=$(NF-2); + version=$(NF-1); + jar_name=$NF; + classifier_start_index=length(artifact_id"-"version"-") + 1; + classifier_end_index=index(jar_name, ".jar") - 1; + classifier=substr(jar_name, classifier_start_index, classifier_end_index - classifier_start_index + 1); + print artifact_id"/"version"/"classifier"/"jar_name + }' | sort | grep -v spark > dev/pr-deps/spark-deps-$HADOOP_HIVE_PROFILE done if [[ $@ == **replace-manifest** ]]; then @@ -88,13 +115,13 @@ if [[ $@ == **replace-manifest** ]]; then exit 0 fi -for HADOOP_PROFILE in "${HADOOP_PROFILES[@]}"; do +for HADOOP_HIVE_PROFILE in "${HADOOP_HIVE_PROFILES[@]}"; do set +e dep_diff="$( git diff \ --no-index \ - dev/deps/spark-deps-$HADOOP_PROFILE \ - dev/pr-deps/spark-deps-$HADOOP_PROFILE \ + dev/deps/spark-deps-$HADOOP_HIVE_PROFILE \ + dev/pr-deps/spark-deps-$HADOOP_HIVE_PROFILE \ )" set -e if [ "$dep_diff" != "" ]; then diff --git a/dev/tox.ini b/dev/tox.ini index 11b1b040035b0..54f65692c8303 100644 --- a/dev/tox.ini +++ b/dev/tox.ini @@ -16,6 +16,6 @@ [pycodestyle] ignore=E226,E241,E305,E402,E722,E731,E741,W503,W504 max-line-length=100 -exclude=cloudpickle.py,heapq3.py,shared.py,python/docs/conf.py,work/*/*.py,python/.eggs/*,dist/* +exclude=cloudpickle.py,heapq3.py,shared.py,python/docs/conf.py,work/*/*.py,python/.eggs/*,dist/*,.git/* [pydocstyle] ignore=D100,D101,D102,D103,D104,D105,D106,D107,D200,D201,D202,D203,D204,D205,D206,D207,D208,D209,D210,D211,D212,D213,D214,D215,D300,D301,D302,D400,D401,D402,D403,D404,D405,D406,D407,D408,D409,D410,D411,D412,D413,D414 diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000000000..2260493b46ab3 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1 @@ +sql-configs.html diff --git a/docs/README.md b/docs/README.md index da531321aa5da..22039871cf63d 100644 --- a/docs/README.md +++ b/docs/README.md @@ -6,9 +6,9 @@ license: | The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - + http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -36,19 +36,31 @@ You need to have [Ruby](https://www.ruby-lang.org/en/documentation/installation/ installed. Also install the following libraries: ```sh -$ sudo gem install jekyll jekyll-redirect-from pygments.rb -$ sudo pip install Pygments -# Following is needed only for generating API docs -$ sudo pip install sphinx pypandoc mkdocs -$ sudo Rscript -e 'install.packages(c("knitr", "devtools", "rmarkdown"), repos="https://cloud.r-project.org/")' -$ sudo Rscript -e 'devtools::install_version("roxygen2", version = "5.0.1", repos="https://cloud.r-project.org/")' -$ sudo Rscript -e 'devtools::install_version("testthat", version = "1.0.2", repos="https://cloud.r-project.org/")' +$ sudo gem install jekyll jekyll-redirect-from rouge ``` Note: If you are on a system with both Ruby 1.9 and Ruby 2.0 you may need to replace gem with gem2.0. +### R Documentation + +If you'd like to generate R documentation, you'll need to [install Pandoc](https://pandoc.org/installing.html) +and install these libraries: + +```sh +$ sudo Rscript -e 'install.packages(c("knitr", "devtools", "testthat", "rmarkdown"), repos="https://cloud.r-project.org/")' +$ sudo Rscript -e 'devtools::install_version("roxygen2", version = "5.0.1", repos="https://cloud.r-project.org/")' +``` + Note: Other versions of roxygen2 might work in SparkR documentation generation but `RoxygenNote` field in `$SPARK_HOME/R/pkg/DESCRIPTION` is 5.0.1, which is updated if the version is mismatched. +### API Documentation + +To generate API docs for any language, you'll need to install these libraries: + +```sh +$ sudo pip install sphinx mkdocs numpy +``` + ## Generating the Documentation HTML We include the Spark documentation as part of the source (as opposed to using a hosted wiki, such as @@ -103,3 +115,17 @@ using [MkDocs](https://www.mkdocs.org/). NOTE: To skip the step of building and copying over the Scala, Java, Python, R and SQL API docs, run `SKIP_API=1 jekyll build`. In addition, `SKIP_SCALADOC=1`, `SKIP_PYTHONDOC=1`, `SKIP_RDOC=1` and `SKIP_SQLDOC=1` can be used to skip a single step of the corresponding language. `SKIP_SCALADOC` indicates skipping both the Scala and Java docs. + +### Automatically Rebuilding API Docs + +`jekyll serve --watch` will only watch what's in `docs/`, and it won't follow symlinks. That means it won't monitor your API docs under `python/docs` or elsewhere. + +To work around this limitation for Python, install [`entr`](http://eradman.com/entrproject/) and run the following in a separate shell: + +```sh +cd "$SPARK_HOME/python/docs" +find .. -type f -name '*.py' \ +| entr -s 'make html && cp -r _build/html/. ../../docs/api/python' +``` + +Whenever there is a change to your Python code, `entr` will automatically rebuild the Python API docs and copy them to `docs/`, thus triggering a Jekyll update. diff --git a/docs/_config.yml b/docs/_config.yml index 146c90fcff6e5..a888620139207 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -1,4 +1,4 @@ -highlighter: pygments +highlighter: rouge markdown: kramdown gems: - jekyll-redirect-from @@ -17,7 +17,7 @@ include: SPARK_VERSION: 3.0.0-SNAPSHOT SPARK_VERSION_SHORT: 3.0.0 SCALA_BINARY_VERSION: "2.12" -SCALA_VERSION: "2.12.8" +SCALA_VERSION: "2.12.10" MESOS_VERSION: 1.0.0 SPARK_ISSUE_TRACKER_URL: https://issues.apache.org/jira/browse/SPARK SPARK_GITHUB_URL: https://github.com/apache/spark diff --git a/docs/_data/menu-migration.yaml b/docs/_data/menu-migration.yaml new file mode 100644 index 0000000000000..1d8b311dd64fb --- /dev/null +++ b/docs/_data/menu-migration.yaml @@ -0,0 +1,12 @@ +- text: Spark Core + url: core-migration-guide.html +- text: SQL, Datasets and DataFrame + url: sql-migration-guide.html +- text: Structured Streaming + url: ss-migration-guide.html +- text: MLlib (Machine Learning) + url: ml-migration-guide.html +- text: PySpark (Python on Spark) + url: pyspark-migration-guide.html +- text: SparkR (R on Spark) + url: sparkr-migration-guide.html diff --git a/docs/_data/menu-sql.yaml b/docs/_data/menu-sql.yaml index 717911b5a4645..38a5cf61245a6 100644 --- a/docs/_data/menu-sql.yaml +++ b/docs/_data/menu-sql.yaml @@ -15,6 +15,8 @@ url: sql-getting-started.html#creating-datasets - text: Interoperating with RDDs url: sql-getting-started.html#interoperating-with-rdds + - text: Scalar Functions + url: sql-getting-started.html#scalar-functions - text: Aggregations url: sql-getting-started.html#aggregations - text: Data Sources @@ -22,6 +24,8 @@ subitems: - text: "Generic Load/Save Functions" url: sql-data-sources-load-save-functions.html + - text: "Generic File Source Options" + url: sql-data-sources-generic-options.html - text: Parquet Files url: sql-data-sources-parquet.html - text: ORC Files @@ -34,6 +38,8 @@ url: sql-data-sources-jdbc.html - text: Avro Files url: sql-data-sources-avro.html + - text: Whole Binary Files + url: sql-data-sources-binaryFile.html - text: Troubleshooting url: sql-data-sources-troubleshooting.html - text: Performance Tuning @@ -43,8 +49,8 @@ url: sql-performance-tuning.html#caching-data-in-memory - text: Other Configuration Options url: sql-performance-tuning.html#other-configuration-options - - text: Broadcast Hint for SQL Queries - url: sql-performance-tuning.html#broadcast-hint-for-sql-queries + - text: Join Strategy Hints for SQL Queries + url: sql-performance-tuning.html#join-strategy-hints-for-sql-queries - text: Distributed SQL Engine url: sql-distributed-sql-engine.html subitems: @@ -64,22 +70,25 @@ - text: Usage Notes url: sql-pyspark-pandas-with-arrow.html#usage-notes - text: Migration Guide - url: sql-migration-guide.html - subitems: - - text: Spark SQL Upgrading Guide - url: sql-migration-guide-upgrade.html - - text: Compatibility with Apache Hive - url: sql-migration-guide-hive-compatibility.html - - text: SQL Reserved/Non-Reserved Keywords - url: sql-reserved-and-non-reserved-keywords.html - + url: sql-migration-old.html - text: SQL Reference url: sql-ref.html subitems: - text: Data Types url: sql-ref-datatypes.html + - text: Null Semantics + url: sql-ref-null-semantics.html - text: NaN Semantics url: sql-ref-nan-semantics.html + - text: ANSI Compliance + url: sql-ref-ansi-compliance.html + subitems: + - text: Arithmetic Operations + url: sql-ref-ansi-compliance.html#arithmetic-operations + - text: Type Conversion + url: sql-ref-ansi-compliance.html#type-conversion + - text: SQL Keywords + url: sql-ref-ansi-compliance.html#sql-keywords - text: SQL Syntax url: sql-ref-syntax.html subitems: @@ -125,43 +134,35 @@ - text: SELECT url: sql-ref-syntax-qry-select.html subitems: - - text: DISTINCT Clause - url: sql-ref-syntax-qry-select-distinct.html - - text: Joins - url: sql-ref-syntax-qry-select-join.html - - text: ORDER BY Clause - url: sql-ref-syntax-qry-select-orderby.html + - text: WHERE Clause + url: sql-ref-syntax-qry-select-where.html - text: GROUP BY Clause url: sql-ref-syntax-qry-select-groupby.html - text: HAVING Clause url: sql-ref-syntax-qry-select-having.html + - text: ORDER BY Clause + url: sql-ref-syntax-qry-select-orderby.html + - text: SORT BY Clause + url: sql-ref-syntax-qry-select-sortby.html + - text: CLUSTER BY Clause + url: sql-ref-syntax-qry-select-clusterby.html + - text: DISTRIBUTE BY Clause + url: sql-ref-syntax-qry-select-distribute-by.html - text: LIMIT Clause url: sql-ref-syntax-qry-select-limit.html - - text: Set operations - url: sql-ref-syntax-qry-select-setops.html - - text: Common Table Expression(CTE) - url: sql-ref-syntax-qry-select-cte.html - - text: Subqueries - url: sql-ref-syntax-qry-select-subqueries.html - - text: Query hints - url: sql-ref-syntax-qry-select-hints.html - - text: SAMPLING - url: sql-ref-syntax-qry-sampling.html - - text: WINDOWING ANALYTIC FUNCTIONS - url: sql-ref-syntax-qry-window.html - - text: AGGREGATION (CUBE/ROLLUP/GROUPING) - url: sql-ref-syntax-qry-aggregation.html + - text: USE database + url: sql-ref-syntax-qry-select-usedb.html - text: EXPLAIN url: sql-ref-syntax-qry-explain.html - - text: Auxilarry Statements + - text: Auxiliary Statements url: sql-ref-syntax-aux.html subitems: - - text: Analyze statement + - text: ANALYZE url: sql-ref-syntax-aux-analyze.html subitems: - text: ANALYZE TABLE url: sql-ref-syntax-aux-analyze-table.html - - text: Caching statements + - text: CACHE url: sql-ref-syntax-aux-cache.html subitems: - text: CACHE TABLE @@ -170,7 +171,11 @@ url: sql-ref-syntax-aux-cache-uncache-table.html - text: CLEAR CACHE url: sql-ref-syntax-aux-cache-clear-cache.html - - text: Describe Commands + - text: REFRESH TABLE + url: sql-ref-syntax-aux-refresh-table.html + - text: REFRESH + url: sql-ref-syntax-aux-cache-refresh.md + - text: DESCRIBE url: sql-ref-syntax-aux-describe.html subitems: - text: DESCRIBE DATABASE @@ -181,7 +186,7 @@ url: sql-ref-syntax-aux-describe-function.html - text: DESCRIBE QUERY url: sql-ref-syntax-aux-describe-query.html - - text: Show commands + - text: SHOW url: sql-ref-syntax-aux-show.html subitems: - text: SHOW COLUMNS @@ -200,36 +205,21 @@ url: sql-ref-syntax-aux-show-partitions.html - text: SHOW CREATE TABLE url: sql-ref-syntax-aux-show-create-table.html - - text: Configuration Management Commands + - text: CONFIGURATION MANAGEMENT url: sql-ref-syntax-aux-conf-mgmt.html subitems: - text: SET url: sql-ref-syntax-aux-conf-mgmt-set.html - text: RESET url: sql-ref-syntax-aux-conf-mgmt-reset.html - - text: Resource Management Commands + - text: RESOURCE MANAGEMENT url: sql-ref-syntax-aux-resource-mgmt.html subitems: - text: ADD FILE url: sql-ref-syntax-aux-resource-mgmt-add-file.html - text: ADD JAR url: sql-ref-syntax-aux-resource-mgmt-add-jar.html - - text: Functions - url: sql-ref-functions.html - subitems: - - text: Builtin Functions - url: sql-ref-functions-builtin.html - subitems: - - text: Scalar functions - url: sql-ref-functions-builtin-scalar.html - - text: Aggregate functions - url: sql-ref-functions-builtin-aggregate.html - - text: User defined Functions - url: sql-ref-functions-udf.html - subitems: - - text: Scalar functions - url: sql-ref-functions-udf-scalar.html - - text: Aggregate functions - url: sql-ref-functions-udf-aggregate.html - - text: Arthmetic operations - url: sql-ref-arithmetic-ops.html + - text: LIST FILE + url: sql-ref-syntax-aux-resource-mgmt-list-file.html + - text: LIST JAR + url: sql-ref-syntax-aux-resource-mgmt-list-jar.html diff --git a/docs/_includes/nav-left-wrapper-migration.html b/docs/_includes/nav-left-wrapper-migration.html new file mode 100644 index 0000000000000..4318a324a9475 --- /dev/null +++ b/docs/_includes/nav-left-wrapper-migration.html @@ -0,0 +1,6 @@ +
    +
    +

    Migration Guide

    + {% include nav-left.html nav=include.nav-migration %} +
    +
    diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html index 8ea15dc71d541..d05ac6bbe129d 100755 --- a/docs/_layouts/global.html +++ b/docs/_layouts/global.html @@ -82,7 +82,7 @@
  • `mergeSchema` (default is the value specified in `spark.sql.parquet.mergeSchema`): sets * whether we should merge schemas collected from all Parquet part-files. This will override * `spark.sql.parquet.mergeSchema`.
  • + *
  • `pathGlobFilter`: an optional glob pattern to only include files with paths matching + * the pattern. The syntax follows org.apache.hadoop.fs.GlobFilter. + * It does not change the behavior of partition discovery.
  • + *
  • `recursiveFileLookup`: recursively scan a directory for files. Using this option + * disables partition discovery
  • * + * * @since 1.4.0 */ @scala.annotation.varargs @@ -688,6 +710,18 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { /** * Loads ORC files and returns the result as a `DataFrame`. * + * You can set the following ORC-specific option(s) for reading ORC files: + *
      + *
    • `mergeSchema` (default is the value specified in `spark.sql.orc.mergeSchema`): sets whether + * we should merge schemas collected from all ORC part-files. This will override + * `spark.sql.orc.mergeSchema`.
    • + *
    • `pathGlobFilter`: an optional glob pattern to only include files with paths matching + * the pattern. The syntax follows org.apache.hadoop.fs.GlobFilter. + * It does not change the behavior of partition discovery.
    • + *
    • `recursiveFileLookup`: recursively scan a directory for files. Using this option + * disables partition discovery
    • + *
    + * * @param paths input paths * @since 2.0.0 */ @@ -736,6 +770,11 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * *
  • `lineSep` (default covers all `\r`, `\r\n` and `\n`): defines the line separator * that should be used for parsing.
  • + *
  • `pathGlobFilter`: an optional glob pattern to only include files with paths matching + * the pattern. The syntax follows org.apache.hadoop.fs.GlobFilter. + * It does not change the behavior of partition discovery.
  • + *
  • `recursiveFileLookup`: recursively scan a directory for files. Using this option + * disables partition discovery
  • * * * @param paths input paths @@ -771,13 +810,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * spark.read().textFile("/path/to/spark/README.md") * }}} * - * You can set the following textFile-specific option(s) for reading text files: - *
      - *
    • `wholetext` (default `false`): If true, read a file as a single row and not split by "\n". - *
    • - *
    • `lineSep` (default covers all `\r`, `\r\n` and `\n`): defines the line separator - * that should be used for parsing.
    • - *
    + * You can set the text-specific options as specified in `DataFrameReader.text`. * * @param paths input path * @since 2.0.0 diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index c782e5012d8d7..fff1f4b636dea 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -17,28 +17,26 @@ package org.apache.spark.sql -import java.util.{Locale, Properties, UUID} +import java.util.{Locale, Properties} import scala.collection.JavaConverters._ import org.apache.spark.annotation.Stable -import org.apache.spark.sql.catalog.v2.{CatalogPlugin, Identifier, TableCatalog} -import org.apache.spark.sql.catalog.v2.expressions._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, NoSuchTableException, UnresolvedRelation} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.expressions.Literal -import org.apache.spark.sql.catalyst.plans.logical.{AppendData, CreateTableAsSelect, InsertIntoTable, LogicalPlan, OverwriteByExpression, OverwritePartitionsDynamic, ReplaceTableAsSelect} +import org.apache.spark.sql.catalyst.plans.logical.{AppendData, CreateTableAsSelect, InsertIntoStatement, LogicalPlan, OverwriteByExpression, OverwritePartitionsDynamic, ReplaceTableAsSelect} import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.sql.connector.catalog.{CatalogPlugin, CatalogV2Implicits, CatalogV2Util, Identifier, SupportsCatalogOptions, SupportsWrite, Table, TableCatalog, TableProvider, V1Table} +import org.apache.spark.sql.connector.catalog.TableCapability._ +import org.apache.spark.sql.connector.expressions.{BucketTransform, FieldReference, IdentityTransform, LiteralValue, Transform} import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, DataSourceUtils, LogicalRelation} import org.apache.spark.sql.execution.datasources.v2._ import org.apache.spark.sql.internal.SQLConf.PartitionOverwriteMode import org.apache.spark.sql.sources.BaseRelation -import org.apache.spark.sql.sources.v2._ -import org.apache.spark.sql.sources.v2.TableCapability._ -import org.apache.spark.sql.sources.v2.internal.V1Table import org.apache.spark.sql.types.{IntegerType, StructType} import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -68,7 +66,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { * @since 1.4.0 */ def mode(saveMode: SaveMode): DataFrameWriter[T] = { - this.mode = Some(saveMode) + this.mode = saveMode this } @@ -88,10 +86,9 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { case "overwrite" => mode(SaveMode.Overwrite) case "append" => mode(SaveMode.Append) case "ignore" => mode(SaveMode.Ignore) - case "error" | "errorifexists" => mode(SaveMode.ErrorIfExists) - case "default" => this - case _ => throw new IllegalArgumentException(s"Unknown save mode: $saveMode. " + - "Accepted save modes are 'overwrite', 'append', 'ignore', 'error', 'errorifexists'.") + case "error" | "errorifexists" | "default" => mode(SaveMode.ErrorIfExists) + case _ => throw new IllegalArgumentException(s"Unknown save mode: $saveMode. Accepted " + + "save modes are 'overwrite', 'append', 'ignore', 'error', 'errorifexists', 'default'.") } } @@ -254,44 +251,98 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { val maybeV2Provider = lookupV2Provider() if (maybeV2Provider.isDefined) { - if (partitioningColumns.nonEmpty) { - throw new AnalysisException( - "Cannot write data to TableProvider implementation if partition columns are specified.") - } - val provider = maybeV2Provider.get val sessionOptions = DataSourceV2Utils.extractSessionConfigs( provider, df.sparkSession.sessionState.conf) val options = sessionOptions ++ extraOptions val dsOptions = new CaseInsensitiveStringMap(options.asJava) + def getTable: Table = { + // For file source, it's expensive to infer schema/partition at each write. Here we pass + // the schema of input query and the user-specified partitioning to `getTable`. If the + // query schema is not compatible with the existing data, the write can still success but + // following reads would fail. + if (provider.isInstanceOf[FileDataSourceV2]) { + provider.getTable( + df.schema.asNullable, + partitioningAsV2.toArray, + dsOptions.asCaseSensitiveMap()) + } else { + DataSourceV2Utils.getTableFromProvider(provider, dsOptions, userSpecifiedSchema = None) + } + } + import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._ - provider.getTable(dsOptions) match { - case table: SupportsWrite if table.supports(BATCH_WRITE) => - lazy val relation = DataSourceV2Relation.create(table, dsOptions) - modeForDSV2 match { - case SaveMode.Append => - runCommand(df.sparkSession, "save") { - AppendData.byName(relation, df.logicalPlan) + val catalogManager = df.sparkSession.sessionState.catalogManager + mode match { + case SaveMode.Append | SaveMode.Overwrite => + val (table, catalog, ident) = provider match { + case supportsExtract: SupportsCatalogOptions => + val ident = supportsExtract.extractIdentifier(dsOptions) + val catalog = CatalogV2Util.getTableProviderCatalog( + supportsExtract, catalogManager, dsOptions) + + (catalog.loadTable(ident), Some(catalog), Some(ident)) + case _: TableProvider => + val t = getTable + if (t.supports(BATCH_WRITE)) { + (t, None, None) + } else { + // Streaming also uses the data source V2 API. So it may be that the data source + // implements v2, but has no v2 implementation for batch writes. In that case, we + // fall back to saving as though it's a V1 source. + return saveToV1Source() } + } + + val relation = DataSourceV2Relation.create(table, catalog, ident, dsOptions) + checkPartitioningMatchesV2Table(table) + if (mode == SaveMode.Append) { + runCommand(df.sparkSession, "save") { + AppendData.byName(relation, df.logicalPlan, extraOptions.toMap) + } + } else { + // Truncate the table. TableCapabilityCheck will throw a nice exception if this + // isn't supported + runCommand(df.sparkSession, "save") { + OverwriteByExpression.byName( + relation, df.logicalPlan, Literal(true), extraOptions.toMap) + } + } + + case createMode => + provider match { + case supportsExtract: SupportsCatalogOptions => + val ident = supportsExtract.extractIdentifier(dsOptions) + val catalog = CatalogV2Util.getTableProviderCatalog( + supportsExtract, catalogManager, dsOptions) + + val location = Option(dsOptions.get("path")).map(TableCatalog.PROP_LOCATION -> _) - case SaveMode.Overwrite if table.supportsAny(TRUNCATE, OVERWRITE_BY_FILTER) => - // truncate the table runCommand(df.sparkSession, "save") { - OverwriteByExpression.byName(relation, df.logicalPlan, Literal(true)) + CreateTableAsSelect( + catalog, + ident, + partitioningAsV2, + df.queryExecution.analyzed, + Map(TableCatalog.PROP_PROVIDER -> source) ++ location, + extraOptions.toMap, + ignoreIfExists = createMode == SaveMode.Ignore) + } + case _: TableProvider => + if (getTable.supports(BATCH_WRITE)) { + throw new AnalysisException(s"TableProvider implementation $source cannot be " + + s"written with $createMode mode, please use Append or Overwrite " + + "modes instead.") + } else { + // Streaming also uses the data source V2 API. So it may be that the data source + // implements v2, but has no v2 implementation for batch writes. In that case, we + // fallback to saving as though it's a V1 source. + saveToV1Source() } - - case other => - throw new AnalysisException(s"TableProvider implementation $source cannot be " + - s"written with $other mode, please use Append or Overwrite " + - "modes instead.") } - - // Streaming also uses the data source V2 API. So it may be that the data source implements - // v2, but has no v2 implementation for batch writes. In that case, we fall back to saving - // as though it's a V1 source. - case _ => saveToV1Source() } + } else { saveToV1Source() } @@ -309,7 +360,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { sparkSession = df.sparkSession, className = source, partitionColumns = partitioningColumns.getOrElse(Nil), - options = extraOptions.toMap).planForWriting(modeForDSV1, df.logicalPlan) + options = extraOptions.toMap).planForWriting(mode, df.logicalPlan) } } @@ -320,6 +371,9 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { * @note Unlike `saveAsTable`, `insertInto` ignores the column names and just uses position-based * resolution. For example: * + * @note SaveMode.ErrorIfExists and SaveMode.Ignore behave as SaveMode.Append in `insertInto` as + * `insertInto` is not a table creating operation. + * * {{{ * scala> Seq((1, 2)).toDF("i", "j").write.mode("overwrite").saveAsTable("t1") * scala> Seq((3, 4)).toDF("j", "i").write.insertInto("t1") @@ -339,8 +393,9 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { * @since 1.4.0 */ def insertInto(tableName: String): Unit = { - import df.sparkSession.sessionState.analyzer.{AsTableIdentifier, CatalogObjectIdentifier} - import org.apache.spark.sql.catalog.v2.CatalogV2Implicits._ + import df.sparkSession.sessionState.analyzer.{AsTableIdentifier, NonSessionCatalogAndIdentifier, SessionCatalogAndIdentifier} + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + import org.apache.spark.sql.connector.catalog.CatalogV2Util._ assertNotBucketed("insertInto") @@ -354,15 +409,14 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { val session = df.sparkSession val canUseV2 = lookupV2Provider().isDefined - val sessionCatalogOpt = session.sessionState.analyzer.sessionCatalog session.sessionState.sqlParser.parseMultipartIdentifier(tableName) match { - case CatalogObjectIdentifier(Some(catalog), ident) => + case NonSessionCatalogAndIdentifier(catalog, ident) => insertInto(catalog, ident) - case CatalogObjectIdentifier(None, ident) - if canUseV2 && sessionCatalogOpt.isDefined && ident.namespace().length <= 1 => - insertInto(sessionCatalogOpt.get, ident) + case SessionCatalogAndIdentifier(catalog, ident) + if canUseV2 && ident.namespace().length <= 1 => + insertInto(catalog, ident) case AsTableIdentifier(tableIdentifier) => insertInto(tableIdentifier) @@ -373,18 +427,18 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { } private def insertInto(catalog: CatalogPlugin, ident: Identifier): Unit = { - import org.apache.spark.sql.catalog.v2.CatalogV2Implicits._ + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ val table = catalog.asTableCatalog.loadTable(ident) match { case _: V1Table => return insertInto(TableIdentifier(ident.name(), ident.namespace().headOption)) case t => - DataSourceV2Relation.create(t) + DataSourceV2Relation.create(t, Some(catalog), Some(ident)) } - val command = modeForDSV2 match { - case SaveMode.Append => - AppendData.byPosition(table, df.logicalPlan) + val command = mode match { + case SaveMode.Append | SaveMode.ErrorIfExists | SaveMode.Ignore => + AppendData.byPosition(table, df.logicalPlan, extraOptions.toMap) case SaveMode.Overwrite => val conf = df.sparkSession.sessionState.conf @@ -392,14 +446,10 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { conf.partitionOverwriteMode == PartitionOverwriteMode.DYNAMIC if (dynamicPartitionOverwrite) { - OverwritePartitionsDynamic.byPosition(table, df.logicalPlan) + OverwritePartitionsDynamic.byPosition(table, df.logicalPlan, extraOptions.toMap) } else { - OverwriteByExpression.byPosition(table, df.logicalPlan, Literal(true)) + OverwriteByExpression.byPosition(table, df.logicalPlan, Literal(true), extraOptions.toMap) } - - case other => - throw new AnalysisException(s"insertInto does not support $other mode, " + - s"please use Append or Overwrite mode instead.") } runCommand(df.sparkSession, "insertInto") { @@ -409,11 +459,11 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { private def insertInto(tableIdent: TableIdentifier): Unit = { runCommand(df.sparkSession, "insertInto") { - InsertIntoTable( + InsertIntoStatement( table = UnresolvedRelation(tableIdent), - partition = Map.empty[String, Option[String]], + partitionSpec = Map.empty[String, Option[String]], query = df.logicalPlan, - overwrite = modeForDSV1 == SaveMode.Overwrite, + overwrite = mode == SaveMode.Overwrite, ifPartitionNotExists = false) } } @@ -483,22 +533,19 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { * @since 1.4.0 */ def saveAsTable(tableName: String): Unit = { - import df.sparkSession.sessionState.analyzer.{AsTableIdentifier, CatalogObjectIdentifier} - import org.apache.spark.sql.catalog.v2.CatalogV2Implicits._ + import df.sparkSession.sessionState.analyzer.{AsTableIdentifier, NonSessionCatalogAndIdentifier, SessionCatalogAndIdentifier} + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ val session = df.sparkSession val canUseV2 = lookupV2Provider().isDefined - val sessionCatalogOpt = session.sessionState.analyzer.sessionCatalog session.sessionState.sqlParser.parseMultipartIdentifier(tableName) match { - case CatalogObjectIdentifier(Some(catalog), ident) => - saveAsTable(catalog.asTableCatalog, ident, modeForDSV2) + case NonSessionCatalogAndIdentifier(catalog, ident) => + saveAsTable(catalog.asTableCatalog, ident) - case CatalogObjectIdentifier(None, ident) - if canUseV2 && sessionCatalogOpt.isDefined && ident.namespace().length <= 1 => - // We pass in the modeForDSV1, as using the V2 session catalog should maintain compatibility - // for now. - saveAsTable(sessionCatalogOpt.get.asTableCatalog, ident, modeForDSV1) + case SessionCatalogAndIdentifier(catalog, ident) + if canUseV2 && ident.namespace().length <= 1 => + saveAsTable(catalog.asTableCatalog, ident) case AsTableIdentifier(tableIdentifier) => saveAsTable(tableIdentifier) @@ -510,38 +557,32 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { } - private def saveAsTable(catalog: TableCatalog, ident: Identifier, mode: SaveMode): Unit = { - val partitioning = partitioningColumns.map { colNames => - colNames.map(name => IdentityTransform(FieldReference(name))) - }.getOrElse(Seq.empty[Transform]) - val bucketing = bucketColumnNames.map { cols => - Seq(BucketTransform(LiteralValue(numBuckets.get, IntegerType), cols.map(FieldReference(_)))) - }.getOrElse(Seq.empty[Transform]) - val partitionTransforms = partitioning ++ bucketing - + private def saveAsTable(catalog: TableCatalog, ident: Identifier): Unit = { val tableOpt = try Option(catalog.loadTable(ident)) catch { case _: NoSuchTableException => None } def getLocationIfExists: Option[(String, String)] = { val opts = CaseInsensitiveMap(extraOptions.toMap) - opts.get("path").map("location" -> _) + opts.get("path").map(TableCatalog.PROP_LOCATION -> _) } val command = (mode, tableOpt) match { - case (_, Some(table: V1Table)) => + case (_, Some(_: V1Table)) => return saveAsTable(TableIdentifier(ident.name(), ident.namespace().headOption)) case (SaveMode.Append, Some(table)) => - AppendData.byName(DataSourceV2Relation.create(table), df.logicalPlan) + checkPartitioningMatchesV2Table(table) + val v2Relation = DataSourceV2Relation.create(table, Some(catalog), Some(ident)) + AppendData.byName(v2Relation, df.logicalPlan, extraOptions.toMap) case (SaveMode.Overwrite, _) => ReplaceTableAsSelect( catalog, ident, - partitionTransforms, + partitioningAsV2, df.queryExecution.analyzed, - Map("provider" -> source) ++ getLocationIfExists, + Map(TableCatalog.PROP_PROVIDER -> source) ++ getLocationIfExists, extraOptions.toMap, orCreate = true) // Create the table if it doesn't exist @@ -552,9 +593,9 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { CreateTableAsSelect( catalog, ident, - partitionTransforms, + partitioningAsV2, df.queryExecution.analyzed, - Map("provider" -> source) ++ getLocationIfExists, + Map(TableCatalog.PROP_PROVIDER -> source) ++ getLocationIfExists, extraOptions.toMap, ignoreIfExists = other == SaveMode.Ignore) } @@ -571,7 +612,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { val tableIdentWithDB = tableIdent.copy(database = Some(db)) val tableName = tableIdentWithDB.unquotedString - (tableExists, modeForDSV1) match { + (tableExists, mode) match { case (true, SaveMode.Ignore) => // Do nothing @@ -627,7 +668,30 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { bucketSpec = getBucketSpec) runCommand(df.sparkSession, "saveAsTable")( - CreateTable(tableDesc, modeForDSV1, Some(df.logicalPlan))) + CreateTable(tableDesc, mode, Some(df.logicalPlan))) + } + + /** Converts the provided partitioning and bucketing information to DataSourceV2 Transforms. */ + private def partitioningAsV2: Seq[Transform] = { + val partitioning = partitioningColumns.map { colNames => + colNames.map(name => IdentityTransform(FieldReference(name))) + }.getOrElse(Seq.empty[Transform]) + val bucketing = + getBucketSpec.map(spec => CatalogV2Implicits.BucketSpecHelper(spec).asTransform).toSeq + partitioning ++ bucketing + } + + /** + * For V2 DataSources, performs if the provided partitioning matches that of the table. + * Partitioning information is not required when appending data to V2 tables. + */ + private def checkPartitioningMatchesV2Table(existingTable: Table): Unit = { + val v2Partitions = partitioningAsV2 + if (v2Partitions.isEmpty) return + require(v2Partitions.sameElements(existingTable.partitioning()), + "The provided partitioning does not match of the table.\n" + + s" - provided: ${v2Partitions.mkString(", ")}\n" + + s" - table: ${existingTable.partitioning().mkString(", ")}") } /** @@ -693,6 +757,8 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { *
  • `encoding` (by default it is not set): specifies encoding (charset) of saved json * files. If it is not set, the UTF-8 charset will be used.
  • *
  • `lineSep` (default `\n`): defines the line separator that should be used for writing.
  • + *
  • `ignoreNullFields` (default `true`): Whether to ignore null fields + * when generating JSON objects.
  • * * * @since 1.4.0 @@ -830,13 +896,9 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { private def runCommand(session: SparkSession, name: String)(command: LogicalPlan): Unit = { val qe = session.sessionState.executePlan(command) // call `QueryExecution.toRDD` to trigger the execution of commands. - SQLExecution.withNewExecutionId(session, qe, Some(name))(qe.toRdd) + SQLExecution.withNewExecutionId(qe, Some(name))(qe.toRdd) } - private def modeForDSV1 = mode.getOrElse(SaveMode.ErrorIfExists) - - private def modeForDSV2 = mode.getOrElse(SaveMode.Append) - private def lookupV2Provider(): Option[TableProvider] = { DataSource.lookupDataSourceV2(source, df.sparkSession.sessionState.conf) match { // TODO(SPARK-28396): File source v2 write path is currently broken. @@ -851,7 +913,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) { private var source: String = df.sparkSession.sessionState.conf.defaultDataSourceName - private var mode: Option[SaveMode] = None + private var mode: SaveMode = SaveMode.ErrorIfExists private val extraOptions = new scala.collection.mutable.HashMap[String, String] diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala new file mode 100644 index 0000000000000..cf6bde5a2bcb9 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala @@ -0,0 +1,370 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import scala.collection.JavaConverters._ +import scala.collection.mutable + +import org.apache.spark.annotation.Experimental +import org.apache.spark.sql.catalyst.analysis.{CannotReplaceMissingTableException, NoSuchTableException, TableAlreadyExistsException} +import org.apache.spark.sql.catalyst.expressions.{Attribute, Bucket, Days, Hours, Literal, Months, Years} +import org.apache.spark.sql.catalyst.plans.logical.{AppendData, CreateTableAsSelect, LogicalPlan, OverwriteByExpression, OverwritePartitionsDynamic, ReplaceTableAsSelect} +import org.apache.spark.sql.connector.catalog.TableCatalog +import org.apache.spark.sql.connector.expressions.{LogicalExpressions, NamedReference, Transform} +import org.apache.spark.sql.execution.SQLExecution +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation +import org.apache.spark.sql.types.IntegerType + +/** + * Interface used to write a [[org.apache.spark.sql.Dataset]] to external storage using the v2 API. + * + * @since 3.0.0 + */ +@Experimental +final class DataFrameWriterV2[T] private[sql](table: String, ds: Dataset[T]) + extends CreateTableWriter[T] { + + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + import org.apache.spark.sql.connector.catalog.CatalogV2Util._ + import df.sparkSession.sessionState.analyzer.CatalogAndIdentifier + + private val df: DataFrame = ds.toDF() + + private val sparkSession = ds.sparkSession + + private val catalogManager = sparkSession.sessionState.analyzer.catalogManager + + private val tableName = sparkSession.sessionState.sqlParser.parseMultipartIdentifier(table) + + private val (catalog, identifier) = { + val CatalogAndIdentifier(catalog, identifier) = tableName + (catalog.asTableCatalog, identifier) + } + + private val logicalPlan = df.queryExecution.logical + + private var provider: Option[String] = None + + private val options = new mutable.HashMap[String, String]() + + private val properties = new mutable.HashMap[String, String]() + + private var partitioning: Option[Seq[Transform]] = None + + override def using(provider: String): CreateTableWriter[T] = { + this.provider = Some(provider) + this + } + + override def option(key: String, value: String): DataFrameWriterV2[T] = { + this.options.put(key, value) + this + } + + override def options(options: scala.collection.Map[String, String]): DataFrameWriterV2[T] = { + options.foreach { + case (key, value) => + this.options.put(key, value) + } + this + } + + override def options(options: java.util.Map[String, String]): DataFrameWriterV2[T] = { + this.options(options.asScala) + this + } + + override def tableProperty(property: String, value: String): CreateTableWriter[T] = { + this.properties.put(property, value) + this + } + + @scala.annotation.varargs + override def partitionedBy(column: Column, columns: Column*): CreateTableWriter[T] = { + def ref(name: String): NamedReference = LogicalExpressions.parseReference(name) + + val asTransforms = (column +: columns).map(_.expr).map { + case Years(attr: Attribute) => + LogicalExpressions.years(ref(attr.name)) + case Months(attr: Attribute) => + LogicalExpressions.months(ref(attr.name)) + case Days(attr: Attribute) => + LogicalExpressions.days(ref(attr.name)) + case Hours(attr: Attribute) => + LogicalExpressions.hours(ref(attr.name)) + case Bucket(Literal(numBuckets: Int, IntegerType), attr: Attribute) => + LogicalExpressions.bucket(numBuckets, Array(ref(attr.name))) + case attr: Attribute => + LogicalExpressions.identity(ref(attr.name)) + case expr => + throw new AnalysisException(s"Invalid partition transformation: ${expr.sql}") + } + + this.partitioning = Some(asTransforms) + this + } + + override def create(): Unit = { + // create and replace could alternatively create ParsedPlan statements, like + // `CreateTableFromDataFrameStatement(UnresolvedRelation(tableName), ...)`, to keep the catalog + // resolution logic in the analyzer. + runCommand("create") { + CreateTableAsSelect( + catalog, + identifier, + partitioning.getOrElse(Seq.empty), + logicalPlan, + properties = provider.map(p => properties + (TableCatalog.PROP_PROVIDER -> p)) + .getOrElse(properties).toMap, + writeOptions = options.toMap, + ignoreIfExists = false) + } + } + + override def replace(): Unit = { + internalReplace(orCreate = false) + } + + override def createOrReplace(): Unit = { + internalReplace(orCreate = true) + } + + + /** + * Append the contents of the data frame to the output table. + * + * If the output table does not exist, this operation will fail with + * [[org.apache.spark.sql.catalyst.analysis.NoSuchTableException]]. The data frame will be + * validated to ensure it is compatible with the existing table. + * + * @throws org.apache.spark.sql.catalyst.analysis.NoSuchTableException If the table does not exist + */ + @throws(classOf[NoSuchTableException]) + def append(): Unit = { + val append = loadTable(catalog, identifier) match { + case Some(t) => + AppendData.byName( + DataSourceV2Relation.create(t, Some(catalog), Some(identifier)), + logicalPlan, options.toMap) + case _ => + throw new NoSuchTableException(identifier) + } + + runCommand("append")(append) + } + + /** + * Overwrite rows matching the given filter condition with the contents of the data frame in + * the output table. + * + * If the output table does not exist, this operation will fail with + * [[org.apache.spark.sql.catalyst.analysis.NoSuchTableException]]. + * The data frame will be validated to ensure it is compatible with the existing table. + * + * @throws org.apache.spark.sql.catalyst.analysis.NoSuchTableException If the table does not exist + */ + @throws(classOf[NoSuchTableException]) + def overwrite(condition: Column): Unit = { + val overwrite = loadTable(catalog, identifier) match { + case Some(t) => + OverwriteByExpression.byName( + DataSourceV2Relation.create(t, Some(catalog), Some(identifier)), + logicalPlan, condition.expr, options.toMap) + case _ => + throw new NoSuchTableException(identifier) + } + + runCommand("overwrite")(overwrite) + } + + /** + * Overwrite all partition for which the data frame contains at least one row with the contents + * of the data frame in the output table. + * + * This operation is equivalent to Hive's `INSERT OVERWRITE ... PARTITION`, which replaces + * partitions dynamically depending on the contents of the data frame. + * + * If the output table does not exist, this operation will fail with + * [[org.apache.spark.sql.catalyst.analysis.NoSuchTableException]]. The data frame will be + * validated to ensure it is compatible with the existing table. + * + * @throws org.apache.spark.sql.catalyst.analysis.NoSuchTableException If the table does not exist + */ + @throws(classOf[NoSuchTableException]) + def overwritePartitions(): Unit = { + val dynamicOverwrite = loadTable(catalog, identifier) match { + case Some(t) => + OverwritePartitionsDynamic.byName( + DataSourceV2Relation.create(t, Some(catalog), Some(identifier)), + logicalPlan, options.toMap) + case _ => + throw new NoSuchTableException(identifier) + } + + runCommand("overwritePartitions")(dynamicOverwrite) + } + + /** + * Wrap an action to track the QueryExecution and time cost, then report to the user-registered + * callback functions. + */ + private def runCommand(name: String)(command: LogicalPlan): Unit = { + val qe = sparkSession.sessionState.executePlan(command) + // call `QueryExecution.toRDD` to trigger the execution of commands. + SQLExecution.withNewExecutionId(qe, Some(name))(qe.toRdd) + } + + private def internalReplace(orCreate: Boolean): Unit = { + runCommand("replace") { + ReplaceTableAsSelect( + catalog, + identifier, + partitioning.getOrElse(Seq.empty), + logicalPlan, + properties = provider.map(p => properties + ("provider" -> p)).getOrElse(properties).toMap, + writeOptions = options.toMap, + orCreate = orCreate) + } + } +} + +/** + * Configuration methods common to create/replace operations and insert/overwrite operations. + * @tparam R builder type to return + */ +trait WriteConfigMethods[R] { + /** + * Add a write option. + * + * @since 3.0.0 + */ + def option(key: String, value: String): R + + /** + * Add a boolean output option. + * + * @since 3.0.0 + */ + def option(key: String, value: Boolean): R = option(key, value.toString) + + /** + * Add a long output option. + * + * @since 3.0.0 + */ + def option(key: String, value: Long): R = option(key, value.toString) + + /** + * Add a double output option. + * + * @since 3.0.0 + */ + def option(key: String, value: Double): R = option(key, value.toString) + + /** + * Add write options from a Scala Map. + * + * @since 3.0.0 + */ + def options(options: scala.collection.Map[String, String]): R + + /** + * Add write options from a Java Map. + * + * @since 3.0.0 + */ + def options(options: java.util.Map[String, String]): R +} + +/** + * Trait to restrict calls to create and replace operations. + */ +trait CreateTableWriter[T] extends WriteConfigMethods[CreateTableWriter[T]] { + /** + * Create a new table from the contents of the data frame. + * + * The new table's schema, partition layout, properties, and other configuration will be + * based on the configuration set on this writer. + * + * If the output table exists, this operation will fail with + * [[org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException]]. + * + * @throws org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException + * If the table already exists + */ + @throws(classOf[TableAlreadyExistsException]) + def create(): Unit + + /** + * Replace an existing table with the contents of the data frame. + * + * The existing table's schema, partition layout, properties, and other configuration will be + * replaced with the contents of the data frame and the configuration set on this writer. + * + * If the output table does not exist, this operation will fail with + * [[org.apache.spark.sql.catalyst.analysis.CannotReplaceMissingTableException]]. + * + * @throws org.apache.spark.sql.catalyst.analysis.CannotReplaceMissingTableException + * If the table already exists + */ + @throws(classOf[CannotReplaceMissingTableException]) + def replace(): Unit + + /** + * Create a new table or replace an existing table with the contents of the data frame. + * + * The output table's schema, partition layout, properties, and other configuration will be based + * on the contents of the data frame and the configuration set on this writer. If the table + * exists, its configuration and data will be replaced. + */ + def createOrReplace(): Unit + + /** + * Partition the output table created by `create`, `createOrReplace`, or `replace` using + * the given columns or transforms. + * + * When specified, the table data will be stored by these values for efficient reads. + * + * For example, when a table is partitioned by day, it may be stored in a directory layout like: + *
      + *
    • `table/day=2019-06-01/`
    • + *
    • `table/day=2019-06-02/`
    • + *
    + * + * Partitioning is one of the most widely used techniques to optimize physical data layout. + * It provides a coarse-grained index for skipping unnecessary data reads when queries have + * predicates on the partitioned columns. In order for partitioning to work well, the number + * of distinct values in each column should typically be less than tens of thousands. + * + * @since 3.0.0 + */ + def partitionedBy(column: Column, columns: Column*): CreateTableWriter[T] + + /** + * Specifies a provider for the underlying output data source. Spark's default catalog supports + * "parquet", "json", etc. + * + * @since 3.0.0 + */ + def using(provider: String): CreateTableWriter[T] + + /** + * Add a table property. + */ + def tableProperty(property: String, value: String): CreateTableWriter[T] +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 88fe7a3f380ab..42f35354e864f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -46,11 +46,12 @@ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, PartitioningCollection} import org.apache.spark.sql.catalyst.trees.TreeNodeTag +import org.apache.spark.sql.catalyst.util.IntervalUtils import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.arrow.{ArrowBatchStreamWriter, ArrowConverters} import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.execution.datasources.LogicalRelation -import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, FileTable} +import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2ScanRelation, FileTable} import org.apache.spark.sql.execution.python.EvaluatePython import org.apache.spark.sql.execution.stat.StatFunctions import org.apache.spark.sql.internal.SQLConf @@ -59,7 +60,7 @@ import org.apache.spark.sql.types._ import org.apache.spark.sql.util.SchemaUtils import org.apache.spark.storage.StorageLevel import org.apache.spark.unsafe.array.ByteArrayMethods -import org.apache.spark.unsafe.types.CalendarInterval +import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} import org.apache.spark.util.Utils private[sql] object Dataset { @@ -81,18 +82,19 @@ private[sql] object Dataset { dataset } - def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan): DataFrame = { - val qe = sparkSession.sessionState.executePlan(logicalPlan) - qe.assertAnalyzed() - new Dataset[Row](sparkSession, qe, RowEncoder(qe.analyzed.schema)) + def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan): DataFrame = + sparkSession.withActive { + val qe = sparkSession.sessionState.executePlan(logicalPlan) + qe.assertAnalyzed() + new Dataset[Row](qe, RowEncoder(qe.analyzed.schema)) } /** A variant of ofRows that allows passing in a tracker so we can track query parsing time. */ def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan, tracker: QueryPlanningTracker) - : DataFrame = { + : DataFrame = sparkSession.withActive { val qe = new QueryExecution(sparkSession, logicalPlan, tracker) qe.assertAnalyzed() - new Dataset[Row](sparkSession, qe, RowEncoder(qe.analyzed.schema)) + new Dataset[Row](qe, RowEncoder(qe.analyzed.schema)) } } @@ -184,13 +186,12 @@ private[sql] object Dataset { */ @Stable class Dataset[T] private[sql]( - @transient private val _sparkSession: SparkSession, @DeveloperApi @Unstable @transient val queryExecution: QueryExecution, @DeveloperApi @Unstable @transient val encoder: Encoder[T]) extends Serializable { @transient lazy val sparkSession: SparkSession = { - if (_sparkSession == null) { + if (queryExecution == null || queryExecution.sparkSession == null) { throw new SparkException( "Dataset transformations and actions can only be invoked by the driver, not inside of" + " other Dataset transformations; for example, dataset1.map(x => dataset2.values.count()" + @@ -198,7 +199,7 @@ class Dataset[T] private[sql]( "performed inside of the dataset1.map transformation. For more information," + " see SPARK-28702.") } - _sparkSession + queryExecution.sparkSession } // A globally unique id of this Dataset. @@ -210,7 +211,7 @@ class Dataset[T] private[sql]( // you wrap it with `withNewExecutionId` if this actions doesn't call other action. def this(sparkSession: SparkSession, logicalPlan: LogicalPlan, encoder: Encoder[T]) = { - this(sparkSession, sparkSession.sessionState.executePlan(logicalPlan), encoder) + this(sparkSession.sessionState.executePlan(logicalPlan), encoder) } def this(sqlContext: SQLContext, logicalPlan: LogicalPlan, encoder: Encoder[T]) = { @@ -228,7 +229,7 @@ class Dataset[T] private[sql]( case _ => queryExecution.analyzed } - if (sparkSession.sessionState.conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN)) { + if (sparkSession.sessionState.conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED)) { plan.setTagValue(Dataset.DATASET_ID_TAG, id) } plan @@ -254,10 +255,16 @@ class Dataset[T] private[sql]( @transient lazy val sqlContext: SQLContext = sparkSession.sqlContext private[sql] def resolve(colName: String): NamedExpression = { - queryExecution.analyzed.resolveQuoted(colName, sparkSession.sessionState.analyzer.resolver) + val resolver = sparkSession.sessionState.analyzer.resolver + queryExecution.analyzed.resolveQuoted(colName, resolver) .getOrElse { - throw new AnalysisException( - s"""Cannot resolve column name "$colName" among (${schema.fieldNames.mkString(", ")})""") + val fields = schema.fieldNames + val extraMsg = if (fields.exists(resolver(_, colName))) { + s"; did you mean to quote the `$colName` column?" + } else "" + val fieldsStr = fields.mkString(", ") + val errorMsg = s"""Cannot resolve column name "$colName" among (${fieldsStr})${extraMsg}""" + throw new AnalysisException(errorMsg) } } @@ -438,17 +445,19 @@ class Dataset[T] private[sql]( */ // This is declared with parentheses to prevent the Scala compiler from treating // `ds.toDF("1")` as invoking this toDF and then apply on the returned DataFrame. - def toDF(): DataFrame = new Dataset[Row](sparkSession, queryExecution, RowEncoder(schema)) + def toDF(): DataFrame = new Dataset[Row](queryExecution, RowEncoder(schema)) /** * Returns a new Dataset where each record has been mapped on to the specified type. The * method used to map columns depend on the type of `U`: - * - When `U` is a class, fields for the class will be mapped to columns of the same name - * (case sensitivity is determined by `spark.sql.caseSensitive`). - * - When `U` is a tuple, the columns will be mapped by ordinal (i.e. the first column will - * be assigned to `_1`). - * - When `U` is a primitive type (i.e. String, Int, etc), then the first column of the - * `DataFrame` will be used. + *
      + *
    • When `U` is a class, fields for the class will be mapped to columns of the same name + * (case sensitivity is determined by `spark.sql.caseSensitive`).
    • + *
    • When `U` is a tuple, the columns will be mapped by ordinal (i.e. the first column will + * be assigned to `_1`).
    • + *
    • When `U` is a primitive type (i.e. String, Int, etc), then the first column of the + * `DataFrame` will be used.
    • + *
    * * If the schema of the Dataset does not match the desired `U` type, you can use `select` * along with `alias` or `as` to rearrange or rename as required. @@ -494,7 +503,9 @@ class Dataset[T] private[sql]( * @group basic * @since 1.6.0 */ - def schema: StructType = queryExecution.analyzed.schema + def schema: StructType = sparkSession.withActive { + queryExecution.analyzed.schema + } /** * Prints the schema to the console in a nice tree format. @@ -515,36 +526,53 @@ class Dataset[T] private[sql]( // scalastyle:on println /** - * Prints the plans (logical and physical) to the console for debugging purposes. + * Prints the plans (logical and physical) with a format specified by a given explain mode. * + * @param mode specifies the expected output format of plans. + *
      + *
    • `simple` Print only a physical plan.
    • + *
    • `extended`: Print both logical and physical plans.
    • + *
    • `codegen`: Print a physical plan and generated codes if they are + * available.
    • + *
    • `cost`: Print a logical plan and statistics if they are available.
    • + *
    • `formatted`: Split explain output into two sections: a physical plan outline + * and node details.
    • + *
    * @group basic - * @since 1.6.0 + * @since 3.0.0 */ - def explain(extended: Boolean): Unit = { + def explain(mode: String): Unit = sparkSession.withActive { // Because temporary views are resolved during analysis when we create a Dataset, and // `ExplainCommand` analyzes input query plan and resolves temporary views again. Using // `ExplainCommand` here will probably output different query plans, compared to the results // of evaluation of the Dataset. So just output QueryExecution's query plans here. - val qe = ExplainCommandUtil.explainedQueryExecution(sparkSession, logicalPlan, queryExecution) - val outputString = - if (extended) { - qe.toString - } else { - qe.simpleString - } // scalastyle:off println - println(outputString) + println(queryExecution.explainString(ExplainMode.fromString(mode))) // scalastyle:on println } + /** + * Prints the plans (logical and physical) to the console for debugging purposes. + * + * @param extended default `false`. If `false`, prints only the physical plan. + * + * @group basic + * @since 1.6.0 + */ + def explain(extended: Boolean): Unit = if (extended) { + explain(ExtendedMode.name) + } else { + explain(SimpleMode.name) + } + /** * Prints the physical plan to the console for debugging purposes. * * @group basic * @since 1.6.0 */ - def explain(): Unit = explain(extended = false) + def explain(): Unit = explain(SimpleMode.name) /** * Returns all column names and their data types as an array. @@ -579,8 +607,8 @@ class Dataset[T] private[sql]( * @group basic * @since 2.4.0 */ - def isEmpty: Boolean = withAction("isEmpty", limit(1).groupBy().count().queryExecution) { plan => - plan.executeCollect().head.getLong(0) == 0 + def isEmpty: Boolean = withAction("isEmpty", select().queryExecution) { plan => + plan.executeTake(1).isEmpty } /** @@ -694,11 +722,12 @@ class Dataset[T] private[sql]( * before which we assume no more late data is going to arrive. * * Spark will use this watermark for several purposes: - * - To know when a given time window aggregation can be finalized and thus can be emitted when - * using output modes that do not allow updates. - * - To minimize the amount of state that we need to keep for on-going aggregations, - * `mapGroupsWithState` and `dropDuplicates` operators. - * + *
      + *
    • To know when a given time window aggregation can be finalized and thus can be emitted + * when using output modes that do not allow updates.
    • + *
    • To minimize the amount of state that we need to keep for on-going aggregations, + * `mapGroupsWithState` and `dropDuplicates` operators.
    • + *
    * The current watermark is computed by looking at the `MAX(eventTime)` seen across * all of the partitions in the query minus a user specified `delayThreshold`. Due to the cost * of coordinating this value across partitions, the actual watermark used is only guaranteed @@ -718,14 +747,14 @@ class Dataset[T] private[sql]( def withWatermark(eventTime: String, delayThreshold: String): Dataset[T] = withTypedPlan { val parsedDelay = try { - CalendarInterval.fromCaseInsensitiveString(delayThreshold) + IntervalUtils.stringToInterval(UTF8String.fromString(delayThreshold)) } catch { case e: IllegalArgumentException => throw new AnalysisException( s"Unable to parse time delay '$delayThreshold'", cause = Some(e)) } - require(parsedDelay.milliseconds >= 0 && parsedDelay.months >= 0, + require(!IntervalUtils.isNegative(parsedDelay), s"delay threshold ($delayThreshold) should not be negative.") EliminateEventTimeWatermark( EventTimeWatermark(UnresolvedAttribute(eventTime), parsedDelay, logicalPlan)) @@ -1330,7 +1359,7 @@ class Dataset[T] private[sql]( private def addDataFrameIdToCol(expr: NamedExpression): NamedExpression = { val newExpr = expr transform { case a: AttributeReference - if sparkSession.sessionState.conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN) => + if sparkSession.sessionState.conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED) => val metadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(Dataset.DATASET_ID_KEY, id) @@ -1475,7 +1504,7 @@ class Dataset[T] private[sql]( val namedColumns = columns.map(_.withInputType(exprEnc, logicalPlan.output).named) val execution = new QueryExecution(sparkSession, Project(namedColumns, logicalPlan)) - new Dataset(sparkSession, execution, ExpressionEncoder.tuple(encoders)) + new Dataset(execution, ExpressionEncoder.tuple(encoders)) } /** @@ -1841,6 +1870,57 @@ class Dataset[T] private[sql]( @scala.annotation.varargs def agg(expr: Column, exprs: Column*): DataFrame = groupBy().agg(expr, exprs : _*) + /** + * Define (named) metrics to observe on the Dataset. This method returns an 'observed' Dataset + * that returns the same result as the input, with the following guarantees: + *
      + *
    • It will compute the defined aggregates (metrics) on all the data that is flowing through + * the Dataset at that point.
    • + *
    • It will report the value of the defined aggregate columns as soon as we reach a completion + * point. A completion point is either the end of a query (batch mode) or the end of a streaming + * epoch. The value of the aggregates only reflects the data processed since the previous + * completion point.
    • + *
    + * Please note that continuous execution is currently not supported. + * + * The metrics columns must either contain a literal (e.g. lit(42)), or should contain one or + * more aggregate functions (e.g. sum(a) or sum(a + b) + avg(c) - lit(1)). Expressions that + * contain references to the input Dataset's columns must always be wrapped in an aggregate + * function. + * + * A user can observe these metrics by either adding + * [[org.apache.spark.sql.streaming.StreamingQueryListener]] or a + * [[org.apache.spark.sql.util.QueryExecutionListener]] to the spark session. + * + * {{{ + * // Monitor the metrics using a listener. + * spark.streams.addListener(new StreamingQueryListener() { + * override def onQueryProgress(event: QueryProgressEvent): Unit = { + * event.progress.observedMetrics.asScala.get("my_event").foreach { row => + * // Trigger if the number of errors exceeds 5 percent + * val num_rows = row.getAs[Long]("rc") + * val num_error_rows = row.getAs[Long]("erc") + * val ratio = num_error_rows.toDouble / num_rows + * if (ratio > 0.05) { + * // Trigger alert + * } + * } + * } + * def onQueryStarted(event: QueryStartedEvent): Unit = {} + * def onQueryTerminated(event: QueryTerminatedEvent): Unit = {} + * }) + * // Observe row count (rc) and error row count (erc) in the streaming Dataset + * val observed_ds = ds.observe("my_event", count(lit(1)).as("rc"), count($"error").as("erc")) + * observed_ds.writeStream.format("...").start() + * }}} + * + * @group typedrel + * @since 3.0.0 + */ + def observe(name: String, expr: Column, exprs: Column*): Dataset[T] = withTypedPlan { + CollectMetrics(name, (expr +: exprs).map(_.named), logicalPlan) + } + /** * Returns a new Dataset by taking the first `n` rows. The difference between this function * and `head` is that `head` is an action and returns an array (by triggering query execution) @@ -2439,13 +2519,14 @@ class Dataset[T] private[sql]( /** * Computes specified statistics for numeric and string columns. Available statistics are: - * - * - count - * - mean - * - stddev - * - min - * - max - * - arbitrary approximate percentiles specified as a percentage (eg, 75%) + *
      + *
    • count
    • + *
    • mean
    • + *
    • stddev
    • + *
    • min
    • + *
    • max
    • + *
    • arbitrary approximate percentiles specified as a percentage (e.g. 75%)
    • + *
    * * If no statistics are given, this function computes count, mean, stddev, min, * approximate quartiles (percentiles at 25%, 50%, and 75%), and max. @@ -2715,6 +2796,18 @@ class Dataset[T] private[sql]( */ def take(n: Int): Array[T] = head(n) + /** + * Returns the last `n` rows in the Dataset. + * + * Running tail requires moving data into the application's driver process, and doing so with + * a very large `n` can crash the driver process with OutOfMemoryError. + * + * @group action + * @since 3.0.0 + */ + def tail(n: Int): Array[T] = withAction( + "tail", withTypedPlan(Tail(Literal(n), logicalPlan)).queryExecution)(collectFromPlan) + /** * Returns the first `n` rows in the Dataset as a list. * @@ -3120,6 +3213,34 @@ class Dataset[T] private[sql]( new DataFrameWriter[T](this) } + /** + * Create a write configuration builder for v2 sources. + * + * This builder is used to configure and execute write operations. For example, to append to an + * existing table, run: + * + * {{{ + * df.writeTo("catalog.db.table").append() + * }}} + * + * This can also be used to create or replace existing tables: + * + * {{{ + * df.writeTo("catalog.db.table").partitionedBy($"col").createOrReplace() + * }}} + * + * @group basic + * @since 3.0.0 + */ + def writeTo(table: String): DataFrameWriterV2[T] = { + // TODO: streaming could be adapted to use this interface + if (isStreaming) { + logicalPlan.failAnalysis( + "'writeTo' can not be called on streaming Dataset/DataFrame") + } + new DataFrameWriterV2[T](table, this) + } + /** * Interface for saving the content of the streaming Dataset out into external storage. * @@ -3183,7 +3304,7 @@ class Dataset[T] private[sql]( fr.inputFiles case r: HiveTableRelation => r.tableMeta.storage.locationUri.map(_.toString).toArray - case DataSourceV2Relation(table: FileTable, _, _) => + case DataSourceV2ScanRelation(table: FileTable, _, _) => table.fileIndex.inputFiles }.flatten files.toSet.toArray @@ -3212,6 +3333,16 @@ class Dataset[T] private[sql]( } } + private[sql] def tailToPython(n: Int): Array[Any] = { + EvaluatePython.registerPicklers() + withAction("tailToPython", queryExecution) { plan => + val toJava: (Any) => Any = EvaluatePython.toJava(_, schema) + val iter: Iterator[Array[Byte]] = new SerDeUtil.AutoBatchedPickler( + plan.executeTail(n).iterator.map(toJava)) + PythonRDD.serveIterator(iter, "serve-DataFrame") + } + } + private[sql] def getRowsToPython( _numRows: Int, truncate: Int): Array[Any] = { @@ -3328,9 +3459,9 @@ class Dataset[T] private[sql]( } } - private[sql] def toPythonIterator(): Array[Any] = { + private[sql] def toPythonIterator(prefetchPartitions: Boolean = false): Array[Any] = { withNewExecutionId { - PythonRDD.toLocalIteratorAndServe(javaToPython.rdd) + PythonRDD.toLocalIteratorAndServe(javaToPython.rdd, prefetchPartitions) } } @@ -3343,7 +3474,7 @@ class Dataset[T] private[sql]( * an execution. */ private def withNewExecutionId[U](body: => U): U = { - SQLExecution.withNewExecutionId(sparkSession, queryExecution)(body) + SQLExecution.withNewExecutionId(queryExecution)(body) } /** @@ -3352,10 +3483,8 @@ class Dataset[T] private[sql]( * reset. */ private def withNewRDDExecutionId[U](body: => U): U = { - SQLExecution.withNewExecutionId(sparkSession, rddQueryExecution) { - rddQueryExecution.executedPlan.foreach { plan => - plan.resetMetrics() - } + SQLExecution.withNewExecutionId(rddQueryExecution) { + rddQueryExecution.executedPlan.resetMetrics() body } } @@ -3365,10 +3494,8 @@ class Dataset[T] private[sql]( * user-registered callback functions. */ private def withAction[U](name: String, qe: QueryExecution)(action: SparkPlan => U) = { - SQLExecution.withNewExecutionId(sparkSession, qe, Some(name)) { - qe.executedPlan.foreach { plan => - plan.resetMetrics() - } + SQLExecution.withNewExecutionId(qe, Some(name)) { + qe.executedPlan.resetMetrics() action(qe.executedPlan) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala index 89cc9735e4f6a..76ee297dfca79 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/KeyValueGroupedDataset.scala @@ -449,10 +449,7 @@ class KeyValueGroupedDataset[K, V] private[sql]( val aggregate = Aggregate(groupingAttributes, keyColumn +: namedColumns, logicalPlan) val execution = new QueryExecution(sparkSession, aggregate) - new Dataset( - sparkSession, - execution, - ExpressionEncoder.tuple(kExprEnc +: encoders)) + new Dataset(execution, ExpressionEncoder.tuple(kExprEnc +: encoders)) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala index e85636d82a62c..b1ba7d4538732 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala @@ -26,6 +26,7 @@ import org.apache.spark.annotation.Stable import org.apache.spark.api.python.PythonEvalType import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.catalyst.analysis.{Star, UnresolvedAlias, UnresolvedAttribute, UnresolvedFunction} +import org.apache.spark.sql.catalyst.encoders.encoderFor import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.plans.logical._ @@ -47,8 +48,8 @@ import org.apache.spark.sql.types.{NumericType, StructType} */ @Stable class RelationalGroupedDataset protected[sql]( - df: DataFrame, - groupingExprs: Seq[Expression], + private[sql] val df: DataFrame, + private[sql] val groupingExprs: Seq[Expression], groupType: RelationalGroupedDataset.GroupType) { private[this] def toDF(aggExprs: Seq[Expression]): DataFrame = { @@ -129,6 +130,37 @@ class RelationalGroupedDataset protected[sql]( (inputExpr: Expression) => exprToFunc(inputExpr) } + /** + * Returns a `KeyValueGroupedDataset` where the data is grouped by the grouping expressions + * of current `RelationalGroupedDataset`. + * + * @since 3.0.0 + */ + def as[K: Encoder, T: Encoder]: KeyValueGroupedDataset[K, T] = { + val keyEncoder = encoderFor[K] + val valueEncoder = encoderFor[T] + + // Resolves grouping expressions. + val dummyPlan = Project(groupingExprs.map(alias), LocalRelation(df.logicalPlan.output)) + val analyzedPlan = df.sparkSession.sessionState.analyzer.execute(dummyPlan) + .asInstanceOf[Project] + df.sparkSession.sessionState.analyzer.checkAnalysis(analyzedPlan) + val aliasedGroupings = analyzedPlan.projectList + + // Adds the grouping expressions that are not in base DataFrame into outputs. + val addedCols = aliasedGroupings.filter(g => !df.logicalPlan.outputSet.contains(g.toAttribute)) + val qe = Dataset.ofRows( + df.sparkSession, + Project(df.logicalPlan.output ++ addedCols, df.logicalPlan)).queryExecution + + new KeyValueGroupedDataset( + keyEncoder, + valueEncoder, + qe, + df.logicalPlan.output, + aliasedGroupings.map(_.toAttribute)) + } + /** * (Scala-specific) Compute aggregates by specifying the column names and * aggregate methods. The resulting `DataFrame` will also contain the grouping columns. @@ -523,6 +555,48 @@ class RelationalGroupedDataset protected[sql]( Dataset.ofRows(df.sparkSession, plan) } + /** + * Applies a vectorized python user-defined function to each cogrouped data. + * The user-defined function defines a transformation: + * `pandas.DataFrame`, `pandas.DataFrame` -> `pandas.DataFrame`. + * For each group in the cogrouped data, all elements in the group are passed as a + * `pandas.DataFrame` and the results for all cogroups are combined into a new [[DataFrame]]. + * + * This function uses Apache Arrow as serialization format between Java executors and Python + * workers. + */ + private[sql] def flatMapCoGroupsInPandas( + r: RelationalGroupedDataset, + expr: PythonUDF): DataFrame = { + require(expr.evalType == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF, + "Must pass a cogrouped map udf") + require(expr.dataType.isInstanceOf[StructType], + s"The returnType of the udf must be a ${StructType.simpleString}") + + val leftGroupingNamedExpressions = groupingExprs.map { + case ne: NamedExpression => ne + case other => Alias(other, other.toString)() + } + + val rightGroupingNamedExpressions = r.groupingExprs.map { + case ne: NamedExpression => ne + case other => Alias(other, other.toString)() + } + + val leftAttributes = leftGroupingNamedExpressions.map(_.toAttribute) + val rightAttributes = rightGroupingNamedExpressions.map(_.toAttribute) + + val leftChild = df.logicalPlan + val rightChild = r.df.logicalPlan + + val left = Project(leftGroupingNamedExpressions ++ leftChild.output, leftChild) + val right = Project(rightGroupingNamedExpressions ++ rightChild.output, rightChild) + + val output = expr.dataType.asInstanceOf[StructType].toAttributes + val plan = FlatMapCoGroupsInPandas(leftAttributes, rightAttributes, expr, output, left, right) + Dataset.ofRows(df.sparkSession, plan) + } + override def toString: String = { val builder = new StringBuilder builder.append("RelationalGroupedDataset: [grouping expressions: [") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala index 0f5aab7f47d0d..e1b44b5918143 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala @@ -18,8 +18,10 @@ package org.apache.spark.sql import org.apache.spark.annotation.Stable +import org.apache.spark.internal.Logging import org.apache.spark.internal.config.{ConfigEntry, OptionalConfigEntry} import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.{DeprecatedConfig, RemovedConfig} /** * Runtime configuration interface for Spark. To access this, use `SparkSession.conf`. @@ -29,7 +31,7 @@ import org.apache.spark.sql.internal.SQLConf * @since 2.0.0 */ @Stable -class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) { +class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) extends Logging { /** * Sets the given Spark runtime configuration property. @@ -38,6 +40,8 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) { */ def set(key: String, value: String): Unit = { requireNonStaticConf(key) + requireDefaultValueOfRemovedConf(key, value) + logDeprecationWarning(key) sqlConf.setConfString(key, value) } @@ -47,7 +51,6 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) { * @since 2.0.0 */ def set(key: String, value: Boolean): Unit = { - requireNonStaticConf(key) set(key, value.toString) } @@ -57,7 +60,6 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) { * @since 2.0.0 */ def set(key: String, value: Long): Unit = { - requireNonStaticConf(key) set(key, value.toString) } @@ -128,6 +130,7 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) { */ def unset(key: String): Unit = { requireNonStaticConf(key) + logDeprecationWarning(key) sqlConf.unsetConf(key) } @@ -158,4 +161,26 @@ class RuntimeConfig private[sql](sqlConf: SQLConf = new SQLConf) { throw new AnalysisException(s"Cannot modify the value of a Spark config: $key") } } + + private def requireDefaultValueOfRemovedConf(key: String, value: String): Unit = { + SQLConf.removedSQLConfigs.get(key).foreach { + case RemovedConfig(configName, version, defaultValue, comment) => + if (value != defaultValue) { + throw new AnalysisException( + s"The SQL config '$configName' was removed in the version $version. $comment") + } + } + } + + /** + * Logs a warning message if the given config key is deprecated. + */ + private def logDeprecationWarning(key: String): Unit = { + SQLConf.deprecatedSQLConfigs.get(key).foreach { + case DeprecatedConfig(configName, version, comment) => + logWarning( + s"The SQL config '$configName' has been deprecated in Spark v$version " + + s"and may be removed in the future. $comment") + } + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 45d0bd4122535..2054874e5e07b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -479,97 +479,6 @@ class SQLContext private[sql](val sparkSession: SparkSession) def readStream: DataStreamReader = sparkSession.readStream - /** - * Creates an external table from the given path and returns the corresponding DataFrame. - * It will use the default data source configured by spark.sql.sources.default. - * - * @group ddl_ops - * @since 1.3.0 - */ - @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") - def createExternalTable(tableName: String, path: String): DataFrame = { - sparkSession.catalog.createTable(tableName, path) - } - - /** - * Creates an external table from the given path based on a data source - * and returns the corresponding DataFrame. - * - * @group ddl_ops - * @since 1.3.0 - */ - @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") - def createExternalTable( - tableName: String, - path: String, - source: String): DataFrame = { - sparkSession.catalog.createTable(tableName, path, source) - } - - /** - * Creates an external table from the given path based on a data source and a set of options. - * Then, returns the corresponding DataFrame. - * - * @group ddl_ops - * @since 1.3.0 - */ - @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") - def createExternalTable( - tableName: String, - source: String, - options: java.util.Map[String, String]): DataFrame = { - sparkSession.catalog.createTable(tableName, source, options) - } - - /** - * (Scala-specific) - * Creates an external table from the given path based on a data source and a set of options. - * Then, returns the corresponding DataFrame. - * - * @group ddl_ops - * @since 1.3.0 - */ - @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") - def createExternalTable( - tableName: String, - source: String, - options: Map[String, String]): DataFrame = { - sparkSession.catalog.createTable(tableName, source, options) - } - - /** - * Create an external table from the given path based on a data source, a schema and - * a set of options. Then, returns the corresponding DataFrame. - * - * @group ddl_ops - * @since 1.3.0 - */ - @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") - def createExternalTable( - tableName: String, - source: String, - schema: StructType, - options: java.util.Map[String, String]): DataFrame = { - sparkSession.catalog.createTable(tableName, source, schema, options) - } - - /** - * (Scala-specific) - * Create an external table from the given path based on a data source, a schema and - * a set of options. Then, returns the corresponding DataFrame. - * - * @group ddl_ops - * @since 1.3.0 - */ - @deprecated("use sparkSession.catalog.createTable instead.", "2.2.0") - def createExternalTable( - tableName: String, - source: String, - schema: StructType, - options: Map[String, String]): DataFrame = { - sparkSession.catalog.createTable(tableName, source, schema, options) - } - /** * Registers the given `DataFrame` as a temporary table in the catalog. Temporary tables exist * only during the lifetime of this instance of SQLContext. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index bd2bc1c0ad5d7..1fb97fb4b4cf1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -37,8 +37,10 @@ import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.encoders._ import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Range} +import org.apache.spark.sql.connector.ExternalCommandRunner import org.apache.spark.sql.execution._ -import org.apache.spark.sql.execution.datasources.LogicalRelation +import org.apache.spark.sql.execution.command.ExternalCommandExecutor +import org.apache.spark.sql.execution.datasources.{DataSource, LogicalRelation} import org.apache.spark.sql.internal._ import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION import org.apache.spark.sql.sources.BaseRelation @@ -274,9 +276,7 @@ class SparkSession private( * @since 2.0.0 */ @transient - lazy val emptyDataFrame: DataFrame = { - createDataFrame(sparkContext.emptyRDD[Row].setName("empty"), StructType(Nil)) - } + lazy val emptyDataFrame: DataFrame = Dataset.ofRows(self, LocalRelation()) /** * Creates a new [[Dataset]] of type T containing zero elements. @@ -293,8 +293,7 @@ class SparkSession private( * * @since 2.0.0 */ - def createDataFrame[A <: Product : TypeTag](rdd: RDD[A]): DataFrame = { - SparkSession.setActiveSession(this) + def createDataFrame[A <: Product : TypeTag](rdd: RDD[A]): DataFrame = withActive { val encoder = Encoders.product[A] Dataset.ofRows(self, ExternalRDD(rdd, self)(encoder)) } @@ -304,8 +303,7 @@ class SparkSession private( * * @since 2.0.0 */ - def createDataFrame[A <: Product : TypeTag](data: Seq[A]): DataFrame = { - SparkSession.setActiveSession(this) + def createDataFrame[A <: Product : TypeTag](data: Seq[A]): DataFrame = withActive { val schema = ScalaReflection.schemaFor[A].dataType.asInstanceOf[StructType] val attributeSeq = schema.toAttributes Dataset.ofRows(self, LocalRelation.fromProduct(attributeSeq, data)) @@ -343,7 +341,7 @@ class SparkSession private( * @since 2.0.0 */ @DeveloperApi - def createDataFrame(rowRDD: RDD[Row], schema: StructType): DataFrame = { + def createDataFrame(rowRDD: RDD[Row], schema: StructType): DataFrame = withActive { // TODO: use MutableProjection when rowRDD is another DataFrame and the applied // schema differs from the existing schema on any field data type. val encoder = RowEncoder(schema) @@ -373,7 +371,7 @@ class SparkSession private( * @since 2.0.0 */ @DeveloperApi - def createDataFrame(rows: java.util.List[Row], schema: StructType): DataFrame = { + def createDataFrame(rows: java.util.List[Row], schema: StructType): DataFrame = withActive { Dataset.ofRows(self, LocalRelation.fromExternalRows(schema.toAttributes, rows.asScala)) } @@ -385,7 +383,7 @@ class SparkSession private( * * @since 2.0.0 */ - def createDataFrame(rdd: RDD[_], beanClass: Class[_]): DataFrame = { + def createDataFrame(rdd: RDD[_], beanClass: Class[_]): DataFrame = withActive { val attributeSeq: Seq[AttributeReference] = getSchema(beanClass) val className = beanClass.getName val rowRdd = rdd.mapPartitions { iter => @@ -414,7 +412,7 @@ class SparkSession private( * SELECT * queries will return the columns in an undefined order. * @since 1.6.0 */ - def createDataFrame(data: java.util.List[_], beanClass: Class[_]): DataFrame = { + def createDataFrame(data: java.util.List[_], beanClass: Class[_]): DataFrame = withActive { val attrSeq = getSchema(beanClass) val rows = SQLContext.beansToRows(data.asScala.iterator, beanClass, attrSeq) Dataset.ofRows(self, LocalRelation(attrSeq, rows.toSeq)) @@ -599,7 +597,7 @@ class SparkSession private( * * @since 2.0.0 */ - def sql(sqlText: String): DataFrame = { + def sql(sqlText: String): DataFrame = withActive { val tracker = new QueryPlanningTracker val plan = tracker.measurePhase(QueryPlanningTracker.PARSING) { sessionState.sqlParser.parsePlan(sqlText) @@ -607,6 +605,33 @@ class SparkSession private( Dataset.ofRows(self, plan, tracker) } + /** + * Execute an arbitrary string command inside an external execution engine rather than Spark. + * This could be useful when user wants to execute some commands out of Spark. For + * example, executing custom DDL/DML command for JDBC, creating index for ElasticSearch, + * creating cores for Solr and so on. + * + * The command will be eagerly executed after this method is called and the returned + * DataFrame will contain the output of the command(if any). + * + * @param runner The class name of the runner that implements `ExternalCommandRunner`. + * @param command The target command to be executed + * @param options The options for the runner. + * + * @since 3.0.0 + */ + @Unstable + def executeCommand(runner: String, command: String, options: Map[String, String]): DataFrame = { + DataSource.lookupDataSource(runner, sessionState.conf) match { + case source if classOf[ExternalCommandRunner].isAssignableFrom(source) => + Dataset.ofRows(self, ExternalCommandExecutor( + source.newInstance().asInstanceOf[ExternalCommandRunner], command, options)) + + case _ => + throw new AnalysisException(s"Command execution is not supported in runner $runner") + } + } + /** * Returns a [[DataFrameReader]] that can be used to read non-streaming data in as a * `DataFrame`. @@ -724,6 +749,20 @@ class SparkSession private( } } + /** + * Execute a block of code with the this session set as the active session, and restore the + * previous session on completion. + */ + private[sql] def withActive[T](block: => T): T = { + // Use the active session thread local directly to make sure we get the session that is actually + // set and not the default session. This to prevent that we promote the default session to the + // active session once we are done. + val old = SparkSession.activeThreadSession.get() + SparkSession.setActiveSession(this) + try block finally { + SparkSession.setActiveSession(old) + } + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala index bb05c76cfee6d..0f08e10c00d22 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala @@ -28,10 +28,11 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.api.java._ import org.apache.spark.sql.catalyst.{JavaTypeInference, ScalaReflection} import org.apache.spark.sql.catalyst.analysis.FunctionRegistry +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUDF} -import org.apache.spark.sql.execution.aggregate.ScalaUDAF +import org.apache.spark.sql.execution.aggregate.{ScalaAggregator, ScalaUDAF} import org.apache.spark.sql.execution.python.UserDefinedPythonFunction -import org.apache.spark.sql.expressions.{SparkUserDefinedFunction, UserDefinedAggregateFunction, UserDefinedFunction} +import org.apache.spark.sql.expressions.{Aggregator, SparkUserDefinedFunction, UserDefinedAggregateFunction, UserDefinedAggregator, UserDefinedFunction} import org.apache.spark.sql.types.DataType import org.apache.spark.util.Utils @@ -72,7 +73,11 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @return the registered UDAF. * * @since 1.5.0 + * @deprecated this method and the use of UserDefinedAggregateFunction are deprecated. + * Aggregator[IN, BUF, OUT] should now be registered as a UDF via the functions.udaf(agg) method. */ + @deprecated("Aggregator[IN, BUF, OUT] should now be registered as a UDF" + + " via the functions.udaf(agg) method.", "3.0.0") def register(name: String, udaf: UserDefinedAggregateFunction): UserDefinedAggregateFunction = { def builder(children: Seq[Expression]) = ScalaUDAF(children, udaf) functionRegistry.createOrReplaceTempFunction(name, builder) @@ -101,9 +106,16 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends * @since 2.2.0 */ def register(name: String, udf: UserDefinedFunction): UserDefinedFunction = { - def builder(children: Seq[Expression]) = udf.apply(children.map(Column.apply) : _*).expr - functionRegistry.createOrReplaceTempFunction(name, builder) - udf + udf match { + case udaf: UserDefinedAggregator[_, _, _] => + def builder(children: Seq[Expression]) = udaf.scalaAggregator(children) + functionRegistry.createOrReplaceTempFunction(name, builder) + udf + case _ => + def builder(children: Seq[Expression]) = udf.apply(children.map(Column.apply) : _*).expr + functionRegistry.createOrReplaceTempFunction(name, builder) + udf + } } // scalastyle:off line.size.limit diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala index 482e2bfeb7098..bf3055d5e3e09 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/api/python/PythonSQLUtils.scala @@ -23,11 +23,13 @@ import java.nio.channels.Channels import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.python.PythonRDDServer import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, SQLContext} +import org.apache.spark.sql.{DataFrame, Dataset, SQLContext} import org.apache.spark.sql.catalyst.analysis.FunctionRegistry import org.apache.spark.sql.catalyst.expressions.ExpressionInfo import org.apache.spark.sql.catalyst.parser.CatalystSqlParser +import org.apache.spark.sql.execution.{ExplainMode, QueryExecution} import org.apache.spark.sql.execution.arrow.ArrowConverters +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.DataType private[sql] object PythonSQLUtils { @@ -38,6 +40,12 @@ private[sql] object PythonSQLUtils { FunctionRegistry.functionSet.flatMap(f => FunctionRegistry.builtin.lookupFunction(f)).toArray } + def listSQLConfigs(): Array[(String, String, String)] = { + val conf = new SQLConf() + // Py4J doesn't seem to translate Seq well, so we convert to an Array. + conf.getAllDefinedConfs.toArray + } + /** * Python callable function to read a file in Arrow stream format and create a [[RDD]] * using each serialized ArrowRecordBatch as a partition. @@ -56,6 +64,10 @@ private[sql] object PythonSQLUtils { sqlContext: SQLContext): DataFrame = { ArrowConverters.toDataFrame(arrowBatchRDD, schemaString, sqlContext) } + + def explainString(queryExecution: QueryExecution, mode: String): String = { + queryExecution.explainString(ExplainMode.fromString(mode)) + } } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala index 60738e6d4ef9e..318cc629e7a34 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalog/Catalog.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalog import scala.collection.JavaConverters._ -import org.apache.spark.annotation.{Evolving, Experimental, Stable} +import org.apache.spark.annotation.Stable import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset} import org.apache.spark.sql.types.StructType import org.apache.spark.storage.StorageLevel @@ -208,20 +208,6 @@ abstract class Catalog { */ def functionExists(dbName: String, functionName: String): Boolean - /** - * Creates a table from the given path and returns the corresponding DataFrame. - * It will use the default data source configured by spark.sql.sources.default. - * - * @param tableName is either a qualified or unqualified name that designates a table. - * If no database identifier is provided, it refers to a table in - * the current database. - * @since 2.0.0 - */ - @deprecated("use createTable instead.", "2.2.0") - def createExternalTable(tableName: String, path: String): DataFrame = { - createTable(tableName, path) - } - /** * Creates a table from the given path and returns the corresponding DataFrame. * It will use the default data source configured by spark.sql.sources.default. @@ -233,20 +219,6 @@ abstract class Catalog { */ def createTable(tableName: String, path: String): DataFrame - /** - * Creates a table from the given path based on a data source and returns the corresponding - * DataFrame. - * - * @param tableName is either a qualified or unqualified name that designates a table. - * If no database identifier is provided, it refers to a table in - * the current database. - * @since 2.0.0 - */ - @deprecated("use createTable instead.", "2.2.0") - def createExternalTable(tableName: String, path: String, source: String): DataFrame = { - createTable(tableName, path, source) - } - /** * Creates a table from the given path based on a data source and returns the corresponding * DataFrame. @@ -258,23 +230,6 @@ abstract class Catalog { */ def createTable(tableName: String, path: String, source: String): DataFrame - /** - * Creates a table from the given path based on a data source and a set of options. - * Then, returns the corresponding DataFrame. - * - * @param tableName is either a qualified or unqualified name that designates a table. - * If no database identifier is provided, it refers to a table in - * the current database. - * @since 2.0.0 - */ - @deprecated("use createTable instead.", "2.2.0") - def createExternalTable( - tableName: String, - source: String, - options: java.util.Map[String, String]): DataFrame = { - createTable(tableName, source, options) - } - /** * Creates a table based on the dataset in a data source and a set of options. * Then, returns the corresponding DataFrame. @@ -291,24 +246,6 @@ abstract class Catalog { createTable(tableName, source, options.asScala.toMap) } - /** - * (Scala-specific) - * Creates a table from the given path based on a data source and a set of options. - * Then, returns the corresponding DataFrame. - * - * @param tableName is either a qualified or unqualified name that designates a table. - * If no database identifier is provided, it refers to a table in - * the current database. - * @since 2.0.0 - */ - @deprecated("use createTable instead.", "2.2.0") - def createExternalTable( - tableName: String, - source: String, - options: Map[String, String]): DataFrame = { - createTable(tableName, source, options) - } - /** * (Scala-specific) * Creates a table based on the dataset in a data source and a set of options. @@ -324,24 +261,6 @@ abstract class Catalog { source: String, options: Map[String, String]): DataFrame - /** - * Create a table from the given path based on a data source, a schema and a set of options. - * Then, returns the corresponding DataFrame. - * - * @param tableName is either a qualified or unqualified name that designates a table. - * If no database identifier is provided, it refers to a table in - * the current database. - * @since 2.0.0 - */ - @deprecated("use createTable instead.", "2.2.0") - def createExternalTable( - tableName: String, - source: String, - schema: StructType, - options: java.util.Map[String, String]): DataFrame = { - createTable(tableName, source, schema, options) - } - /** * Create a table based on the dataset in a data source, a schema and a set of options. * Then, returns the corresponding DataFrame. @@ -359,25 +278,6 @@ abstract class Catalog { createTable(tableName, source, schema, options.asScala.toMap) } - /** - * (Scala-specific) - * Create a table from the given path based on a data source, a schema and a set of options. - * Then, returns the corresponding DataFrame. - * - * @param tableName is either a qualified or unqualified name that designates a table. - * If no database identifier is provided, it refers to a table in - * the current database. - * @since 2.0.0 - */ - @deprecated("use createTable instead.", "2.2.0") - def createExternalTable( - tableName: String, - source: String, - schema: StructType, - options: Map[String, String]): DataFrame = { - createTable(tableName, source, schema, options) - } - /** * (Scala-specific) * Create a table based on the dataset in a data source, a schema and a set of options. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala new file mode 100644 index 0000000000000..adeb2164eff63 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSessionCatalog.scala @@ -0,0 +1,654 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.analysis + +import org.apache.spark.sql.{AnalysisException, SaveMode} +import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} +import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType, CatalogUtils} +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogPlugin, CatalogV2Util, Identifier, LookupCatalog, SupportsNamespaces, TableCatalog, TableChange, V1Table} +import org.apache.spark.sql.connector.expressions.Transform +import org.apache.spark.sql.execution.command._ +import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, RefreshTable} +import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2 +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{HIVE_TYPE_STRING, HiveStringType, MetadataBuilder, StructField, StructType} + +/** + * Resolves catalogs from the multi-part identifiers in SQL statements, and convert the statements + * to the corresponding v1 or v2 commands if the resolved catalog is the session catalog. + * + * We can remove this rule once we implement all the catalog functionality in `V2SessionCatalog`. + */ +class ResolveSessionCatalog( + val catalogManager: CatalogManager, + conf: SQLConf, + isView: Seq[String] => Boolean) + extends Rule[LogicalPlan] with LookupCatalog { + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + import org.apache.spark.sql.connector.catalog.CatalogV2Util._ + + override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp { + case AlterTableAddColumnsStatement( + nameParts @ SessionCatalogAndTable(catalog, tbl), cols) => + loadTable(catalog, tbl.asIdentifier).collect { + case v1Table: V1Table => + cols.foreach { c => + assertTopLevelColumn(c.name, "AlterTableAddColumnsCommand") + if (!c.nullable) { + throw new AnalysisException( + "ADD COLUMN with v1 tables cannot specify NOT NULL.") + } + } + AlterTableAddColumnsCommand(tbl.asTableIdentifier, cols.map(convertToStructField)) + }.getOrElse { + val changes = cols.map { col => + TableChange.addColumn( + col.name.toArray, + col.dataType, + col.nullable, + col.comment.orNull, + col.position.orNull) + } + createAlterTable(nameParts, catalog, tbl, changes) + } + + case AlterTableReplaceColumnsStatement( + nameParts @ SessionCatalogAndTable(catalog, tbl), cols) => + val changes: Seq[TableChange] = loadTable(catalog, tbl.asIdentifier) match { + case Some(_: V1Table) => + throw new AnalysisException("REPLACE COLUMNS is only supported with v2 tables.") + case Some(table) => + // REPLACE COLUMNS deletes all the existing columns and adds new columns specified. + val deleteChanges = table.schema.fieldNames.map { name => + TableChange.deleteColumn(Array(name)) + } + val addChanges = cols.map { col => + TableChange.addColumn( + col.name.toArray, + col.dataType, + col.nullable, + col.comment.orNull, + col.position.orNull) + } + deleteChanges ++ addChanges + case None => Seq() // Unresolved table will be handled in CheckAnalysis. + } + createAlterTable(nameParts, catalog, tbl, changes) + + case a @ AlterTableAlterColumnStatement( + nameParts @ SessionCatalogAndTable(catalog, tbl), _, _, _, _, _) => + loadTable(catalog, tbl.asIdentifier).collect { + case v1Table: V1Table => + if (a.column.length > 1) { + throw new AnalysisException( + "ALTER COLUMN with qualified column is only supported with v2 tables.") + } + if (a.nullable.isDefined) { + throw new AnalysisException( + "ALTER COLUMN with v1 tables cannot specify NOT NULL.") + } + if (a.position.isDefined) { + throw new AnalysisException("" + + "ALTER COLUMN ... FIRST | ALTER is only supported with v2 tables.") + } + val builder = new MetadataBuilder + // Add comment to metadata + a.comment.map(c => builder.putString("comment", c)) + val colName = a.column(0) + val dataType = a.dataType.getOrElse { + v1Table.schema.findNestedField(Seq(colName), resolver = conf.resolver) + .map(_._2.dataType) + .getOrElse { + throw new AnalysisException( + s"ALTER COLUMN cannot find column ${quote(colName)} in v1 table. " + + s"Available: ${v1Table.schema.fieldNames.mkString(", ")}") + } + } + // Add Hive type string to metadata. + val cleanedDataType = HiveStringType.replaceCharType(dataType) + if (dataType != cleanedDataType) { + builder.putString(HIVE_TYPE_STRING, dataType.catalogString) + } + val newColumn = StructField( + colName, + cleanedDataType, + nullable = true, + builder.build()) + AlterTableChangeColumnCommand(tbl.asTableIdentifier, colName, newColumn) + }.getOrElse { + val colName = a.column.toArray + val typeChange = a.dataType.map { newDataType => + TableChange.updateColumnType(colName, newDataType) + } + val nullabilityChange = a.nullable.map { nullable => + TableChange.updateColumnNullability(colName, nullable) + } + val commentChange = a.comment.map { newComment => + TableChange.updateColumnComment(colName, newComment) + } + val positionChange = a.position.map { newPosition => + TableChange.updateColumnPosition(colName, newPosition) + } + createAlterTable( + nameParts, + catalog, + tbl, + typeChange.toSeq ++ nullabilityChange ++ commentChange ++ positionChange) + } + + case AlterTableRenameColumnStatement( + nameParts @ SessionCatalogAndTable(catalog, tbl), col, newName) => + loadTable(catalog, tbl.asIdentifier).collect { + case v1Table: V1Table => + throw new AnalysisException("RENAME COLUMN is only supported with v2 tables.") + }.getOrElse { + val changes = Seq(TableChange.renameColumn(col.toArray, newName)) + createAlterTable(nameParts, catalog, tbl, changes) + } + + case AlterTableDropColumnsStatement( + nameParts @ SessionCatalogAndTable(catalog, tbl), cols) => + loadTable(catalog, tbl.asIdentifier).collect { + case v1Table: V1Table => + throw new AnalysisException("DROP COLUMN is only supported with v2 tables.") + }.getOrElse { + val changes = cols.map(col => TableChange.deleteColumn(col.toArray)) + createAlterTable(nameParts, catalog, tbl, changes) + } + + case AlterTableSetPropertiesStatement( + nameParts @ SessionCatalogAndTable(catalog, tbl), props) => + loadTable(catalog, tbl.asIdentifier).collect { + case v1Table: V1Table => + AlterTableSetPropertiesCommand(tbl.asTableIdentifier, props, isView = false) + }.getOrElse { + val changes = props.map { case (key, value) => + TableChange.setProperty(key, value) + }.toSeq + createAlterTable(nameParts, catalog, tbl, changes) + } + + case AlterTableUnsetPropertiesStatement( + nameParts @ SessionCatalogAndTable(catalog, tbl), keys, ifExists) => + loadTable(catalog, tbl.asIdentifier).collect { + case v1Table: V1Table => + AlterTableUnsetPropertiesCommand( + tbl.asTableIdentifier, keys, ifExists, isView = false) + }.getOrElse { + val changes = keys.map(key => TableChange.removeProperty(key)) + createAlterTable(nameParts, catalog, tbl, changes) + } + + case AlterTableSetLocationStatement( + nameParts @ SessionCatalogAndTable(catalog, tbl), partitionSpec, newLoc) => + loadTable(catalog, tbl.asIdentifier).collect { + case v1Table: V1Table => + AlterTableSetLocationCommand(tbl.asTableIdentifier, partitionSpec, newLoc) + }.getOrElse { + if (partitionSpec.nonEmpty) { + throw new AnalysisException( + "ALTER TABLE SET LOCATION does not support partition for v2 tables.") + } + val changes = Seq(TableChange.setProperty(TableCatalog.PROP_LOCATION, newLoc)) + createAlterTable(nameParts, catalog, tbl, changes) + } + + // ALTER VIEW should always use v1 command if the resolved catalog is session catalog. + case AlterViewSetPropertiesStatement(SessionCatalogAndTable(_, tbl), props) => + AlterTableSetPropertiesCommand(tbl.asTableIdentifier, props, isView = true) + + case AlterViewUnsetPropertiesStatement(SessionCatalogAndTable(_, tbl), keys, ifExists) => + AlterTableUnsetPropertiesCommand(tbl.asTableIdentifier, keys, ifExists, isView = true) + + case d @ DescribeNamespace(SessionCatalogAndNamespace(_, ns), _) => + if (ns.length != 1) { + throw new AnalysisException( + s"The database name is not valid: ${ns.quoted}") + } + DescribeDatabaseCommand(ns.head, d.extended) + + case AlterNamespaceSetProperties(SessionCatalogAndNamespace(_, ns), properties) => + if (ns.length != 1) { + throw new AnalysisException( + s"The database name is not valid: ${ns.quoted}") + } + AlterDatabasePropertiesCommand(ns.head, properties) + + case AlterNamespaceSetLocation(SessionCatalogAndNamespace(_, ns), location) => + if (ns.length != 1) { + throw new AnalysisException( + s"The database name is not valid: ${ns.quoted}") + } + AlterDatabaseSetLocationCommand(ns.head, location) + + case RenameTableStatement(SessionCatalogAndTable(_, oldName), newNameParts, isView) => + AlterTableRenameCommand(oldName.asTableIdentifier, newNameParts.asTableIdentifier, isView) + + case DescribeRelation(ResolvedTable(_, ident, _: V1Table), partitionSpec, isExtended) => + DescribeTableCommand(ident.asTableIdentifier, partitionSpec, isExtended) + + // Use v1 command to describe (temp) view, as v2 catalog doesn't support view yet. + case DescribeRelation(ResolvedView(ident), partitionSpec, isExtended) => + DescribeTableCommand(ident.asTableIdentifier, partitionSpec, isExtended) + + case DescribeColumnStatement( + SessionCatalogAndTable(catalog, tbl), colNameParts, isExtended) => + loadTable(catalog, tbl.asIdentifier).collect { + case v1Table: V1Table => + DescribeColumnCommand(tbl.asTableIdentifier, colNameParts, isExtended) + }.getOrElse { + if (isView(tbl)) { + DescribeColumnCommand(tbl.asTableIdentifier, colNameParts, isExtended) + } else { + throw new AnalysisException("Describing columns is not supported for v2 tables.") + } + } + + // For CREATE TABLE [AS SELECT], we should use the v1 command if the catalog is resolved to the + // session catalog and the table provider is not v2. + case c @ CreateTableStatement( + SessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _) => + if (!isV2Provider(c.provider)) { + val tableDesc = buildCatalogTable(tbl.asTableIdentifier, c.tableSchema, + c.partitioning, c.bucketSpec, c.properties, c.provider, c.options, c.location, + c.comment, c.ifNotExists) + val mode = if (c.ifNotExists) SaveMode.Ignore else SaveMode.ErrorIfExists + CreateTable(tableDesc, mode, None) + } else { + CreateV2Table( + catalog.asTableCatalog, + tbl.asIdentifier, + c.tableSchema, + // convert the bucket spec and add it as a transform + c.partitioning ++ c.bucketSpec.map(_.asTransform), + convertTableProperties(c.properties, c.options, c.location, c.comment, c.provider), + ignoreIfExists = c.ifNotExists) + } + + case c @ CreateTableAsSelectStatement( + SessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _) => + if (!isV2Provider(c.provider)) { + val tableDesc = buildCatalogTable(tbl.asTableIdentifier, new StructType, + c.partitioning, c.bucketSpec, c.properties, c.provider, c.options, c.location, + c.comment, c.ifNotExists) + val mode = if (c.ifNotExists) SaveMode.Ignore else SaveMode.ErrorIfExists + CreateTable(tableDesc, mode, Some(c.asSelect)) + } else { + CreateTableAsSelect( + catalog.asTableCatalog, + tbl.asIdentifier, + // convert the bucket spec and add it as a transform + c.partitioning ++ c.bucketSpec.map(_.asTransform), + c.asSelect, + convertTableProperties(c.properties, c.options, c.location, c.comment, c.provider), + writeOptions = c.options, + ignoreIfExists = c.ifNotExists) + } + + case RefreshTableStatement(SessionCatalogAndTable(_, tbl)) => + RefreshTable(tbl.asTableIdentifier) + + // For REPLACE TABLE [AS SELECT], we should fail if the catalog is resolved to the + // session catalog and the table provider is not v2. + case c @ ReplaceTableStatement( + SessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _) => + if (!isV2Provider(c.provider)) { + throw new AnalysisException("REPLACE TABLE is only supported with v2 tables.") + } else { + ReplaceTable( + catalog.asTableCatalog, + tbl.asIdentifier, + c.tableSchema, + // convert the bucket spec and add it as a transform + c.partitioning ++ c.bucketSpec.map(_.asTransform), + convertTableProperties(c.properties, c.options, c.location, c.comment, c.provider), + orCreate = c.orCreate) + } + + case c @ ReplaceTableAsSelectStatement( + SessionCatalogAndTable(catalog, tbl), _, _, _, _, _, _, _, _, _) => + if (!isV2Provider(c.provider)) { + throw new AnalysisException("REPLACE TABLE AS SELECT is only supported with v2 tables.") + } else { + ReplaceTableAsSelect( + catalog.asTableCatalog, + tbl.asIdentifier, + // convert the bucket spec and add it as a transform + c.partitioning ++ c.bucketSpec.map(_.asTransform), + c.asSelect, + convertTableProperties(c.properties, c.options, c.location, c.comment, c.provider), + writeOptions = c.options, + orCreate = c.orCreate) + } + + case DropTableStatement(SessionCatalogAndTable(catalog, tbl), ifExists, purge) => + DropTableCommand(tbl.asTableIdentifier, ifExists, isView = false, purge = purge) + + case DropViewStatement(SessionCatalogAndTable(catalog, viewName), ifExists) => + DropTableCommand(viewName.asTableIdentifier, ifExists, isView = true, purge = false) + + case c @ CreateNamespaceStatement(CatalogAndNamespace(catalog, ns), _, _) + if isSessionCatalog(catalog) => + if (ns.length != 1) { + throw new AnalysisException( + s"The database name is not valid: ${ns.quoted}") + } + + val comment = c.properties.get(SupportsNamespaces.PROP_COMMENT) + val location = c.properties.get(SupportsNamespaces.PROP_LOCATION) + val newProperties = c.properties -- CatalogV2Util.NAMESPACE_RESERVED_PROPERTIES + CreateDatabaseCommand(ns.head, c.ifNotExists, location, comment, newProperties) + + case d @ DropNamespace(SessionCatalogAndNamespace(_, ns), _, _) => + if (ns.length != 1) { + throw new AnalysisException( + s"The database name is not valid: ${ns.quoted}") + } + DropDatabaseCommand(ns.head, d.ifExists, d.cascade) + + case ShowTables(SessionCatalogAndNamespace(_, ns), pattern) => + assert(ns.nonEmpty) + if (ns.length != 1) { + throw new AnalysisException( + s"The database name is not valid: ${ns.quoted}") + } + ShowTablesCommand(Some(ns.head), pattern) + + case ShowTableStatement(ns, pattern, partitionsSpec) => + val db = ns match { + case Some(ns) if ns.length != 1 => + throw new AnalysisException( + s"The database name is not valid: ${ns.quoted}") + case _ => ns.map(_.head) + } + ShowTablesCommand(db, Some(pattern), true, partitionsSpec) + + case AnalyzeTableStatement(tbl, partitionSpec, noScan) => + val v1TableName = parseV1Table(tbl, "ANALYZE TABLE") + if (partitionSpec.isEmpty) { + AnalyzeTableCommand(v1TableName.asTableIdentifier, noScan) + } else { + AnalyzePartitionCommand(v1TableName.asTableIdentifier, partitionSpec, noScan) + } + + case AnalyzeColumnStatement(tbl, columnNames, allColumns) => + val v1TableName = parseV1Table(tbl, "ANALYZE TABLE") + AnalyzeColumnCommand(v1TableName.asTableIdentifier, columnNames, allColumns) + + case RepairTableStatement(tbl) => + val v1TableName = parseV1Table(tbl, "MSCK REPAIR TABLE") + AlterTableRecoverPartitionsCommand( + v1TableName.asTableIdentifier, + "MSCK REPAIR TABLE") + + case LoadDataStatement(tbl, path, isLocal, isOverwrite, partition) => + val v1TableName = parseV1Table(tbl, "LOAD DATA") + LoadDataCommand( + v1TableName.asTableIdentifier, + path, + isLocal, + isOverwrite, + partition) + + case ShowCreateTableStatement(tbl, asSerde) if !asSerde => + val v1TableName = parseV1Table(tbl, "SHOW CREATE TABLE") + ShowCreateTableCommand(v1TableName.asTableIdentifier) + + case ShowCreateTableStatement(tbl, asSerde) if asSerde => + val v1TableName = parseV1Table(tbl, "SHOW CREATE TABLE AS SERDE") + ShowCreateTableAsSerdeCommand(v1TableName.asTableIdentifier) + + case CacheTableStatement(tbl, plan, isLazy, options) => + val v1TableName = parseV1Table(tbl, "CACHE TABLE") + CacheTableCommand(v1TableName.asTableIdentifier, plan, isLazy, options) + + case UncacheTableStatement(tbl, ifExists) => + val v1TableName = parseV1Table(tbl, "UNCACHE TABLE") + UncacheTableCommand(v1TableName.asTableIdentifier, ifExists) + + case TruncateTableStatement(tbl, partitionSpec) => + val v1TableName = parseV1Table(tbl, "TRUNCATE TABLE") + TruncateTableCommand( + v1TableName.asTableIdentifier, + partitionSpec) + + case ShowPartitionsStatement(tbl, partitionSpec) => + val v1TableName = parseV1Table(tbl, "SHOW PARTITIONS") + ShowPartitionsCommand( + v1TableName.asTableIdentifier, + partitionSpec) + + case ShowColumnsStatement(tbl, ns) => + val sql = "SHOW COLUMNS" + val v1TableName = parseV1Table(tbl, sql).asTableIdentifier + val resolver = conf.resolver + val db = ns match { + case Some(db) if (v1TableName.database.exists(!resolver(_, db.head))) => + throw new AnalysisException( + s"SHOW COLUMNS with conflicting databases: " + + s"'${db.head}' != '${v1TableName.database.get}'") + case _ => ns.map(_.head) + } + if (ns.isDefined && ns.get.length > 1) { + throw new AnalysisException( + s"Namespace name should have only one part if specified: ${ns.get.quoted}") + } + if (tbl.length > 2) { + throw new AnalysisException( + s"Table name should have at most two parts: ${tbl.quoted}") + } + ShowColumnsCommand(db, v1TableName) + + case AlterTableRecoverPartitionsStatement(tbl) => + val v1TableName = parseV1Table(tbl, "ALTER TABLE RECOVER PARTITIONS") + AlterTableRecoverPartitionsCommand( + v1TableName.asTableIdentifier, + "ALTER TABLE RECOVER PARTITIONS") + + case AlterTableAddPartitionStatement(tbl, partitionSpecsAndLocs, ifNotExists) => + val v1TableName = parseV1Table(tbl, "ALTER TABLE ADD PARTITION") + AlterTableAddPartitionCommand( + v1TableName.asTableIdentifier, + partitionSpecsAndLocs, + ifNotExists) + + case AlterTableRenamePartitionStatement(tbl, from, to) => + val v1TableName = parseV1Table(tbl, "ALTER TABLE RENAME PARTITION") + AlterTableRenamePartitionCommand( + v1TableName.asTableIdentifier, + from, + to) + + case AlterTableDropPartitionStatement(tbl, specs, ifExists, purge, retainData) => + val v1TableName = parseV1Table(tbl, "ALTER TABLE DROP PARTITION") + AlterTableDropPartitionCommand( + v1TableName.asTableIdentifier, + specs, + ifExists, + purge, + retainData) + + case AlterTableSerDePropertiesStatement(tbl, serdeClassName, serdeProperties, partitionSpec) => + val v1TableName = parseV1Table(tbl, "ALTER TABLE SerDe Properties") + AlterTableSerDePropertiesCommand( + v1TableName.asTableIdentifier, + serdeClassName, + serdeProperties, + partitionSpec) + + case AlterViewAsStatement(tbl, originalText, query) => + val v1TableName = parseV1Table(tbl, "ALTER VIEW QUERY") + AlterViewAsCommand( + v1TableName.asTableIdentifier, + originalText, + query) + + case CreateViewStatement( + tbl, userSpecifiedColumns, comment, properties, + originalText, child, allowExisting, replace, viewType) => + + val v1TableName = parseV1Table(tbl, "CREATE VIEW") + CreateViewCommand( + v1TableName.asTableIdentifier, + userSpecifiedColumns, + comment, + properties, + originalText, + child, + allowExisting, + replace, + viewType) + + case ShowTableProperties(r: ResolvedTable, propertyKey) if isSessionCatalog(r.catalog) => + ShowTablePropertiesCommand(r.identifier.asTableIdentifier, propertyKey) + + case DescribeFunctionStatement(CatalogAndIdentifier(catalog, ident), extended) => + val functionIdent = + parseSessionCatalogFunctionIdentifier("DESCRIBE FUNCTION", catalog, ident) + DescribeFunctionCommand(functionIdent, extended) + + case ShowFunctionsStatement(userScope, systemScope, pattern, fun) => + val (database, function) = fun match { + case Some(CatalogAndIdentifier(catalog, ident)) => + val FunctionIdentifier(fn, db) = + parseSessionCatalogFunctionIdentifier("SHOW FUNCTIONS", catalog, ident) + (db, Some(fn)) + case None => (None, pattern) + } + ShowFunctionsCommand(database, function, userScope, systemScope) + + case DropFunctionStatement(CatalogAndIdentifier(catalog, ident), ifExists, isTemp) => + val FunctionIdentifier(function, database) = + parseSessionCatalogFunctionIdentifier("DROP FUNCTION", catalog, ident) + DropFunctionCommand(database, function, ifExists, isTemp) + + case CreateFunctionStatement(CatalogAndIdentifier(catalog, ident), + className, resources, isTemp, ignoreIfExists, replace) => + val FunctionIdentifier(function, database) = + parseSessionCatalogFunctionIdentifier("CREATE FUNCTION", catalog, ident) + CreateFunctionCommand(database, function, className, resources, isTemp, ignoreIfExists, + replace) + } + + private def parseSessionCatalogFunctionIdentifier( + sql: String, + catalog: CatalogPlugin, + functionIdent: Identifier): FunctionIdentifier = { + if (isSessionCatalog(catalog)) { + functionIdent.asMultipartIdentifier match { + case Seq(db, fn) => FunctionIdentifier(fn, Some(db)) + case Seq(fn) => FunctionIdentifier(fn, None) + case _ => + throw new AnalysisException(s"Unsupported function name '${functionIdent.quoted}'") + } + } else { + throw new AnalysisException(s"$sql is only supported in v1 catalog") + } + } + + private def parseV1Table(tableName: Seq[String], sql: String): Seq[String] = { + val CatalogAndIdentifier(catalog, ident) = tableName + if (!isSessionCatalog(catalog)) { + throw new AnalysisException(s"$sql is only supported with v1 tables.") + } + ident.asMultipartIdentifier + } + + private def buildCatalogTable( + table: TableIdentifier, + schema: StructType, + partitioning: Seq[Transform], + bucketSpec: Option[BucketSpec], + properties: Map[String, String], + provider: String, + options: Map[String, String], + location: Option[String], + comment: Option[String], + ifNotExists: Boolean): CatalogTable = { + val storage = CatalogStorageFormat.empty.copy( + locationUri = location.map(CatalogUtils.stringToURI), + properties = options) + + val tableType = if (location.isDefined) { + CatalogTableType.EXTERNAL + } else { + CatalogTableType.MANAGED + } + + CatalogTable( + identifier = table, + tableType = tableType, + storage = storage, + schema = schema, + provider = Some(provider), + partitionColumnNames = partitioning.asPartitionColumns, + bucketSpec = bucketSpec, + properties = properties, + comment = comment) + } + + object SessionCatalogAndTable { + def unapply(nameParts: Seq[String]): Option[(CatalogPlugin, Seq[String])] = nameParts match { + case SessionCatalogAndIdentifier(catalog, ident) => + Some(catalog -> ident.asMultipartIdentifier) + case _ => None + } + } + + object SessionCatalogAndNamespace { + def unapply(resolved: ResolvedNamespace): Option[(CatalogPlugin, Seq[String])] = + if (isSessionCatalog(resolved.catalog)) { + Some(resolved.catalog -> resolved.namespace) + } else { + None + } + } + + private def assertTopLevelColumn(colName: Seq[String], command: String): Unit = { + if (colName.length > 1) { + throw new AnalysisException(s"$command does not support nested column: ${colName.quoted}") + } + } + + private def convertToStructField(col: QualifiedColType): StructField = { + val builder = new MetadataBuilder + col.comment.foreach(builder.putString("comment", _)) + + val cleanedDataType = HiveStringType.replaceCharType(col.dataType) + if (col.dataType != cleanedDataType) { + builder.putString(HIVE_TYPE_STRING, col.dataType.catalogString) + } + + StructField( + col.name.head, + cleanedDataType, + nullable = true, + builder.build()) + } + + private def isV2Provider(provider: String): Boolean = { + DataSource.lookupDataSourceV2(provider, conf) match { + // TODO(SPARK-28396): Currently file source v2 can't work with tables. + case Some(_: FileDataSourceV2) => false + case Some(_) => true + case _ => false + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PartitionPruning.scala b/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PartitionPruning.scala index d64e11136e673..28f8f49d2ce44 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PartitionPruning.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PartitionPruning.scala @@ -86,7 +86,7 @@ object PartitionPruning extends Rule[LogicalPlan] with PredicateHelper { filteringPlan: LogicalPlan, joinKeys: Seq[Expression], hasBenefit: Boolean): LogicalPlan = { - val reuseEnabled = SQLConf.get.dynamicPartitionPruningReuseBroadcast + val reuseEnabled = SQLConf.get.exchangeReuseEnabled val index = joinKeys.indexOf(filteringKey) if (hasBenefit || reuseEnabled) { // insert a DynamicPruning wrapper to identify the subquery during query planning @@ -96,7 +96,7 @@ object PartitionPruning extends Rule[LogicalPlan] with PredicateHelper { filteringPlan, joinKeys, index, - !hasBenefit), + !hasBenefit || SQLConf.get.dynamicPartitionPruningReuseBroadcastOnly), pruningPlan) } else { // abort dynamic partition pruning @@ -159,7 +159,7 @@ object PartitionPruning extends Rule[LogicalPlan] with PredicateHelper { case Not(expr) => isLikelySelective(expr) case And(l, r) => isLikelySelective(l) || isLikelySelective(r) case Or(l, r) => isLikelySelective(l) && isLikelySelective(r) - case Like(_, _) => true + case Like(_, _, _) => true case _: BinaryComparison => true case _: In | _: InSet => true case _: StringPredicate => true @@ -252,7 +252,7 @@ object PartitionPruning extends Rule[LogicalPlan] with PredicateHelper { override def apply(plan: LogicalPlan): LogicalPlan = plan match { // Do not rewrite subqueries. - case _: Subquery => plan + case s: Subquery if s.correlated => plan case _ if !SQLConf.get.dynamicPartitionPruningEnabled => plan case _ => prune(plan) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PlanDynamicPruningFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PlanDynamicPruningFilters.scala index 031c3b1aa0d50..be00f728aa3ca 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PlanDynamicPruningFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/dynamicpruning/PlanDynamicPruningFilters.scala @@ -36,9 +36,6 @@ import org.apache.spark.sql.internal.SQLConf case class PlanDynamicPruningFilters(sparkSession: SparkSession) extends Rule[SparkPlan] with PredicateHelper { - private def reuseBroadcast: Boolean = - SQLConf.get.dynamicPartitionPruningReuseBroadcast && SQLConf.get.exchangeReuseEnabled - /** * Identify the shape in which keys of a given plan are broadcasted. */ @@ -55,22 +52,24 @@ case class PlanDynamicPruningFilters(sparkSession: SparkSession) plan transformAllExpressions { case DynamicPruningSubquery( value, buildPlan, buildKeys, broadcastKeyIndex, onlyInBroadcast, exprId) => - val qe = new QueryExecution(sparkSession, buildPlan) + val sparkPlan = QueryExecution.createSparkPlan( + sparkSession, sparkSession.sessionState.planner, buildPlan) // Using `sparkPlan` is a little hacky as it is based on the assumption that this rule is // the first to be applied (apart from `InsertAdaptiveSparkPlan`). - val canReuseExchange = reuseBroadcast && buildKeys.nonEmpty && + val canReuseExchange = SQLConf.get.exchangeReuseEnabled && buildKeys.nonEmpty && plan.find { case BroadcastHashJoinExec(_, _, _, BuildLeft, _, left, _) => - left.sameResult(qe.sparkPlan) + left.sameResult(sparkPlan) case BroadcastHashJoinExec(_, _, _, BuildRight, _, _, right) => - right.sameResult(qe.sparkPlan) + right.sameResult(sparkPlan) case _ => false }.isDefined if (canReuseExchange) { val mode = broadcastMode(buildKeys, buildPlan) + val executedPlan = QueryExecution.prepareExecutedPlan(sparkSession, sparkPlan) // plan a broadcast exchange of the build side of the join - val exchange = BroadcastExchangeExec(mode, qe.executedPlan) + val exchange = BroadcastExchangeExec(mode, executedPlan) val name = s"dynamicpruning#${exprId.id}" // place the broadcast adaptor for reusing the broadcast results on the probe side val broadcastValues = diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/AggregatingAccumulator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/AggregatingAccumulator.scala new file mode 100644 index 0000000000000..9807b5dbe9348 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/AggregatingAccumulator.scala @@ -0,0 +1,275 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution + +import scala.collection.mutable + +import org.apache.spark.TaskContext +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSeq, BindReferences, Expression, InterpretedMutableProjection, InterpretedUnsafeProjection, JoinedRow, MutableProjection, NamedExpression, Projection, SpecificInternalRow} +import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, DeclarativeAggregate, ImperativeAggregate, NoOp, TypedImperativeAggregate} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{DataType, StructField, StructType} +import org.apache.spark.util.AccumulatorV2 + +/** + * Accumulator that computes a global aggregate. + */ +class AggregatingAccumulator private( + bufferSchema: Seq[DataType], + initialValues: Seq[Expression], + updateExpressions: Seq[Expression], + @transient private val mergeExpressions: Seq[Expression], + @transient private val resultExpressions: Seq[Expression], + imperatives: Array[ImperativeAggregate], + typedImperatives: Array[TypedImperativeAggregate[_]], + @transient private val conf: SQLConf) + extends AccumulatorV2[InternalRow, InternalRow] { + assert(bufferSchema.size == initialValues.size) + assert(bufferSchema.size == updateExpressions.size) + assert(mergeExpressions == null || bufferSchema.size == mergeExpressions.size) + + @transient + private var joinedRow: JoinedRow = _ + + private var buffer: SpecificInternalRow = _ + + private def createBuffer(): SpecificInternalRow = { + val buffer = new SpecificInternalRow(bufferSchema) + + // Initialize the buffer. Note that we do not use a code generated projection here because + // generating and compiling a projection is probably more expensive than using an interpreted + // projection. + InterpretedMutableProjection.createProjection(initialValues) + .target(buffer) + .apply(InternalRow.empty) + imperatives.foreach(_.initialize(buffer)) + typedImperatives.foreach(_.initialize(buffer)) + buffer + } + + private def getOrCreateBuffer(): SpecificInternalRow = { + if (buffer == null) { + buffer = createBuffer() + + // Create the joined row and set the buffer as its 'left' row. + joinedRow = new JoinedRow() + joinedRow.withLeft(buffer) + } + buffer + } + + private def initializeProjection[T <: Projection](projection: T): T = { + projection.initialize(TaskContext.getPartitionId()) + projection + } + + @transient + private[this] lazy val updateProjection = initializeProjection { + MutableProjection.create(updateExpressions) + } + + @transient + private[this] lazy val mergeProjection = initializeProjection { + InterpretedMutableProjection.createProjection(mergeExpressions) + } + + @transient + private[this] lazy val resultProjection = initializeProjection { + InterpretedUnsafeProjection.createProjection(resultExpressions) + } + + /** + * Driver side operations like `merge` and `value` are executed in the DAGScheduler thread. This + * thread does not have a SQL configuration so we attach our own here. Note that we can't (and + * shouldn't) call `merge` or `value` on an accumulator originating from an executor so we just + * return a default value here. + */ + private[this] def withSQLConf[T](default: => T)(body: => T): T = { + if (conf != null) { + SQLConf.withExistingConf(conf)(body) + } else { + default + } + } + + override def reset(): Unit = { + buffer = null + joinedRow = null + } + + override def isZero: Boolean = buffer == null + + override def copyAndReset(): AggregatingAccumulator = { + new AggregatingAccumulator( + bufferSchema, + initialValues, + updateExpressions, + mergeExpressions, + resultExpressions, + imperatives, + typedImperatives, + conf) + } + + override def copy(): AggregatingAccumulator = { + val copy = copyAndReset() + copy.merge(this) + copy + } + + override def add(v: InternalRow): Unit = { + val buffer = getOrCreateBuffer() + updateProjection.target(buffer)(joinedRow.withRight(v)) + var i = 0 + while (i < imperatives.length) { + imperatives(i).update(buffer, v) + i += 1 + } + i = 0 + while (i < typedImperatives.length) { + typedImperatives(i).update(buffer, v) + i += 1 + } + } + + override def merge(other: AccumulatorV2[InternalRow, InternalRow]): Unit = withSQLConf(()) { + if (!other.isZero) { + other match { + case agg: AggregatingAccumulator => + val buffer = getOrCreateBuffer() + val otherBuffer = agg.buffer + mergeProjection.target(buffer)(joinedRow.withRight(otherBuffer)) + var i = 0 + while (i < imperatives.length) { + imperatives(i).merge(buffer, otherBuffer) + i += 1 + } + i = 0 + while (i < typedImperatives.length) { + typedImperatives(i).mergeBuffersObjects(buffer, otherBuffer) + i += 1 + } + case _ => + throw new UnsupportedOperationException( + s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}") + } + } + } + + override def value: InternalRow = withSQLConf(InternalRow.empty) { + // Either use the existing buffer or create a temporary one. + val input = if (!isZero) { + buffer + } else { + // Create a temporary buffer because we want to avoid changing the state of the accumulator + // here, which would happen if we called getOrCreateBuffer(). This is relatively expensive to + // do but it should be no problem since this method is supposed to be called rarely (once per + // query execution). + createBuffer() + } + resultProjection(input) + } + + /** + * Get the output schema of the aggregating accumulator. + */ + lazy val schema: StructType = { + StructType(resultExpressions.zipWithIndex.map { + case (e: NamedExpression, _) => StructField(e.name, e.dataType, e.nullable, e.metadata) + case (e, i) => StructField(s"c_$i", e.dataType, e.nullable) + }) + } + + /** + * Set the state of the accumulator to the state of another accumulator. This is used in cases + * where we only want to publish the state of the accumulator when the task completes, see + * [[CollectMetricsExec]] for an example. + */ + private[execution] def setState(other: AggregatingAccumulator): Unit = { + assert(buffer == null || (buffer eq other.buffer)) + buffer = other.buffer + joinedRow = other.joinedRow + } +} + +object AggregatingAccumulator { + /** + * Create an aggregating accumulator for the given functions and input schema. + */ + def apply(functions: Seq[Expression], inputAttributes: Seq[Attribute]): AggregatingAccumulator = { + // There are a couple of things happening here: + // - Collect the schema's of the aggregate and input aggregate buffers. These are needed to bind + // the expressions which will be done when we create the accumulator. + // - Collect the initialValues, update and merge expressions for declarative aggregate + // functions. + // - Bind and Collect the imperative aggregate functions. Note that we insert NoOps into the + // (declarative) initialValues, update and merge expression buffers to keep these aligned with + // the aggregate buffer. + // - Build the result expressions. + val aggBufferAttributes = mutable.Buffer.empty[AttributeReference] + val inputAggBufferAttributes = mutable.Buffer.empty[AttributeReference] + val initialValues = mutable.Buffer.empty[Expression] + val updateExpressions = mutable.Buffer.empty[Expression] + val mergeExpressions = mutable.Buffer.empty[Expression] + val imperatives = mutable.Buffer.empty[ImperativeAggregate] + val typedImperatives = mutable.Buffer.empty[TypedImperativeAggregate[_]] + val inputAttributeSeq: AttributeSeq = inputAttributes + val resultExpressions = functions.map(_.transform { + case AggregateExpression(agg: DeclarativeAggregate, _, _, _, _) => + aggBufferAttributes ++= agg.aggBufferAttributes + inputAggBufferAttributes ++= agg.inputAggBufferAttributes + initialValues ++= agg.initialValues + updateExpressions ++= agg.updateExpressions + mergeExpressions ++= agg.mergeExpressions + agg.evaluateExpression + case AggregateExpression(agg: ImperativeAggregate, _, _, _, _) => + val imperative = BindReferences.bindReference(agg + .withNewMutableAggBufferOffset(aggBufferAttributes.size) + .withNewInputAggBufferOffset(inputAggBufferAttributes.size), + inputAttributeSeq) + imperative match { + case typedImperative: TypedImperativeAggregate[_] => + typedImperatives += typedImperative + case _ => + imperatives += imperative + } + aggBufferAttributes ++= imperative.aggBufferAttributes + inputAggBufferAttributes ++= agg.inputAggBufferAttributes + val noOps = Seq.fill(imperative.aggBufferAttributes.size)(NoOp) + initialValues ++= noOps + updateExpressions ++= noOps + mergeExpressions ++= noOps + imperative + }) + + val updateAttrSeq: AttributeSeq = aggBufferAttributes ++ inputAttributes + val mergeAttrSeq: AttributeSeq = aggBufferAttributes ++ inputAggBufferAttributes + val aggBufferAttributesSeq: AttributeSeq = aggBufferAttributes + + // Create the accumulator. + new AggregatingAccumulator( + aggBufferAttributes.map(_.dataType), + initialValues, + updateExpressions.map(BindReferences.bindReference(_, updateAttrSeq)), + mergeExpressions.map(BindReferences.bindReference(_, mergeAttrSeq)), + resultExpressions.map(BindReferences.bindReference(_, aggBufferAttributesSeq)), + imperatives.toArray, + typedImperatives.toArray, + SQLConf.get) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/AliasAwareOutputPartitioning.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/AliasAwareOutputPartitioning.scala new file mode 100644 index 0000000000000..2c7faea019322 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/AliasAwareOutputPartitioning.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution + +import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, Expression, NamedExpression} +import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning} + +/** + * A trait that handles aliases in the `outputExpressions` to produce `outputPartitioning` + * that satisfies output distribution requirements. + */ +trait AliasAwareOutputPartitioning extends UnaryExecNode { + protected def outputExpressions: Seq[NamedExpression] + + final override def outputPartitioning: Partitioning = { + if (hasAlias) { + child.outputPartitioning match { + case h: HashPartitioning => h.copy(expressions = replaceAliases(h.expressions)) + case other => other + } + } else { + child.outputPartitioning + } + } + + private def hasAlias: Boolean = outputExpressions.collectFirst { case _: Alias => }.isDefined + + private def replaceAliases(exprs: Seq[Expression]): Seq[Expression] = { + exprs.map { + case a: AttributeReference => replaceAlias(a).getOrElse(a) + case other => other + } + } + + private def replaceAlias(attr: AttributeReference): Option[Attribute] = { + outputExpressions.collectFirst { + case a @ Alias(child: AttributeReference, _) if child.semanticEquals(attr) => + a.toAttribute + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala index 10dc74dd8a8ff..413bd7b29cf45 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala @@ -30,6 +30,7 @@ import org.apache.spark.sql.execution.columnar.InMemoryRelation import org.apache.spark.sql.execution.command.CommandUtils import org.apache.spark.sql.execution.datasources.{FileIndex, HadoopFsRelation, LogicalRelation} import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, FileTable} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.storage.StorageLevel import org.apache.spark.storage.StorageLevel.MEMORY_AND_DISK @@ -80,12 +81,20 @@ class CacheManager extends Logging { } else { val sparkSession = query.sparkSession val qe = sparkSession.sessionState.executePlan(planToCache) - val inMemoryRelation = InMemoryRelation( - sparkSession.sessionState.conf.useCompression, - sparkSession.sessionState.conf.columnBatchSize, storageLevel, - qe.executedPlan, - tableName, - optimizedPlan = qe.optimizedPlan) + val originalValue = sparkSession.sessionState.conf.getConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED) + val inMemoryRelation = try { + // Avoiding changing the output partitioning, here disable AQE. + sparkSession.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, false) + InMemoryRelation( + sparkSession.sessionState.conf.useCompression, + sparkSession.sessionState.conf.columnBatchSize, storageLevel, + qe.executedPlan, + tableName, + optimizedPlan = qe.optimizedPlan) + } finally { + sparkSession.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, originalValue) + } + this.synchronized { if (lookupCachedData(planToCache).nonEmpty) { logWarning("Data has already been cached.") @@ -261,7 +270,7 @@ class CacheManager extends Logging { case _ => false } - case DataSourceV2Relation(fileTable: FileTable, _, _) => + case DataSourceV2Relation(fileTable: FileTable, _, _, _, _) => refreshFileIndexIfNecessary(fileTable.fileIndex, fs, qualifiedPath) case _ => false diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala new file mode 100644 index 0000000000000..e482bc9941ea9 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution + +import scala.collection.mutable + +import org.apache.spark.TaskContext +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} +import org.apache.spark.sql.catalyst.expressions.{Attribute, NamedExpression, SortOrder} +import org.apache.spark.sql.catalyst.plans.physical.Partitioning +import org.apache.spark.sql.types.StructType + +/** + * Collect arbitrary (named) metrics from a [[SparkPlan]]. + */ +case class CollectMetricsExec( + name: String, + metricExpressions: Seq[NamedExpression], + child: SparkPlan) + extends UnaryExecNode { + + private lazy val accumulator: AggregatingAccumulator = { + val acc = AggregatingAccumulator(metricExpressions, child.output) + acc.register(sparkContext, Option("Collected metrics")) + acc + } + + val metricsSchema: StructType = { + StructType.fromAttributes(metricExpressions.map(_.toAttribute)) + } + + // This is not used very frequently (once a query); it is not useful to use code generation here. + private lazy val toRowConverter: InternalRow => Row = { + CatalystTypeConverters.createToScalaConverter(metricsSchema) + .asInstanceOf[InternalRow => Row] + } + + def collectedMetrics: Row = toRowConverter(accumulator.value) + + override def output: Seq[Attribute] = child.output + + override def outputPartitioning: Partitioning = child.outputPartitioning + + override def outputOrdering: Seq[SortOrder] = child.outputOrdering + + override protected def doExecute(): RDD[InternalRow] = { + val collector = accumulator + collector.reset() + child.execute().mapPartitions { rows => + // Only publish the value of the accumulator when the task has completed. This is done by + // updating a task local accumulator ('updater') which will be merged with the actual + // accumulator as soon as the task completes. This avoids the following problems during the + // heartbeat: + // - Correctness issues due to partially completed/visible updates. + // - Performance issues due to excessive serialization. + val updater = collector.copyAndReset() + TaskContext.get().addTaskCompletionListener[Unit] { _ => + collector.setState(updater) + } + + rows.map { r => + updater.add(r) + r + } + } + } +} + +object CollectMetricsExec { + /** + * Recursively collect all collected metrics from a query tree. + */ + def collect(plan: SparkPlan): Map[String, Row] = { + val metrics = plan.collectInPlanAndSubqueries { + case collector: CollectMetricsExec => collector.name -> collector.collectedMetrics + } + metrics.toMap + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Columnar.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Columnar.scala index 9d1636ccf2718..e01cd8598db0c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Columnar.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Columnar.scala @@ -323,7 +323,8 @@ private object RowToColumnConverter { val c = row.getInterval(column) cv.appendStruct(false) cv.getChild(0).appendInt(c.months) - cv.getChild(1).appendLong(c.microseconds) + cv.getChild(1).appendInt(c.days) + cv.getChild(2).appendLong(c.microseconds) } } @@ -454,6 +455,7 @@ case class RowToColumnarExec(child: SparkPlan) extends UnaryExecNode { override def next(): ColumnarBatch = { cb.setNumRows(0) + vectors.foreach(_.reset()) var rowCount = 0 while (rowCount < numRows && rowIterator.hasNext) { val row = rowIterator.next() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala index 4a87049ac292b..0d759085a7e2c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala @@ -65,10 +65,26 @@ trait DataSourceScanExec extends LeafExecNode { s"$nodeNamePrefix$nodeName${truncatedString(output, "[", ",", "]", maxFields)}$metadataStr") } + override def verboseStringWithOperatorId(): String = { + val metadataStr = metadata.toSeq.sorted.filterNot { + case (_, value) if (value.isEmpty || value.equals("[]")) => true + case (key, _) if (key.equals("DataFilters") || key.equals("Format")) => true + case (_, _) => false + }.map { + case (key, value) => s"$key: ${redact(value)}" + } + + s""" + |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)} + |Output: ${producedAttributes.mkString("[", ", ", "]")} + |${metadataStr.mkString("\n")} + """.stripMargin + } + /** * Shorthand for calling redactString() without specifying redacting rules */ - private def redact(text: String): String = { + protected def redact(text: String): String = { Utils.redact(sqlContext.sessionState.conf.stringRedactionPattern, text) } @@ -171,7 +187,7 @@ case class FileSourceScanExec( partitionSchema = relation.partitionSchema, relation.sparkSession.sessionState.conf) - val driverMetrics: HashMap[String, Long] = HashMap.empty + private lazy val driverMetrics: HashMap[String, Long] = HashMap.empty /** * Send the driver-side metrics. Before calling this function, selectedPartitions has @@ -214,7 +230,7 @@ case class FileSourceScanExec( // call the file index for the files matching all filters except dynamic partition filters val predicate = dynamicPartitionFilters.reduce(And) val partitionColumns = relation.partitionSchema - val boundPredicate = newPredicate(predicate.transform { + val boundPredicate = Predicate.create(predicate.transform { case a: AttributeReference => val index = partitionColumns.indexWhere(a.name == _.name) BoundReference(index, partitionColumns(index).dataType, nullable = true) @@ -309,8 +325,7 @@ case class FileSourceScanExec( } @transient - private val pushedDownFilters = dataFilters.flatMap(DataSourceStrategy.translateFilter) - logInfo(s"Pushed Filters: ${pushedDownFilters.mkString(",")}") + private lazy val pushedDownFilters = dataFilters.flatMap(DataSourceStrategy.translateFilter) override lazy val metadata: Map[String, String] = { def seqToString(seq: Seq[Any]) = seq.mkString("[", ", ", "]") @@ -342,6 +357,31 @@ case class FileSourceScanExec( withSelectedBucketsCount } + override def verboseStringWithOperatorId(): String = { + val metadataStr = metadata.toSeq.sorted.filterNot { + case (_, value) if (value.isEmpty || value.equals("[]")) => true + case (key, _) if (key.equals("DataFilters") || key.equals("Format")) => true + case (_, _) => false + }.map { + case (key, _) if (key.equals("Location")) => + val location = relation.location + val numPaths = location.rootPaths.length + val abbreviatedLoaction = if (numPaths <= 1) { + location.rootPaths.mkString("[", ", ", "]") + } else { + "[" + location.rootPaths.head + s", ... ${numPaths - 1} entries]" + } + s"$key: ${location.getClass.getSimpleName} ${redact(abbreviatedLoaction)}" + case (key, value) => s"$key: ${redact(value)}" + } + + s""" + |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)} + |Output: ${producedAttributes.mkString("[", ", ", "]")} + |${metadataStr.mkString("\n")} + """.stripMargin + } + lazy val inputRDD: RDD[InternalRow] = { val readFile: (PartitionedFile) => Iterator[InternalRow] = relation.fileFormat.buildReaderWithPartitionValues( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainMode.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainMode.scala new file mode 100644 index 0000000000000..0ceafe99cdfcf --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainMode.scala @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import java.util.Locale + +sealed trait ExplainMode { + /** + * String name of the explain mode. + */ + def name: String +} + +/** + * Simple mode means that when printing explain for a DataFrame, only a physical plan is + * expected to be printed to the console. + */ +case object SimpleMode extends ExplainMode { val name = "simple" } + +/** + * Extended mode means that when printing explain for a DataFrame, both logical and physical + * plans are expected to be printed to the console. + */ +case object ExtendedMode extends ExplainMode { val name = "extended" } + +/** + * Codegen mode means that when printing explain for a DataFrame, if generated codes are + * available, a physical plan and the generated codes are expected to be printed to the console. + */ +case object CodegenMode extends ExplainMode { val name = "codegen" } + +/** + * Cost mode means that when printing explain for a DataFrame, if plan node statistics are + * available, a logical plan and the statistics are expected to be printed to the console. + */ +case object CostMode extends ExplainMode { val name = "cost" } + +/** + * Formatted mode means that when printing explain for a DataFrame, explain output is + * expected to be split into two sections: a physical plan outline and node details. + */ +case object FormattedMode extends ExplainMode { val name = "formatted" } + +object ExplainMode { + /** + * Returns the explain mode from the given string. + */ + def fromString(mode: String): ExplainMode = mode.toLowerCase(Locale.ROOT) match { + case SimpleMode.name => SimpleMode + case ExtendedMode.name => ExtendedMode + case CodegenMode.name => CodegenMode + case CostMode.name => CostMode + case FormattedMode.name => FormattedMode + case _ => throw new IllegalArgumentException(s"Unknown explain mode: $mode. Accepted " + + "explain modes are 'simple', 'extended', 'codegen', 'cost', 'formatted'.") + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainUtils.scala index 18a7f9822dcbc..d4fe272f8c95f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExplainUtils.scala @@ -41,7 +41,7 @@ object ExplainUtils { * * @param plan Input query plan to process * @param append function used to append the explain output - * @param startOperationID The start value of operation id. The subsequent operations will + * @param startOperatorID The start value of operation id. The subsequent operations will * be assigned higher value. * * @return The last generated operation id for this input plan. This is to ensure we @@ -125,7 +125,7 @@ object ExplainUtils { * appear in the explain output. * 2. operator identifier starts at startOperatorID + 1 * @param plan Input query plan to process - * @param startOperationID The start value of operation id. The subsequent operations will + * @param startOperatorID The start value of operation id. The subsequent operations will * be assigned higher value. * @param operatorIDs A output parameter that contains a map of operator id and query plan. This * is used by caller to print the detail portion of the plan. @@ -193,14 +193,14 @@ object ExplainUtils { subqueries: ArrayBuffer[(SparkPlan, Expression, BaseSubqueryExec)]): Unit = { plan.foreach { case p: SparkPlan => - p.expressions.flatMap(_.collect { + p.expressions.foreach (_.collect { case e: PlanExpression[_] => e.plan match { case s: BaseSubqueryExec => subqueries += ((p, e, s)) getSubqueries(s, subqueries) + case _ => } - case other => }) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala index eec8d70b5adf0..5a2f16d8e1526 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/HiveResult.scala @@ -19,12 +19,14 @@ package org.apache.spark.sql.execution import java.nio.charset.StandardCharsets import java.sql.{Date, Timestamp} +import java.time.{Instant, LocalDate} import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, TimestampFormatter} import org.apache.spark.sql.execution.command.{DescribeCommandBase, ExecutedCommandExec, ShowTablesCommand} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.CalendarInterval /** * Runs a query returning the result in Hive compatible form. @@ -53,75 +55,40 @@ object HiveResult { // We need the types so we can output struct field names val types = executedPlan.output.map(_.dataType) // Reformat to match hive tab delimited output. - result.map(_.zip(types).map(toHiveString)).map(_.mkString("\t")) + result.map(_.zip(types).map(e => toHiveString(e))) + .map(_.mkString("\t")) } - private def formatDecimal(d: java.math.BigDecimal): String = { - if (d.compareTo(java.math.BigDecimal.ZERO) == 0) { - java.math.BigDecimal.ZERO.toPlainString - } else { - d.stripTrailingZeros().toPlainString // Hive strips trailing zeros - } - } - - private val primitiveTypes = Seq( - StringType, - IntegerType, - LongType, - DoubleType, - FloatType, - BooleanType, - ByteType, - ShortType, - DateType, - TimestampType, - BinaryType) - - private lazy val dateFormatter = DateFormatter() - private lazy val timestampFormatter = TimestampFormatter.getFractionFormatter( - DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone)) - - /** Hive outputs fields of structs slightly differently than top level attributes. */ - private def toHiveStructString(a: (Any, DataType)): String = a match { - case (struct: Row, StructType(fields)) => - struct.toSeq.zip(fields).map { - case (v, t) => s""""${t.name}":${toHiveStructString((v, t.dataType))}""" - }.mkString("{", ",", "}") - case (seq: Seq[_], ArrayType(typ, _)) => - seq.map(v => (v, typ)).map(toHiveStructString).mkString("[", ",", "]") - case (map: Map[_, _], MapType(kType, vType, _)) => - map.map { - case (key, value) => - toHiveStructString((key, kType)) + ":" + toHiveStructString((value, vType)) - }.toSeq.sorted.mkString("{", ",", "}") - case (null, _) => "null" - case (s: String, StringType) => "\"" + s + "\"" - case (decimal, DecimalType()) => decimal.toString - case (interval, CalendarIntervalType) => interval.toString - case (other, tpe) if primitiveTypes contains tpe => other.toString - } + private lazy val zoneId = DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone) + private lazy val dateFormatter = DateFormatter(zoneId) + private lazy val timestampFormatter = TimestampFormatter.getFractionFormatter(zoneId) /** Formats a datum (based on the given data type) and returns the string representation. */ - def toHiveString(a: (Any, DataType)): String = a match { - case (struct: Row, StructType(fields)) => - struct.toSeq.zip(fields).map { - case (v, t) => s""""${t.name}":${toHiveStructString((v, t.dataType))}""" - }.mkString("{", ",", "}") - case (seq: Seq[_], ArrayType(typ, _)) => - seq.map(v => (v, typ)).map(toHiveStructString).mkString("[", ",", "]") - case (map: Map[_, _], MapType(kType, vType, _)) => - map.map { - case (key, value) => - toHiveStructString((key, kType)) + ":" + toHiveStructString((value, vType)) - }.toSeq.sorted.mkString("{", ",", "}") - case (null, _) => "NULL" + def toHiveString(a: (Any, DataType), nested: Boolean = false): String = a match { + case (null, _) => if (nested) "null" else "NULL" + case (b, BooleanType) => b.toString case (d: Date, DateType) => dateFormatter.format(DateTimeUtils.fromJavaDate(d)) + case (ld: LocalDate, DateType) => + dateFormatter.format(DateTimeUtils.localDateToDays(ld)) case (t: Timestamp, TimestampType) => - DateTimeUtils.timestampToString(timestampFormatter, DateTimeUtils.fromJavaTimestamp(t)) + timestampFormatter.format(DateTimeUtils.fromJavaTimestamp(t)) + case (i: Instant, TimestampType) => + timestampFormatter.format(DateTimeUtils.instantToMicros(i)) case (bin: Array[Byte], BinaryType) => new String(bin, StandardCharsets.UTF_8) - case (decimal: java.math.BigDecimal, DecimalType()) => formatDecimal(decimal) - case (interval, CalendarIntervalType) => interval.toString - case (other, _ : UserDefinedType[_]) => other.toString - case (other, tpe) if primitiveTypes.contains(tpe) => other.toString + case (decimal: java.math.BigDecimal, DecimalType()) => decimal.toPlainString + case (n, _: NumericType) => n.toString + case (s: String, StringType) => if (nested) "\"" + s + "\"" else s + case (interval: CalendarInterval, CalendarIntervalType) => interval.toString + case (seq: Seq[_], ArrayType(typ, _)) => + seq.map(v => (v, typ)).map(e => toHiveString(e, true)).mkString("[", ",", "]") + case (m: Map[_, _], MapType(kType, vType, _)) => + m.map { case (key, value) => + toHiveString((key, kType), true) + ":" + toHiveString((value, vType), true) + }.toSeq.sorted.mkString("{", ",", "}") + case (struct: Row, StructType(fields)) => + struct.toSeq.zip(fields).map { case (v, t) => + s""""${t.name}":${toHiveString((v, t.dataType), true)}""" + }.mkString("{", ",", "}") + case (other, _: UserDefinedType[_]) => other.toString } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala index 9e32ecfdd80a7..b452213cd6cc7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScanExec.scala @@ -45,10 +45,14 @@ case class LocalTableScanExec( } } - private lazy val numParallelism: Int = math.min(math.max(unsafeRows.length, 1), - sqlContext.sparkContext.defaultParallelism) - - private lazy val rdd = sqlContext.sparkContext.parallelize(unsafeRows, numParallelism) + @transient private lazy val rdd: RDD[InternalRow] = { + if (rows.isEmpty) { + sqlContext.sparkContext.emptyRDD + } else { + val numSlices = math.min(unsafeRows.length, sqlContext.sparkContext.defaultParallelism) + sqlContext.sparkContext.parallelize(unsafeRows, numSlices) + } + } protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") @@ -77,6 +81,12 @@ case class LocalTableScanExec( taken } + override def executeTail(limit: Int): Array[InternalRow] = { + val taken: Seq[InternalRow] = unsafeRows.takeRight(limit) + longMetric("numOutputRows").add(taken.size) + taken.toArray + } + // Input is already UnsafeRows. override protected val createUnsafeProjection: Boolean = false diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/PartitionedFileUtil.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/PartitionedFileUtil.scala index 3196624f7c7c3..7dece29eb0212 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/PartitionedFileUtil.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/PartitionedFileUtil.scala @@ -64,13 +64,14 @@ object PartitionedFileUtil { offset: Long, length: Long): Array[String] = { val candidates = blockLocations.map { - // The fragment starts from a position within this block + // The fragment starts from a position within this block. It handles the case where the + // fragment is fully contained in the block. case b if b.getOffset <= offset && offset < b.getOffset + b.getLength => b.getHosts -> (b.getOffset + b.getLength - offset).min(length) // The fragment ends at a position within this block - case b if offset <= b.getOffset && offset + length < b.getLength => - b.getHosts -> (offset + length - b.getOffset).min(length) + case b if b.getOffset < offset + length && offset + length < b.getOffset + b.getLength => + b.getHosts -> (offset + length - b.getOffset) // The fragment fully contains this block case b if offset <= b.getOffset && b.getOffset + b.getLength <= offset + length => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala index e5e86db29fe61..9109c05e75853 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala @@ -18,22 +18,26 @@ package org.apache.spark.sql.execution import java.io.{BufferedWriter, OutputStreamWriter} +import java.util.UUID import org.apache.hadoop.fs.Path import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{AnalysisException, SparkSession} +import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.{InternalRow, QueryPlanningTracker} import org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker +import org.apache.spark.sql.catalyst.expressions.codegen.ByteCodeStats import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, ReturnAnswer} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.util.StringUtils.PlanStringConcat import org.apache.spark.sql.catalyst.util.truncatedString import org.apache.spark.sql.dynamicpruning.PlanDynamicPruningFilters -import org.apache.spark.sql.execution.adaptive.InsertAdaptiveSparkPlan +import org.apache.spark.sql.execution.adaptive.{AdaptiveExecutionContext, InsertAdaptiveSparkPlan} import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ReuseExchange} +import org.apache.spark.sql.execution.streaming.{IncrementalExecution, OffsetSeqMetadata} import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.util.Utils /** @@ -59,13 +63,12 @@ class QueryExecution( } } - lazy val analyzed: LogicalPlan = tracker.measurePhase(QueryPlanningTracker.ANALYSIS) { - SparkSession.setActiveSession(sparkSession) + lazy val analyzed: LogicalPlan = executePhase(QueryPlanningTracker.ANALYSIS) { // We can't clone `logical` here, which will reset the `_analyzed` flag. sparkSession.sessionState.analyzer.executeAndCheck(logical, tracker) } - lazy val withCachedData: LogicalPlan = { + lazy val withCachedData: LogicalPlan = sparkSession.withActive { assertAnalyzed() assertSupported() // clone the plan to avoid sharing the plan instance between different stages like analyzing, @@ -73,26 +76,23 @@ class QueryExecution( sparkSession.sharedState.cacheManager.useCachedData(analyzed.clone()) } - lazy val optimizedPlan: LogicalPlan = tracker.measurePhase(QueryPlanningTracker.OPTIMIZATION) { + lazy val optimizedPlan: LogicalPlan = executePhase(QueryPlanningTracker.OPTIMIZATION) { // clone the plan to avoid sharing the plan instance between different stages like analyzing, // optimizing and planning. sparkSession.sessionState.optimizer.executeAndTrack(withCachedData.clone(), tracker) } - lazy val sparkPlan: SparkPlan = tracker.measurePhase(QueryPlanningTracker.PLANNING) { - SparkSession.setActiveSession(sparkSession) - // TODO: We use next(), i.e. take the first plan returned by the planner, here for now, - // but we will implement to choose the best plan. + lazy val sparkPlan: SparkPlan = executePhase(QueryPlanningTracker.PLANNING) { // Clone the logical plan here, in case the planner rules change the states of the logical plan. - planner.plan(ReturnAnswer(optimizedPlan.clone())).next() + QueryExecution.createSparkPlan(sparkSession, planner, optimizedPlan.clone()) } // executedPlan should not be used to initialize any SparkPlan. It should be // only used for execution. - lazy val executedPlan: SparkPlan = tracker.measurePhase(QueryPlanningTracker.PLANNING) { + lazy val executedPlan: SparkPlan = executePhase(QueryPlanningTracker.PLANNING) { // clone the plan to avoid sharing the plan instance between different stages like analyzing, // optimizing and planning. - prepareForExecution(sparkPlan.clone()) + QueryExecution.prepareForExecution(preparations, sparkPlan.clone()) } /** @@ -105,29 +105,19 @@ class QueryExecution( * Given QueryExecution is not a public class, end users are discouraged to use this: please * use `Dataset.rdd` instead where conversion will be applied. */ - lazy val toRdd: RDD[InternalRow] = executedPlan.execute() + lazy val toRdd: RDD[InternalRow] = new SQLExecutionRDD( + executedPlan.execute(), sparkSession.sessionState.conf) - /** - * Prepares a planned [[SparkPlan]] for execution by inserting shuffle operations and internal - * row format conversions as needed. - */ - protected def prepareForExecution(plan: SparkPlan): SparkPlan = { - preparations.foldLeft(plan) { case (sp, rule) => rule.apply(sp) } + /** Get the metrics observed during the execution of the query plan. */ + def observedMetrics: Map[String, Row] = CollectMetricsExec.collect(executedPlan) + + protected def preparations: Seq[Rule[SparkPlan]] = { + QueryExecution.preparations(sparkSession) } - /** A sequence of rules that will be applied in order to the physical plan before execution. */ - protected def preparations: Seq[Rule[SparkPlan]] = Seq( - // `AdaptiveSparkPlanExec` is a leaf node. If inserted, all the following rules will be no-op - // as the original plan is hidden behind `AdaptiveSparkPlanExec`. - InsertAdaptiveSparkPlan(sparkSession, this), - PlanDynamicPruningFilters(sparkSession), - PlanSubqueries(sparkSession), - EnsureRequirements(sparkSession.sessionState.conf), - ApplyColumnarRulesAndInsertTransitions(sparkSession.sessionState.conf, - sparkSession.sessionState.columnarRules), - CollapseCodegenStages(sparkSession.sessionState.conf), - ReuseExchange(sparkSession.sessionState.conf), - ReuseSubquery(sparkSession.sessionState.conf)) + private def executePhase[T](phase: String)(block: => T): T = sparkSession.withActive { + tracker.measurePhase(phase)(block) + } def simpleString: String = simpleString(false) @@ -135,7 +125,12 @@ class QueryExecution( val concat = new PlanStringConcat() concat.append("== Physical Plan ==\n") if (formatted) { - ExplainUtils.processPlan(executedPlan, concat.append) + try { + ExplainUtils.processPlan(executedPlan, concat.append) + } catch { + case e: AnalysisException => concat.append(e.toString) + case e: IllegalArgumentException => concat.append(e.toString) + } } else { QueryPlan.append(executedPlan, concat.append, verbose = false, addSuffix = false) } @@ -143,24 +138,54 @@ class QueryExecution( concat.toString } + def explainString(mode: ExplainMode): String = { + val queryExecution = if (logical.isStreaming) { + // This is used only by explaining `Dataset/DataFrame` created by `spark.readStream`, so the + // output mode does not matter since there is no `Sink`. + new IncrementalExecution( + sparkSession, logical, OutputMode.Append(), "", + UUID.randomUUID, UUID.randomUUID, 0, OffsetSeqMetadata(0, 0)) + } else { + this + } + + mode match { + case SimpleMode => + queryExecution.simpleString + case ExtendedMode => + queryExecution.toString + case CodegenMode => + try { + org.apache.spark.sql.execution.debug.codegenString(queryExecution.executedPlan) + } catch { + case e: AnalysisException => e.toString + } + case CostMode => + queryExecution.stringWithStats + case FormattedMode => + queryExecution.simpleString(formatted = true) + } + } + private def writePlans(append: String => Unit, maxFields: Int): Unit = { val (verbose, addSuffix) = (true, false) append("== Parsed Logical Plan ==\n") QueryPlan.append(logical, append, verbose, addSuffix, maxFields) append("\n== Analyzed Logical Plan ==\n") - val analyzedOutput = try { - truncatedString( - analyzed.output.map(o => s"${o.name}: ${o.dataType.simpleString}"), ", ", maxFields) + try { + append( + truncatedString( + analyzed.output.map(o => s"${o.name}: ${o.dataType.simpleString}"), ", ", maxFields) + ) + append("\n") + QueryPlan.append(analyzed, append, verbose, addSuffix, maxFields) + append("\n== Optimized Logical Plan ==\n") + QueryPlan.append(optimizedPlan, append, verbose, addSuffix, maxFields) + append("\n== Physical Plan ==\n") + QueryPlan.append(executedPlan, append, verbose, addSuffix, maxFields) } catch { - case e: AnalysisException => e.toString + case e: AnalysisException => append(e.toString) } - append(analyzedOutput) - append("\n") - QueryPlan.append(analyzed, append, verbose, addSuffix, maxFields) - append("\n== Optimized Logical Plan ==\n") - QueryPlan.append(optimizedPlan, append, verbose, addSuffix, maxFields) - append("\n== Physical Plan ==\n") - QueryPlan.append(executedPlan, append, verbose, addSuffix, maxFields) } override def toString: String = withRedaction { @@ -174,8 +199,11 @@ class QueryExecution( val maxFields = SQLConf.get.maxToStringFields // trigger to compute stats for logical plans - optimizedPlan.stats - + try { + optimizedPlan.stats + } catch { + case e: AnalysisException => concat.append(e.toString + "\n") + } // only show optimized logical plan and physical plan concat.append("== Optimized Logical Plan ==\n") QueryPlan.append(optimizedPlan, concat.append, verbose = true, addSuffix = true, maxFields) @@ -212,7 +240,7 @@ class QueryExecution( * * @return Sequence of WholeStageCodegen subtrees and corresponding codegen */ - def codegenToSeq(): Seq[(String, String)] = { + def codegenToSeq(): Seq[(String, String, ByteCodeStats)] = { org.apache.spark.sql.execution.debug.codegenStringSeq(executedPlan) } @@ -238,3 +266,79 @@ class QueryExecution( } } } + +object QueryExecution { + /** + * Construct a sequence of rules that are used to prepare a planned [[SparkPlan]] for execution. + * These rules will make sure subqueries are planned, make use the data partitioning and ordering + * are correct, insert whole stage code gen, and try to reduce the work done by reusing exchanges + * and subqueries. + */ + private[execution] def preparations(sparkSession: SparkSession): Seq[Rule[SparkPlan]] = { + + val sparkSessionWithAdaptiveExecutionOff = + if (sparkSession.sessionState.conf.adaptiveExecutionEnabled) { + val session = sparkSession.cloneSession() + session.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, false) + session + } else { + sparkSession + } + + Seq( + // `AdaptiveSparkPlanExec` is a leaf node. If inserted, all the following rules will be no-op + // as the original plan is hidden behind `AdaptiveSparkPlanExec`. + InsertAdaptiveSparkPlan(AdaptiveExecutionContext(sparkSession)), + // If the following rules apply, it means the main query is not AQE-ed, so we make sure the + // subqueries are not AQE-ed either. + PlanDynamicPruningFilters(sparkSessionWithAdaptiveExecutionOff), + PlanSubqueries(sparkSessionWithAdaptiveExecutionOff), + EnsureRequirements(sparkSession.sessionState.conf), + ApplyColumnarRulesAndInsertTransitions(sparkSession.sessionState.conf, + sparkSession.sessionState.columnarRules), + CollapseCodegenStages(sparkSession.sessionState.conf), + ReuseExchange(sparkSession.sessionState.conf), + ReuseSubquery(sparkSession.sessionState.conf) + ) + } + + /** + * Prepares a planned [[SparkPlan]] for execution by inserting shuffle operations and internal + * row format conversions as needed. + */ + private[execution] def prepareForExecution( + preparations: Seq[Rule[SparkPlan]], + plan: SparkPlan): SparkPlan = { + preparations.foldLeft(plan) { case (sp, rule) => rule.apply(sp) } + } + + /** + * Transform a [[LogicalPlan]] into a [[SparkPlan]]. + * + * Note that the returned physical plan still needs to be prepared for execution. + */ + def createSparkPlan( + sparkSession: SparkSession, + planner: SparkPlanner, + plan: LogicalPlan): SparkPlan = { + // TODO: We use next(), i.e. take the first plan returned by the planner, here for now, + // but we will implement to choose the best plan. + planner.plan(ReturnAnswer(plan)).next() + } + + /** + * Prepare the [[SparkPlan]] for execution. + */ + def prepareExecutedPlan(spark: SparkSession, plan: SparkPlan): SparkPlan = { + prepareForExecution(preparations(spark), plan) + } + + /** + * Transform the subquery's [[LogicalPlan]] into a [[SparkPlan]] and prepare the resulting + * [[SparkPlan]] for execution. + */ + def prepareExecutedPlan(spark: SparkSession, plan: LogicalPlan): SparkPlan = { + val sparkPlan = createSparkPlan(spark, spark.sessionState.planner, plan.clone()) + prepareExecutedPlan(spark, sparkPlan) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala index 6046805ae95d4..59c503e372535 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecution.scala @@ -20,6 +20,8 @@ package org.apache.spark.sql.execution import java.util.concurrent.ConcurrentHashMap import java.util.concurrent.atomic.AtomicLong +import scala.concurrent.{ExecutionContext, Future} + import org.apache.spark.SparkContext import org.apache.spark.internal.config.Tests.IS_TESTING import org.apache.spark.sql.SparkSession @@ -60,9 +62,9 @@ object SQLExecution { * we can connect them with an execution. */ def withNewExecutionId[T]( - sparkSession: SparkSession, queryExecution: QueryExecution, - name: Option[String] = None)(body: => T): T = { + name: Option[String] = None)(body: => T): T = queryExecution.sparkSession.withActive { + val sparkSession = queryExecution.sparkSession val sc = sparkSession.sparkContext val oldExecutionId = sc.getLocalProperty(EXECUTION_ID_KEY) val executionId = SQLExecution.nextExecutionId @@ -164,4 +166,30 @@ object SQLExecution { } } } + + /** + * Wrap passed function to ensure necessary thread-local variables like + * SparkContext local properties are forwarded to execution thread + */ + def withThreadLocalCaptured[T]( + sparkSession: SparkSession, exec: ExecutionContext)(body: => T): Future[T] = { + val activeSession = sparkSession + val sc = sparkSession.sparkContext + val localProps = Utils.cloneProperties(sc.getLocalProperties) + Future { + val originalSession = SparkSession.getActiveSession + val originalLocalProps = sc.getLocalProperties + SparkSession.setActiveSession(activeSession) + sc.setLocalProperties(localProps) + val res = body + // reset active session and local props. + sc.setLocalProperties(originalLocalProps) + if (originalSession.nonEmpty) { + SparkSession.setActiveSession(originalSession.get) + } else { + SparkSession.clearActiveSession() + } + res + }(exec) + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecutionRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecutionRDD.scala new file mode 100644 index 0000000000000..45b9cadc4aeda --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SQLExecutionRDD.scala @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution + +import org.apache.spark.{Partition, TaskContext} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.internal.SQLConf + +/** + * It is just a wrapper over `sqlRDD`, which sets and makes effective all the configs from the + * captured `SQLConf`. + * Please notice that this means we may miss configurations set after the creation of this RDD and + * before its execution. + * + * @param sqlRDD the `RDD` generated by the SQL plan + * @param conf the `SQLConf` to apply to the execution of the SQL plan + */ +class SQLExecutionRDD( + var sqlRDD: RDD[InternalRow], @transient conf: SQLConf) extends RDD[InternalRow](sqlRDD) { + private val sqlConfigs = conf.getAllConfs + private lazy val sqlConfExecutorSide = { + val newConf = new SQLConf() + sqlConfigs.foreach { case (k, v) => newConf.setConfString(k, v) } + newConf + } + + override val partitioner = firstParent[InternalRow].partitioner + + override def getPartitions: Array[Partition] = firstParent[InternalRow].partitions + + override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = { + // If we are in the context of a tracked SQL operation, `SQLExecution.EXECUTION_ID_KEY` is set + // and we have nothing to do here. Otherwise, we use the `SQLConf` captured at the creation of + // this RDD. + if (context.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) == null) { + SQLConf.withExistingConf(sqlConfExecutorSide) { + firstParent[InternalRow].iterator(split, context) + } + } else { + firstParent[InternalRow].iterator(split, context) + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala index 079ff25fcb67e..4c19f95796d04 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala @@ -21,8 +21,10 @@ import java.util.Arrays import org.apache.spark._ import org.apache.spark.rdd.RDD +import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.metric.{SQLMetric, SQLShuffleReadMetricsReporter} +import org.apache.spark.sql.internal.SQLConf /** * The [[Partition]] used by [[ShuffledRowRDD]]. A post-shuffle partition @@ -117,6 +119,11 @@ class ShuffledRowRDD( specifiedPartitionStartIndices: Option[Array[Int]] = None) extends RDD[InternalRow](dependency.rdd.context, Nil) { + if (SQLConf.get.fetchShuffleBlocksInBatchEnabled) { + dependency.rdd.context.setLocalProperty( + SortShuffleManager.FETCH_SHUFFLE_BLOCKS_IN_BATCH_ENABLED_KEY, "true") + } + private[this] val numPreShufflePartitions = dependency.partitioner.numPartitions private[this] val partitionStartIndices: Array[Int] = specifiedPartitionStartIndices match { @@ -172,7 +179,7 @@ class ShuffledRowRDD( reader.read().asInstanceOf[Iterator[Product2[Int, InternalRow]]].map(_._2) } - override def clearDependencies() { + override def clearDependencies(): Unit = { super.clearDependencies() dependency = null } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala index 0a955d6a75235..6b6ca531c6d3b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode} import org.apache.spark.sql.catalyst.plans.physical._ -import org.apache.spark.sql.catalyst.util.DateTimeUtils._ +import org.apache.spark.sql.catalyst.util.DateTimeConstants.NANOS_PER_MILLIS import org.apache.spark.sql.execution.metric.SQLMetrics /** @@ -62,8 +62,16 @@ case class SortExec( "peakMemory" -> SQLMetrics.createSizeMetric(sparkContext, "peak memory"), "spillSize" -> SQLMetrics.createSizeMetric(sparkContext, "spill size")) + private[sql] var rowSorter: UnsafeExternalRowSorter = _ + + /** + * This method gets invoked only once for each SortExec instance to initialize an + * UnsafeExternalRowSorter, both `plan.execute` and code generation are using it. + * In the code generation code path, we need to call this function outside the class so we + * should make it public. + */ def createSorter(): UnsafeExternalRowSorter = { - val ordering = newOrdering(sortOrder, output) + val ordering = RowOrdering.create(sortOrder, output) // The comparator for comparing prefix val boundSortExpression = BindReferences.bindReference(sortOrder.head, output) @@ -87,13 +95,13 @@ case class SortExec( } val pageSize = SparkEnv.get.memoryManager.pageSizeBytes - val sorter = UnsafeExternalRowSorter.create( + rowSorter = UnsafeExternalRowSorter.create( schema, ordering, prefixComparator, prefixComputer, pageSize, canUseRadixSort) if (testSpillFrequency > 0) { - sorter.setTestSpillFrequency(testSpillFrequency) + rowSorter.setTestSpillFrequency(testSpillFrequency) } - sorter + rowSorter } protected override def doExecute(): RDD[InternalRow] = { @@ -181,4 +189,17 @@ case class SortExec( |$sorterVariable.insertRow((UnsafeRow)${row.value}); """.stripMargin } + + /** + * In SortExec, we overwrites cleanupResources to close UnsafeExternalRowSorter. + */ + override protected[sql] def cleanupResources(): Unit = { + if (rowSorter != null) { + // There's possible for rowSorter is null here, for example, in the scenario of empty + // iterator in the current task, the downstream physical node(like SortMergeJoinExec) will + // trigger cleanupResources before rowSorter initialized in createSorter. + rowSorter.cleanupResources() + } + super.cleanupResources() + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala index 98d6be0374da7..013d94768a2a8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala @@ -20,20 +20,27 @@ package org.apache.spark.sql.execution import org.apache.spark.sql.ExperimentalMethods import org.apache.spark.sql.catalyst.catalog.SessionCatalog import org.apache.spark.sql.catalyst.optimizer._ +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.connector.catalog.CatalogManager import org.apache.spark.sql.dynamicpruning.{CleanupDynamicPruningFilters, PartitionPruning} import org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions import org.apache.spark.sql.execution.datasources.SchemaPruning +import org.apache.spark.sql.execution.datasources.v2.V2ScanRelationPushDown import org.apache.spark.sql.execution.python.{ExtractGroupingPythonUDFFromAggregate, ExtractPythonUDFFromAggregate, ExtractPythonUDFs} class SparkOptimizer( + catalogManager: CatalogManager, catalog: SessionCatalog, experimentalMethods: ExperimentalMethods) - extends Optimizer(catalog) { + extends Optimizer(catalogManager) { + + override def earlyScanPushDownRules: Seq[Rule[LogicalPlan]] = + // TODO: move SchemaPruning into catalyst + SchemaPruning :: V2ScanRelationPushDown :: PruneFileSourcePartitions :: Nil override def defaultBatches: Seq[Batch] = (preOptimizationBatches ++ super.defaultBatches :+ Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog)) :+ - Batch("Prune File Source Table Partitions", Once, PruneFileSourcePartitions) :+ - Batch("Schema Pruning", Once, SchemaPruning) :+ Batch("PartitionPruning", Once, PartitionPruning, OptimizeSubqueries) :+ @@ -62,7 +69,8 @@ class SparkOptimizer( override def nonExcludableRules: Seq[String] = super.nonExcludableRules :+ ExtractPythonUDFFromJoinCondition.ruleName :+ ExtractPythonUDFFromAggregate.ruleName :+ ExtractGroupingPythonUDFFromAggregate.ruleName :+ - ExtractPythonUDFs.ruleName + ExtractPythonUDFs.ruleName :+ + V2ScanRelationPushDown.ruleName /** * Optimization batches that are executed before the regular optimization batches (also before diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala index b4cdf9e16b7e5..3301e9b5ab180 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala @@ -20,11 +20,7 @@ package org.apache.spark.sql.execution import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream} import java.util.concurrent.atomic.AtomicInteger -import scala.collection.mutable.ArrayBuffer -import scala.concurrent.ExecutionContext - -import org.codehaus.commons.compiler.CompileException -import org.codehaus.janino.InternalCompilerException +import scala.collection.mutable.{ArrayBuffer, ListBuffer} import org.apache.spark.{broadcast, SparkEnv} import org.apache.spark.internal.Logging @@ -33,13 +29,11 @@ import org.apache.spark.rdd.{RDD, RDDOperationScope} import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.codegen.{Predicate => GenPredicate, _} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.catalyst.trees.TreeNodeTag import org.apache.spark.sql.execution.metric.SQLMetric -import org.apache.spark.sql.types.DataType import org.apache.spark.sql.vectorized.ColumnarBatch object SparkPlan { @@ -73,16 +67,6 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ val id: Int = SparkPlan.newPlanId() - // sqlContext will be null when SparkPlan nodes are created without the active sessions. - val subexpressionEliminationEnabled: Boolean = if (sqlContext != null) { - sqlContext.conf.subexpressionEliminationEnabled - } else { - false - } - - // whether we should fallback when hitting compilation errors caused by codegen - private val codeGenFallBack = (sqlContext == null) || sqlContext.conf.codegenFallback - /** * Return true if this stage of the plan supports columnar execution. */ @@ -141,6 +125,7 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ */ def resetMetrics(): Unit = { metrics.valuesIterator.foreach(_.reset()) + children.foreach(_.resetMetrics()) } /** @@ -325,20 +310,38 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ * UnsafeRow is highly compressible (at least 8 bytes for any column), the byte array is also * compressed. */ - private def getByteArrayRdd(n: Int = -1): RDD[(Long, Array[Byte])] = { + private def getByteArrayRdd( + n: Int = -1, takeFromEnd: Boolean = false): RDD[(Long, Array[Byte])] = { execute().mapPartitionsInternal { iter => var count = 0 val buffer = new Array[Byte](4 << 10) // 4K val codec = CompressionCodec.createCodec(SparkEnv.get.conf) val bos = new ByteArrayOutputStream() val out = new DataOutputStream(codec.compressedOutputStream(bos)) - // `iter.hasNext` may produce one row and buffer it, we should only call it when the limit is - // not hit. - while ((n < 0 || count < n) && iter.hasNext) { - val row = iter.next().asInstanceOf[UnsafeRow] - out.writeInt(row.getSizeInBytes) - row.writeToStream(out, buffer) - count += 1 + + if (takeFromEnd && n > 0) { + // To collect n from the last, we should anyway read everything with keeping the n. + // Otherwise, we don't know where is the last from the iterator. + var last: Seq[UnsafeRow] = Seq.empty[UnsafeRow] + val slidingIter = iter.map(_.copy()).sliding(n) + while (slidingIter.hasNext) { last = slidingIter.next().asInstanceOf[Seq[UnsafeRow]] } + var i = 0 + count = last.length + while (i < count) { + val row = last(i) + out.writeInt(row.getSizeInBytes) + row.writeToStream(out, buffer) + i += 1 + } + } else { + // `iter.hasNext` may produce one row and buffer it, we should only call it when the + // limit is not hit. + while ((n < 0 || count < n) && iter.hasNext) { + val row = iter.next().asInstanceOf[UnsafeRow] + out.writeInt(row.getSizeInBytes) + row.writeToStream(out, buffer) + count += 1 + } } out.writeInt(-1) out.flush() @@ -413,14 +416,23 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ * * This is modeled after `RDD.take` but never runs any job locally on the driver. */ - def executeTake(n: Int): Array[InternalRow] = { + def executeTake(n: Int): Array[InternalRow] = executeTake(n, takeFromEnd = false) + + /** + * Runs this query returning the last `n` rows as an array. + * + * This is modeled after `RDD.take` but never runs any job locally on the driver. + */ + def executeTail(n: Int): Array[InternalRow] = executeTake(n, takeFromEnd = true) + + private def executeTake(n: Int, takeFromEnd: Boolean): Array[InternalRow] = { if (n == 0) { return new Array[InternalRow](0) } - val childRDD = getByteArrayRdd(n) + val childRDD = getByteArrayRdd(n, takeFromEnd) - val buf = new ArrayBuffer[InternalRow] + val buf = if (takeFromEnd) new ListBuffer[InternalRow] else new ArrayBuffer[InternalRow] val totalParts = childRDD.partitions.length var partsScanned = 0 while (buf.length < n && partsScanned < totalParts) { @@ -442,70 +454,57 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ } } - val p = partsScanned.until(math.min(partsScanned + numPartsToTry, totalParts).toInt) + val parts = partsScanned.until(math.min(partsScanned + numPartsToTry, totalParts).toInt) + val partsToScan = if (takeFromEnd) { + // Reverse partitions to scan. So, if parts was [1, 2, 3] in 200 partitions (0 to 199), + // it becomes [198, 197, 196]. + parts.map(p => (totalParts - 1) - p) + } else { + parts + } val sc = sqlContext.sparkContext val res = sc.runJob(childRDD, (it: Iterator[(Long, Array[Byte])]) => - if (it.hasNext) it.next() else (0L, Array.empty[Byte]), p) + if (it.hasNext) it.next() else (0L, Array.emptyByteArray), partsToScan) var i = 0 - while (buf.length < n && i < res.length) { - val rows = decodeUnsafeRows(res(i)._2) - val rowsToTake = if (n - buf.length >= res(i)._1) { - rows.toArray - } else { - rows.take(n - buf.length).toArray + + if (takeFromEnd) { + while (buf.length < n && i < res.length) { + val rows = decodeUnsafeRows(res(i)._2) + if (n - buf.length >= res(i)._1) { + buf.prepend(rows.toArray[InternalRow]: _*) + } else { + val dropUntil = res(i)._1 - (n - buf.length) + // Same as Iterator.drop but this only takes a long. + var j: Long = 0L + while (j < dropUntil) { rows.next(); j += 1L} + buf.prepend(rows.toArray[InternalRow]: _*) + } + i += 1 + } + } else { + while (buf.length < n && i < res.length) { + val rows = decodeUnsafeRows(res(i)._2) + if (n - buf.length >= res(i)._1) { + buf ++= rows.toArray[InternalRow] + } else { + buf ++= rows.take(n - buf.length).toArray[InternalRow] + } + i += 1 } - buf ++= rowsToTake - i += 1 } - partsScanned += p.size + partsScanned += partsToScan.size } buf.toArray } - protected def newMutableProjection( - expressions: Seq[Expression], - inputSchema: Seq[Attribute], - useSubexprElimination: Boolean = false): MutableProjection = { - log.debug(s"Creating MutableProj: $expressions, inputSchema: $inputSchema") - MutableProjection.create(expressions, inputSchema) - } - - private def genInterpretedPredicate( - expression: Expression, inputSchema: Seq[Attribute]): InterpretedPredicate = { - val str = expression.toString - val logMessage = if (str.length > 256) { - str.substring(0, 256 - 3) + "..." - } else { - str - } - logWarning(s"Codegen disabled for this expression:\n $logMessage") - InterpretedPredicate.create(expression, inputSchema) - } - - protected def newPredicate( - expression: Expression, inputSchema: Seq[Attribute]): GenPredicate = { - try { - GeneratePredicate.generate(expression, inputSchema) - } catch { - case _ @ (_: InternalCompilerException | _: CompileException) if codeGenFallBack => - genInterpretedPredicate(expression, inputSchema) - } - } - - protected def newOrdering( - order: Seq[SortOrder], inputSchema: Seq[Attribute]): Ordering[InternalRow] = { - GenerateOrdering.generate(order, inputSchema) - } - /** - * Creates a row ordering for the given schema, in natural ascending order. + * Cleans up the resources used by the physical operator (if any). In general, all the resources + * should be cleaned up when the task finishes but operators like SortMergeJoinExec and LimitExec + * may want eager cleanup to free up tight resources (e.g., memory). */ - protected def newNaturalAscendingOrdering(dataTypes: Seq[DataType]): Ordering[InternalRow] = { - val order: Seq[SortOrder] = dataTypes.zipWithIndex.map { - case (dt, index) => SortOrder(BoundReference(index, dt, nullable = true), Ascending) - } - newOrdering(order, Seq.empty) + protected[sql] def cleanupResources(): Unit = { + children.foreach(_.cleanupResources()) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala index 8c7752c4bb742..5b72ec058e127 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.execution import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, QueryStageExec} +import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, LocalShuffleReaderExec, QueryStageExec} import org.apache.spark.sql.execution.exchange.ReusedExchangeExec import org.apache.spark.sql.execution.metric.SQLMetricInfo import org.apache.spark.sql.internal.SQLConf @@ -71,6 +71,7 @@ private[execution] object SparkPlanInfo { plan.nodeName, plan.simpleString(SQLConf.get.maxToStringFields), children.map(fromSparkPlan), - metadata, metrics) + metadata, + metrics) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala index dc7fb7741e7a7..895eeedd86b8b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanner.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.execution -import org.apache.spark.SparkContext import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan @@ -27,7 +26,7 @@ import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Strategy import org.apache.spark.sql.internal.SQLConf class SparkPlanner( - val sparkContext: SparkContext, + val session: SparkSession, val conf: SQLConf, val experimentalMethods: ExperimentalMethods) extends SparkStrategies { @@ -39,7 +38,7 @@ class SparkPlanner( extraPlanningStrategies ++ ( LogicalQueryStageStrategy :: PythonEvals :: - DataSourceV2Strategy :: + new DataSourceV2Strategy(session) :: FileSourceStrategy :: DataSourceStrategy(conf) :: SpecialLimits :: diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 83cdc7a978a9a..aa139cb6b0c3b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -39,7 +39,7 @@ import org.apache.spark.sql.types.StructType /** * Concrete parser for Spark SQL statements. */ -class SparkSqlParser(conf: SQLConf) extends AbstractSqlParser { +class SparkSqlParser(conf: SQLConf) extends AbstractSqlParser(conf) { val astBuilder = new SparkSqlAstBuilder(conf) private val substitutor = new VariableSubstitution(conf) @@ -89,151 +89,6 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { ResetCommand } - /** - * Create an [[AnalyzeTableCommand]] command, or an [[AnalyzePartitionCommand]] - * or an [[AnalyzeColumnCommand]] command. - * Example SQL for analyzing a table or a set of partitions : - * {{{ - * ANALYZE TABLE [db_name.]tablename [PARTITION (partcol1[=val1], partcol2[=val2], ...)] - * COMPUTE STATISTICS [NOSCAN]; - * }}} - * - * Example SQL for analyzing columns : - * {{{ - * ANALYZE TABLE [db_name.]tablename COMPUTE STATISTICS FOR COLUMNS column1, column2; - * }}} - * - * Example SQL for analyzing all columns of a table: - * {{{ - * ANALYZE TABLE [db_name.]tablename COMPUTE STATISTICS FOR ALL COLUMNS; - * }}} - */ - override def visitAnalyze(ctx: AnalyzeContext): LogicalPlan = withOrigin(ctx) { - def checkPartitionSpec(): Unit = { - if (ctx.partitionSpec != null) { - logWarning("Partition specification is ignored when collecting column statistics: " + - ctx.partitionSpec.getText) - } - } - if (ctx.identifier != null && - ctx.identifier.getText.toLowerCase(Locale.ROOT) != "noscan") { - throw new ParseException(s"Expected `NOSCAN` instead of `${ctx.identifier.getText}`", ctx) - } - - val table = visitTableIdentifier(ctx.tableIdentifier) - if (ctx.ALL() != null) { - checkPartitionSpec() - AnalyzeColumnCommand(table, None, allColumns = true) - } else if (ctx.identifierSeq() == null) { - if (ctx.partitionSpec != null) { - AnalyzePartitionCommand(table, visitPartitionSpec(ctx.partitionSpec), - noscan = ctx.identifier != null) - } else { - AnalyzeTableCommand(table, noscan = ctx.identifier != null) - } - } else { - checkPartitionSpec() - AnalyzeColumnCommand(table, - Option(visitIdentifierSeq(ctx.identifierSeq())), allColumns = false) - } - } - - /** - * Create a [[SetDatabaseCommand]] logical plan. - */ - override def visitUse(ctx: UseContext): LogicalPlan = withOrigin(ctx) { - SetDatabaseCommand(ctx.db.getText) - } - - /** - * Create a [[ShowTablesCommand]] logical plan. - * Example SQL : - * {{{ - * SHOW TABLE EXTENDED [(IN|FROM) database_name] LIKE 'identifier_with_wildcards' - * [PARTITION(partition_spec)]; - * }}} - */ - override def visitShowTable(ctx: ShowTableContext): LogicalPlan = withOrigin(ctx) { - val partitionSpec = Option(ctx.partitionSpec).map(visitNonOptionalPartitionSpec) - ShowTablesCommand( - Option(ctx.db).map(_.getText), - Option(ctx.pattern).map(string), - isExtended = true, - partitionSpec = partitionSpec) - } - - /** - * Create a [[ShowDatabasesCommand]] logical plan. - * Example SQL: - * {{{ - * SHOW (DATABASES|SCHEMAS) [LIKE 'identifier_with_wildcards']; - * }}} - */ - override def visitShowDatabases(ctx: ShowDatabasesContext): LogicalPlan = withOrigin(ctx) { - ShowDatabasesCommand(Option(ctx.pattern).map(string)) - } - - /** - * A command for users to list the properties for a table. If propertyKey is specified, the value - * for the propertyKey is returned. If propertyKey is not specified, all the keys and their - * corresponding values are returned. - * The syntax of using this command in SQL is: - * {{{ - * SHOW TBLPROPERTIES table_name[('propertyKey')]; - * }}} - */ - override def visitShowTblProperties( - ctx: ShowTblPropertiesContext): LogicalPlan = withOrigin(ctx) { - ShowTablePropertiesCommand( - visitTableIdentifier(ctx.tableIdentifier), - Option(ctx.key).map(visitTablePropertyKey)) - } - - /** - * A command for users to list the column names for a table. - * This function creates a [[ShowColumnsCommand]] logical plan. - * - * The syntax of using this command in SQL is: - * {{{ - * SHOW COLUMNS (FROM | IN) table_identifier [(FROM | IN) database]; - * }}} - */ - override def visitShowColumns(ctx: ShowColumnsContext): LogicalPlan = withOrigin(ctx) { - ShowColumnsCommand(Option(ctx.db).map(_.getText), visitTableIdentifier(ctx.tableIdentifier)) - } - - /** - * A command for users to list the partition names of a table. If partition spec is specified, - * partitions that match the spec are returned. Otherwise an empty result set is returned. - * - * This function creates a [[ShowPartitionsCommand]] logical plan - * - * The syntax of using this command in SQL is: - * {{{ - * SHOW PARTITIONS table_identifier [partition_spec]; - * }}} - */ - override def visitShowPartitions(ctx: ShowPartitionsContext): LogicalPlan = withOrigin(ctx) { - val table = visitTableIdentifier(ctx.tableIdentifier) - val partitionKeys = Option(ctx.partitionSpec).map(visitNonOptionalPartitionSpec) - ShowPartitionsCommand(table, partitionKeys) - } - - /** - * Creates a [[ShowCreateTableCommand]] - */ - override def visitShowCreateTable(ctx: ShowCreateTableContext): LogicalPlan = withOrigin(ctx) { - val table = visitTableIdentifier(ctx.tableIdentifier()) - ShowCreateTableCommand(table) - } - - /** - * Create a [[RefreshTable]] logical plan. - */ - override def visitRefreshTable(ctx: RefreshTableContext): LogicalPlan = withOrigin(ctx) { - RefreshTable(visitTableIdentifier(ctx.tableIdentifier)) - } - /** * Create a [[RefreshResource]] logical plan. */ @@ -256,28 +111,6 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { unquotedPath } - /** - * Create a [[CacheTableCommand]] logical plan. - */ - override def visitCacheTable(ctx: CacheTableContext): LogicalPlan = withOrigin(ctx) { - val query = Option(ctx.query).map(plan) - val tableIdent = visitTableIdentifier(ctx.tableIdentifier) - if (query.isDefined && tableIdent.database.isDefined) { - val database = tableIdent.database.get - throw new ParseException(s"It is not allowed to add database prefix `$database` to " + - s"the table name in CACHE TABLE AS SELECT", ctx) - } - val options = Option(ctx.options).map(visitPropertyKeyValues).getOrElse(Map.empty) - CacheTableCommand(tableIdent, query, ctx.LAZY != null, options) - } - - /** - * Create an [[UncacheTableCommand]] logical plan. - */ - override def visitUncacheTable(ctx: UncacheTableContext): LogicalPlan = withOrigin(ctx) { - UncacheTableCommand(visitTableIdentifier(ctx.tableIdentifier), ctx.EXISTS != null) - } - /** * Create a [[ClearCacheCommand]] logical plan. */ @@ -303,10 +136,13 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { } else { ExplainCommand( logicalPlan = statement, - extended = ctx.EXTENDED != null, - codegen = ctx.CODEGEN != null, - cost = ctx.COST != null, - formatted = ctx.FORMATTED != null) + mode = { + if (ctx.EXTENDED != null) ExtendedMode + else if (ctx.CODEGEN != null) CodegenMode + else if (ctx.COST != null) CostMode + else if (ctx.FORMATTED != null) FormattedMode + else SimpleMode + }) } } @@ -354,22 +190,15 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { if (external) { operationNotAllowed("CREATE EXTERNAL TABLE ... USING", ctx) } - - checkDuplicateClauses(ctx.TBLPROPERTIES, "TBLPROPERTIES", ctx) - checkDuplicateClauses(ctx.OPTIONS, "OPTIONS", ctx) - checkDuplicateClauses(ctx.PARTITIONED, "PARTITIONED BY", ctx) - checkDuplicateClauses(ctx.COMMENT, "COMMENT", ctx) - checkDuplicateClauses(ctx.bucketSpec(), "CLUSTERED BY", ctx) - checkDuplicateClauses(ctx.locationSpec, "LOCATION", ctx) - if (ifNotExists) { // Unlike CREATE TEMPORARY VIEW USING, CREATE TEMPORARY TABLE USING does not support // IF NOT EXISTS. Users are not allowed to replace the existing temp table. operationNotAllowed("CREATE TEMPORARY TABLE IF NOT EXISTS", ctx) } - val options = Option(ctx.options).map(visitPropertyKeyValues).getOrElse(Map.empty) - val provider = ctx.tableProvider.qualifiedName.getText + val (_, _, _, options, _, _) = visitCreateTableClauses(ctx.createTableClauses()) + val provider = Option(ctx.tableProvider).map(_.multipartIdentifier.getText).getOrElse( + throw new ParseException("CREATE TEMPORARY TABLE without a provider is not allowed.", ctx)) val schema = Option(ctx.colTypeList()).map(createSchema) logWarning(s"CREATE TEMPORARY TABLE ... USING ... is deprecated, please use " + @@ -390,378 +219,10 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { userSpecifiedSchema = Option(ctx.colTypeList()).map(createSchema), replace = ctx.REPLACE != null, global = ctx.GLOBAL != null, - provider = ctx.tableProvider.qualifiedName.getText, + provider = ctx.tableProvider.multipartIdentifier.getText, options = Option(ctx.tablePropertyList).map(visitPropertyKeyValues).getOrElse(Map.empty)) } - /** - * Create a [[LoadDataCommand]] command. - * - * For example: - * {{{ - * LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename - * [PARTITION (partcol1=val1, partcol2=val2 ...)] - * }}} - */ - override def visitLoadData(ctx: LoadDataContext): LogicalPlan = withOrigin(ctx) { - LoadDataCommand( - table = visitTableIdentifier(ctx.tableIdentifier), - path = string(ctx.path), - isLocal = ctx.LOCAL != null, - isOverwrite = ctx.OVERWRITE != null, - partition = Option(ctx.partitionSpec).map(visitNonOptionalPartitionSpec) - ) - } - - /** - * Create a [[TruncateTableCommand]] command. - * - * For example: - * {{{ - * TRUNCATE TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)] - * }}} - */ - override def visitTruncateTable(ctx: TruncateTableContext): LogicalPlan = withOrigin(ctx) { - TruncateTableCommand( - visitTableIdentifier(ctx.tableIdentifier), - Option(ctx.partitionSpec).map(visitNonOptionalPartitionSpec)) - } - - /** - * Create a [[AlterTableRecoverPartitionsCommand]] command. - * - * For example: - * {{{ - * MSCK REPAIR TABLE tablename - * }}} - */ - override def visitRepairTable(ctx: RepairTableContext): LogicalPlan = withOrigin(ctx) { - AlterTableRecoverPartitionsCommand( - visitTableIdentifier(ctx.tableIdentifier), - "MSCK REPAIR TABLE") - } - - /** - * Create a [[CreateDatabaseCommand]] command. - * - * For example: - * {{{ - * CREATE DATABASE [IF NOT EXISTS] database_name - * create_database_clauses; - * - * create_database_clauses (order insensitive): - * [COMMENT database_comment] - * [LOCATION path] - * [WITH DBPROPERTIES (key1=val1, key2=val2, ...)] - * }}} - */ - override def visitCreateDatabase(ctx: CreateDatabaseContext): LogicalPlan = withOrigin(ctx) { - checkDuplicateClauses(ctx.COMMENT, "COMMENT", ctx) - checkDuplicateClauses(ctx.locationSpec, "LOCATION", ctx) - checkDuplicateClauses(ctx.DBPROPERTIES, "WITH DBPROPERTIES", ctx) - - CreateDatabaseCommand( - ctx.db.getText, - ctx.EXISTS != null, - ctx.locationSpec.asScala.headOption.map(visitLocationSpec), - Option(ctx.comment).map(string), - ctx.tablePropertyList.asScala.headOption.map(visitPropertyKeyValues).getOrElse(Map.empty)) - } - - /** - * Create an [[AlterDatabasePropertiesCommand]] command. - * - * For example: - * {{{ - * ALTER (DATABASE|SCHEMA) database SET DBPROPERTIES (property_name=property_value, ...); - * }}} - */ - override def visitSetDatabaseProperties( - ctx: SetDatabasePropertiesContext): LogicalPlan = withOrigin(ctx) { - AlterDatabasePropertiesCommand( - ctx.db.getText, - visitPropertyKeyValues(ctx.tablePropertyList)) - } - - /** - * Create a [[DropDatabaseCommand]] command. - * - * For example: - * {{{ - * DROP (DATABASE|SCHEMA) [IF EXISTS] database [RESTRICT|CASCADE]; - * }}} - */ - override def visitDropDatabase(ctx: DropDatabaseContext): LogicalPlan = withOrigin(ctx) { - DropDatabaseCommand(ctx.db.getText, ctx.EXISTS != null, ctx.CASCADE != null) - } - - /** - * Create a [[DescribeDatabaseCommand]] command. - * - * For example: - * {{{ - * DESCRIBE DATABASE [EXTENDED] database; - * }}} - */ - override def visitDescribeDatabase(ctx: DescribeDatabaseContext): LogicalPlan = withOrigin(ctx) { - DescribeDatabaseCommand(ctx.db.getText, ctx.EXTENDED != null) - } - - /** - * Create a plan for a DESCRIBE FUNCTION command. - */ - override def visitDescribeFunction(ctx: DescribeFunctionContext): LogicalPlan = withOrigin(ctx) { - import ctx._ - val functionName = - if (describeFuncName.STRING() != null) { - FunctionIdentifier(string(describeFuncName.STRING()), database = None) - } else if (describeFuncName.qualifiedName() != null) { - visitFunctionName(describeFuncName.qualifiedName) - } else { - FunctionIdentifier(describeFuncName.getText, database = None) - } - DescribeFunctionCommand(functionName, EXTENDED != null) - } - - /** - * Create a plan for a SHOW FUNCTIONS command. - */ - override def visitShowFunctions(ctx: ShowFunctionsContext): LogicalPlan = withOrigin(ctx) { - import ctx._ - val (user, system) = Option(ctx.identifier).map(_.getText.toLowerCase(Locale.ROOT)) match { - case None | Some("all") => (true, true) - case Some("system") => (false, true) - case Some("user") => (true, false) - case Some(x) => throw new ParseException(s"SHOW $x FUNCTIONS not supported", ctx) - } - - val (db, pat) = if (qualifiedName != null) { - val name = visitFunctionName(qualifiedName) - (name.database, Some(name.funcName)) - } else if (pattern != null) { - (None, Some(string(pattern))) - } else { - (None, None) - } - - ShowFunctionsCommand(db, pat, user, system) - } - - /** - * Create a [[CreateFunctionCommand]] command. - * - * For example: - * {{{ - * CREATE [OR REPLACE] [TEMPORARY] FUNCTION [IF NOT EXISTS] [db_name.]function_name - * AS class_name [USING JAR|FILE|ARCHIVE 'file_uri' [, JAR|FILE|ARCHIVE 'file_uri']]; - * }}} - */ - override def visitCreateFunction(ctx: CreateFunctionContext): LogicalPlan = withOrigin(ctx) { - val resources = ctx.resource.asScala.map { resource => - val resourceType = resource.identifier.getText.toLowerCase(Locale.ROOT) - resourceType match { - case "jar" | "file" | "archive" => - FunctionResource(FunctionResourceType.fromString(resourceType), string(resource.STRING)) - case other => - operationNotAllowed(s"CREATE FUNCTION with resource type '$resourceType'", ctx) - } - } - - // Extract database, name & alias. - val functionIdentifier = visitFunctionName(ctx.qualifiedName) - CreateFunctionCommand( - functionIdentifier.database, - functionIdentifier.funcName, - string(ctx.className), - resources, - ctx.TEMPORARY != null, - ctx.EXISTS != null, - ctx.REPLACE != null) - } - - /** - * Create a [[DropFunctionCommand]] command. - * - * For example: - * {{{ - * DROP [TEMPORARY] FUNCTION [IF EXISTS] function; - * }}} - */ - override def visitDropFunction(ctx: DropFunctionContext): LogicalPlan = withOrigin(ctx) { - val functionIdentifier = visitFunctionName(ctx.qualifiedName) - DropFunctionCommand( - functionIdentifier.database, - functionIdentifier.funcName, - ctx.EXISTS != null, - ctx.TEMPORARY != null) - } - - /** - * Create a [[AlterTableRenameCommand]] command. - * - * For example: - * {{{ - * ALTER TABLE table1 RENAME TO table2; - * ALTER VIEW view1 RENAME TO view2; - * }}} - */ - override def visitRenameTable(ctx: RenameTableContext): LogicalPlan = withOrigin(ctx) { - AlterTableRenameCommand( - visitTableIdentifier(ctx.from), - visitTableIdentifier(ctx.to), - ctx.VIEW != null) - } - - /** - * Create an [[AlterTableSerDePropertiesCommand]] command. - * - * For example: - * {{{ - * ALTER TABLE table [PARTITION spec] SET SERDE serde_name [WITH SERDEPROPERTIES props]; - * ALTER TABLE table [PARTITION spec] SET SERDEPROPERTIES serde_properties; - * }}} - */ - override def visitSetTableSerDe(ctx: SetTableSerDeContext): LogicalPlan = withOrigin(ctx) { - AlterTableSerDePropertiesCommand( - visitTableIdentifier(ctx.tableIdentifier), - Option(ctx.STRING).map(string), - Option(ctx.tablePropertyList).map(visitPropertyKeyValues), - // TODO a partition spec is allowed to have optional values. This is currently violated. - Option(ctx.partitionSpec).map(visitNonOptionalPartitionSpec)) - } - - /** - * Create an [[AlterTableAddPartitionCommand]] command. - * - * For example: - * {{{ - * ALTER TABLE table ADD [IF NOT EXISTS] PARTITION spec [LOCATION 'loc1'] - * ALTER VIEW view ADD [IF NOT EXISTS] PARTITION spec - * }}} - * - * ALTER VIEW ... ADD PARTITION ... is not supported because the concept of partitioning - * is associated with physical tables - */ - override def visitAddTablePartition( - ctx: AddTablePartitionContext): LogicalPlan = withOrigin(ctx) { - if (ctx.VIEW != null) { - operationNotAllowed("ALTER VIEW ... ADD PARTITION", ctx) - } - // Create partition spec to location mapping. - val specsAndLocs = if (ctx.partitionSpec.isEmpty) { - ctx.partitionSpecLocation.asScala.map { - splCtx => - val spec = visitNonOptionalPartitionSpec(splCtx.partitionSpec) - val location = Option(splCtx.locationSpec).map(visitLocationSpec) - spec -> location - } - } else { - // Alter View: the location clauses are not allowed. - ctx.partitionSpec.asScala.map(visitNonOptionalPartitionSpec(_) -> None) - } - AlterTableAddPartitionCommand( - visitTableIdentifier(ctx.tableIdentifier), - specsAndLocs, - ctx.EXISTS != null) - } - - /** - * Create an [[AlterTableRenamePartitionCommand]] command - * - * For example: - * {{{ - * ALTER TABLE table PARTITION spec1 RENAME TO PARTITION spec2; - * }}} - */ - override def visitRenameTablePartition( - ctx: RenameTablePartitionContext): LogicalPlan = withOrigin(ctx) { - AlterTableRenamePartitionCommand( - visitTableIdentifier(ctx.tableIdentifier), - visitNonOptionalPartitionSpec(ctx.from), - visitNonOptionalPartitionSpec(ctx.to)) - } - - /** - * Create an [[AlterTableDropPartitionCommand]] command - * - * For example: - * {{{ - * ALTER TABLE table DROP [IF EXISTS] PARTITION spec1[, PARTITION spec2, ...] [PURGE]; - * ALTER VIEW view DROP [IF EXISTS] PARTITION spec1[, PARTITION spec2, ...]; - * }}} - * - * ALTER VIEW ... DROP PARTITION ... is not supported because the concept of partitioning - * is associated with physical tables - */ - override def visitDropTablePartitions( - ctx: DropTablePartitionsContext): LogicalPlan = withOrigin(ctx) { - if (ctx.VIEW != null) { - operationNotAllowed("ALTER VIEW ... DROP PARTITION", ctx) - } - AlterTableDropPartitionCommand( - visitTableIdentifier(ctx.tableIdentifier), - ctx.partitionSpec.asScala.map(visitNonOptionalPartitionSpec), - ifExists = ctx.EXISTS != null, - purge = ctx.PURGE != null, - retainData = false) - } - - /** - * Create an [[AlterTableRecoverPartitionsCommand]] command - * - * For example: - * {{{ - * ALTER TABLE table RECOVER PARTITIONS; - * }}} - */ - override def visitRecoverPartitions( - ctx: RecoverPartitionsContext): LogicalPlan = withOrigin(ctx) { - AlterTableRecoverPartitionsCommand(visitTableIdentifier(ctx.tableIdentifier)) - } - - /** - * Create an [[AlterTableSetLocationCommand]] command for a partition. - * - * For example: - * {{{ - * ALTER TABLE table PARTITION spec SET LOCATION "loc"; - * }}} - */ - override def visitSetPartitionLocation( - ctx: SetPartitionLocationContext): LogicalPlan = withOrigin(ctx) { - AlterTableSetLocationCommand( - visitTableIdentifier(ctx.tableIdentifier), - Some(visitNonOptionalPartitionSpec(ctx.partitionSpec)), - visitLocationSpec(ctx.locationSpec)) - } - - /** - * Create a [[AlterTableChangeColumnCommand]] command. - * - * For example: - * {{{ - * ALTER TABLE table [PARTITION partition_spec] - * CHANGE [COLUMN] column_old_name column_new_name column_dataType [COMMENT column_comment] - * [FIRST | AFTER column_name]; - * }}} - */ - override def visitChangeColumn(ctx: ChangeColumnContext): LogicalPlan = withOrigin(ctx) { - if (ctx.partitionSpec != null) { - operationNotAllowed("ALTER TABLE table PARTITION partition_spec CHANGE COLUMN", ctx) - } - - if (ctx.colPosition != null) { - operationNotAllowed( - "ALTER TABLE table [PARTITION partition_spec] CHANGE COLUMN ... FIRST | AFTER otherCol", - ctx) - } - - AlterTableChangeColumnCommand( - tableName = visitTableIdentifier(ctx.tableIdentifier), - columnName = ctx.colName.getText, - newColumn = visitColType(ctx.colType)) - } - /** * Convert a nested constants list into a sequence of string sequences. */ @@ -801,9 +262,14 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { * ADD (FILE[S] | JAR[S] ) * LIST (FILE[S] [filepath ...] | JAR[S] [jarpath ...]) * }}} + * + * Note that filepath/jarpath can be given as follows; + * - /path/to/fileOrJar + * - "/path/to/fileOrJar" + * - '/path/to/fileOrJar' */ override def visitManageResource(ctx: ManageResourceContext): LogicalPlan = withOrigin(ctx) { - val mayebePaths = remainder(ctx.identifier).trim + val mayebePaths = if (ctx.STRING != null) string(ctx.STRING) else remainder(ctx.identifier).trim ctx.op.getType match { case SqlBaseParser.ADD => ctx.identifier.getText.toLowerCase(Locale.ROOT) match { @@ -870,7 +336,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { checkDuplicateClauses(ctx.TBLPROPERTIES, "TBLPROPERTIES", ctx) checkDuplicateClauses(ctx.PARTITIONED, "PARTITIONED BY", ctx) - checkDuplicateClauses(ctx.COMMENT, "COMMENT", ctx) + checkDuplicateClauses(ctx.commentSpec(), "COMMENT", ctx) checkDuplicateClauses(ctx.bucketSpec(), "CLUSTERED BY", ctx) checkDuplicateClauses(ctx.createFileFormat, "STORED AS/BY", ctx) checkDuplicateClauses(ctx.rowFormat, "ROW FORMAT", ctx) @@ -893,7 +359,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { .getOrElse(CatalogStorageFormat.empty) val rowStorage = ctx.rowFormat.asScala.headOption.map(visitRowFormat) .getOrElse(CatalogStorageFormat.empty) - val location = ctx.locationSpec.asScala.headOption.map(visitLocationSpec) + val location = visitLocationSpecList(ctx.locationSpec()) // If we are creating an EXTERNAL table, then the LOCATION field is required if (external && location.isEmpty) { operationNotAllowed("CREATE EXTERNAL TABLE must be accompanied by LOCATION", ctx) @@ -927,7 +393,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { provider = Some(DDLUtils.HIVE_PROVIDER), partitionColumnNames = partitionCols.map(_.name), properties = properties, - comment = Option(ctx.comment).map(string)) + comment = visitCommentSpecList(ctx.commentSpec())) val mode = if (ifNotExists) SaveMode.Ignore else SaveMode.ErrorIfExists @@ -986,14 +452,50 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { * For example: * {{{ * CREATE TABLE [IF NOT EXISTS] [db_name.]table_name - * LIKE [other_db_name.]existing_table_name [locationSpec] + * LIKE [other_db_name.]existing_table_name + * [USING provider | + * [ + * [ROW FORMAT row_format] + * [STORED AS file_format] [WITH SERDEPROPERTIES (...)] + * ] + * ] + * [locationSpec] + * [TBLPROPERTIES (property_name=property_value, ...)] * }}} */ override def visitCreateTableLike(ctx: CreateTableLikeContext): LogicalPlan = withOrigin(ctx) { val targetTable = visitTableIdentifier(ctx.target) val sourceTable = visitTableIdentifier(ctx.source) - val location = Option(ctx.locationSpec).map(visitLocationSpec) - CreateTableLikeCommand(targetTable, sourceTable, location, ctx.EXISTS != null) + checkDuplicateClauses(ctx.tableProvider, "PROVIDER", ctx) + checkDuplicateClauses(ctx.createFileFormat, "STORED AS/BY", ctx) + checkDuplicateClauses(ctx.rowFormat, "ROW FORMAT", ctx) + checkDuplicateClauses(ctx.locationSpec, "LOCATION", ctx) + checkDuplicateClauses(ctx.TBLPROPERTIES, "TBLPROPERTIES", ctx) + val provider = ctx.tableProvider.asScala.headOption.map(_.multipartIdentifier.getText) + val location = visitLocationSpecList(ctx.locationSpec()) + // rowStorage used to determine CatalogStorageFormat.serde and + // CatalogStorageFormat.properties in STORED AS clause. + val rowStorage = ctx.rowFormat.asScala.headOption.map(visitRowFormat) + .getOrElse(CatalogStorageFormat.empty) + val fileFormat = ctx.createFileFormat.asScala.headOption.map(visitCreateFileFormat) match { + case Some(f) => + if (provider.isDefined) { + throw new ParseException("'STORED AS hiveFormats' and 'USING provider' " + + "should not be specified both", ctx) + } + f.copy( + locationUri = location.map(CatalogUtils.stringToURI), + serde = rowStorage.serde.orElse(f.serde), + properties = rowStorage.properties ++ f.properties) + case None => + if (rowStorage.serde.isDefined) { + throw new ParseException("'ROW FORMAT' must be used with 'STORED AS'", ctx) + } + CatalogStorageFormat.empty.copy(locationUri = location.map(CatalogUtils.stringToURI)) + } + val properties = Option(ctx.tableProps).map(visitPropertyKeyValues).getOrElse(Map.empty) + CreateTableLikeCommand( + targetTable, sourceTable, fileFormat, provider, properties, ctx.EXISTS != null) } /** @@ -1160,73 +662,6 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { } } - /** - * Create or replace a view. This creates a [[CreateViewCommand]] command. - * - * For example: - * {{{ - * CREATE [OR REPLACE] [[GLOBAL] TEMPORARY] VIEW [IF NOT EXISTS] [db_name.]view_name - * [(column_name [COMMENT column_comment], ...) ] - * create_view_clauses - * - * AS SELECT ...; - * - * create_view_clauses (order insensitive): - * [COMMENT view_comment] - * [TBLPROPERTIES (property_name = property_value, ...)] - * }}} - */ - override def visitCreateView(ctx: CreateViewContext): LogicalPlan = withOrigin(ctx) { - if (!ctx.identifierList.isEmpty) { - operationNotAllowed("CREATE VIEW ... PARTITIONED ON", ctx) - } - - checkDuplicateClauses(ctx.COMMENT, "COMMENT", ctx) - checkDuplicateClauses(ctx.PARTITIONED, "PARTITIONED ON", ctx) - checkDuplicateClauses(ctx.TBLPROPERTIES, "TBLPROPERTIES", ctx) - - val userSpecifiedColumns = Option(ctx.identifierCommentList).toSeq.flatMap { icl => - icl.identifierComment.asScala.map { ic => - ic.identifier.getText -> Option(ic.STRING).map(string) - } - } - - val viewType = if (ctx.TEMPORARY == null) { - PersistedView - } else if (ctx.GLOBAL != null) { - GlobalTempView - } else { - LocalTempView - } - - CreateViewCommand( - name = visitTableIdentifier(ctx.tableIdentifier), - userSpecifiedColumns = userSpecifiedColumns, - comment = ctx.STRING.asScala.headOption.map(string), - properties = ctx.tablePropertyList.asScala.headOption.map(visitPropertyKeyValues) - .getOrElse(Map.empty), - originalText = Option(source(ctx.query)), - child = plan(ctx.query), - allowExisting = ctx.EXISTS != null, - replace = ctx.REPLACE != null, - viewType = viewType) - } - - /** - * Alter the query of a view. This creates a [[AlterViewAsCommand]] command. - * - * For example: - * {{{ - * ALTER VIEW [db_name.]view_name AS SELECT ...; - * }}} - */ - override def visitAlterViewQuery(ctx: AlterViewQueryContext): LogicalPlan = withOrigin(ctx) { - AlterViewAsCommand( - name = visitTableIdentifier(ctx.tableIdentifier), - originalText = source(ctx.query), - query = plan(ctx.query)) - } - /** * Create a [[ScriptInputOutputSchema]]. */ @@ -1346,7 +781,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { storage = storage.copy(locationUri = customLocation) } - val provider = ctx.tableProvider.qualifiedName.getText + val provider = ctx.tableProvider.multipartIdentifier.getText (false, storage, Some(provider)) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index 08b00184ef9d7..bd2684d92a1d2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.catalyst.streaming.InternalOutputModes -import org.apache.spark.sql.execution.adaptive.LogicalQueryStage +import org.apache.spark.sql.execution.aggregate.AggUtils import org.apache.spark.sql.execution.columnar.{InMemoryRelation, InMemoryTableScanExec} import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec @@ -89,6 +89,8 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { TakeOrderedAndProjectExec(limit, order, projectList, planLater(child)) :: Nil case Limit(IntegerLiteral(limit), child) => CollectLimitExec(limit, planLater(child)) :: Nil + case Tail(IntegerLiteral(limit), child) => + CollectTailExec(limit, planLater(child)) :: Nil case other => planLater(other) :: Nil } case Limit(IntegerLiteral(limit), Sort(order, true, child)) @@ -291,7 +293,9 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { } def createJoinWithoutHint() = { - createBroadcastHashJoin(canBroadcast(left), canBroadcast(right)) + createBroadcastHashJoin( + canBroadcast(left) && !hint.leftHint.exists(_.strategy.contains(NO_BROADCAST_HASH)), + canBroadcast(right) && !hint.rightHint.exists(_.strategy.contains(NO_BROADCAST_HASH))) .orElse { if (!conf.preferSortMergeJoin) { createShuffleHashJoin( @@ -420,7 +424,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { } } - aggregate.AggUtils.planStreamingAggregation( + AggUtils.planStreamingAggregation( normalizedGroupingExpressions, aggregateExpressions.map(expr => expr.asInstanceOf[AggregateExpression]), rewrittenResultExpressions, @@ -447,21 +451,35 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { * Used to plan the streaming global limit operator for streams in append mode. * We need to check for either a direct Limit or a Limit wrapped in a ReturnAnswer operator, * following the example of the SpecialLimits Strategy above. - * Streams with limit in Append mode use the stateful StreamingGlobalLimitExec. - * Streams with limit in Complete mode use the stateless CollectLimitExec operator. - * Limit is unsupported for streams in Update mode. */ case class StreamingGlobalLimitStrategy(outputMode: OutputMode) extends Strategy { - override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { - case ReturnAnswer(rootPlan) => rootPlan match { - case Limit(IntegerLiteral(limit), child) - if plan.isStreaming && outputMode == InternalOutputModes.Append => - StreamingGlobalLimitExec(limit, LocalLimitExec(limit, planLater(child))) :: Nil - case _ => Nil + + private def generatesStreamingAppends(plan: LogicalPlan): Boolean = { + + /** Ensures that this plan does not have a streaming aggregate in it. */ + def hasNoStreamingAgg: Boolean = { + plan.collectFirst { case a: Aggregate if a.isStreaming => a }.isEmpty } - case Limit(IntegerLiteral(limit), child) - if plan.isStreaming && outputMode == InternalOutputModes.Append => - StreamingGlobalLimitExec(limit, LocalLimitExec(limit, planLater(child))) :: Nil + + // The following cases of limits on a streaming plan has to be executed with a stateful + // streaming plan. + // 1. When the query is in append mode (that is, all logical plan operate on appended data). + // 2. When the plan does not contain any streaming aggregate (that is, plan has only + // operators that operate on appended data). This must be executed with a stateful + // streaming plan even if the query is in complete mode because of a later streaming + // aggregation (e.g., `streamingDf.limit(5).groupBy().count()`). + plan.isStreaming && ( + outputMode == InternalOutputModes.Append || + outputMode == InternalOutputModes.Complete && hasNoStreamingAgg) + } + + override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { + case ReturnAnswer(Limit(IntegerLiteral(limit), child)) if generatesStreamingAppends(child) => + StreamingGlobalLimitExec(limit, StreamingLocalLimitExec(limit, planLater(child))) :: Nil + + case Limit(IntegerLiteral(limit), child) if generatesStreamingAppends(child) => + StreamingGlobalLimitExec(limit, StreamingLocalLimitExec(limit, planLater(child))) :: Nil + case _ => Nil } } @@ -472,8 +490,9 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right, _) if left.isStreaming && right.isStreaming => - new StreamingSymmetricHashJoinExec( - leftKeys, rightKeys, joinType, condition, planLater(left), planLater(right)) :: Nil + val stateVersion = conf.getConf(SQLConf.STREAMING_JOIN_STATE_FORMAT_VERSION) + new StreamingSymmetricHashJoinExec(leftKeys, rightKeys, joinType, condition, + stateVersion, planLater(left), planLater(right)) :: Nil case Join(left, right, _, _, _) if left.isStreaming && right.isStreaming => throw new AnalysisException( @@ -514,13 +533,13 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { val aggregateOperator = if (functionsWithDistinct.isEmpty) { - aggregate.AggUtils.planAggregateWithoutDistinct( + AggUtils.planAggregateWithoutDistinct( normalizedGroupingExpressions, aggregateExpressions, resultExpressions, planLater(child)) } else { - aggregate.AggUtils.planAggregateWithOneDistinct( + AggUtils.planAggregateWithOneDistinct( normalizedGroupingExpressions, functionsWithDistinct, functionsWithoutDistinct, @@ -565,7 +584,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { } } - protected lazy val singleRowRdd = sparkContext.parallelize(Seq(InternalRow()), 1) + protected lazy val singleRowRdd = session.sparkContext.parallelize(Seq(InternalRow()), 1) object InMemoryScans extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { @@ -680,6 +699,9 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { f, p, b, is, ot, planLater(child)) :: Nil case logical.FlatMapGroupsInPandas(grouping, func, output, child) => execution.python.FlatMapGroupsInPandasExec(grouping, func, output, planLater(child)) :: Nil + case logical.FlatMapCoGroupsInPandas(leftGroup, rightGroup, func, output, left, right) => + execution.python.FlatMapCoGroupsInPandasExec( + leftGroup, rightGroup, func, output, planLater(left), planLater(right)) :: Nil case logical.MapInPandas(func, output, child) => execution.python.MapInPandasExec(func, output, planLater(child)) :: Nil case logical.MapElements(f, _, _, objAttr, child) => @@ -740,6 +762,12 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] { case ExternalRDD(outputObjAttr, rdd) => ExternalRDDScanExec(outputObjAttr, rdd) :: Nil case r: LogicalRDD => RDDScanExec(r.output, r.rdd, "ExistingRDD", r.outputPartitioning, r.outputOrdering) :: Nil + case _: UpdateTable => + throw new UnsupportedOperationException(s"UPDATE TABLE is not supported temporarily.") + case _: MergeIntoTable => + throw new UnsupportedOperationException(s"MERGE INTO TABLE is not supported temporarily.") + case logical.CollectMetrics(name, metrics, child) => + execution.CollectMetricsExec(name, metrics, planLater(child)) :: Nil case _ => Nil } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala index ce9a6ea319d5f..10fe0f252322f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala @@ -54,6 +54,7 @@ trait CodegenSupport extends SparkPlan { case _: RDDScanExec => "rdd" case _: DataSourceScanExec => "scan" case _: InMemoryTableScanExec => "memoryScan" + case _: WholeStageCodegenExec => "wholestagecodegen" case _ => nodeName.toLowerCase(Locale.ROOT) } @@ -613,6 +614,8 @@ case class WholeStageCodegenExec(child: SparkPlan)(val codegenStageId: Int) "pipelineTime" -> SQLMetrics.createTimingMetric(sparkContext, WholeStageCodegenExec.PIPELINE_DURATION_METRIC)) + override def nodeName: String = s"WholeStageCodegen (${codegenStageId})" + def generatedClassName(): String = if (conf.wholeStageUseIdInClassName) { s"GeneratedIteratorForCodegenStage$codegenStageId" } else { @@ -688,7 +691,7 @@ case class WholeStageCodegenExec(child: SparkPlan)(val codegenStageId: Int) override def doExecute(): RDD[InternalRow] = { val (ctx, cleanedSource) = doCodeGen() // try to compile and fallback if it failed - val (_, maxCodeSize) = try { + val (_, compiledCodeStats) = try { CodeGenerator.compile(cleanedSource) } catch { case NonFatal(_) if !Utils.isTesting && sqlContext.conf.codegenFallback => @@ -698,9 +701,9 @@ case class WholeStageCodegenExec(child: SparkPlan)(val codegenStageId: Int) } // Check if compiled code has a too large function - if (maxCodeSize > sqlContext.conf.hugeMethodLimit) { + if (compiledCodeStats.maxMethodCodeSize > sqlContext.conf.hugeMethodLimit) { logInfo(s"Found too long generated codes and JIT optimization might not work: " + - s"the bytecode size ($maxCodeSize) is above the limit " + + s"the bytecode size (${compiledCodeStats.maxMethodCodeSize}) is above the limit " + s"${sqlContext.conf.hugeMethodLimit}, and the whole-stage codegen was disabled " + s"for this plan (id=$codegenStageId). To avoid this, you can raise the limit " + s"`${SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.key}`:\n$treeString") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala index 5d92ddad887bf..3f20b59361988 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala @@ -31,15 +31,14 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, ReturnAnswer} import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor} import org.apache.spark.sql.catalyst.trees.TreeNodeTag import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec._ -import org.apache.spark.sql.execution.adaptive.rule.ReduceNumShufflePartitions import org.apache.spark.sql.execution.exchange._ -import org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.execution.ui.{SparkListenerSQLAdaptiveExecutionUpdate, SparkListenerSQLAdaptiveSQLMetricUpdates, SQLPlanMetric} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.ThreadUtils @@ -61,11 +60,9 @@ import org.apache.spark.util.ThreadUtils */ case class AdaptiveSparkPlanExec( initialPlan: SparkPlan, - @transient session: SparkSession, + @transient context: AdaptiveExecutionContext, @transient preprocessingRules: Seq[Rule[SparkPlan]], - @transient subqueryCache: TrieMap[SparkPlan, BaseSubqueryExec], - @transient stageCache: TrieMap[SparkPlan, QueryStageExec], - @transient queryExecution: QueryExecution) + @transient isSubquery: Boolean) extends LeafExecNode { @transient private val lock = new Object() @@ -73,7 +70,9 @@ case class AdaptiveSparkPlanExec( // The logical plan optimizer for re-optimizing the current logical plan. @transient private val optimizer = new RuleExecutor[LogicalPlan] { // TODO add more optimization rules - override protected def batches: Seq[Batch] = Seq() + override protected def batches: Seq[Batch] = Seq( + Batch("Demote BroadcastHashJoin", Once, DemoteBroadcastHashJoin(conf)) + ) } @transient private val ensureRequirements = EnsureRequirements(conf) @@ -88,10 +87,16 @@ case class AdaptiveSparkPlanExec( // A list of physical optimizer rules to be applied to a new stage before its execution. These // optimizations should be stage-independent. @transient private val queryStageOptimizerRules: Seq[Rule[SparkPlan]] = Seq( - ReuseAdaptiveSubquery(conf, subqueryCache), + ReuseAdaptiveSubquery(conf, context.subqueryCache), + // Here the 'OptimizeSkewedJoin' rule should be executed + // before 'ReduceNumShufflePartitions', as the skewed partition handled + // in 'OptimizeSkewedJoin' rule, should be omitted in 'ReduceNumShufflePartitions'. + OptimizeSkewedJoin(conf), ReduceNumShufflePartitions(conf), - ApplyColumnarRulesAndInsertTransitions(session.sessionState.conf, - session.sessionState.columnarRules), + // The rule of 'OptimizeLocalShuffleReader' need to make use of the 'partitionStartIndices' + // in 'ReduceNumShufflePartitions' rule. So it must be after 'ReduceNumShufflePartitions' rule. + OptimizeLocalShuffleReader(conf), + ApplyColumnarRulesAndInsertTransitions(conf, context.session.sessionState.columnarRules), CollapseCodegenStages(conf) ) @@ -117,25 +122,38 @@ case class AdaptiveSparkPlanExec( def executedPlan: SparkPlan = currentPhysicalPlan - override def conf: SQLConf = session.sessionState.conf + override def conf: SQLConf = context.session.sessionState.conf override def output: Seq[Attribute] = initialPlan.output override def doCanonicalize(): SparkPlan = initialPlan.canonicalized - override def doExecute(): RDD[InternalRow] = lock.synchronized { - if (isFinalPlan) { - currentPhysicalPlan.execute() - } else { - // Make sure we only update Spark UI if this plan's `QueryExecution` object matches the one - // retrieved by the `sparkContext`'s current execution ID. Note that sub-queries do not have - // their own execution IDs and therefore rely on the main query to update UI. - val executionId = Option( - session.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)).flatMap { idStr => - val id = idStr.toLong - val qe = SQLExecution.getQueryExecution(id) - if (qe.eq(queryExecution)) Some(id) else None - } + override def resetMetrics(): Unit = { + metrics.valuesIterator.foreach(_.reset()) + executedPlan.resetMetrics() + } + + private def collectSQLMetrics(plan: SparkPlan): Seq[SQLMetric] = { + val metrics = new mutable.ArrayBuffer[SQLMetric]() + plan.foreach { + case p: ShuffleQueryStageExec if (p.resultOption.isEmpty) => + collectSQLMetrics(p.plan).foreach(metrics += _) + case p: BroadcastQueryStageExec if (p.resultOption.isEmpty) => + collectSQLMetrics(p.plan).foreach(metrics += _) + case p: SparkPlan => + p.metrics.foreach { case metric => + metrics += metric._2 + } + } + metrics + } + + private def getFinalPhysicalPlan(): SparkPlan = lock.synchronized { + if (!isFinalPlan) { + // Subqueries do not have their own execution IDs and therefore rely on the main query to + // update UI. + val executionId = Option(context.session.sparkContext.getLocalProperty( + SQLExecution.EXECUTION_ID_KEY)).map(_.toLong) var currentLogicalPlan = currentPhysicalPlan.logicalLink.get var result = createQueryStages(currentPhysicalPlan) val events = new LinkedBlockingQueue[StageMaterializationEvent]() @@ -149,13 +167,17 @@ case class AdaptiveSparkPlanExec( // Start materialization of all new stages. result.newStages.foreach { stage => - stage.materialize().onComplete { res => - if (res.isSuccess) { - events.offer(StageSuccess(stage, res.get)) - } else { - events.offer(StageFailure(stage, res.failed.get)) - } - }(AdaptiveSparkPlanExec.executionContext) + try { + stage.materialize().onComplete { res => + if (res.isSuccess) { + events.offer(StageSuccess(stage, res.get)) + } else { + events.offer(StageFailure(stage, res.failed.get)) + } + }(AdaptiveSparkPlanExec.executionContext) + } catch { + case e: Throwable => events.offer(StageFailure(stage, e)) + } } } @@ -170,7 +192,8 @@ case class AdaptiveSparkPlanExec( stage.resultOption = Some(res) case StageFailure(stage, ex) => errors.append( - new SparkException(s"Failed to materialize query stage: ${stage.treeString}", ex)) + new SparkException(s"Failed to materialize query stage: ${stage.treeString}." + + s" and the cause is ${ex.getMessage}", ex)) } // In case of errors, we cancel all running stages and throw exception. @@ -207,12 +230,26 @@ case class AdaptiveSparkPlanExec( // Run the final plan when there's no more unfinished stages. currentPhysicalPlan = applyPhysicalRules(result.newPlan, queryStageOptimizerRules) isFinalPlan = true - - val ret = currentPhysicalPlan.execute() - logDebug(s"Final plan: $currentPhysicalPlan") executionId.foreach(onUpdatePlan) - ret + logDebug(s"Final plan: $currentPhysicalPlan") } + currentPhysicalPlan + } + + override def executeCollect(): Array[InternalRow] = { + getFinalPhysicalPlan().executeCollect() + } + + override def executeTake(n: Int): Array[InternalRow] = { + getFinalPhysicalPlan().executeTake(n) + } + + override def executeTail(n: Int): Array[InternalRow] = { + getFinalPhysicalPlan().executeTail(n) + } + + override def doExecute(): RDD[InternalRow] = { + getFinalPhysicalPlan().execute() } override def verboseString(maxFields: Int): String = simpleString(maxFields) @@ -271,13 +308,14 @@ case class AdaptiveSparkPlanExec( private def createQueryStages(plan: SparkPlan): CreateStageResult = plan match { case e: Exchange => // First have a quick check in the `stageCache` without having to traverse down the node. - stageCache.get(e.canonicalized) match { + context.stageCache.get(e.canonicalized) match { case Some(existingStage) if conf.exchangeReuseEnabled => - val reusedStage = reuseQueryStage(existingStage, e) - // When reusing a stage, we treat it a new stage regardless of whether the existing stage - // has been materialized or not. Thus we won't skip re-optimization for a reused stage. - CreateStageResult(newPlan = reusedStage, - allChildStagesMaterialized = false, newStages = Seq(reusedStage)) + val stage = reuseQueryStage(existingStage, e) + // This is a leaf stage and is not materialized yet even if the reused exchange may has + // been completed. It will trigger re-optimization later and stage materialization will + // finish in instant if the underlying exchange is already completed. + CreateStageResult( + newPlan = stage, allChildStagesMaterialized = false, newStages = Seq(stage)) case _ => val result = createQueryStages(e.child) @@ -289,7 +327,7 @@ case class AdaptiveSparkPlanExec( // Check the `stageCache` again for reuse. If a match is found, ditch the new stage // and reuse the existing stage found in the `stageCache`, otherwise update the // `stageCache` with the new stage. - val queryStage = stageCache.getOrElseUpdate(e.canonicalized, newStage) + val queryStage = context.stageCache.getOrElseUpdate(e.canonicalized, newStage) if (queryStage.ne(newStage)) { newStage = reuseQueryStage(queryStage, e) } @@ -333,10 +371,10 @@ case class AdaptiveSparkPlanExec( queryStage } - private def reuseQueryStage(s: QueryStageExec, e: Exchange): QueryStageExec = { - val queryStage = ReusedQueryStageExec(currentStageId, s, e.output) + private def reuseQueryStage(existing: QueryStageExec, exchange: Exchange): QueryStageExec = { + val queryStage = existing.newReuseInstance(currentStageId, exchange.output) currentStageId += 1 - setLogicalLinkForNewQueryStage(queryStage, e) + setLogicalLinkForNewQueryStage(queryStage, exchange) queryStage } @@ -429,8 +467,8 @@ case class AdaptiveSparkPlanExec( private def reOptimize(logicalPlan: LogicalPlan): (SparkPlan, LogicalPlan) = { logicalPlan.invalidateStatsCache() val optimized = optimizer.execute(logicalPlan) - SparkSession.setActiveSession(session) - val sparkPlan = session.sessionState.planner.plan(ReturnAnswer(optimized)).next() + SparkSession.setActiveSession(context.session) + val sparkPlan = context.session.sessionState.planner.plan(ReturnAnswer(optimized)).next() val newPlan = applyPhysicalRules(sparkPlan, preprocessingRules ++ queryStagePreparationRules) (newPlan, optimized) } @@ -458,10 +496,27 @@ case class AdaptiveSparkPlanExec( * Notify the listeners of the physical plan change. */ private def onUpdatePlan(executionId: Long): Unit = { - session.sparkContext.listenerBus.post(SparkListenerSQLAdaptiveExecutionUpdate( - executionId, - SQLExecution.getQueryExecution(executionId).toString, - SparkPlanInfo.fromSparkPlan(this))) + if (isSubquery) { + // When executing subqueries, we can't update the query plan in the UI as the + // UI doesn't support partial update yet. However, the subquery may have been + // optimized into a different plan and we must let the UI know the SQL metrics + // of the new plan nodes, so that it can track the valid accumulator updates later + // and display SQL metrics correctly. + onUpdateSQLMetrics(collectSQLMetrics(currentPhysicalPlan), executionId) + } else { + context.session.sparkContext.listenerBus.post(SparkListenerSQLAdaptiveExecutionUpdate( + executionId, + SQLExecution.getQueryExecution(executionId).toString, + SparkPlanInfo.fromSparkPlan(this))) + } + } + + private def onUpdateSQLMetrics(sqlMetrics: Seq[SQLMetric], executionId: Long): Unit = { + val sqlPlanMetrics = sqlMetrics.map { case sqlMetric => + SQLPlanMetric(sqlMetric.name.get, sqlMetric.id, sqlMetric.metricType) + } + context.session.sparkContext.listenerBus.post(SparkListenerSQLAdaptiveSQLMetricUpdates( + executionId.toLong, sqlPlanMetrics)) } /** @@ -485,7 +540,8 @@ case class AdaptiveSparkPlanExec( } } finally { val ex = new SparkException( - "Adaptive execution failed due to stage materialization failures.", errors.head) + "Adaptive execution failed due to stage materialization failures." + + s" and the cause is ${errors.head.getMessage}", errors.head) errors.tail.foreach(ex.addSuppressed) cancelErrors.foreach(ex.addSuppressed) throw ex @@ -517,6 +573,24 @@ object AdaptiveSparkPlanExec { } } +/** + * The execution context shared between the main query and all sub-queries. + */ +case class AdaptiveExecutionContext(session: SparkSession) { + + /** + * The subquery-reuse map shared across the entire query. + */ + val subqueryCache: TrieMap[SparkPlan, BaseSubqueryExec] = + new TrieMap[SparkPlan, BaseSubqueryExec]() + + /** + * The exchange-reuse map shared across the entire query, including sub-queries. + */ + val stageCache: TrieMap[SparkPlan, QueryStageExec] = + new TrieMap[SparkPlan, QueryStageExec]() +} + /** * The event type for stage materialization. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanHelper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanHelper.scala new file mode 100644 index 0000000000000..61ae6cb14ccd3 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanHelper.scala @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.adaptive + +import org.apache.spark.sql.execution.SparkPlan + +/** + * This class provides utility methods related to tree traversal of an [[AdaptiveSparkPlanExec]] + * plan. Unlike their counterparts in [[org.apache.spark.sql.catalyst.trees.TreeNode]] or + * [[org.apache.spark.sql.catalyst.plans.QueryPlan]], these methods traverse down leaf nodes of + * adaptive plans, i.e., [[AdaptiveSparkPlanExec]] and [[QueryStageExec]]. + */ +trait AdaptiveSparkPlanHelper { + + /** + * Find the first [[SparkPlan]] that satisfies the condition specified by `f`. + * The condition is recursively applied to this node and all of its children (pre-order). + */ + def find(p: SparkPlan)(f: SparkPlan => Boolean): Option[SparkPlan] = if (f(p)) { + Some(p) + } else { + allChildren(p).foldLeft(Option.empty[SparkPlan]) { (l, r) => l.orElse(find(r)(f)) } + } + + /** + * Runs the given function on this node and then recursively on children. + * @param f the function to be applied to each node in the tree. + */ + def foreach(p: SparkPlan)(f: SparkPlan => Unit): Unit = { + f(p) + allChildren(p).foreach(foreach(_)(f)) + } + + /** + * Runs the given function recursively on children then on this node. + * @param f the function to be applied to each node in the tree. + */ + def foreachUp(p: SparkPlan)(f: SparkPlan => Unit): Unit = { + allChildren(p).foreach(foreachUp(_)(f)) + f(p) + } + + /** + * Returns a Seq containing the result of applying the given function to each + * node in this tree in a preorder traversal. + * @param f the function to be applied. + */ + def mapPlans[A](p: SparkPlan)(f: SparkPlan => A): Seq[A] = { + val ret = new collection.mutable.ArrayBuffer[A]() + foreach(p)(ret += f(_)) + ret + } + + /** + * Returns a Seq by applying a function to all nodes in this tree and using the elements of the + * resulting collections. + */ + def flatMap[A](p: SparkPlan)(f: SparkPlan => TraversableOnce[A]): Seq[A] = { + val ret = new collection.mutable.ArrayBuffer[A]() + foreach(p)(ret ++= f(_)) + ret + } + + /** + * Returns a Seq containing the result of applying a partial function to all elements in this + * tree on which the function is defined. + */ + def collect[B](p: SparkPlan)(pf: PartialFunction[SparkPlan, B]): Seq[B] = { + val ret = new collection.mutable.ArrayBuffer[B]() + val lifted = pf.lift + foreach(p)(node => lifted(node).foreach(ret.+=)) + ret + } + + /** + * Returns a Seq containing the leaves in this tree. + */ + def collectLeaves(p: SparkPlan): Seq[SparkPlan] = { + collect(p) { case plan if allChildren(plan).isEmpty => plan } + } + + /** + * Finds and returns the first [[SparkPlan]] of the tree for which the given partial function + * is defined (pre-order), and applies the partial function to it. + */ + def collectFirst[B](p: SparkPlan)(pf: PartialFunction[SparkPlan, B]): Option[B] = { + val lifted = pf.lift + lifted(p).orElse { + allChildren(p).foldLeft(Option.empty[B]) { (l, r) => l.orElse(collectFirst(r)(pf)) } + } + } + + /** + * Returns a sequence containing the result of applying a partial function to all elements in this + * plan, also considering all the plans in its (nested) subqueries + */ + def collectInPlanAndSubqueries[B](p: SparkPlan)(f: PartialFunction[SparkPlan, B]): Seq[B] = { + (p +: subqueriesAll(p)).flatMap(collect(_)(f)) + } + + /** + * Returns a sequence containing the subqueries in this plan, also including the (nested) + * subquries in its children + */ + def subqueriesAll(p: SparkPlan): Seq[SparkPlan] = { + val subqueries = flatMap(p)(_.subqueries) + subqueries ++ subqueries.flatMap(subqueriesAll) + } + + private def allChildren(p: SparkPlan): Seq[SparkPlan] = p match { + case a: AdaptiveSparkPlanExec => Seq(a.executedPlan) + case s: QueryStageExec => Seq(s.plan) + case _ => p.children + } + + /** + * Strip the executePlan of AdaptiveSparkPlanExec leaf node. + */ + def stripAQEPlan(p: SparkPlan): SparkPlan = p match { + case a: AdaptiveSparkPlanExec => a.executedPlan + case other => other + } + } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffledRowRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffledRowRDD.scala new file mode 100644 index 0000000000000..5aba57443d632 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffledRowRDD.scala @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.adaptive + +import org.apache.spark.{Dependency, MapOutputTrackerMaster, Partition, ShuffleDependency, SparkEnv, TaskContext} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.metric.{SQLMetric, SQLShuffleReadMetricsReporter} + +sealed trait ShufflePartitionSpec + +// A partition that reads data of one reducer. +case class SinglePartitionSpec(reducerIndex: Int) extends ShufflePartitionSpec + +// A partition that reads data of multiple reducers, from `startReducerIndex` (inclusive) to +// `endReducerIndex` (exclusive). +case class CoalescedPartitionSpec( + startReducerIndex: Int, endReducerIndex: Int) extends ShufflePartitionSpec + +// A partition that reads partial data of one reducer, from `startMapIndex` (inclusive) to +// `endMapIndex` (exclusive). +case class PartialPartitionSpec( + reducerIndex: Int, startMapIndex: Int, endMapIndex: Int) extends ShufflePartitionSpec + +private final case class CustomShufflePartition( + index: Int, spec: ShufflePartitionSpec) extends Partition + +// TODO: merge this with `ShuffledRowRDD`, and replace `LocalShuffledRowRDD` with this RDD. +class CustomShuffledRowRDD( + var dependency: ShuffleDependency[Int, InternalRow, InternalRow], + metrics: Map[String, SQLMetric], + partitionSpecs: Array[ShufflePartitionSpec]) + extends RDD[InternalRow](dependency.rdd.context, Nil) { + + override def getDependencies: Seq[Dependency[_]] = List(dependency) + + override def clearDependencies() { + super.clearDependencies() + dependency = null + } + + override def getPartitions: Array[Partition] = { + Array.tabulate[Partition](partitionSpecs.length) { i => + CustomShufflePartition(i, partitionSpecs(i)) + } + } + + override def getPreferredLocations(partition: Partition): Seq[String] = { + val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] + partition.asInstanceOf[CustomShufflePartition].spec match { + case SinglePartitionSpec(reducerIndex) => + tracker.getPreferredLocationsForShuffle(dependency, reducerIndex) + + case CoalescedPartitionSpec(startReducerIndex, endReducerIndex) => + startReducerIndex.until(endReducerIndex).flatMap { reducerIndex => + tracker.getPreferredLocationsForShuffle(dependency, reducerIndex) + } + + case PartialPartitionSpec(_, startMapIndex, endMapIndex) => + tracker.getMapLocation(dependency, startMapIndex, endMapIndex) + } + } + + override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = { + val tempMetrics = context.taskMetrics().createTempShuffleReadMetrics() + // `SQLShuffleReadMetricsReporter` will update its own metrics for SQL exchange operator, + // as well as the `tempMetrics` for basic shuffle metrics. + val sqlMetricsReporter = new SQLShuffleReadMetricsReporter(tempMetrics, metrics) + val reader = split.asInstanceOf[CustomShufflePartition].spec match { + case SinglePartitionSpec(reducerIndex) => + SparkEnv.get.shuffleManager.getReader( + dependency.shuffleHandle, + reducerIndex, + reducerIndex + 1, + context, + sqlMetricsReporter) + + case CoalescedPartitionSpec(startReducerIndex, endReducerIndex) => + SparkEnv.get.shuffleManager.getReader( + dependency.shuffleHandle, + startReducerIndex, + endReducerIndex, + context, + sqlMetricsReporter) + + case PartialPartitionSpec(reducerIndex, startMapIndex, endMapIndex) => + SparkEnv.get.shuffleManager.getReaderForRange( + dependency.shuffleHandle, + startMapIndex, + endMapIndex, + reducerIndex, + reducerIndex + 1, + context, + sqlMetricsReporter) + } + reader.read().asInstanceOf[Iterator[Product2[Int, InternalRow]]].map(_._2) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/DemoteBroadcastHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/DemoteBroadcastHashJoin.scala new file mode 100644 index 0000000000000..e5642991c59a3 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/DemoteBroadcastHashJoin.scala @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.adaptive + +import org.apache.spark.MapOutputStatistics +import org.apache.spark.sql.catalyst.plans.logical.{HintInfo, Join, LogicalPlan, NO_BROADCAST_HASH} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.internal.SQLConf + +/** + * This optimization rule detects a join child that has a high ratio of empty partitions and + * adds a no-broadcast-hash-join hint to avoid it being broadcast. + */ +case class DemoteBroadcastHashJoin(conf: SQLConf) extends Rule[LogicalPlan] { + + private def shouldDemote(plan: LogicalPlan): Boolean = plan match { + case LogicalQueryStage(_, stage: ShuffleQueryStageExec) if stage.resultOption.isDefined + && stage.resultOption.get != null => + val mapOutputStatistics = stage.resultOption.get.asInstanceOf[MapOutputStatistics] + val partitionCnt = mapOutputStatistics.bytesByPartitionId.length + val nonZeroCnt = mapOutputStatistics.bytesByPartitionId.count(_ > 0) + partitionCnt > 0 && nonZeroCnt > 0 && + (nonZeroCnt * 1.0 / partitionCnt) < conf.nonEmptyPartitionRatioForBroadcastJoin + case _ => false + } + + def apply(plan: LogicalPlan): LogicalPlan = plan.transformDown { + case j @ Join(left, right, _, _, hint) => + var newHint = hint + if (!hint.leftHint.exists(_.strategy.isDefined) && shouldDemote(left)) { + newHint = newHint.copy(leftHint = + Some(hint.leftHint.getOrElse(HintInfo()).copy(strategy = Some(NO_BROADCAST_HASH)))) + } + if (!hint.rightHint.exists(_.strategy.isDefined) && shouldDemote(right)) { + newHint = newHint.copy(rightHint = + Some(hint.rightHint.getOrElse(HintInfo()).copy(strategy = Some(NO_BROADCAST_HASH)))) + } + if (newHint.ne(hint)) { + j.copy(hint = newHint) + } else { + j + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala index 23eadfd6f3e5e..621c063e5a7d8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InsertAdaptiveSparkPlan.scala @@ -17,16 +17,16 @@ package org.apache.spark.sql.execution.adaptive -import scala.collection.concurrent.TrieMap import scala.collection.mutable -import org.apache.spark.sql.{execution, SparkSession} import org.apache.spark.sql.catalyst.expressions -import org.apache.spark.sql.catalyst.expressions.DynamicPruningSubquery +import org.apache.spark.sql.catalyst.expressions.{DynamicPruningSubquery, ListQuery, SubqueryExpression} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.plans.physical.UnspecifiedDistribution import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.command.ExecutedCommandExec +import org.apache.spark.sql.execution.exchange.Exchange import org.apache.spark.sql.internal.SQLConf /** @@ -36,45 +36,64 @@ import org.apache.spark.sql.internal.SQLConf * Note that this rule is stateful and thus should not be reused across query executions. */ case class InsertAdaptiveSparkPlan( - session: SparkSession, - queryExecution: QueryExecution) extends Rule[SparkPlan] { + adaptiveExecutionContext: AdaptiveExecutionContext) extends Rule[SparkPlan] { - private val conf = session.sessionState.conf + private val conf = adaptiveExecutionContext.session.sessionState.conf - // Subquery-reuse is shared across the entire query. - private val subqueryCache = new TrieMap[SparkPlan, BaseSubqueryExec]() + override def apply(plan: SparkPlan): SparkPlan = applyInternal(plan, false) - // Exchange-reuse is shared across the entire query, including sub-queries. - private val stageCache = new TrieMap[SparkPlan, QueryStageExec]() - - override def apply(plan: SparkPlan): SparkPlan = applyInternal(plan, queryExecution) - - private def applyInternal(plan: SparkPlan, qe: QueryExecution): SparkPlan = plan match { + private def applyInternal(plan: SparkPlan, isSubquery: Boolean): SparkPlan = plan match { + case _ if !conf.adaptiveExecutionEnabled => plan case _: ExecutedCommandExec => plan - case _ if conf.adaptiveExecutionEnabled && supportAdaptive(plan) => - try { - // Plan sub-queries recursively and pass in the shared stage cache for exchange reuse. Fall - // back to non-adaptive mode if adaptive execution is supported in any of the sub-queries. - val subqueryMap = buildSubqueryMap(plan) - val planSubqueriesRule = PlanAdaptiveSubqueries(subqueryMap) - val preprocessingRules = Seq( - planSubqueriesRule) - // Run pre-processing rules. - val newPlan = AdaptiveSparkPlanExec.applyPhysicalRules(plan, preprocessingRules) - logDebug(s"Adaptive execution enabled for plan: $plan") - AdaptiveSparkPlanExec(newPlan, session, preprocessingRules, subqueryCache, stageCache, qe) - } catch { - case SubqueryAdaptiveNotSupportedException(subquery) => - logWarning(s"${SQLConf.ADAPTIVE_EXECUTION_ENABLED.key} is enabled " + - s"but is not supported for sub-query: $subquery.") - plan - } - case _ => - if (conf.adaptiveExecutionEnabled) { + case _ if shouldApplyAQE(plan, isSubquery) => + if (supportAdaptive(plan)) { + try { + // Plan sub-queries recursively and pass in the shared stage cache for exchange reuse. + // Fall back to non-AQE mode if AQE is not supported in any of the sub-queries. + val subqueryMap = buildSubqueryMap(plan) + val planSubqueriesRule = PlanAdaptiveSubqueries(subqueryMap) + val preprocessingRules = Seq( + planSubqueriesRule) + // Run pre-processing rules. + val newPlan = AdaptiveSparkPlanExec.applyPhysicalRules(plan, preprocessingRules) + logDebug(s"Adaptive execution enabled for plan: $plan") + AdaptiveSparkPlanExec(newPlan, adaptiveExecutionContext, preprocessingRules, isSubquery) + } catch { + case SubqueryAdaptiveNotSupportedException(subquery) => + logWarning(s"${SQLConf.ADAPTIVE_EXECUTION_ENABLED.key} is enabled " + + s"but is not supported for sub-query: $subquery.") + plan + } + } else { logWarning(s"${SQLConf.ADAPTIVE_EXECUTION_ENABLED.key} is enabled " + s"but is not supported for query: $plan.") + plan } - plan + + case _ => plan + } + + // AQE is only useful when the query has exchanges or sub-queries. This method returns true if + // one of the following conditions is satisfied: + // - The config ADAPTIVE_EXECUTION_FORCE_APPLY is true. + // - The input query is from a sub-query. When this happens, it means we've already decided to + // apply AQE for the main query and we must continue to do it. + // - The query contains exchanges. + // - The query may need to add exchanges. It's an overkill to run `EnsureRequirements` here, so + // we just check `SparkPlan.requiredChildDistribution` and see if it's possible that the + // the query needs to add exchanges later. + // - The query contains sub-query. + private def shouldApplyAQE(plan: SparkPlan, isSubquery: Boolean): Boolean = { + conf.getConf(SQLConf.ADAPTIVE_EXECUTION_FORCE_APPLY) || isSubquery || { + plan.find { + case _: Exchange => true + case p if !p.requiredChildDistribution.forall(_ == UnspecifiedDistribution) => true + case p => p.expressions.exists(_.find { + case _: SubqueryExpression => true + case _ => false + }.isDefined) + }.isDefined + } } private def supportAdaptive(plan: SparkPlan): Boolean = { @@ -93,27 +112,33 @@ case class InsertAdaptiveSparkPlan( * For each sub-query, generate the adaptive execution plan for each sub-query by applying this * rule, or reuse the execution plan from another sub-query of the same semantics if possible. */ - private def buildSubqueryMap(plan: SparkPlan): mutable.HashMap[Long, ExecSubqueryExpression] = { - val subqueryMap = mutable.HashMap.empty[Long, ExecSubqueryExpression] + private def buildSubqueryMap(plan: SparkPlan): Map[Long, SubqueryExec] = { + val subqueryMap = mutable.HashMap.empty[Long, SubqueryExec] plan.foreach(_.expressions.foreach(_.foreach { case expressions.ScalarSubquery(p, _, exprId) if !subqueryMap.contains(exprId.id) => val executedPlan = compileSubquery(p) verifyAdaptivePlan(executedPlan, p) - val scalarSubquery = execution.ScalarSubquery( - SubqueryExec(s"subquery${exprId.id}", executedPlan), exprId) - subqueryMap.put(exprId.id, scalarSubquery) + val subquery = SubqueryExec(s"subquery${exprId.id}", executedPlan) + subqueryMap.put(exprId.id, subquery) + case expressions.InSubquery(_, ListQuery(query, _, exprId, _)) + if !subqueryMap.contains(exprId.id) => + val executedPlan = compileSubquery(query) + verifyAdaptivePlan(executedPlan, query) + val subquery = SubqueryExec(s"subquery#${exprId.id}", executedPlan) + subqueryMap.put(exprId.id, subquery) case _ => })) - subqueryMap + subqueryMap.toMap } def compileSubquery(plan: LogicalPlan): SparkPlan = { - val queryExec = new QueryExecution(session, plan) // Apply the same instance of this rule to sub-queries so that sub-queries all share the // same `stageCache` for Exchange reuse. - this.applyInternal(queryExec.sparkPlan, queryExec) + this.applyInternal( + QueryExecution.createSparkPlan(adaptiveExecutionContext.session, + adaptiveExecutionContext.session.sessionState.planner, plan.clone()), true) } private def verifyAdaptivePlan(plan: SparkPlan, logicalPlan: LogicalPlan): Unit = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/LocalShuffledRowRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/LocalShuffledRowRDD.scala new file mode 100644 index 0000000000000..19b78f5e36c9b --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/LocalShuffledRowRDD.scala @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.adaptive + +import scala.collection.mutable.ArrayBuffer + +import org.apache.spark._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.metric.{SQLMetric, SQLShuffleReadMetricsReporter} + +/** + * The [[Partition]] used by [[LocalShuffledRowRDD]]. + * @param mapIndex the index of mapper. + * @param startPartition the start partition ID in mapIndex mapper. + * @param endPartition the end partition ID in mapIndex mapper. + */ +private final class LocalShuffledRowRDDPartition( + override val index: Int, + val mapIndex: Int, + val startPartition: Int, + val endPartition: Int) extends Partition { +} + +/** + * This is a specialized version of [[org.apache.spark.sql.execution.ShuffledRowRDD]]. This is used + * in Spark SQL adaptive execution when a shuffle join is converted to broadcast join at runtime + * because the map output of one input table is small enough for broadcast. This RDD represents the + * data of another input table of the join that reads from shuffle. Each partition of the RDD reads + * the whole data from just one mapper output locally. So actually there is no data transferred + * from the network. + * + * This RDD takes a [[ShuffleDependency]] (`dependency`). + * + * The `dependency` has the parent RDD of this RDD, which represents the dataset before shuffle + * (i.e. map output). Elements of this RDD are (partitionId, Row) pairs. + * Partition ids should be in the range [0, numPartitions - 1]. + * `dependency.partitioner.numPartitions` is the number of pre-shuffle partitions. (i.e. the number + * of partitions of the map output). The post-shuffle partition number is the same to the parent + * RDD's partition number. + * + * `partitionStartIndicesPerMapper` specifies how to split the shuffle blocks of each mapper into + * one or more partitions. For a mapper `i`, the `j`th partition includes shuffle blocks from + * `partitionStartIndicesPerMapper[i][j]` to `partitionStartIndicesPerMapper[i][j+1]` (exclusive). + */ +class LocalShuffledRowRDD( + var dependency: ShuffleDependency[Int, InternalRow, InternalRow], + metrics: Map[String, SQLMetric], + partitionStartIndicesPerMapper: Array[Array[Int]]) + extends RDD[InternalRow](dependency.rdd.context, Nil) { + + private[this] val numReducers = dependency.partitioner.numPartitions + private[this] val numMappers = dependency.rdd.partitions.length + + override def getDependencies: Seq[Dependency[_]] = List(dependency) + + override def getPartitions: Array[Partition] = { + val partitions = ArrayBuffer[LocalShuffledRowRDDPartition]() + for (mapIndex <- 0 until numMappers) { + (partitionStartIndicesPerMapper(mapIndex) :+ numReducers).sliding(2, 1).foreach { + case Array(start, end) => + partitions += new LocalShuffledRowRDDPartition(partitions.length, mapIndex, start, end) + } + } + partitions.toArray + } + + override def getPreferredLocations(partition: Partition): Seq[String] = { + val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] + tracker.getMapLocation(dependency, partition.index, partition.index + 1) + } + + override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = { + val localRowPartition = split.asInstanceOf[LocalShuffledRowRDDPartition] + val mapIndex = localRowPartition.mapIndex + val tempMetrics = context.taskMetrics().createTempShuffleReadMetrics() + // `SQLShuffleReadMetricsReporter` will update its own metrics for SQL exchange operator, + // as well as the `tempMetrics` for basic shuffle metrics. + val sqlMetricsReporter = new SQLShuffleReadMetricsReporter(tempMetrics, metrics) + + val reader = SparkEnv.get.shuffleManager.getReaderForRange( + dependency.shuffleHandle, + mapIndex, + mapIndex + 1, + localRowPartition.startPartition, + localRowPartition.endPartition, + context, + sqlMetricsReporter) + reader.read().asInstanceOf[Iterator[Product2[Int, InternalRow]]].map(_._2) + } + + override def clearDependencies() { + super.clearDependencies() + dependency = null + } +} + diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/LogicalQueryStageStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/LogicalQueryStageStrategy.scala index a0d07a68ab0f4..d60c3ca72f6f6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/LogicalQueryStageStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/LogicalQueryStageStrategy.scala @@ -36,9 +36,7 @@ import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, BroadcastNes object LogicalQueryStageStrategy extends Strategy with PredicateHelper { private def isBroadcastStage(plan: LogicalPlan): Boolean = plan match { - case LogicalQueryStage(_, physicalPlan) - if BroadcastQueryStageExec.isBroadcastQueryStageExec(physicalPlan) => - true + case LogicalQueryStage(_, _: BroadcastQueryStageExec) => true case _ => false } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala new file mode 100644 index 0000000000000..e95441e28aafe --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeLocalShuffleReader.scala @@ -0,0 +1,197 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.adaptive + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} +import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ReusedExchangeExec, ShuffleExchangeExec} +import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, BuildLeft, BuildRight, BuildSide} +import org.apache.spark.sql.internal.SQLConf + +/** + * A rule to optimize the shuffle reader to local reader iff no additional shuffles + * will be introduced: + * 1. if the input plan is a shuffle, add local reader directly as we can never introduce + * extra shuffles in this case. + * 2. otherwise, add local reader to the probe side of broadcast hash join and + * then run `EnsureRequirements` to check whether additional shuffle introduced. + * If introduced, we will revert all the local readers. + */ +case class OptimizeLocalShuffleReader(conf: SQLConf) extends Rule[SparkPlan] { + import OptimizeLocalShuffleReader._ + + private val ensureRequirements = EnsureRequirements(conf) + + // The build side is a broadcast query stage which should have been optimized using local reader + // already. So we only need to deal with probe side here. + private def createProbeSideLocalReader(plan: SparkPlan): SparkPlan = { + val optimizedPlan = plan.transformDown { + case join @ BroadcastJoinWithShuffleLeft(shuffleStage, BuildRight) => + val localReader = createLocalReader(shuffleStage) + join.asInstanceOf[BroadcastHashJoinExec].copy(left = localReader) + case join @ BroadcastJoinWithShuffleRight(shuffleStage, BuildLeft) => + val localReader = createLocalReader(shuffleStage) + join.asInstanceOf[BroadcastHashJoinExec].copy(right = localReader) + } + + val numShuffles = ensureRequirements.apply(optimizedPlan).collect { + case e: ShuffleExchangeExec => e + }.length + + // Check whether additional shuffle introduced. If introduced, revert the local reader. + if (numShuffles > 0) { + logDebug("OptimizeLocalShuffleReader rule is not applied due" + + " to additional shuffles will be introduced.") + plan + } else { + optimizedPlan + } + } + + private def createLocalReader(plan: SparkPlan): LocalShuffleReaderExec = { + plan match { + case c @ CoalescedShuffleReaderExec(s: ShuffleQueryStageExec, _) => + LocalShuffleReaderExec( + s, getPartitionStartIndices(s, Some(c.partitionStartIndices.length))) + case s: ShuffleQueryStageExec => + LocalShuffleReaderExec(s, getPartitionStartIndices(s, None)) + } + } + + // TODO: this method assumes all shuffle blocks are the same data size. We should calculate the + // partition start indices based on block size to avoid data skew. + private def getPartitionStartIndices( + shuffleStage: ShuffleQueryStageExec, + advisoryParallelism: Option[Int]): Array[Array[Int]] = { + val shuffleDep = shuffleStage.shuffle.shuffleDependency + val numReducers = shuffleDep.partitioner.numPartitions + val expectedParallelism = advisoryParallelism.getOrElse(numReducers) + val numMappers = shuffleDep.rdd.getNumPartitions + Array.fill(numMappers) { + equallyDivide(numReducers, math.max(1, expectedParallelism / numMappers)).toArray + } + } + + /** + * To equally divide n elements into m buckets, basically each bucket should have n/m elements, + * for the remaining n%m elements, add one more element to the first n%m buckets each. Returns + * a sequence with length numBuckets and each value represents the start index of each bucket. + */ + private def equallyDivide(numElements: Int, numBuckets: Int): Seq[Int] = { + val elementsPerBucket = numElements / numBuckets + val remaining = numElements % numBuckets + val splitPoint = (elementsPerBucket + 1) * remaining + (0 until remaining).map(_ * (elementsPerBucket + 1)) ++ + (remaining until numBuckets).map(i => splitPoint + (i - remaining) * elementsPerBucket) + } + + override def apply(plan: SparkPlan): SparkPlan = { + if (!conf.getConf(SQLConf.LOCAL_SHUFFLE_READER_ENABLED)) { + return plan + } + + plan match { + case s: SparkPlan if canUseLocalShuffleReader(s) => + createLocalReader(s) + case s: SparkPlan => + createProbeSideLocalReader(s) + } + } +} + +object OptimizeLocalShuffleReader { + + object BroadcastJoinWithShuffleLeft { + def unapply(plan: SparkPlan): Option[(SparkPlan, BuildSide)] = plan match { + case join: BroadcastHashJoinExec if canUseLocalShuffleReader(join.left) => + Some((join.left, join.buildSide)) + case _ => None + } + } + + object BroadcastJoinWithShuffleRight { + def unapply(plan: SparkPlan): Option[(SparkPlan, BuildSide)] = plan match { + case join: BroadcastHashJoinExec if canUseLocalShuffleReader(join.right) => + Some((join.right, join.buildSide)) + case _ => None + } + } + + def canUseLocalShuffleReader(plan: SparkPlan): Boolean = plan match { + case s: ShuffleQueryStageExec => s.shuffle.canChangeNumPartitions + case CoalescedShuffleReaderExec(s: ShuffleQueryStageExec, _) => s.shuffle.canChangeNumPartitions + case _ => false + } +} + +/** + * A wrapper of shuffle query stage, which submits one or more reduce tasks per mapper to read the + * shuffle files written by one mapper. By doing this, it's very likely to read the shuffle files + * locally, as the shuffle files that a reduce task needs to read are in one node. + * + * @param child It's usually `ShuffleQueryStageExec`, but can be the shuffle exchange node during + * canonicalization. + * @param partitionStartIndicesPerMapper A mapper usually writes many shuffle blocks, and it's + * better to launch multiple tasks to read shuffle blocks of + * one mapper. This array contains the partition start + * indices for each mapper. + */ +case class LocalShuffleReaderExec( + child: SparkPlan, + partitionStartIndicesPerMapper: Array[Array[Int]]) extends UnaryExecNode { + + override def output: Seq[Attribute] = child.output + + override lazy val outputPartitioning: Partitioning = { + // when we read one mapper per task, then the output partitioning is the same as the plan + // before shuffle. + if (partitionStartIndicesPerMapper.forall(_.length == 1)) { + child match { + case ShuffleQueryStageExec(_, s: ShuffleExchangeExec) => + s.child.outputPartitioning + case ShuffleQueryStageExec(_, r @ ReusedExchangeExec(_, s: ShuffleExchangeExec)) => + s.child.outputPartitioning match { + case e: Expression => r.updateAttr(e).asInstanceOf[Partitioning] + case other => other + } + case _ => + throw new IllegalStateException("operating on canonicalization plan") + } + } else { + UnknownPartitioning(partitionStartIndicesPerMapper.map(_.length).sum) + } + } + + private var cachedShuffleRDD: RDD[InternalRow] = null + + override protected def doExecute(): RDD[InternalRow] = { + if (cachedShuffleRDD == null) { + cachedShuffleRDD = child match { + case stage: ShuffleQueryStageExec => + stage.shuffle.createLocalShuffleRDD(partitionStartIndicesPerMapper) + case _ => + throw new IllegalStateException("operating on canonicalization plan") + } + } + cachedShuffleRDD + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala new file mode 100644 index 0000000000000..a716497c274b8 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedJoin.scala @@ -0,0 +1,390 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.adaptive + +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer + +import org.apache.commons.io.FileUtils + +import org.apache.spark.{MapOutputStatistics, MapOutputTrackerMaster, SparkEnv} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.catalyst.plans._ +import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.execution._ +import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ShuffleExchangeExec} +import org.apache.spark.sql.execution.joins.SortMergeJoinExec +import org.apache.spark.sql.internal.SQLConf + +case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] { + + private val ensureRequirements = EnsureRequirements(conf) + + private val supportedJoinTypes = + Inner :: Cross :: LeftSemi :: LeftAnti :: LeftOuter :: RightOuter :: Nil + + /** + * A partition is considered as a skewed partition if its size is larger than the median + * partition size * spark.sql.adaptive.skewedPartitionFactor and also larger than + * spark.sql.adaptive.skewedPartitionSizeThreshold. + */ + private def isSkewed(size: Long, medianSize: Long): Boolean = { + size > medianSize * conf.getConf(SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_FACTOR) && + size > conf.getConf(SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD) + } + + private def medianSize(stats: MapOutputStatistics): Long = { + val numPartitions = stats.bytesByPartitionId.length + val bytes = stats.bytesByPartitionId.sorted + numPartitions match { + case _ if (numPartitions % 2 == 0) => + math.max((bytes(numPartitions / 2) + bytes(numPartitions / 2 - 1)) / 2, 1) + case _ => math.max(bytes(numPartitions / 2), 1) + } + } + + /** + * Get the map size of the specific reduce shuffle Id. + */ + private def getMapSizesForReduceId(shuffleId: Int, partitionId: Int): Array[Long] = { + val mapOutputTracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] + mapOutputTracker.shuffleStatuses(shuffleId).mapStatuses.map{_.getSizeForBlock(partitionId)} + } + + /** + * Split the skewed partition based on the map size and the max split number. + */ + private def getMapStartIndices(stage: ShuffleQueryStageExec, partitionId: Int): Array[Int] = { + val shuffleId = stage.shuffle.shuffleDependency.shuffleHandle.shuffleId + val mapPartitionSizes = getMapSizesForReduceId(shuffleId, partitionId) + val maxSplits = math.min(conf.getConf( + SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_MAX_SPLITS), mapPartitionSizes.length) + val avgPartitionSize = mapPartitionSizes.sum / maxSplits + val advisoryPartitionSize = math.max(avgPartitionSize, + conf.getConf(SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD)) + val partitionStartIndices = ArrayBuffer[Int]() + partitionStartIndices += 0 + var i = 0 + var postMapPartitionSize = 0L + while (i < mapPartitionSizes.length) { + val nextMapPartitionSize = mapPartitionSizes(i) + if (i > 0 && postMapPartitionSize + nextMapPartitionSize > advisoryPartitionSize) { + partitionStartIndices += i + postMapPartitionSize = nextMapPartitionSize + } else { + postMapPartitionSize += nextMapPartitionSize + } + i += 1 + } + + if (partitionStartIndices.size > maxSplits) { + partitionStartIndices.take(maxSplits).toArray + } else partitionStartIndices.toArray + } + + private def getStatistics(stage: ShuffleQueryStageExec): MapOutputStatistics = { + assert(stage.resultOption.isDefined, "ShuffleQueryStageExec should" + + " already be ready when executing OptimizeSkewedPartitions rule") + stage.resultOption.get.asInstanceOf[MapOutputStatistics] + } + + private def canSplitLeftSide(joinType: JoinType) = { + joinType == Inner || joinType == Cross || joinType == LeftSemi || + joinType == LeftAnti || joinType == LeftOuter + } + + private def canSplitRightSide(joinType: JoinType) = { + joinType == Inner || joinType == Cross || joinType == RightOuter + } + + private def getNumMappers(stage: ShuffleQueryStageExec): Int = { + stage.shuffle.shuffleDependency.rdd.partitions.length + } + + private def getSizeInfo(medianSize: Long, maxSize: Long): String = { + s"median size: $medianSize, max size: ${maxSize}" + } + + /* + * This method aim to optimize the skewed join with the following steps: + * 1. Check whether the shuffle partition is skewed based on the median size + * and the skewed partition threshold in origin smj. + * 2. Assuming partition0 is skewed in left side, and it has 5 mappers (Map0, Map1...Map4). + * And we may split the 5 Mappers into 3 mapper ranges [(Map0, Map1), (Map2, Map3), (Map4)] + * based on the map size and the max split number. + * 3. Wrap the join left child with a special shuffle reader that reads each mapper range with one + * task, so total 3 tasks. + * 4. Wrap the join right child with a special shuffle reader that reads partition0 3 times by + * 3 tasks separately. + */ + def optimizeSkewJoin(plan: SparkPlan): SparkPlan = plan.transformUp { + case smj @ SortMergeJoinExec(_, _, joinType, _, + s1 @ SortExec(_, _, left: ShuffleQueryStageExec, _), + s2 @ SortExec(_, _, right: ShuffleQueryStageExec, _), _) + if supportedJoinTypes.contains(joinType) => + val leftStats = getStatistics(left) + val rightStats = getStatistics(right) + val numPartitions = leftStats.bytesByPartitionId.length + + val leftMedSize = medianSize(leftStats) + val rightMedSize = medianSize(rightStats) + logDebug( + s""" + |Try to optimize skewed join. + |Left side partition size: + |${getSizeInfo(leftMedSize, leftStats.bytesByPartitionId.max)} + |Right side partition size: + |${getSizeInfo(rightMedSize, rightStats.bytesByPartitionId.max)} + """.stripMargin) + val canSplitLeft = canSplitLeftSide(joinType) + val canSplitRight = canSplitRightSide(joinType) + + val leftSidePartitions = mutable.ArrayBuffer.empty[ShufflePartitionSpec] + val rightSidePartitions = mutable.ArrayBuffer.empty[ShufflePartitionSpec] + // This is used to delay the creation of non-skew partitions so that we can potentially + // coalesce them like `ReduceNumShufflePartitions` does. + val nonSkewPartitionIndices = mutable.ArrayBuffer.empty[Int] + val leftSkewDesc = new SkewDesc + val rightSkewDesc = new SkewDesc + for (partitionIndex <- 0 until numPartitions) { + val leftSize = leftStats.bytesByPartitionId(partitionIndex) + val isLeftSkew = isSkewed(leftSize, leftMedSize) && canSplitLeft + val rightSize = rightStats.bytesByPartitionId(partitionIndex) + val isRightSkew = isSkewed(rightSize, rightMedSize) && canSplitRight + if (isLeftSkew || isRightSkew) { + if (nonSkewPartitionIndices.nonEmpty) { + // As soon as we see a skew, we'll "flush" out unhandled non-skew partitions. + createNonSkewPartitions(leftStats, rightStats, nonSkewPartitionIndices).foreach { p => + leftSidePartitions += p + rightSidePartitions += p + } + nonSkewPartitionIndices.clear() + } + + val leftParts = if (isLeftSkew) { + leftSkewDesc.addPartitionSize(leftSize) + createSkewPartitions( + partitionIndex, + getMapStartIndices(left, partitionIndex), + getNumMappers(left)) + } else { + Seq(SinglePartitionSpec(partitionIndex)) + } + + val rightParts = if (isRightSkew) { + rightSkewDesc.addPartitionSize(rightSize) + createSkewPartitions( + partitionIndex, + getMapStartIndices(right, partitionIndex), + getNumMappers(right)) + } else { + Seq(SinglePartitionSpec(partitionIndex)) + } + + for { + leftSidePartition <- leftParts + rightSidePartition <- rightParts + } { + leftSidePartitions += leftSidePartition + rightSidePartitions += rightSidePartition + } + } else { + // Add to `nonSkewPartitionIndices` first, and add real partitions later, in case we can + // coalesce the non-skew partitions. + nonSkewPartitionIndices += partitionIndex + // If this is the last partition, add real partition immediately. + if (partitionIndex == numPartitions - 1) { + createNonSkewPartitions(leftStats, rightStats, nonSkewPartitionIndices).foreach { p => + leftSidePartitions += p + rightSidePartitions += p + } + nonSkewPartitionIndices.clear() + } + } + } + + logDebug("number of skewed partitions: " + + s"left ${leftSkewDesc.numPartitions}, right ${rightSkewDesc.numPartitions}") + if (leftSkewDesc.numPartitions > 0 || rightSkewDesc.numPartitions > 0) { + val newLeft = SkewJoinShuffleReaderExec( + left, leftSidePartitions.toArray, leftSkewDesc.toString) + val newRight = SkewJoinShuffleReaderExec( + right, rightSidePartitions.toArray, rightSkewDesc.toString) + smj.copy( + left = s1.copy(child = newLeft), right = s2.copy(child = newRight), isSkewJoin = true) + } else { + smj + } + } + + private def createNonSkewPartitions( + leftStats: MapOutputStatistics, + rightStats: MapOutputStatistics, + nonSkewPartitionIndices: Seq[Int]): Seq[ShufflePartitionSpec] = { + assert(nonSkewPartitionIndices.nonEmpty) + if (nonSkewPartitionIndices.length == 1) { + Seq(SinglePartitionSpec(nonSkewPartitionIndices.head)) + } else { + val startIndices = ShufflePartitionsCoalescer.coalescePartitions( + Array(leftStats, rightStats), + firstPartitionIndex = nonSkewPartitionIndices.head, + // `lastPartitionIndex` is exclusive. + lastPartitionIndex = nonSkewPartitionIndices.last + 1, + advisoryTargetSize = conf.targetPostShuffleInputSize) + startIndices.indices.map { i => + val startIndex = startIndices(i) + val endIndex = if (i == startIndices.length - 1) { + // `endIndex` is exclusive. + nonSkewPartitionIndices.last + 1 + } else { + startIndices(i + 1) + } + // Do not create `CoalescedPartitionSpec` if only need to read a singe partition. + if (startIndex + 1 == endIndex) { + SinglePartitionSpec(startIndex) + } else { + CoalescedPartitionSpec(startIndex, endIndex) + } + } + } + } + + private def createSkewPartitions( + reducerIndex: Int, + mapStartIndices: Array[Int], + numMappers: Int): Seq[PartialPartitionSpec] = { + mapStartIndices.indices.map { i => + val startMapIndex = mapStartIndices(i) + val endMapIndex = if (i == mapStartIndices.length - 1) { + numMappers + } else { + mapStartIndices(i + 1) + } + PartialPartitionSpec(reducerIndex, startMapIndex, endMapIndex) + } + } + + override def apply(plan: SparkPlan): SparkPlan = { + if (!conf.getConf(SQLConf.ADAPTIVE_EXECUTION_SKEWED_JOIN_ENABLED)) { + return plan + } + + def collectShuffleStages(plan: SparkPlan): Seq[ShuffleQueryStageExec] = plan match { + case stage: ShuffleQueryStageExec => Seq(stage) + case _ => plan.children.flatMap(collectShuffleStages) + } + + val shuffleStages = collectShuffleStages(plan) + + if (shuffleStages.length == 2) { + // When multi table join, there will be too many complex combination to consider. + // Currently we only handle 2 table join like following two use cases. + // SMJ + // Sort + // Shuffle + // Sort + // Shuffle + val optimizePlan = optimizeSkewJoin(plan) + val numShuffles = ensureRequirements.apply(optimizePlan).collect { + case e: ShuffleExchangeExec => e + }.length + + if (numShuffles > 0) { + logDebug("OptimizeSkewedJoin rule is not applied due" + + " to additional shuffles will be introduced.") + plan + } else { + optimizePlan + } + } else { + plan + } + } +} + +private class SkewDesc { + private[this] var numSkewedPartitions: Int = 0 + private[this] var totalSize: Long = 0 + private[this] var maxSize: Long = 0 + private[this] var minSize: Long = 0 + + def numPartitions: Int = numSkewedPartitions + + def addPartitionSize(size: Long): Unit = { + if (numSkewedPartitions == 0) { + maxSize = size + minSize = size + } + numSkewedPartitions += 1 + totalSize += size + if (size > maxSize) maxSize = size + if (size < minSize) minSize = size + } + + override def toString: String = { + if (numSkewedPartitions == 0) { + "no skewed partition" + } else { + val maxSizeStr = FileUtils.byteCountToDisplaySize(maxSize) + val minSizeStr = FileUtils.byteCountToDisplaySize(minSize) + val avgSizeStr = FileUtils.byteCountToDisplaySize(totalSize / numSkewedPartitions) + s"$numSkewedPartitions skewed partitions with " + + s"size(max=$maxSizeStr, min=$minSizeStr, avg=$avgSizeStr)" + } + } +} + +/** + * A wrapper of shuffle query stage, which follows the given partition arrangement. + * + * @param child It's usually `ShuffleQueryStageExec`, but can be the shuffle exchange node during + * canonicalization. + * @param partitionSpecs The partition specs that defines the arrangement. + * @param skewDesc The description of the skewed partitions. + */ +case class SkewJoinShuffleReaderExec( + child: SparkPlan, + partitionSpecs: Array[ShufflePartitionSpec], + skewDesc: String) extends UnaryExecNode { + + override def output: Seq[Attribute] = child.output + + override def outputPartitioning: Partitioning = { + UnknownPartitioning(partitionSpecs.length) + } + + override def stringArgs: Iterator[Any] = Iterator(skewDesc) + + private var cachedShuffleRDD: RDD[InternalRow] = null + + override protected def doExecute(): RDD[InternalRow] = { + if (cachedShuffleRDD == null) { + cachedShuffleRDD = child match { + case stage: ShuffleQueryStageExec => + new CustomShuffledRowRDD( + stage.shuffle.shuffleDependency, stage.shuffle.readMetrics, partitionSpecs) + case _ => + throw new IllegalStateException("operating on canonicalization plan") + } + } + cachedShuffleRDD + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveSubqueries.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveSubqueries.scala index 91d4359224a6a..f845b6b16ee3a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveSubqueries.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveSubqueries.scala @@ -18,19 +18,28 @@ package org.apache.spark.sql.execution.adaptive import org.apache.spark.sql.catalyst.expressions -import org.apache.spark.sql.catalyst.expressions.ListQuery +import org.apache.spark.sql.catalyst.expressions.{CreateNamedStruct, ListQuery, Literal} import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution.{ExecSubqueryExpression, SparkPlan} +import org.apache.spark.sql.execution +import org.apache.spark.sql.execution.{InSubqueryExec, SparkPlan, SubqueryExec} -case class PlanAdaptiveSubqueries( - subqueryMap: scala.collection.Map[Long, ExecSubqueryExpression]) extends Rule[SparkPlan] { +case class PlanAdaptiveSubqueries(subqueryMap: Map[Long, SubqueryExec]) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { plan.transformAllExpressions { case expressions.ScalarSubquery(_, _, exprId) => - subqueryMap(exprId.id) - case expressions.InSubquery(_, ListQuery(_, _, exprId, _)) => - subqueryMap(exprId.id) + execution.ScalarSubquery(subqueryMap(exprId.id), exprId) + case expressions.InSubquery(values, ListQuery(_, _, exprId, _)) => + val expr = if (values.length == 1) { + values.head + } else { + CreateNamedStruct( + values.zipWithIndex.flatMap { case (v, index) => + Seq(Literal(s"col_$index"), v) + } + ) + } + InSubqueryExec(expr, subqueryMap(exprId.id), exprId) } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStageExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStageExec.scala index 231fffce3360b..d5dc1be63f06e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStageExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStageExec.scala @@ -17,7 +17,6 @@ package org.apache.spark.sql.execution.adaptive -import scala.collection.mutable import scala.concurrent.Future import org.apache.spark.{FutureAction, MapOutputStatistics} @@ -25,13 +24,11 @@ import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.Statistics import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.exchange._ - /** * A query stage is an independent subgraph of the query plan. Query stage materializes its output * before proceeding with further operators of the query plan. The data statistics of the @@ -76,12 +73,19 @@ abstract class QueryStageExec extends LeafExecNode { doMaterialize() } + def newReuseInstance(newStageId: Int, newOutput: Seq[Attribute]): QueryStageExec + /** * Compute the statistics of the query stage if executed, otherwise None. */ def computeStats(): Option[Statistics] = resultOption.map { _ => // Metrics `dataSize` are available in both `ShuffleExchangeExec` and `BroadcastExchangeExec`. - Statistics(sizeInBytes = plan.metrics("dataSize").value) + val exchange = plan match { + case r: ReusedExchangeExec => r.child + case e: Exchange => e + case _ => throw new IllegalStateException("wrong plan for query stage:\n " + plan.treeString) + } + Statistics(sizeInBytes = exchange.metrics("dataSize").value) } @transient @@ -93,6 +97,7 @@ abstract class QueryStageExec extends LeafExecNode { override def outputOrdering: Seq[SortOrder] = plan.outputOrdering override def executeCollect(): Array[InternalRow] = plan.executeCollect() override def executeTake(n: Int): Array[InternalRow] = plan.executeTake(n) + override def executeTail(n: Int): Array[InternalRow] = plan.executeTail(n) override def executeToIterator(): Iterator[InternalRow] = plan.executeToIterator() override def doPrepare(): Unit = plan.prepare() @@ -125,27 +130,33 @@ abstract class QueryStageExec extends LeafExecNode { } /** - * A shuffle query stage whose child is a [[ShuffleExchangeExec]]. + * A shuffle query stage whose child is a [[ShuffleExchangeExec]] or [[ReusedExchangeExec]]. */ case class ShuffleQueryStageExec( override val id: Int, - override val plan: ShuffleExchangeExec) extends QueryStageExec { + override val plan: SparkPlan) extends QueryStageExec { - @transient lazy val mapOutputStatisticsFuture: Future[MapOutputStatistics] = { - if (plan.inputRDD.getNumPartitions == 0) { - Future.successful(null) - } else { - sparkContext.submitMapStage(plan.shuffleDependency) - } + @transient val shuffle = plan match { + case s: ShuffleExchangeExec => s + case ReusedExchangeExec(_, s: ShuffleExchangeExec) => s + case _ => + throw new IllegalStateException("wrong plan for shuffle stage:\n " + plan.treeString) } override def doMaterialize(): Future[Any] = { - mapOutputStatisticsFuture + shuffle.mapOutputStatisticsFuture + } + + override def newReuseInstance(newStageId: Int, newOutput: Seq[Attribute]): QueryStageExec = { + ShuffleQueryStageExec( + newStageId, + ReusedExchangeExec(newOutput, shuffle)) } override def cancel(): Unit = { - mapOutputStatisticsFuture match { - case action: FutureAction[MapOutputStatistics] if !mapOutputStatisticsFuture.isCompleted => + shuffle.mapOutputStatisticsFuture match { + case action: FutureAction[MapOutputStatistics] + if !shuffle.mapOutputStatisticsFuture.isCompleted => action.cancel() case _ => } @@ -153,80 +164,33 @@ case class ShuffleQueryStageExec( } /** - * A broadcast query stage whose child is a [[BroadcastExchangeExec]]. + * A broadcast query stage whose child is a [[BroadcastExchangeExec]] or [[ReusedExchangeExec]]. */ case class BroadcastQueryStageExec( override val id: Int, - override val plan: BroadcastExchangeExec) extends QueryStageExec { + override val plan: SparkPlan) extends QueryStageExec { - override def doMaterialize(): Future[Any] = { - plan.completionFuture + @transient val broadcast = plan match { + case b: BroadcastExchangeExec => b + case ReusedExchangeExec(_, b: BroadcastExchangeExec) => b + case _ => + throw new IllegalStateException("wrong plan for broadcast stage:\n " + plan.treeString) } - override def cancel(): Unit = { - if (!plan.relationFuture.isDone) { - sparkContext.cancelJobGroup(plan.runId.toString) - plan.relationFuture.cancel(true) - } - } -} - -object ShuffleQueryStageExec { - /** - * Returns true if the plan is a [[ShuffleQueryStageExec]] or a reused [[ShuffleQueryStageExec]]. - */ - def isShuffleQueryStageExec(plan: SparkPlan): Boolean = plan match { - case r: ReusedQueryStageExec => isShuffleQueryStageExec(r.plan) - case _: ShuffleQueryStageExec => true - case _ => false - } -} - -object BroadcastQueryStageExec { - /** - * Returns true if the plan is a [[BroadcastQueryStageExec]] or a reused - * [[BroadcastQueryStageExec]]. - */ - def isBroadcastQueryStageExec(plan: SparkPlan): Boolean = plan match { - case r: ReusedQueryStageExec => isBroadcastQueryStageExec(r.plan) - case _: BroadcastQueryStageExec => true - case _ => false - } -} - -/** - * A wrapper for reused query stage to have different output. - */ -case class ReusedQueryStageExec( - override val id: Int, - override val plan: QueryStageExec, - override val output: Seq[Attribute]) extends QueryStageExec { - override def doMaterialize(): Future[Any] = { - plan.materialize() + broadcast.completionFuture } - override def cancel(): Unit = { - plan.cancel() + override def newReuseInstance(newStageId: Int, newOutput: Seq[Attribute]): QueryStageExec = { + BroadcastQueryStageExec( + newStageId, + ReusedExchangeExec(newOutput, broadcast)) } - // `ReusedQueryStageExec` can have distinct set of output attribute ids from its child, we need - // to update the attribute ids in `outputPartitioning` and `outputOrdering`. - private lazy val updateAttr: Expression => Expression = { - val originalAttrToNewAttr = AttributeMap(plan.output.zip(output)) - e => e.transform { - case attr: Attribute => originalAttrToNewAttr.getOrElse(attr, attr) + override def cancel(): Unit = { + if (!broadcast.relationFuture.isDone) { + sparkContext.cancelJobGroup(broadcast.runId.toString) + broadcast.relationFuture.cancel(true) } } - - override def outputPartitioning: Partitioning = plan.outputPartitioning match { - case e: Expression => updateAttr(e).asInstanceOf[Partitioning] - case other => other - } - - override def outputOrdering: Seq[SortOrder] = { - plan.outputOrdering.map(updateAttr(_).asInstanceOf[SortOrder]) - } - - override def computeStats(): Option[Statistics] = plan.computeStats() } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ReduceNumShufflePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ReduceNumShufflePartitions.scala index 1a85d5c02075b..5bbcb14e008d8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ReduceNumShufflePartitions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ReduceNumShufflePartitions.scala @@ -15,10 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.execution.adaptive.rule - -import scala.collection.mutable.ArrayBuffer -import scala.concurrent.duration.Duration +package org.apache.spark.sql.execution.adaptive import org.apache.spark.MapOutputStatistics import org.apache.spark.rdd.RDD @@ -27,29 +24,11 @@ import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{ShuffledRowRDD, SparkPlan, UnaryExecNode} -import org.apache.spark.sql.execution.adaptive.{QueryStageExec, ReusedQueryStageExec, ShuffleQueryStageExec} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.util.ThreadUtils /** - * A rule to adjust the post shuffle partitions based on the map output statistics. - * - * The strategy used to determine the number of post-shuffle partitions is described as follows. - * To determine the number of post-shuffle partitions, we have a target input size for a - * post-shuffle partition. Once we have size statistics of all pre-shuffle partitions, we will do - * a pass of those statistics and pack pre-shuffle partitions with continuous indices to a single - * post-shuffle partition until adding another pre-shuffle partition would cause the size of a - * post-shuffle partition to be greater than the target size. - * - * For example, we have two stages with the following pre-shuffle partition size statistics: - * stage 1: [100 MiB, 20 MiB, 100 MiB, 10MiB, 30 MiB] - * stage 2: [10 MiB, 10 MiB, 70 MiB, 5 MiB, 5 MiB] - * assuming the target input size is 128 MiB, we will have four post-shuffle partitions, - * which are: - * - post-shuffle partition 0: pre-shuffle partition 0 (size 110 MiB) - * - post-shuffle partition 1: pre-shuffle partition 1 (size 30 MiB) - * - post-shuffle partition 2: pre-shuffle partition 2 (size 170 MiB) - * - post-shuffle partition 3: pre-shuffle partition 3 and 4 (size 50 MiB) + * A rule to reduce the post shuffle partitions based on the map output statistics, which can + * avoid many small reduce tasks that hurt performance. */ case class ReduceNumShufflePartitions(conf: SQLConf) extends Rule[SparkPlan] { @@ -64,19 +43,22 @@ case class ReduceNumShufflePartitions(conf: SQLConf) extends Rule[SparkPlan] { return plan } - val shuffleStages = plan.collect { - case stage: ShuffleQueryStageExec => stage - case ReusedQueryStageExec(_, stage: ShuffleQueryStageExec, _) => stage + def collectShuffleStages(plan: SparkPlan): Seq[ShuffleQueryStageExec] = plan match { + case _: LocalShuffleReaderExec => Nil + case _: SkewJoinShuffleReaderExec => Nil + case stage: ShuffleQueryStageExec => Seq(stage) + case _ => plan.children.flatMap(collectShuffleStages) } + + val shuffleStages = collectShuffleStages(plan) // ShuffleExchanges introduced by repartition do not support changing the number of partitions. // We change the number of partitions in the stage only if all the ShuffleExchanges support it. - if (!shuffleStages.forall(_.plan.canChangeNumPartitions)) { + if (!shuffleStages.forall(_.shuffle.canChangeNumPartitions)) { plan } else { val shuffleMetrics = shuffleStages.map { stage => - val metricsFuture = stage.mapOutputStatisticsFuture - assert(metricsFuture.isCompleted, "ShuffleQueryStageExec should already be ready") - ThreadUtils.awaitResult(metricsFuture, Duration.Zero) + assert(stage.resultOption.isDefined, "ShuffleQueryStageExec should already be ready") + stage.resultOption.get.asInstanceOf[MapOutputStatistics] } // `ShuffleQueryStageExec` gives null mapOutputStatistics when the input RDD has 0 partitions, @@ -88,13 +70,19 @@ case class ReduceNumShufflePartitions(conf: SQLConf) extends Rule[SparkPlan] { val distinctNumPreShufflePartitions = validMetrics.map(stats => stats.bytesByPartitionId.length).distinct if (validMetrics.nonEmpty && distinctNumPreShufflePartitions.length == 1) { - val partitionStartIndices = estimatePartitionStartIndices(validMetrics.toArray) + val partitionStartIndices = ShufflePartitionsCoalescer.coalescePartitions( + validMetrics.toArray, + firstPartitionIndex = 0, + lastPartitionIndex = distinctNumPreShufflePartitions.head, + advisoryTargetSize = conf.targetPostShuffleInputSize, + minNumPartitions = conf.minNumPostShufflePartitions) // This transformation adds new nodes, so we must use `transformUp` here. + val stageIds = shuffleStages.map(_.id).toSet plan.transformUp { // even for shuffle exchange whose input RDD has 0 partition, we should still update its // `partitionStartIndices`, so that all the leaf shuffles in a stage have the same // number of output partitions. - case stage: QueryStageExec if ShuffleQueryStageExec.isShuffleQueryStageExec(stage) => + case stage: ShuffleQueryStageExec if stageIds.contains(stage.id) => CoalescedShuffleReaderExec(stage, partitionStartIndices) } } else { @@ -102,90 +90,22 @@ case class ReduceNumShufflePartitions(conf: SQLConf) extends Rule[SparkPlan] { } } } - - /** - * Estimates partition start indices for post-shuffle partitions based on - * mapOutputStatistics provided by all pre-shuffle stages. - */ - // visible for testing. - private[sql] def estimatePartitionStartIndices( - mapOutputStatistics: Array[MapOutputStatistics]): Array[Int] = { - val minNumPostShufflePartitions = conf.minNumPostShufflePartitions - val advisoryTargetPostShuffleInputSize = conf.targetPostShuffleInputSize - // If minNumPostShufflePartitions is defined, it is possible that we need to use a - // value less than advisoryTargetPostShuffleInputSize as the target input size of - // a post shuffle task. - val totalPostShuffleInputSize = mapOutputStatistics.map(_.bytesByPartitionId.sum).sum - // The max at here is to make sure that when we have an empty table, we - // only have a single post-shuffle partition. - // There is no particular reason that we pick 16. We just need a number to - // prevent maxPostShuffleInputSize from being set to 0. - val maxPostShuffleInputSize = math.max( - math.ceil(totalPostShuffleInputSize / minNumPostShufflePartitions.toDouble).toLong, 16) - val targetPostShuffleInputSize = - math.min(maxPostShuffleInputSize, advisoryTargetPostShuffleInputSize) - - logInfo( - s"advisoryTargetPostShuffleInputSize: $advisoryTargetPostShuffleInputSize, " + - s"targetPostShuffleInputSize $targetPostShuffleInputSize.") - - // Make sure we do get the same number of pre-shuffle partitions for those stages. - val distinctNumPreShufflePartitions = - mapOutputStatistics.map(stats => stats.bytesByPartitionId.length).distinct - // The reason that we are expecting a single value of the number of pre-shuffle partitions - // is that when we add Exchanges, we set the number of pre-shuffle partitions - // (i.e. map output partitions) using a static setting, which is the value of - // spark.sql.shuffle.partitions. Even if two input RDDs are having different - // number of partitions, they will have the same number of pre-shuffle partitions - // (i.e. map output partitions). - assert( - distinctNumPreShufflePartitions.length == 1, - "There should be only one distinct value of the number pre-shuffle partitions " + - "among registered Exchange operator.") - val numPreShufflePartitions = distinctNumPreShufflePartitions.head - - val partitionStartIndices = ArrayBuffer[Int]() - // The first element of partitionStartIndices is always 0. - partitionStartIndices += 0 - - var postShuffleInputSize = 0L - - var i = 0 - while (i < numPreShufflePartitions) { - // We calculate the total size of ith pre-shuffle partitions from all pre-shuffle stages. - // Then, we add the total size to postShuffleInputSize. - var nextShuffleInputSize = 0L - var j = 0 - while (j < mapOutputStatistics.length) { - nextShuffleInputSize += mapOutputStatistics(j).bytesByPartitionId(i) - j += 1 - } - - // If including the nextShuffleInputSize would exceed the target partition size, then start a - // new partition. - if (i > 0 && postShuffleInputSize + nextShuffleInputSize > targetPostShuffleInputSize) { - partitionStartIndices += i - // reset postShuffleInputSize. - postShuffleInputSize = nextShuffleInputSize - } else { - postShuffleInputSize += nextShuffleInputSize - } - - i += 1 - } - - partitionStartIndices.toArray - } } +/** + * A wrapper of shuffle query stage, which submits fewer reduce task as one reduce task may read + * multiple shuffle partitions. This can avoid many small reduce tasks that hurt performance. + * + * @param child It's usually `ShuffleQueryStageExec`, but can be the shuffle exchange node during + * canonicalization. + * @param partitionStartIndices The start partition indices for the coalesced partitions. + */ case class CoalescedShuffleReaderExec( - child: QueryStageExec, + child: SparkPlan, partitionStartIndices: Array[Int]) extends UnaryExecNode { override def output: Seq[Attribute] = child.output - override def doCanonicalize(): SparkPlan = child.canonicalized - override def outputPartitioning: Partitioning = { UnknownPartitioning(partitionStartIndices.length) } @@ -196,9 +116,9 @@ case class CoalescedShuffleReaderExec( if (cachedShuffleRDD == null) { cachedShuffleRDD = child match { case stage: ShuffleQueryStageExec => - stage.plan.createShuffledRDD(Some(partitionStartIndices)) - case ReusedQueryStageExec(_, stage: ShuffleQueryStageExec, _) => - stage.plan.createShuffledRDD(Some(partitionStartIndices)) + stage.shuffle.createShuffledRDD(Some(partitionStartIndices)) + case _ => + throw new IllegalStateException("operating on canonicalization plan") } } cachedShuffleRDD diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ShufflePartitionsCoalescer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ShufflePartitionsCoalescer.scala new file mode 100644 index 0000000000000..18f0585524aa2 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ShufflePartitionsCoalescer.scala @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.adaptive + +import scala.collection.mutable.ArrayBuffer + +import org.apache.spark.MapOutputStatistics +import org.apache.spark.internal.Logging + +object ShufflePartitionsCoalescer extends Logging { + + /** + * Coalesce the same range of partitions (`firstPartitionIndex`` to `lastPartitionIndex`, the + * start is inclusive and the end is exclusive) from multiple shuffles. This method assumes that + * all the shuffles have the same number of partitions, and the partitions of same index will be + * read together by one task. + * + * The strategy used to determine the number of coalesced partitions is described as follows. + * To determine the number of coalesced partitions, we have a target size for a coalesced + * partition. Once we have size statistics of all shuffle partitions, we will do + * a pass of those statistics and pack shuffle partitions with continuous indices to a single + * coalesced partition until adding another shuffle partition would cause the size of a + * coalesced partition to be greater than the target size. + * + * For example, we have two shuffles with the following partition size statistics: + * - shuffle 1 (5 partitions): [100 MiB, 20 MiB, 100 MiB, 10MiB, 30 MiB] + * - shuffle 2 (5 partitions): [10 MiB, 10 MiB, 70 MiB, 5 MiB, 5 MiB] + * Assuming the target size is 128 MiB, we will have 4 coalesced partitions, which are: + * - coalesced partition 0: shuffle partition 0 (size 110 MiB) + * - coalesced partition 1: shuffle partition 1 (size 30 MiB) + * - coalesced partition 2: shuffle partition 2 (size 170 MiB) + * - coalesced partition 3: shuffle partition 3 and 4 (size 50 MiB) + * + * @return An array of partition indices which represents the coalesced partitions. For example, + * [0, 2, 3] means 3 coalesced partitions: [0, 2), [2, 3), [3, lastPartitionIndex] + */ + def coalescePartitions( + mapOutputStatistics: Array[MapOutputStatistics], + firstPartitionIndex: Int, + lastPartitionIndex: Int, + advisoryTargetSize: Long, + minNumPartitions: Int = 1): Array[Int] = { + // If `minNumPartitions` is very large, it is possible that we need to use a value less than + // `advisoryTargetSize` as the target size of a coalesced task. + val totalPostShuffleInputSize = mapOutputStatistics.map(_.bytesByPartitionId.sum).sum + // The max at here is to make sure that when we have an empty table, we only have a single + // coalesced partition. + // There is no particular reason that we pick 16. We just need a number to prevent + // `maxTargetSize` from being set to 0. + val maxTargetSize = math.max( + math.ceil(totalPostShuffleInputSize / minNumPartitions.toDouble).toLong, 16) + val targetSize = math.min(maxTargetSize, advisoryTargetSize) + + logInfo(s"advisory target size: $advisoryTargetSize, actual target size $targetSize.") + + // Make sure these shuffles have the same number of partitions. + val distinctNumShufflePartitions = + mapOutputStatistics.map(stats => stats.bytesByPartitionId.length).distinct + // The reason that we are expecting a single value of the number of shuffle partitions + // is that when we add Exchanges, we set the number of shuffle partitions + // (i.e. map output partitions) using a static setting, which is the value of + // `spark.sql.shuffle.partitions`. Even if two input RDDs are having different + // number of partitions, they will have the same number of shuffle partitions + // (i.e. map output partitions). + assert( + distinctNumShufflePartitions.length == 1, + "There should be only one distinct value of the number of shuffle partitions " + + "among registered Exchange operators.") + + val splitPoints = ArrayBuffer[Int]() + splitPoints += firstPartitionIndex + var coalescedSize = 0L + var i = firstPartitionIndex + while (i < lastPartitionIndex) { + // We calculate the total size of i-th shuffle partitions from all shuffles. + var totalSizeOfCurrentPartition = 0L + var j = 0 + while (j < mapOutputStatistics.length) { + totalSizeOfCurrentPartition += mapOutputStatistics(j).bytesByPartitionId(i) + j += 1 + } + + // If including the `totalSizeOfCurrentPartition` would exceed the target size, then start a + // new coalesced partition. + if (i > firstPartitionIndex && coalescedSize + totalSizeOfCurrentPartition > targetSize) { + splitPoints += i + // reset postShuffleInputSize. + coalescedSize = totalSizeOfCurrentPartition + } else { + coalescedSize += totalSizeOfCurrentPartition + } + i += 1 + } + + splitPoints.toArray + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala index 4d762c5ea9f34..56a287d4d0279 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala @@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.aggregate import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate._ -import org.apache.spark.sql.catalyst.optimizer.NormalizeFloatingNumbers import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.streaming.{StateStoreRestoreExec, StateStoreSaveExec} @@ -27,6 +26,22 @@ import org.apache.spark.sql.execution.streaming.{StateStoreRestoreExec, StateSto * Utility functions used by the query planner to convert our plan to new aggregation code path. */ object AggUtils { + + private def mayRemoveAggFilters(exprs: Seq[AggregateExpression]): Seq[AggregateExpression] = { + exprs.map { ae => + if (ae.filter.isDefined) { + ae.mode match { + // Aggregate filters are applicable only in partial/complete modes; + // this method filters out them, otherwise. + case Partial | Complete => ae + case _ => ae.copy(filter = None) + } + } else { + ae + } + } + } + private def createAggregate( requiredChildDistributionExpressions: Option[Seq[Expression]] = None, groupingExpressions: Seq[NamedExpression] = Nil, @@ -41,7 +56,7 @@ object AggUtils { HashAggregateExec( requiredChildDistributionExpressions = requiredChildDistributionExpressions, groupingExpressions = groupingExpressions, - aggregateExpressions = aggregateExpressions, + aggregateExpressions = mayRemoveAggFilters(aggregateExpressions), aggregateAttributes = aggregateAttributes, initialInputBufferOffset = initialInputBufferOffset, resultExpressions = resultExpressions, @@ -54,7 +69,7 @@ object AggUtils { ObjectHashAggregateExec( requiredChildDistributionExpressions = requiredChildDistributionExpressions, groupingExpressions = groupingExpressions, - aggregateExpressions = aggregateExpressions, + aggregateExpressions = mayRemoveAggFilters(aggregateExpressions), aggregateAttributes = aggregateAttributes, initialInputBufferOffset = initialInputBufferOffset, resultExpressions = resultExpressions, @@ -63,7 +78,7 @@ object AggUtils { SortAggregateExec( requiredChildDistributionExpressions = requiredChildDistributionExpressions, groupingExpressions = groupingExpressions, - aggregateExpressions = aggregateExpressions, + aggregateExpressions = mayRemoveAggFilters(aggregateExpressions), aggregateAttributes = aggregateAttributes, initialInputBufferOffset = initialInputBufferOffset, resultExpressions = resultExpressions, @@ -174,7 +189,7 @@ object AggUtils { // Children of an AggregateFunction with DISTINCT keyword has already // been evaluated. At here, we need to replace original children // to AttributeReferences. - case agg @ AggregateExpression(aggregateFunction, mode, true, _) => + case agg @ AggregateExpression(aggregateFunction, mode, true, _, _) => aggregateFunction.transformDown(distinctColumnAttributeLookup) .asInstanceOf[AggregateFunction] case agg => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala index d03de1507fbbd..527a9eac9948e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala @@ -157,19 +157,44 @@ abstract class AggregationIterator( inputAttributes: Seq[Attribute]): (InternalRow, InternalRow) => Unit = { val joinedRow = new JoinedRow if (expressions.nonEmpty) { - val mergeExpressions = functions.zip(expressions).flatMap { - case (ae: DeclarativeAggregate, expression) => - expression.mode match { - case Partial | Complete => ae.updateExpressions - case PartialMerge | Final => ae.mergeExpressions + val mergeExpressions = + functions.zip(expressions.map(ae => (ae.mode, ae.isDistinct, ae.filter))).flatMap { + case (ae: DeclarativeAggregate, (mode, isDistinct, filter)) => + mode match { + case Partial | Complete => + if (filter.isDefined) { + ae.updateExpressions.zip(ae.aggBufferAttributes).map { + case (updateExpr, attr) => If(filter.get, updateExpr, attr) + } + } else { + ae.updateExpressions + } + case PartialMerge | Final => ae.mergeExpressions + } + case (agg: AggregateFunction, _) => Seq.fill(agg.aggBufferAttributes.length)(NoOp) + } + // Initialize predicates for aggregate functions if necessary + val predicateOptions = expressions.map { + case AggregateExpression(_, mode, _, Some(filter), _) => + mode match { + case Partial | Complete => + val predicate = Predicate.create(filter, inputAttributes) + predicate.initialize(partIndex) + Some(predicate) + case _ => None } - case (agg: AggregateFunction, _) => Seq.fill(agg.aggBufferAttributes.length)(NoOp) + case _ => None } val updateFunctions = functions.zipWithIndex.collect { case (ae: ImperativeAggregate, i) => expressions(i).mode match { case Partial | Complete => - (buffer: InternalRow, row: InternalRow) => ae.update(buffer, row) + if (predicateOptions(i).isDefined) { + (buffer: InternalRow, row: InternalRow) => + if (predicateOptions(i).get.eval(row)) { ae.update(buffer, row) } + } else { + (buffer: InternalRow, row: InternalRow) => ae.update(buffer, row) + } case PartialMerge | Final => (buffer: InternalRow, row: InternalRow) => ae.merge(buffer, row) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/BaseAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/BaseAggregateExec.scala new file mode 100644 index 0000000000000..0eaa0f53fdacd --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/BaseAggregateExec.scala @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.aggregate + +import org.apache.spark.sql.catalyst.expressions.{Attribute, NamedExpression} +import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression +import org.apache.spark.sql.execution.{ExplainUtils, UnaryExecNode} + +/** + * Holds common logic for aggregate operators + */ +trait BaseAggregateExec extends UnaryExecNode { + def groupingExpressions: Seq[NamedExpression] + def aggregateExpressions: Seq[AggregateExpression] + def aggregateAttributes: Seq[Attribute] + def resultExpressions: Seq[NamedExpression] + + override def verboseStringWithOperatorId(): String = { + val inputString = child.output.mkString("[", ", ", "]") + val keyString = groupingExpressions.mkString("[", ", ", "]") + val functionString = aggregateExpressions.mkString("[", ", ", "]") + val aggregateAttributeString = aggregateAttributes.mkString("[", ", ", "]") + val resultString = resultExpressions.mkString("[", ", ", "]") + s""" + |(${ExplainUtils.getOpId(this)}) $nodeName ${ExplainUtils.getCodegenId(this)} + |Input: $inputString + |Keys: $keyString + |Functions: $functionString + |Aggregate Attributes: $aggregateAttributeString + |Results: $resultString + """.stripMargin + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala index 4a95f76381339..7a26fd7a8541a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.execution.aggregate import java.util.concurrent.TimeUnit._ +import scala.collection.mutable + import org.apache.spark.TaskContext import org.apache.spark.memory.{SparkOutOfMemoryError, TaskMemoryManager} import org.apache.spark.rdd.RDD @@ -30,13 +32,13 @@ import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.plans.physical._ -import org.apache.spark.sql.catalyst.util.DateTimeUtils._ +import org.apache.spark.sql.catalyst.util.DateTimeConstants.NANOS_PER_MILLIS import org.apache.spark.sql.catalyst.util.truncatedString import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.execution.vectorized.MutableColumnarRow import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{DecimalType, StringType, StructType} +import org.apache.spark.sql.types.{CalendarIntervalType, DecimalType, StringType, StructType} import org.apache.spark.unsafe.KVIterator import org.apache.spark.util.Utils @@ -51,7 +53,7 @@ case class HashAggregateExec( initialInputBufferOffset: Int, resultExpressions: Seq[NamedExpression], child: SparkPlan) - extends UnaryExecNode with BlockingOperatorWithCodegen { + extends BaseAggregateExec with BlockingOperatorWithCodegen with AliasAwareOutputPartitioning { private[this] val aggregateBufferAttributes = { aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes) @@ -73,7 +75,7 @@ case class HashAggregateExec( override def output: Seq[Attribute] = resultExpressions.map(_.toAttribute) - override def outputPartitioning: Partitioning = child.outputPartitioning + override protected def outputExpressions: Seq[NamedExpression] = resultExpressions override def producedAttributes: AttributeSet = AttributeSet(aggregateAttributes) ++ @@ -124,7 +126,7 @@ case class HashAggregateExec( initialInputBufferOffset, resultExpressions, (expressions, inputSchema) => - newMutableProjection(expressions, inputSchema, subexpressionEliminationEnabled), + MutableProjection.create(expressions, inputSchema), child.output, iter, testFallbackStartsAt, @@ -150,8 +152,10 @@ case class HashAggregateExec( override def usedInputs: AttributeSet = inputSet override def supportCodegen: Boolean = { - // ImperativeAggregate is not supported right now - !aggregateExpressions.exists(_.aggregateFunction.isInstanceOf[ImperativeAggregate]) + // ImperativeAggregate and filter predicate are not supported right now + // TODO: SPARK-30027 Support codegen for filter exprs in HashAggregateExec + !(aggregateExpressions.exists(_.aggregateFunction.isInstanceOf[ImperativeAggregate]) || + aggregateExpressions.exists(_.filter.isDefined)) } override def inputRDDs(): Seq[RDD[InternalRow]] = { @@ -174,8 +178,9 @@ case class HashAggregateExec( } } - // The variables used as aggregation buffer. Only used for aggregation without keys. - private var bufVars: Seq[ExprCode] = _ + // The variables are used as aggregation buffers and each aggregate function has one or more + // ExprCode to initialize its buffer slots. Only used for aggregation without keys. + private var bufVars: Seq[Seq[ExprCode]] = _ private def doProduceWithoutKeys(ctx: CodegenContext): String = { val initAgg = ctx.addMutableState(CodeGenerator.JAVA_BOOLEAN, "initAgg") @@ -184,27 +189,30 @@ case class HashAggregateExec( // generate variables for aggregation buffer val functions = aggregateExpressions.map(_.aggregateFunction.asInstanceOf[DeclarativeAggregate]) - val initExpr = functions.flatMap(f => f.initialValues) - bufVars = initExpr.map { e => - val isNull = ctx.addMutableState(CodeGenerator.JAVA_BOOLEAN, "bufIsNull") - val value = ctx.addMutableState(CodeGenerator.javaType(e.dataType), "bufValue") - // The initial expression should not access any column - val ev = e.genCode(ctx) - val initVars = code""" - | $isNull = ${ev.isNull}; - | $value = ${ev.value}; - """.stripMargin - ExprCode( - ev.code + initVars, - JavaCode.isNullGlobal(isNull), - JavaCode.global(value, e.dataType)) + val initExpr = functions.map(f => f.initialValues) + bufVars = initExpr.map { exprs => + exprs.map { e => + val isNull = ctx.addMutableState(CodeGenerator.JAVA_BOOLEAN, "bufIsNull") + val value = ctx.addMutableState(CodeGenerator.javaType(e.dataType), "bufValue") + // The initial expression should not access any column + val ev = e.genCode(ctx) + val initVars = code""" + |$isNull = ${ev.isNull}; + |$value = ${ev.value}; + """.stripMargin + ExprCode( + ev.code + initVars, + JavaCode.isNullGlobal(isNull), + JavaCode.global(value, e.dataType)) + } } - val initBufVar = evaluateVariables(bufVars) + val flatBufVars = bufVars.flatten + val initBufVar = evaluateVariables(flatBufVars) // generate variables for output val (resultVars, genResult) = if (modes.contains(Final) || modes.contains(Complete)) { // evaluate aggregate results - ctx.currentVars = bufVars + ctx.currentVars = flatBufVars val aggResults = bindReferences( functions.map(_.evaluateExpression), aggregateBufferAttributes).map(_.genCode(ctx)) @@ -218,7 +226,7 @@ case class HashAggregateExec( """.stripMargin) } else if (modes.contains(Partial) || modes.contains(PartialMerge)) { // output the aggregate buffer directly - (bufVars, "") + (flatBufVars, "") } else { // no aggregate function, the result should be literals val resultVars = resultExpressions.map(_.genCode(ctx)) @@ -228,38 +236,106 @@ case class HashAggregateExec( val doAgg = ctx.freshName("doAggregateWithoutKey") val doAggFuncName = ctx.addNewFunction(doAgg, s""" - | private void $doAgg() throws java.io.IOException { - | // initialize aggregation buffer - | $initBufVar + |private void $doAgg() throws java.io.IOException { + | // initialize aggregation buffer + | $initBufVar | - | ${child.asInstanceOf[CodegenSupport].produce(ctx, this)} - | } + | ${child.asInstanceOf[CodegenSupport].produce(ctx, this)} + |} """.stripMargin) val numOutput = metricTerm(ctx, "numOutputRows") val aggTime = metricTerm(ctx, "aggTime") val beforeAgg = ctx.freshName("beforeAgg") s""" - | while (!$initAgg) { - | $initAgg = true; - | long $beforeAgg = System.nanoTime(); - | $doAggFuncName(); - | $aggTime.add((System.nanoTime() - $beforeAgg) / $NANOS_PER_MILLIS); + |while (!$initAgg) { + | $initAgg = true; + | long $beforeAgg = System.nanoTime(); + | $doAggFuncName(); + | $aggTime.add((System.nanoTime() - $beforeAgg) / $NANOS_PER_MILLIS); | - | // output the result - | ${genResult.trim} + | // output the result + | ${genResult.trim} | - | $numOutput.add(1); - | ${consume(ctx, resultVars).trim} - | } + | $numOutput.add(1); + | ${consume(ctx, resultVars).trim} + |} """.stripMargin } + // Splits aggregate code into small functions because the most of JVM implementations + // can not compile too long functions. Returns None if we are not able to split the given code. + // + // Note: The difference from `CodeGenerator.splitExpressions` is that we define an individual + // function for each aggregation function (e.g., SUM and AVG). For example, in a query + // `SELECT SUM(a), AVG(a) FROM VALUES(1) t(a)`, we define two functions + // for `SUM(a)` and `AVG(a)`. + private def splitAggregateExpressions( + ctx: CodegenContext, + aggNames: Seq[String], + aggBufferUpdatingExprs: Seq[Seq[Expression]], + aggCodeBlocks: Seq[Block], + subExprs: Map[Expression, SubExprEliminationState]): Option[String] = { + val exprValsInSubExprs = subExprs.flatMap { case (_, s) => s.value :: s.isNull :: Nil } + if (exprValsInSubExprs.exists(_.isInstanceOf[SimpleExprValue])) { + // `SimpleExprValue`s cannot be used as an input variable for split functions, so + // we give up splitting functions if it exists in `subExprs`. + None + } else { + val inputVars = aggBufferUpdatingExprs.map { aggExprsForOneFunc => + val inputVarsForOneFunc = aggExprsForOneFunc.map( + CodeGenerator.getLocalInputVariableValues(ctx, _, subExprs)).reduce(_ ++ _).toSeq + val paramLength = CodeGenerator.calculateParamLengthFromExprValues(inputVarsForOneFunc) + + // Checks if a parameter length for the `aggExprsForOneFunc` does not go over the JVM limit + if (CodeGenerator.isValidParamLength(paramLength)) { + Some(inputVarsForOneFunc) + } else { + None + } + } + + // Checks if all the aggregate code can be split into pieces. + // If the parameter length of at lease one `aggExprsForOneFunc` goes over the limit, + // we totally give up splitting aggregate code. + if (inputVars.forall(_.isDefined)) { + val splitCodes = inputVars.flatten.zipWithIndex.map { case (args, i) => + val doAggFunc = ctx.freshName(s"doAggregate_${aggNames(i)}") + val argList = args.map { v => + s"${CodeGenerator.typeName(v.javaType)} ${v.variableName}" + }.mkString(", ") + val doAggFuncName = ctx.addNewFunction(doAggFunc, + s""" + |private void $doAggFunc($argList) throws java.io.IOException { + | ${aggCodeBlocks(i)} + |} + """.stripMargin) + + val inputVariables = args.map(_.variableName).mkString(", ") + s"$doAggFuncName($inputVariables);" + } + Some(splitCodes.mkString("\n").trim) + } else { + val errMsg = "Failed to split aggregate code into small functions because the parameter " + + "length of at least one split function went over the JVM limit: " + + CodeGenerator.MAX_JVM_METHOD_PARAMS_LENGTH + if (Utils.isTesting) { + throw new IllegalStateException(errMsg) + } else { + logInfo(errMsg) + None + } + } + } + } + private def doConsumeWithoutKeys(ctx: CodegenContext, input: Seq[ExprCode]): String = { // only have DeclarativeAggregate val functions = aggregateExpressions.map(_.aggregateFunction.asInstanceOf[DeclarativeAggregate]) val inputAttrs = functions.flatMap(_.aggBufferAttributes) ++ child.output - val updateExpr = aggregateExpressions.flatMap { e => + // To individually generate code for each aggregate function, an element in `updateExprs` holds + // all the expressions for the buffer of an aggregation function. + val updateExprs = aggregateExpressions.map { e => e.mode match { case Partial | Complete => e.aggregateFunction.asInstanceOf[DeclarativeAggregate].updateExpressions @@ -267,28 +343,56 @@ case class HashAggregateExec( e.aggregateFunction.asInstanceOf[DeclarativeAggregate].mergeExpressions } } - ctx.currentVars = bufVars ++ input - val boundUpdateExpr = bindReferences(updateExpr, inputAttrs) - val subExprs = ctx.subexpressionEliminationForWholeStageCodegen(boundUpdateExpr) + ctx.currentVars = bufVars.flatten ++ input + val boundUpdateExprs = updateExprs.map { updateExprsForOneFunc => + bindReferences(updateExprsForOneFunc, inputAttrs) + } + val subExprs = ctx.subexpressionEliminationForWholeStageCodegen(boundUpdateExprs.flatten) val effectiveCodes = subExprs.codes.mkString("\n") - val aggVals = ctx.withSubExprEliminationExprs(subExprs.states) { - boundUpdateExpr.map(_.genCode(ctx)) + val bufferEvals = boundUpdateExprs.map { boundUpdateExprsForOneFunc => + ctx.withSubExprEliminationExprs(subExprs.states) { + boundUpdateExprsForOneFunc.map(_.genCode(ctx)) + } } - // aggregate buffer should be updated atomic - val updates = aggVals.zipWithIndex.map { case (ev, i) => - s""" - | ${bufVars(i).isNull} = ${ev.isNull}; - | ${bufVars(i).value} = ${ev.value}; + + val aggNames = functions.map(_.prettyName) + val aggCodeBlocks = bufferEvals.zipWithIndex.map { case (bufferEvalsForOneFunc, i) => + val bufVarsForOneFunc = bufVars(i) + // All the update code for aggregation buffers should be placed in the end + // of each aggregation function code. + val updates = bufferEvalsForOneFunc.zip(bufVarsForOneFunc).map { case (ev, bufVar) => + s""" + |${bufVar.isNull} = ${ev.isNull}; + |${bufVar.value} = ${ev.value}; + """.stripMargin + } + code""" + |// do aggregate for ${aggNames(i)} + |// evaluate aggregate function + |${evaluateVariables(bufferEvalsForOneFunc)} + |// update aggregation buffers + |${updates.mkString("\n").trim} """.stripMargin } + + val codeToEvalAggFunc = if (conf.codegenSplitAggregateFunc && + aggCodeBlocks.map(_.length).sum > conf.methodSplitThreshold) { + val maybeSplitCode = splitAggregateExpressions( + ctx, aggNames, boundUpdateExprs, aggCodeBlocks, subExprs.states) + + maybeSplitCode.getOrElse { + aggCodeBlocks.fold(EmptyBlock)(_ + _).code + } + } else { + aggCodeBlocks.fold(EmptyBlock)(_ + _).code + } + s""" - | // do aggregate - | // common sub-expressions - | $effectiveCodes - | // evaluate aggregate function - | ${evaluateVariables(aggVals)} - | // update aggregation buffer - | ${updates.mkString("\n").trim} + |// do aggregate + |// common sub-expressions + |$effectiveCodes + |// evaluate aggregate functions and update aggregation buffers + |$codeToEvalAggFunc """.stripMargin } @@ -384,10 +488,9 @@ case class HashAggregateExec( // Create a MutableProjection to merge the rows of same key together val mergeExpr = declFunctions.flatMap(_.mergeExpressions) - val mergeProjection = newMutableProjection( + val mergeProjection = MutableProjection.create( mergeExpr, - aggregateBufferAttributes ++ declFunctions.flatMap(_.inputAggBufferAttributes), - subexpressionEliminationEnabled) + aggregateBufferAttributes ++ declFunctions.flatMap(_.inputAggBufferAttributes)) val joinedRow = new JoinedRow() var currentKey: UnsafeRow = null @@ -473,12 +576,12 @@ case class HashAggregateExec( val evaluateNondeterministicResults = evaluateNondeterministicVariables(output, resultVars, resultExpressions) s""" - $evaluateKeyVars - $evaluateBufferVars - $evaluateAggResults - $evaluateNondeterministicResults - ${consume(ctx, resultVars)} - """ + |$evaluateKeyVars + |$evaluateBufferVars + |$evaluateAggResults + |$evaluateNondeterministicResults + |${consume(ctx, resultVars)} + """.stripMargin } else if (modes.contains(Partial) || modes.contains(PartialMerge)) { // resultExpressions are Attributes of groupingExpressions and aggregateBufferAttributes. assert(resultExpressions.forall(_.isInstanceOf[Attribute])) @@ -505,10 +608,10 @@ case class HashAggregateExec( resultExpressions, inputAttrs).map(_.genCode(ctx)) s""" - $evaluateKeyVars - $evaluateResultBufferVars - ${consume(ctx, resultVars)} - """ + |$evaluateKeyVars + |$evaluateResultBufferVars + |${consume(ctx, resultVars)} + """.stripMargin } else { // generate result based on grouping key ctx.INPUT_ROW = keyTerm @@ -519,18 +622,18 @@ case class HashAggregateExec( val evaluateNondeterministicResults = evaluateNondeterministicVariables(output, resultVars, resultExpressions) s""" - $evaluateNondeterministicResults - ${consume(ctx, resultVars)} - """ + |$evaluateNondeterministicResults + |${consume(ctx, resultVars)} + """.stripMargin } ctx.addNewFunction(funcName, s""" - private void $funcName(UnsafeRow $keyTerm, UnsafeRow $bufferTerm) - throws java.io.IOException { - $numOutput.add(1); - $body - } - """) + |private void $funcName(UnsafeRow $keyTerm, UnsafeRow $bufferTerm) + | throws java.io.IOException { + | $numOutput.add(1); + | $body + |} + """.stripMargin) } /** @@ -542,7 +645,8 @@ case class HashAggregateExec( private def checkIfFastHashMapSupported(ctx: CodegenContext): Boolean = { val isSupported = (groupingKeySchema ++ bufferSchema).forall(f => CodeGenerator.isPrimitiveType(f.dataType) || - f.dataType.isInstanceOf[DecimalType] || f.dataType.isInstanceOf[StringType]) && + f.dataType.isInstanceOf[DecimalType] || f.dataType.isInstanceOf[StringType] || + f.dataType.isInstanceOf[CalendarIntervalType]) && bufferSchema.nonEmpty && modes.forall(mode => mode == Partial || mode == PartialMerge) // For vectorized hash map, We do not support byte array based decimal type for aggregate values @@ -554,7 +658,7 @@ case class HashAggregateExec( val isNotByteArrayDecimalType = bufferSchema.map(_.dataType).filter(_.isInstanceOf[DecimalType]) .forall(!DecimalType.isByteArrayDecimalType(_)) - isSupported && isNotByteArrayDecimalType + isSupported && isNotByteArrayDecimalType } private def enableTwoLevelHashMap(ctx: CodegenContext): Unit = { @@ -583,9 +687,9 @@ case class HashAggregateExec( val thisPlan = ctx.addReferenceObj("plan", this) - // Create a name for the iterator from the fast hash map. - val iterTermForFastHashMap = if (isFastHashMapEnabled) { - // Generates the fast hash map class and creates the fash hash map term. + // Create a name for the iterator from the fast hash map, and the code to create fast hash map. + val (iterTermForFastHashMap, createFastHashMap) = if (isFastHashMapEnabled) { + // Generates the fast hash map class and creates the fast hash map term. val fastHashMapClassName = ctx.freshName("FastHashMap") if (isVectorizedHashMapEnabled) { val generatedMap = new VectorizedHashMapGenerator(ctx, aggregateExpressions, @@ -593,25 +697,30 @@ case class HashAggregateExec( ctx.addInnerClass(generatedMap) // Inline mutable state since not many aggregation operations in a task - fastHashMapTerm = ctx.addMutableState(fastHashMapClassName, "vectorizedHastHashMap", - v => s"$v = new $fastHashMapClassName();", forceInline = true) - ctx.addMutableState(s"java.util.Iterator", "vectorizedFastHashMapIter", + fastHashMapTerm = ctx.addMutableState( + fastHashMapClassName, "vectorizedFastHashMap", forceInline = true) + val iter = ctx.addMutableState( + "java.util.Iterator", + "vectorizedFastHashMapIter", forceInline = true) + val create = s"$fastHashMapTerm = new $fastHashMapClassName();" + (iter, create) } else { val generatedMap = new RowBasedHashMapGenerator(ctx, aggregateExpressions, fastHashMapClassName, groupingKeySchema, bufferSchema, bitMaxCapacity).generate() ctx.addInnerClass(generatedMap) // Inline mutable state since not many aggregation operations in a task - fastHashMapTerm = ctx.addMutableState(fastHashMapClassName, "fastHashMap", - v => s"$v = new $fastHashMapClassName(" + - s"$thisPlan.getTaskMemoryManager(), $thisPlan.getEmptyAggregationBuffer());", - forceInline = true) - ctx.addMutableState( + fastHashMapTerm = ctx.addMutableState( + fastHashMapClassName, "fastHashMap", forceInline = true) + val iter = ctx.addMutableState( "org.apache.spark.unsafe.KVIterator", "fastHashMapIter", forceInline = true) + val create = s"$fastHashMapTerm = new $fastHashMapClassName(" + + s"$thisPlan.getTaskMemoryManager(), $thisPlan.getEmptyAggregationBuffer());" + (iter, create) } - } + } else ("", "") // Create a name for the iterator from the regular hash map. // Inline mutable state since not many aggregation operations in a task @@ -619,8 +728,7 @@ case class HashAggregateExec( "mapIter", forceInline = true) // create hashMap val hashMapClassName = classOf[UnsafeFixedWidthAggregationMap].getName - hashMapTerm = ctx.addMutableState(hashMapClassName, "hashMap", - v => s"$v = $thisPlan.createHashMap();", forceInline = true) + hashMapTerm = ctx.addMutableState(hashMapClassName, "hashMap", forceInline = true) sorterTerm = ctx.addMutableState(classOf[UnsafeKVExternalSorter].getName, "sorter", forceInline = true) @@ -721,17 +829,18 @@ case class HashAggregateExec( val aggTime = metricTerm(ctx, "aggTime") val beforeAgg = ctx.freshName("beforeAgg") s""" - if (!$initAgg) { - $initAgg = true; - long $beforeAgg = System.nanoTime(); - $doAggFuncName(); - $aggTime.add((System.nanoTime() - $beforeAgg) / $NANOS_PER_MILLIS); - } - - // output the result - $outputFromFastHashMap - $outputFromRegularHashMap - """ + |if (!$initAgg) { + | $initAgg = true; + | $createFastHashMap + | $hashMapTerm = $thisPlan.createHashMap(); + | long $beforeAgg = System.nanoTime(); + | $doAggFuncName(); + | $aggTime.add((System.nanoTime() - $beforeAgg) / $NANOS_PER_MILLIS); + |} + |// output the result + |$outputFromFastHashMap + |$outputFromRegularHashMap + """.stripMargin } private def doConsumeWithKeys(ctx: CodegenContext, input: Seq[ExprCode]): String = { @@ -745,8 +854,10 @@ case class HashAggregateExec( val unsafeRowBuffer = ctx.freshName("unsafeRowAggBuffer") val fastRowBuffer = ctx.freshName("fastAggBuffer") - // only have DeclarativeAggregate - val updateExpr = aggregateExpressions.flatMap { e => + // To individually generate code for each aggregate function, an element in `updateExprs` holds + // all the expressions for the buffer of an aggregation function. + val updateExprs = aggregateExpressions.map { e => + // only have DeclarativeAggregate e.mode match { case Partial | Complete => e.aggregateFunction.asInstanceOf[DeclarativeAggregate].updateExpressions @@ -824,25 +935,70 @@ case class HashAggregateExec( // generating input columns, we use `currentVars`. ctx.currentVars = new Array[ExprCode](aggregateBufferAttributes.length) ++ input + val aggNames = aggregateExpressions.map(_.aggregateFunction.prettyName) + // Computes start offsets for each aggregation function code + // in the underlying buffer row. + val bufferStartOffsets = { + val offsets = mutable.ArrayBuffer[Int]() + var curOffset = 0 + updateExprs.foreach { exprsForOneFunc => + offsets += curOffset + curOffset += exprsForOneFunc.length + } + offsets.toArray + } + val updateRowInRegularHashMap: String = { ctx.INPUT_ROW = unsafeRowBuffer - val boundUpdateExpr = bindReferences(updateExpr, inputAttr) - val subExprs = ctx.subexpressionEliminationForWholeStageCodegen(boundUpdateExpr) + val boundUpdateExprs = updateExprs.map { updateExprsForOneFunc => + bindReferences(updateExprsForOneFunc, inputAttr) + } + val subExprs = ctx.subexpressionEliminationForWholeStageCodegen(boundUpdateExprs.flatten) val effectiveCodes = subExprs.codes.mkString("\n") - val unsafeRowBufferEvals = ctx.withSubExprEliminationExprs(subExprs.states) { - boundUpdateExpr.map(_.genCode(ctx)) + val unsafeRowBufferEvals = boundUpdateExprs.map { boundUpdateExprsForOneFunc => + ctx.withSubExprEliminationExprs(subExprs.states) { + boundUpdateExprsForOneFunc.map(_.genCode(ctx)) + } } - val updateUnsafeRowBuffer = unsafeRowBufferEvals.zipWithIndex.map { case (ev, i) => - val dt = updateExpr(i).dataType - CodeGenerator.updateColumn(unsafeRowBuffer, dt, i, ev, updateExpr(i).nullable) + + val aggCodeBlocks = updateExprs.indices.map { i => + val rowBufferEvalsForOneFunc = unsafeRowBufferEvals(i) + val boundUpdateExprsForOneFunc = boundUpdateExprs(i) + val bufferOffset = bufferStartOffsets(i) + + // All the update code for aggregation buffers should be placed in the end + // of each aggregation function code. + val updateRowBuffers = rowBufferEvalsForOneFunc.zipWithIndex.map { case (ev, j) => + val updateExpr = boundUpdateExprsForOneFunc(j) + val dt = updateExpr.dataType + val nullable = updateExpr.nullable + CodeGenerator.updateColumn(unsafeRowBuffer, dt, bufferOffset + j, ev, nullable) + } + code""" + |// evaluate aggregate function for ${aggNames(i)} + |${evaluateVariables(rowBufferEvalsForOneFunc)} + |// update unsafe row buffer + |${updateRowBuffers.mkString("\n").trim} + """.stripMargin + } + + val codeToEvalAggFunc = if (conf.codegenSplitAggregateFunc && + aggCodeBlocks.map(_.length).sum > conf.methodSplitThreshold) { + val maybeSplitCode = splitAggregateExpressions( + ctx, aggNames, boundUpdateExprs, aggCodeBlocks, subExprs.states) + + maybeSplitCode.getOrElse { + aggCodeBlocks.fold(EmptyBlock)(_ + _).code + } + } else { + aggCodeBlocks.fold(EmptyBlock)(_ + _).code } + s""" |// common sub-expressions |$effectiveCodes - |// evaluate aggregate function - |${evaluateVariables(unsafeRowBufferEvals)} - |// update unsafe row buffer - |${updateUnsafeRowBuffer.mkString("\n").trim} + |// evaluate aggregate functions and update aggregation buffers + |$codeToEvalAggFunc """.stripMargin } @@ -850,16 +1006,48 @@ case class HashAggregateExec( if (isFastHashMapEnabled) { if (isVectorizedHashMapEnabled) { ctx.INPUT_ROW = fastRowBuffer - val boundUpdateExpr = bindReferences(updateExpr, inputAttr) - val subExprs = ctx.subexpressionEliminationForWholeStageCodegen(boundUpdateExpr) + val boundUpdateExprs = updateExprs.map { updateExprsForOneFunc => + bindReferences(updateExprsForOneFunc, inputAttr) + } + val subExprs = ctx.subexpressionEliminationForWholeStageCodegen(boundUpdateExprs.flatten) val effectiveCodes = subExprs.codes.mkString("\n") - val fastRowEvals = ctx.withSubExprEliminationExprs(subExprs.states) { - boundUpdateExpr.map(_.genCode(ctx)) + val fastRowEvals = boundUpdateExprs.map { boundUpdateExprsForOneFunc => + ctx.withSubExprEliminationExprs(subExprs.states) { + boundUpdateExprsForOneFunc.map(_.genCode(ctx)) + } + } + + val aggCodeBlocks = fastRowEvals.zipWithIndex.map { case (fastRowEvalsForOneFunc, i) => + val boundUpdateExprsForOneFunc = boundUpdateExprs(i) + val bufferOffset = bufferStartOffsets(i) + // All the update code for aggregation buffers should be placed in the end + // of each aggregation function code. + val updateRowBuffer = fastRowEvalsForOneFunc.zipWithIndex.map { case (ev, j) => + val updateExpr = boundUpdateExprsForOneFunc(j) + val dt = updateExpr.dataType + val nullable = updateExpr.nullable + CodeGenerator.updateColumn(fastRowBuffer, dt, bufferOffset + j, ev, nullable, + isVectorized = true) + } + code""" + |// evaluate aggregate function for ${aggNames(i)} + |${evaluateVariables(fastRowEvalsForOneFunc)} + |// update fast row + |${updateRowBuffer.mkString("\n").trim} + """.stripMargin } - val updateFastRow = fastRowEvals.zipWithIndex.map { case (ev, i) => - val dt = updateExpr(i).dataType - CodeGenerator.updateColumn( - fastRowBuffer, dt, i, ev, updateExpr(i).nullable, isVectorized = true) + + + val codeToEvalAggFunc = if (conf.codegenSplitAggregateFunc && + aggCodeBlocks.map(_.length).sum > conf.methodSplitThreshold) { + val maybeSplitCode = splitAggregateExpressions( + ctx, aggNames, boundUpdateExprs, aggCodeBlocks, subExprs.states) + + maybeSplitCode.getOrElse { + aggCodeBlocks.fold(EmptyBlock)(_ + _).code + } + } else { + aggCodeBlocks.fold(EmptyBlock)(_ + _).code } // If vectorized fast hash map is on, we first generate code to update row @@ -869,10 +1057,8 @@ case class HashAggregateExec( |if ($fastRowBuffer != null) { | // common sub-expressions | $effectiveCodes - | // evaluate aggregate function - | ${evaluateVariables(fastRowEvals)} - | // update fast row - | ${updateFastRow.mkString("\n").trim} + | // evaluate aggregate functions and update aggregation buffers + | $codeToEvalAggFunc |} else { | $updateRowInRegularHashMap |} @@ -913,14 +1099,11 @@ case class HashAggregateExec( // continue to do in-memory aggregation and spilling until all the rows had been processed. // Finally, sort the spilled aggregate buffers by key, and merge them together for same key. s""" - $declareRowBuffer - - $findOrInsertHashMap - - $incCounter - - $updateRowInHashMap - """ + |$declareRowBuffer + |$findOrInsertHashMap + |$incCounter + |$updateRowInHashMap + """.stripMargin } override def verboseString(maxFields: Int): String = toString(verbose = true, maxFields) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala index b88ddba8e48d3..75651500954cf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectAggregationIterator.scala @@ -22,7 +22,7 @@ import org.apache.spark.internal.{config, Logging} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate._ -import org.apache.spark.sql.catalyst.expressions.codegen.{BaseOrdering, GenerateOrdering} +import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering import org.apache.spark.sql.execution.UnsafeKVExternalSorter import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.internal.SQLConf @@ -61,9 +61,9 @@ class ObjectAggregationIterator( // Hacking the aggregation mode to call AggregateFunction.merge to merge two aggregation buffers private val mergeAggregationBuffers: (InternalRow, InternalRow) => Unit = { val newExpressions = aggregateExpressions.map { - case agg @ AggregateExpression(_, Partial, _, _) => + case agg @ AggregateExpression(_, Partial, _, _, _) => agg.copy(mode = PartialMerge) - case agg @ AggregateExpression(_, Complete, _, _) => + case agg @ AggregateExpression(_, Complete, _, _, _) => agg.copy(mode = Final) case other => other } @@ -158,7 +158,7 @@ class ObjectAggregationIterator( val buffer: InternalRow = getAggregationBufferByKey(hashMap, groupingKey) processRow(buffer, newInput) - // The the hash map gets too large, makes a sorted spill and clear the map. + // The hash map gets too large, makes a sorted spill and clear the map. if (hashMap.size >= fallbackCountThreshold) { logInfo( s"Aggregation hash map size ${hashMap.size} reaches threshold " + diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala index 151da241144be..3fb58eb2cc8ba 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ObjectHashAggregateExec.scala @@ -67,7 +67,7 @@ case class ObjectHashAggregateExec( initialInputBufferOffset: Int, resultExpressions: Seq[NamedExpression], child: SparkPlan) - extends UnaryExecNode { + extends BaseAggregateExec with AliasAwareOutputPartitioning { private[this] val aggregateBufferAttributes = { aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes) @@ -97,7 +97,7 @@ case class ObjectHashAggregateExec( } } - override def outputPartitioning: Partitioning = child.outputPartitioning + override protected def outputExpressions: Seq[NamedExpression] = resultExpressions protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") { val numOutputRows = longMetric("numOutputRows") @@ -122,7 +122,7 @@ case class ObjectHashAggregateExec( initialInputBufferOffset, resultExpressions, (expressions, inputSchema) => - newMutableProjection(expressions, inputSchema, subexpressionEliminationEnabled), + MutableProjection.create(expressions, inputSchema), child.output, iter, fallbackCountThreshold, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala index 56cf78d8b7fc1..44d19ad60d49f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala @@ -127,7 +127,8 @@ class RowBasedHashMapGenerator( case t: DecimalType => s"agg_rowWriter.write(${ordinal}, ${key.name}, ${t.precision}, ${t.scale})" case t: DataType => - if (!t.isInstanceOf[StringType] && !CodeGenerator.isPrimitiveType(t)) { + if (!t.isInstanceOf[StringType] && !t.isInstanceOf[CalendarIntervalType] && + !CodeGenerator.isPrimitiveType(t)) { throw new IllegalArgumentException(s"cannot generate code for unsupported type: $t") } s"agg_rowWriter.write(${ordinal}, ${key.name})" diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala index 7ab6ecc08a7bc..77ed469016fa3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.catalyst.util.truncatedString -import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.{AliasAwareOutputPartitioning, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics /** @@ -38,7 +38,7 @@ case class SortAggregateExec( initialInputBufferOffset: Int, resultExpressions: Seq[NamedExpression], child: SparkPlan) - extends UnaryExecNode { + extends BaseAggregateExec with AliasAwareOutputPartitioning { private[this] val aggregateBufferAttributes = { aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes) @@ -66,7 +66,7 @@ case class SortAggregateExec( groupingExpressions.map(SortOrder(_, Ascending)) :: Nil } - override def outputPartitioning: Partitioning = child.outputPartitioning + override protected def outputExpressions: Seq[NamedExpression] = resultExpressions override def outputOrdering: Seq[SortOrder] = { groupingExpressions.map(SortOrder(_, Ascending)) @@ -93,7 +93,7 @@ case class SortAggregateExec( initialInputBufferOffset, resultExpressions, (expressions, inputSchema) => - newMutableProjection(expressions, inputSchema, subexpressionEliminationEnabled), + MutableProjection.create(expressions, inputSchema), numOutputRows) if (!hasInput && groupingExpressions.isEmpty) { // There is no input and there is no grouping expressions. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala index 6dc64657ebf1f..99358fbf4e94f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala @@ -249,9 +249,9 @@ class TungstenAggregationIterator( // Basically the value of the KVIterator returned by externalSorter // will be just aggregation buffer, so we rewrite the aggregateExpressions to reflect it. val newExpressions = aggregateExpressions.map { - case agg @ AggregateExpression(_, Partial, _, _) => + case agg @ AggregateExpression(_, Partial, _, _, _) => agg.copy(mode = PartialMerge) - case agg @ AggregateExpression(_, Complete, _, _) => + case agg @ AggregateExpression(_, Complete, _, _, _) => agg.copy(mode = Final) case other => other } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala index 100486fa9850f..dfae5c07e0373 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala @@ -17,13 +17,17 @@ package org.apache.spark.sql.execution.aggregate +import scala.reflect.runtime.universe.TypeTag + import org.apache.spark.internal.Logging -import org.apache.spark.sql.Row +import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, _} -import org.apache.spark.sql.catalyst.expressions.aggregate.ImperativeAggregate -import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection -import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} +import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete} +import org.apache.spark.sql.catalyst.expressions.aggregate.{ImperativeAggregate, TypedImperativeAggregate} +import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateMutableProjection, GenerateSafeProjection} +import org.apache.spark.sql.expressions.{Aggregator, MutableAggregationBuffer, UserDefinedAggregateFunction} import org.apache.spark.sql.types._ /** @@ -450,3 +454,63 @@ case class ScalaUDAF( override def nodeName: String = udaf.getClass.getSimpleName } + +case class ScalaAggregator[IN, BUF, OUT]( + children: Seq[Expression], + agg: Aggregator[IN, BUF, OUT], + inputEncoderNR: ExpressionEncoder[IN], + nullable: Boolean = true, + isDeterministic: Boolean = true, + mutableAggBufferOffset: Int = 0, + inputAggBufferOffset: Int = 0) + extends TypedImperativeAggregate[BUF] + with NonSQLExpression + with UserDefinedExpression + with ImplicitCastInputTypes + with Logging { + + private[this] lazy val inputEncoder = inputEncoderNR.resolveAndBind() + private[this] lazy val bufferEncoder = + agg.bufferEncoder.asInstanceOf[ExpressionEncoder[BUF]].resolveAndBind() + private[this] lazy val outputEncoder = agg.outputEncoder.asInstanceOf[ExpressionEncoder[OUT]] + + def dataType: DataType = outputEncoder.objSerializer.dataType + + def inputTypes: Seq[DataType] = inputEncoder.schema.map(_.dataType) + + override lazy val deterministic: Boolean = isDeterministic + + def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ScalaAggregator[IN, BUF, OUT] = + copy(mutableAggBufferOffset = newMutableAggBufferOffset) + + def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ScalaAggregator[IN, BUF, OUT] = + copy(inputAggBufferOffset = newInputAggBufferOffset) + + private[this] lazy val inputProjection = UnsafeProjection.create(children) + + def createAggregationBuffer(): BUF = agg.zero + + def update(buffer: BUF, input: InternalRow): BUF = + agg.reduce(buffer, inputEncoder.fromRow(inputProjection(input))) + + def merge(buffer: BUF, input: BUF): BUF = agg.merge(buffer, input) + + def eval(buffer: BUF): Any = { + val row = outputEncoder.toRow(agg.finish(buffer)) + if (outputEncoder.isSerializedAsStruct) row else row.get(0, dataType) + } + + private[this] lazy val bufferRow = new UnsafeRow(bufferEncoder.namedExpressions.length) + + def serialize(agg: BUF): Array[Byte] = + bufferEncoder.toRow(agg).asInstanceOf[UnsafeRow].getBytes() + + def deserialize(storageFormat: Array[Byte]): BUF = { + bufferRow.pointTo(storageFormat, storageFormat.length) + bufferEncoder.fromRow(bufferRow) + } + + override def toString: String = s"""${nodeName}(${children.mkString(",")})""" + + override def nodeName: String = agg.getClass.getSimpleName +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/analysis/DetectAmbiguousSelfJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/analysis/DetectAmbiguousSelfJoin.scala index 5c3c735f0346c..614d6c2846bfa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/analysis/DetectAmbiguousSelfJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/analysis/DetectAmbiguousSelfJoin.scala @@ -71,7 +71,7 @@ class DetectAmbiguousSelfJoin(conf: SQLConf) extends Rule[LogicalPlan] { } override def apply(plan: LogicalPlan): LogicalPlan = { - if (!conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN)) return plan + if (!conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED)) return plan // We always remove the special metadata from `AttributeReference` at the end of this rule, so // Dataset column reference only exists in the root node via Dataset transformations like @@ -149,7 +149,7 @@ class DetectAmbiguousSelfJoin(conf: SQLConf) extends Rule[LogicalPlan] { "to figure out which one. Please alias the Datasets with different names via " + "`Dataset.as` before joining them, and specify the column using qualified name, e.g. " + """`df.as("a").join(df.as("b"), $"a.id" > $"b.id")`. You can also set """ + - s"${SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key} to false to disable this check.") + s"${SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key} to false to disable this check.") } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala index 1a6f4acb63521..d1076d9d0156c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala @@ -26,7 +26,7 @@ import org.apache.arrow.flatbuf.MessageHeader import org.apache.arrow.memory.BufferAllocator import org.apache.arrow.vector._ import org.apache.arrow.vector.ipc.{ArrowStreamWriter, ReadChannel, WriteChannel} -import org.apache.arrow.vector.ipc.message.{ArrowRecordBatch, MessageSerializer} +import org.apache.arrow.vector.ipc.message.{ArrowRecordBatch, IpcOption, MessageSerializer} import org.apache.spark.TaskContext import org.apache.spark.api.java.JavaRDD @@ -64,7 +64,7 @@ private[sql] class ArrowBatchStreamWriter( * End the Arrow stream, does not close output stream. */ def end(): Unit = { - ArrowStreamWriter.writeEndOfStream(writeChannel) + ArrowStreamWriter.writeEndOfStream(writeChannel, new IpcOption) } } @@ -251,8 +251,8 @@ private[sql] object ArrowConverters { // Only care about RecordBatch messages, skip Schema and unsupported Dictionary messages if (msgMetadata.getMessage.headerType() == MessageHeader.RecordBatch) { - // Buffer backed output large enough to hold the complete serialized message - val bbout = new ByteBufferOutputStream(4 + msgMetadata.getMessageLength + bodyLength) + // Buffer backed output large enough to hold 8-byte length + complete serialized message + val bbout = new ByteBufferOutputStream(8 + msgMetadata.getMessageLength + bodyLength) // Write message metadata to ByteBuffer output stream MessageSerializer.writeMessageBuffer( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala index 6147d6fefd52a..501e1c460f9c9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala @@ -62,6 +62,11 @@ object ArrowWriter { case (ArrayType(_, _), vector: ListVector) => val elementVector = createFieldWriter(vector.getDataVector()) new ArrayWriter(vector, elementVector) + case (MapType(_, _, _), vector: MapVector) => + val entryWriter = createFieldWriter(vector.getDataVector).asInstanceOf[StructWriter] + val keyWriter = createFieldWriter(entryWriter.valueVector.getChild(MapVector.KEY_NAME)) + val valueWriter = createFieldWriter(entryWriter.valueVector.getChild(MapVector.VALUE_NAME)) + new MapWriter(vector, keyWriter, valueWriter) case (StructType(_), vector: StructVector) => val children = (0 until vector.size()).map { ordinal => createFieldWriter(vector.getChildByOrdinal(ordinal)) @@ -343,3 +348,38 @@ private[arrow] class StructWriter( children.foreach(_.reset()) } } + +private[arrow] class MapWriter( + val valueVector: MapVector, + val keyWriter: ArrowFieldWriter, + val valueWriter: ArrowFieldWriter) extends ArrowFieldWriter { + + override def setNull(): Unit = {} + + override def setValue(input: SpecializedGetters, ordinal: Int): Unit = { + val map = input.getMap(ordinal) + valueVector.startNewValue(count) + val keys = map.keyArray() + val values = map.valueArray() + var i = 0 + while (i < map.numElements()) { + keyWriter.write(keys, i) + valueWriter.write(values, i) + i += 1 + } + + valueVector.endValue(count, map.numElements()) + } + + override def finish(): Unit = { + super.finish() + keyWriter.finish() + valueWriter.finish() + } + + override def reset(): Unit = { + super.reset() + keyWriter.reset() + valueWriter.reset() + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala index b072a7f5d914c..c35c48496e1c9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.execution import java.util.concurrent.TimeUnit._ +import scala.collection.mutable import scala.concurrent.{ExecutionContext, Future} import scala.concurrent.duration.Duration @@ -28,16 +29,16 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.BindReferences.bindReferences import org.apache.spark.sql.catalyst.expressions.codegen._ -import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.metric.SQLMetrics +import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} import org.apache.spark.sql.types.{LongType, StructType} -import org.apache.spark.util.ThreadUtils +import org.apache.spark.util.{ThreadUtils, Utils} import org.apache.spark.util.random.{BernoulliCellSampler, PoissonSampler} /** Physical plan for Project. */ case class ProjectExec(projectList: Seq[NamedExpression], child: SparkPlan) - extends UnaryExecNode with CodegenSupport { + extends UnaryExecNode with CodegenSupport with AliasAwareOutputPartitioning { override def output: Seq[Attribute] = projectList.map(_.toAttribute) @@ -80,7 +81,7 @@ case class ProjectExec(projectList: Seq[NamedExpression], child: SparkPlan) override def outputOrdering: Seq[SortOrder] = child.outputOrdering - override def outputPartitioning: Partitioning = child.outputPartitioning + override protected def outputExpressions: Seq[NamedExpression] = projectList override def verboseStringWithOperatorId(): String = { s""" @@ -91,7 +92,6 @@ case class ProjectExec(projectList: Seq[NamedExpression], child: SparkPlan) } } - /** Physical plan for Filter. */ case class FilterExec(condition: Expression, child: SparkPlan) extends UnaryExecNode with CodegenSupport with PredicateHelper { @@ -171,6 +171,7 @@ case class FilterExec(condition: Expression, child: SparkPlan) // This is very perf sensitive. // TODO: revisit this. We can consider reordering predicates as well. val generatedIsNotNullChecks = new Array[Boolean](notNullPreds.length) + val extraIsNotNullAttrs = mutable.Set[Attribute]() val generated = otherPreds.map { c => val nullChecks = c.references.map { r => val idx = notNullPreds.indexWhere { n => n.asInstanceOf[IsNotNull].child.semanticEquals(r)} @@ -178,6 +179,9 @@ case class FilterExec(condition: Expression, child: SparkPlan) generatedIsNotNullChecks(idx) = true // Use the child's output. The nullability is what the child produced. genPredicate(notNullPreds(idx), input, child.output) + } else if (notNullAttributes.contains(r.exprId) && !extraIsNotNullAttrs.contains(r)) { + extraIsNotNullAttrs += r + genPredicate(IsNotNull(r), input, child.output) } else { "" } @@ -222,7 +226,7 @@ case class FilterExec(condition: Expression, child: SparkPlan) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") child.execute().mapPartitionsWithIndexInternal { (index, iter) => - val predicate = newPredicate(condition, child.output) + val predicate = Predicate.create(condition, child.output) predicate.initialize(0) iter.filter { row => val r = predicate.eval(row) @@ -294,7 +298,9 @@ case class SampleExec( child.asInstanceOf[CodegenSupport].produce(ctx, this) } - override def needCopyResult: Boolean = withReplacement + override def needCopyResult: Boolean = { + child.asInstanceOf[CodegenSupport].needCopyResult || withReplacement + } override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { val numOutput = metricTerm(ctx, "numOutputRows") @@ -743,7 +749,9 @@ case class SubqueryExec(name: String, child: SparkPlan) private lazy val relationFuture: Future[Array[InternalRow]] = { // relationFuture is used in "doExecute". Therefore we can get the execution id correctly here. val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) - Future { + SQLExecution.withThreadLocalCaptured[Array[InternalRow]]( + sqlContext.sparkSession, + SubqueryExec.executionContext) { // This will run in another thread. Set the execution id so that we can connect these jobs // with the correct execution. SQLExecution.withExecutionId(sqlContext.sparkSession, executionId) { @@ -758,7 +766,7 @@ case class SubqueryExec(name: String, child: SparkPlan) SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toSeq) rows } - }(SubqueryExec.executionContext) + } } protected override def doCanonicalize(): SparkPlan = { @@ -782,7 +790,8 @@ case class SubqueryExec(name: String, child: SparkPlan) object SubqueryExec { private[execution] val executionContext = ExecutionContext.fromExecutorService( - ThreadUtils.newDaemonCachedThreadPool("subquery", 16)) + ThreadUtils.newDaemonCachedThreadPool("subquery", + SQLConf.get.getConf(StaticSQLConf.SUBQUERY_MAX_THREAD_THRESHOLD))) } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnAccessor.scala index 85c36b7da9498..06f411dec158d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnAccessor.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnAccessor.scala @@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.expressions.{UnsafeArrayData, UnsafeMapData import org.apache.spark.sql.execution.columnar.compression.CompressibleColumnAccessor import org.apache.spark.sql.execution.vectorized.WritableColumnVector import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.CalendarInterval /** * An `Iterator` like trait used to extract values from columnar byte buffer. When a value is @@ -36,7 +37,7 @@ import org.apache.spark.sql.types._ private[columnar] trait ColumnAccessor { initialize() - protected def initialize() + protected def initialize(): Unit def hasNext: Boolean @@ -50,7 +51,7 @@ private[columnar] abstract class BasicColumnAccessor[JvmType]( protected val columnType: ColumnType[JvmType]) extends ColumnAccessor { - protected def initialize() {} + protected def initialize(): Unit = {} override def hasNext: Boolean = buffer.hasRemaining @@ -104,6 +105,10 @@ private[columnar] class BinaryColumnAccessor(buffer: ByteBuffer) extends BasicColumnAccessor[Array[Byte]](buffer, BINARY) with NullableColumnAccessor +private[columnar] class IntervalColumnAccessor(buffer: ByteBuffer, dataType: CalendarIntervalType) + extends BasicColumnAccessor[CalendarInterval](buffer, CALENDAR_INTERVAL) + with NullableColumnAccessor + private[columnar] class CompactDecimalColumnAccessor(buffer: ByteBuffer, dataType: DecimalType) extends NativeColumnAccessor(buffer, COMPACT_DECIMAL(dataType)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnBuilder.scala index d30655e0c4a20..3d94681a2fb31 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnBuilder.scala @@ -125,6 +125,9 @@ class StringColumnBuilder extends NativeColumnBuilder(new StringColumnStats, STR private[columnar] class BinaryColumnBuilder extends ComplexColumnBuilder(new BinaryColumnStats, BINARY) +private[columnar] +class IntervalColumnBuilder extends ComplexColumnBuilder(new IntervalColumnStats, CALENDAR_INTERVAL) + private[columnar] class CompactDecimalColumnBuilder(dataType: DecimalType) extends NativeColumnBuilder(new DecimalColumnStats(dataType), COMPACT_DECIMAL(dataType)) @@ -176,6 +179,7 @@ private[columnar] object ColumnBuilder { case DoubleType => new DoubleColumnBuilder case StringType => new StringColumnBuilder case BinaryType => new BinaryColumnBuilder + case CalendarIntervalType => new IntervalColumnBuilder case dt: DecimalType if dt.precision <= Decimal.MAX_LONG_DIGITS => new CompactDecimalColumnBuilder(dt) case dt: DecimalType => new DecimalColumnBuilder(dt) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala index bc7e73ae1ba87..20ecc57c49e75 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.columnar import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference} import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.UTF8String +import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} private[columnar] class ColumnStatisticsSchema(a: Attribute) extends Serializable { val upperBound = AttributeReference(a.name + ".upperBound", a.dataType, nullable = true)() @@ -295,6 +295,20 @@ private[columnar] final class BinaryColumnStats extends ColumnStats { Array[Any](null, null, nullCount, count, sizeInBytes) } +private[columnar] final class IntervalColumnStats extends ColumnStats { + override def gatherStats(row: InternalRow, ordinal: Int): Unit = { + if (!row.isNullAt(ordinal)) { + sizeInBytes += CALENDAR_INTERVAL.actualSize(row, ordinal) + count += 1 + } else { + gatherNullStats + } + } + + override def collectedStatistics: Array[Any] = + Array[Any](null, null, nullCount, count, sizeInBytes) +} + private[columnar] final class DecimalColumnStats(precision: Int, scale: Int) extends ColumnStats { def this(dt: DecimalType) = this(dt.precision, dt.scale) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnType.scala index 542a10fc175c0..d3c8e9251cefd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnType.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnType.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ import org.apache.spark.unsafe.Platform -import org.apache.spark.unsafe.types.UTF8String +import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} /** @@ -221,7 +221,8 @@ private[columnar] object INT extends NativeColumnType(IntegerType, 4) { override def getField(row: InternalRow, ordinal: Int): Int = row.getInt(ordinal) - override def copyField(from: InternalRow, fromOrdinal: Int, to: InternalRow, toOrdinal: Int) { + override def copyField(from: InternalRow, fromOrdinal: Int, + to: InternalRow, toOrdinal: Int): Unit = { to.setInt(toOrdinal, from.getInt(fromOrdinal)) } } @@ -249,7 +250,8 @@ private[columnar] object LONG extends NativeColumnType(LongType, 8) { override def getField(row: InternalRow, ordinal: Int): Long = row.getLong(ordinal) - override def copyField(from: InternalRow, fromOrdinal: Int, to: InternalRow, toOrdinal: Int) { + override def copyField(from: InternalRow, fromOrdinal: Int, + to: InternalRow, toOrdinal: Int): Unit = { to.setLong(toOrdinal, from.getLong(fromOrdinal)) } } @@ -277,7 +279,8 @@ private[columnar] object FLOAT extends NativeColumnType(FloatType, 4) { override def getField(row: InternalRow, ordinal: Int): Float = row.getFloat(ordinal) - override def copyField(from: InternalRow, fromOrdinal: Int, to: InternalRow, toOrdinal: Int) { + override def copyField(from: InternalRow, fromOrdinal: Int, + to: InternalRow, toOrdinal: Int): Unit = { to.setFloat(toOrdinal, from.getFloat(fromOrdinal)) } } @@ -305,7 +308,8 @@ private[columnar] object DOUBLE extends NativeColumnType(DoubleType, 8) { override def getField(row: InternalRow, ordinal: Int): Double = row.getDouble(ordinal) - override def copyField(from: InternalRow, fromOrdinal: Int, to: InternalRow, toOrdinal: Int) { + override def copyField(from: InternalRow, fromOrdinal: Int, + to: InternalRow, toOrdinal: Int): Unit = { to.setDouble(toOrdinal, from.getDouble(fromOrdinal)) } } @@ -331,7 +335,8 @@ private[columnar] object BOOLEAN extends NativeColumnType(BooleanType, 1) { override def getField(row: InternalRow, ordinal: Int): Boolean = row.getBoolean(ordinal) - override def copyField(from: InternalRow, fromOrdinal: Int, to: InternalRow, toOrdinal: Int) { + override def copyField(from: InternalRow, fromOrdinal: Int, + to: InternalRow, toOrdinal: Int): Unit = { to.setBoolean(toOrdinal, from.getBoolean(fromOrdinal)) } } @@ -359,7 +364,8 @@ private[columnar] object BYTE extends NativeColumnType(ByteType, 1) { override def getField(row: InternalRow, ordinal: Int): Byte = row.getByte(ordinal) - override def copyField(from: InternalRow, fromOrdinal: Int, to: InternalRow, toOrdinal: Int) { + override def copyField(from: InternalRow, fromOrdinal: Int, + to: InternalRow, toOrdinal: Int): Unit = { to.setByte(toOrdinal, from.getByte(fromOrdinal)) } } @@ -387,7 +393,8 @@ private[columnar] object SHORT extends NativeColumnType(ShortType, 2) { override def getField(row: InternalRow, ordinal: Int): Short = row.getShort(ordinal) - override def copyField(from: InternalRow, fromOrdinal: Int, to: InternalRow, toOrdinal: Int) { + override def copyField(from: InternalRow, fromOrdinal: Int, + to: InternalRow, toOrdinal: Int): Unit = { to.setShort(toOrdinal, from.getShort(fromOrdinal)) } } @@ -452,7 +459,8 @@ private[columnar] object STRING row.getUTF8String(ordinal) } - override def copyField(from: InternalRow, fromOrdinal: Int, to: InternalRow, toOrdinal: Int) { + override def copyField(from: InternalRow, fromOrdinal: Int, + to: InternalRow, toOrdinal: Int): Unit = { setField(to, toOrdinal, getField(from, fromOrdinal)) } @@ -496,7 +504,8 @@ private[columnar] case class COMPACT_DECIMAL(precision: Int, scale: Int) row.setDecimal(ordinal, value, precision) } - override def copyField(from: InternalRow, fromOrdinal: Int, to: InternalRow, toOrdinal: Int) { + override def copyField(from: InternalRow, fromOrdinal: Int, + to: InternalRow, toOrdinal: Int): Unit = { setField(to, toOrdinal, getField(from, fromOrdinal)) } } @@ -696,6 +705,44 @@ private[columnar] case class MAP(dataType: MapType) override def clone(v: UnsafeMapData): UnsafeMapData = v.copy() } +private[columnar] object CALENDAR_INTERVAL extends ColumnType[CalendarInterval] { + + override def dataType: DataType = CalendarIntervalType + + override def defaultSize: Int = 16 + + override def getField(row: InternalRow, ordinal: Int): CalendarInterval = row.getInterval(ordinal) + + override def setField(row: InternalRow, ordinal: Int, value: CalendarInterval): Unit = { + row.setInterval(ordinal, value) + } + + override def extract(buffer: ByteBuffer): CalendarInterval = { + val months = ByteBufferHelper.getInt(buffer) + val days = ByteBufferHelper.getInt(buffer) + val microseconds = ByteBufferHelper.getLong(buffer) + new CalendarInterval(months, days, microseconds) + } + + // copy the bytes from ByteBuffer to UnsafeRow + override def extract(buffer: ByteBuffer, row: InternalRow, ordinal: Int): Unit = { + if (row.isInstanceOf[MutableUnsafeRow]) { + val cursor = buffer.position() + buffer.position(cursor + defaultSize) + row.asInstanceOf[MutableUnsafeRow].writer.write(ordinal, buffer.array(), + buffer.arrayOffset() + cursor, defaultSize) + } else { + setField(row, ordinal, extract(buffer)) + } + } + + override def append(v: CalendarInterval, buffer: ByteBuffer): Unit = { + ByteBufferHelper.putInt(buffer, v.months) + ByteBufferHelper.putInt(buffer, v.days) + ByteBufferHelper.putLong(buffer, v.microseconds) + } +} + private[columnar] object ColumnType { @tailrec def apply(dataType: DataType): ColumnType[_] = { @@ -710,6 +757,7 @@ private[columnar] object ColumnType { case DoubleType => DOUBLE case StringType => STRING case BinaryType => BINARY + case i: CalendarIntervalType => CALENDAR_INTERVAL case dt: DecimalType if dt.precision <= Decimal.MAX_LONG_DIGITS => COMPACT_DECIMAL(dt) case dt: DecimalType => LARGE_DECIMAL(dt) case arr: ArrayType => ARRAY(arr) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala index 2d699e8a9d088..bd2d06665a910 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala @@ -22,6 +22,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.{CodeAndComment, CodeFormatter, CodeGenerator, UnsafeRowWriter} import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.CalendarInterval /** * An Iterator to walk through the InternalRows from a CachedBatch @@ -51,6 +52,10 @@ class MutableUnsafeRow(val writer: UnsafeRowWriter) extends BaseGenericInternalR // the writer will be used directly to avoid creating wrapper objects override def setDecimal(i: Int, v: Decimal, precision: Int): Unit = throw new UnsupportedOperationException + + override def setInterval(i: Int, value: CalendarInterval): Unit = + throw new UnsupportedOperationException + override def update(i: Int, v: Any): Unit = throw new UnsupportedOperationException // all other methods inherited from GenericMutableRow are not need @@ -81,6 +86,7 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera case DoubleType => classOf[DoubleColumnAccessor].getName case StringType => classOf[StringColumnAccessor].getName case BinaryType => classOf[BinaryColumnAccessor].getName + case CalendarIntervalType => classOf[IntervalColumnAccessor].getName case dt: DecimalType if dt.precision <= Decimal.MAX_LONG_DIGITS => classOf[CompactDecimalColumnAccessor].getName case dt: DecimalType => classOf[DecimalColumnAccessor].getName diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala index 8d13cfb93d270..f03c2586048bd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala @@ -310,7 +310,7 @@ case class InMemoryTableScanExec( val buffers = relation.cacheBuilder.cachedColumnBuffers buffers.mapPartitionsWithIndexInternal { (index, cachedBatchIterator) => - val partitionFilter = newPredicate( + val partitionFilter = Predicate.create( partitionFilters.reduceOption(And).getOrElse(Literal(true)), schema) partitionFilter.initialize(index) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzePartitionCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzePartitionCommand.scala index 18fefa0a6f19f..33b29bde93ee5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzePartitionCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzePartitionCommand.scala @@ -106,11 +106,12 @@ case class AnalyzePartitionCommand( // Update the metastore if newly computed statistics are different from those // recorded in the metastore. - val newPartitions = partitions.flatMap { p => - val newTotalSize = CommandUtils.calculateLocationSize( - sessionState, tableMeta.identifier, p.storage.locationUri) + + val sizes = CommandUtils.calculateMultipleLocationSizes(sparkSession, tableMeta.identifier, + partitions.map(_.storage.locationUri)) + val newPartitions = partitions.zipWithIndex.flatMap { case (p, idx) => val newRowCount = rowCounts.get(p.spec) - val newStats = CommandUtils.compareAndGetNewStats(tableMeta.stats, newTotalSize, newRowCount) + val newStats = CommandUtils.compareAndGetNewStats(p.stats, sizes(idx), newRowCount) newStats.map(_ => p.copy(stats = newStats)) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala index b644e6dc471d6..81157ca0bfe9b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala @@ -31,12 +31,23 @@ import org.apache.spark.sql.catalyst.catalog.{CatalogColumnStat, CatalogStatisti import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.plans.logical._ -import org.apache.spark.sql.catalyst.util.ArrayData +import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData} import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.execution.datasources.{DataSourceUtils, InMemoryFileIndex} import org.apache.spark.sql.internal.{SessionState, SQLConf} import org.apache.spark.sql.types._ +/** + * For the purpose of calculating total directory sizes, use this filter to + * ignore some irrelevant files. + * @param stagingDir hive staging dir + */ +class PathFilterIgnoreNonData(stagingDir: String) extends PathFilter with Serializable { + override def accept(path: Path): Boolean = { + val fileName = path.getName + !fileName.startsWith(stagingDir) && DataSourceUtils.isDataFile(fileName) + } +} object CommandUtils extends Logging { @@ -50,34 +61,31 @@ object CommandUtils extends Logging { catalog.alterTableStats(table.identifier, Some(newStats)) } else if (table.stats.nonEmpty) { catalog.alterTableStats(table.identifier, None) + } else { + // In other cases, we still need to invalidate the table relation cache. + catalog.refreshTable(table.identifier) } } def calculateTotalSize(spark: SparkSession, catalogTable: CatalogTable): BigInt = { val sessionState = spark.sessionState - if (catalogTable.partitionColumnNames.isEmpty) { - calculateLocationSize(sessionState, catalogTable.identifier, catalogTable.storage.locationUri) + val startTime = System.nanoTime() + val totalSize = if (catalogTable.partitionColumnNames.isEmpty) { + calculateSingleLocationSize(sessionState, catalogTable.identifier, + catalogTable.storage.locationUri) } else { // Calculate table size as a sum of the visible partitions. See SPARK-21079 val partitions = sessionState.catalog.listPartitions(catalogTable.identifier) - if (spark.sessionState.conf.parallelFileListingInStatsComputation) { - val paths = partitions.map(x => new Path(x.storage.locationUri.get)) - val stagingDir = sessionState.conf.getConfString("hive.exec.stagingdir", ".hive-staging") - val pathFilter = new PathFilter with Serializable { - override def accept(path: Path): Boolean = isDataPath(path, stagingDir) - } - val fileStatusSeq = InMemoryFileIndex.bulkListLeafFiles( - paths, sessionState.newHadoopConf(), pathFilter, spark, areRootPaths = true) - fileStatusSeq.flatMap(_._2.map(_.getLen)).sum - } else { - partitions.map { p => - calculateLocationSize(sessionState, catalogTable.identifier, p.storage.locationUri) - }.sum - } + logInfo(s"Starting to calculate sizes for ${partitions.length} partitions.") + val paths = partitions.map(_.storage.locationUri) + calculateMultipleLocationSizes(spark, catalogTable.identifier, paths).sum } + logInfo(s"It took ${(System.nanoTime() - startTime) / (1000 * 1000)} ms to calculate" + + s" the total size for table ${catalogTable.identifier}.") + totalSize } - def calculateLocationSize( + def calculateSingleLocationSize( sessionState: SessionState, identifier: TableIdentifier, locationUri: Option[URI]): Long = { @@ -110,7 +118,6 @@ object CommandUtils extends Logging { } val startTime = System.nanoTime() - logInfo(s"Starting to calculate the total file size under path $locationUri.") val size = locationUri.map { p => val path = new Path(p) try { @@ -125,11 +132,44 @@ object CommandUtils extends Logging { } }.getOrElse(0L) val durationInMs = (System.nanoTime() - startTime) / (1000 * 1000) - logInfo(s"It took $durationInMs ms to calculate the total file size under path $locationUri.") + logDebug(s"It took $durationInMs ms to calculate the total file size under path $locationUri.") size } + def calculateMultipleLocationSizes( + sparkSession: SparkSession, + tid: TableIdentifier, + paths: Seq[Option[URI]]): Seq[Long] = { + if (sparkSession.sessionState.conf.parallelFileListingInStatsComputation) { + calculateMultipleLocationSizesInParallel(sparkSession, paths.map(_.map(new Path(_)))) + } else { + paths.map(p => calculateSingleLocationSize(sparkSession.sessionState, tid, p)) + } + } + + /** + * Launch a Job to list all leaf files in `paths` and compute the total size + * for each path. + * @param sparkSession the [[SparkSession]] + * @param paths the Seq of [[Option[Path]]]s + * @return a Seq of same size as `paths` where i-th element is total size of `paths(i)` or 0 + * if `paths(i)` is None + */ + def calculateMultipleLocationSizesInParallel( + sparkSession: SparkSession, + paths: Seq[Option[Path]]): Seq[Long] = { + val stagingDir = sparkSession.sessionState.conf + .getConfString("hive.exec.stagingdir", ".hive-staging") + val filter = new PathFilterIgnoreNonData(stagingDir) + val sizes = InMemoryFileIndex.bulkListLeafFiles(paths.flatten, + sparkSession.sessionState.newHadoopConf(), filter, sparkSession, areRootPaths = true).map { + case (_, files) => files.map(_.getLen).sum + } + // the size is 0 where paths(i) is not defined and sizes(i) where it is defined + paths.zipWithIndex.map { case (p, idx) => p.map(_ => sizes(idx)).getOrElse(0L) } + } + def compareAndGetNewStats( oldStats: Option[CatalogStatistics], newTotalSize: BigInt, @@ -214,7 +254,9 @@ object CommandUtils extends Logging { val namedExprs = attrsToGenHistogram.map { attr => val aggFunc = - new ApproximatePercentile(attr, Literal(percentiles), Literal(conf.percentileAccuracy)) + new ApproximatePercentile(attr, + Literal(new GenericArrayData(percentiles), ArrayType(DoubleType, false)), + Literal(conf.percentileAccuracy)) val expr = aggFunc.toAggregateExpression() Alias(expr, expr.toString)() } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala index f7d4fa4c4ffcb..18fd2a5ac2330 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution.command -import java.util.UUID +import scala.collection.JavaConverters._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} @@ -26,12 +26,12 @@ import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan} -import org.apache.spark.sql.execution.{LeafExecNode, QueryExecution, SparkPlan, UnaryExecNode} -import org.apache.spark.sql.execution.debug._ +import org.apache.spark.sql.connector.ExternalCommandRunner +import org.apache.spark.sql.execution.{ExplainMode, LeafExecNode, SparkPlan, UnaryExecNode} import org.apache.spark.sql.execution.metric.SQLMetric -import org.apache.spark.sql.execution.streaming.{IncrementalExecution, OffsetSeqMetadata} -import org.apache.spark.sql.streaming.OutputMode +import org.apache.spark.sql.execution.streaming.IncrementalExecution import org.apache.spark.sql.types._ +import org.apache.spark.sql.util.CaseInsensitiveStringMap /** * A logical command that is executed for its side-effects. `RunnableCommand`s are @@ -82,6 +82,10 @@ case class ExecutedCommandExec(cmd: RunnableCommand) extends LeafExecNode { override def executeTake(limit: Int): Array[InternalRow] = sideEffectResult.take(limit).toArray + override def executeTail(limit: Int): Array[InternalRow] = { + sideEffectResult.takeRight(limit).toArray + } + protected override def doExecute(): RDD[InternalRow] = { sqlContext.sparkContext.parallelize(sideEffectResult, 1) } @@ -119,6 +123,10 @@ case class DataWritingCommandExec(cmd: DataWritingCommand, child: SparkPlan) override def executeTake(limit: Int): Array[InternalRow] = sideEffectResult.take(limit).toArray + override def executeTail(limit: Int): Array[InternalRow] = { + sideEffectResult.takeRight(limit).toArray + } + protected override def doExecute(): RDD[InternalRow] = { sqlContext.sparkContext.parallelize(sideEffectResult, 1) } @@ -131,20 +139,15 @@ case class DataWritingCommandExec(cmd: DataWritingCommand, child: SparkPlan) * (but do NOT actually execute it). * * {{{ - * EXPLAIN (EXTENDED | CODEGEN) SELECT * FROM ... + * EXPLAIN (EXTENDED | CODEGEN | COST | FORMATTED) SELECT * FROM ... * }}} * * @param logicalPlan plan to explain - * @param extended whether to do extended explain or not - * @param codegen whether to output generated code from whole-stage codegen or not - * @param cost whether to show cost information for operators. + * @param mode explain mode */ case class ExplainCommand( logicalPlan: LogicalPlan, - extended: Boolean = false, - codegen: Boolean = false, - cost: Boolean = false, - formatted: Boolean = false) + mode: ExplainMode) extends RunnableCommand { override val output: Seq[Attribute] = @@ -152,44 +155,13 @@ case class ExplainCommand( // Run through the optimizer to generate the physical plan. override def run(sparkSession: SparkSession): Seq[Row] = try { - val queryExecution = ExplainCommandUtil.explainedQueryExecution(sparkSession, logicalPlan, - sparkSession.sessionState.executePlan(logicalPlan)) - val outputString = - if (codegen) { - codegenString(queryExecution.executedPlan) - } else if (extended) { - queryExecution.toString - } else if (cost) { - queryExecution.stringWithStats - } else if (formatted) { - queryExecution.simpleString(formatted = true) - } else { - queryExecution.simpleString - } + val outputString = sparkSession.sessionState.executePlan(logicalPlan).explainString(mode) Seq(Row(outputString)) } catch { case cause: TreeNodeException[_] => ("Error occurred during query planning: \n" + cause.getMessage).split("\n").map(Row(_)) } } -object ExplainCommandUtil { - // Returns `QueryExecution` which is used to explain a logical plan. - def explainedQueryExecution( - sparkSession: SparkSession, - logicalPlan: LogicalPlan, - queryExecution: => QueryExecution): QueryExecution = { - if (logicalPlan.isStreaming) { - // This is used only by explaining `Dataset/DataFrame` created by `spark.readStream`, so the - // output mode does not matter since there is no `Sink`. - new IncrementalExecution( - sparkSession, logicalPlan, OutputMode.Append(), "", - UUID.randomUUID, UUID.randomUUID, 0, OffsetSeqMetadata(0, 0)) - } else { - queryExecution - } - } -} - /** An explain command for users to see how a streaming batch is executed. */ case class StreamingExplainCommand( queryExecution: IncrementalExecution, @@ -211,3 +183,21 @@ case class StreamingExplainCommand( ("Error occurred during query planning: \n" + cause.getMessage).split("\n").map(Row(_)) } } + +/** + * Used to execute an arbitrary string command inside an external execution engine + * rather than Spark. Please check [[ExternalCommandRunner]] for more details. + */ +case class ExternalCommandExecutor( + runner: ExternalCommandRunner, + command: String, + options: Map[String, String]) extends RunnableCommand { + + override def output: Seq[Attribute] = + Seq(AttributeReference("command_output", StringType)()) + + override def run(sparkSession: SparkSession): Seq[Row] = { + val output = runner.executeCommand(command, new CaseInsensitiveStringMap(options.asJava)) + output.map(Row(_)) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/databases.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/databases.scala deleted file mode 100644 index 470c736da98b7..0000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/databases.scala +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution.command - -import org.apache.spark.sql.{Row, SparkSession} -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} -import org.apache.spark.sql.types.StringType - - -/** - * A command for users to list the databases/schemas. - * If a databasePattern is supplied then the databases that only match the - * pattern would be listed. - * The syntax of using this command in SQL is: - * {{{ - * SHOW (DATABASES|SCHEMAS) [LIKE 'identifier_with_wildcards']; - * }}} - */ -case class ShowDatabasesCommand(databasePattern: Option[String]) extends RunnableCommand { - - // The result of SHOW DATABASES has one column called 'databaseName' - override val output: Seq[Attribute] = { - AttributeReference("databaseName", StringType, nullable = false)() :: Nil - } - - override def run(sparkSession: SparkSession): Seq[Row] = { - val catalog = sparkSession.sessionState.catalog - val databases = - databasePattern.map(catalog.listDatabases).getOrElse(catalog.listDatabases()) - databases.map { d => Row(d) } - } -} - - -/** - * Command for setting the current database. - * {{{ - * USE database_name; - * }}} - */ -case class SetDatabaseCommand(databaseName: String) extends RunnableCommand { - - override def run(sparkSession: SparkSession): Seq[Row] = { - sparkSession.sessionState.catalog.setCurrentDatabase(databaseName) - Seq.empty[Row] - } -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala index ee5d37cebf2f3..47b213fc2d83b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala @@ -22,6 +22,7 @@ import java.util.concurrent.TimeUnit._ import scala.collection.{GenMap, GenSeq} import scala.collection.parallel.ForkJoinTaskSupport +import scala.collection.parallel.immutable.ParVector import scala.util.control.NonFatal import org.apache.hadoop.conf.Configuration @@ -36,10 +37,12 @@ import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.connector.catalog.{CatalogV2Util, TableCatalog} +import org.apache.spark.sql.connector.catalog.SupportsNamespaces._ import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, PartitioningUtils} import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat import org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaConverter -import org.apache.spark.sql.internal.HiveSerDe +import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} import org.apache.spark.sql.types._ import org.apache.spark.util.{SerializableConfiguration, ThreadUtils} @@ -132,6 +135,27 @@ case class AlterDatabasePropertiesCommand( } } +/** + * A command for users to set new location path for a database + * If the database does not exist, an error message will be issued to indicate the database + * does not exist. + * The syntax of using this command in SQL is: + * {{{ + * ALTER (DATABASE|SCHEMA) database_name SET LOCATION path + * }}} + */ +case class AlterDatabaseSetLocationCommand(databaseName: String, location: String) + extends RunnableCommand { + + override def run(sparkSession: SparkSession): Seq[Row] = { + val catalog = sparkSession.sessionState.catalog + val oldDb = catalog.getDatabaseMetadata(databaseName) + catalog.alterDatabase(oldDb.copy(locationUri = CatalogUtils.stringToURI(location))) + + Seq.empty[Row] + } +} + /** * A command for users to show the name of the database, its comment (if one has been set), and its * root location on the filesystem. When extended is true, it also shows the database's properties @@ -150,19 +174,22 @@ case class DescribeDatabaseCommand( override def run(sparkSession: SparkSession): Seq[Row] = { val dbMetadata: CatalogDatabase = sparkSession.sessionState.catalog.getDatabaseMetadata(databaseName) + val allDbProperties = dbMetadata.properties val result = Row("Database Name", dbMetadata.name) :: - Row("Description", dbMetadata.description) :: - Row("Location", CatalogUtils.URIToString(dbMetadata.locationUri)) :: Nil + Row("Comment", dbMetadata.description) :: + Row("Location", CatalogUtils.URIToString(dbMetadata.locationUri)):: + Row("Owner", allDbProperties.getOrElse(PROP_OWNER, "")) :: Nil if (extended) { - val properties = - if (dbMetadata.properties.isEmpty) { + val properties = allDbProperties -- CatalogV2Util.NAMESPACE_RESERVED_PROPERTIES + val propertiesStr = + if (properties.isEmpty) { "" } else { - dbMetadata.properties.toSeq.mkString("(", ", ", ")") + properties.toSeq.mkString("(", ", ", ")") } - result :+ Row("Properties", properties) + result :+ Row("Properties", propertiesStr) } else { result } @@ -249,7 +276,7 @@ case class AlterTableSetPropertiesCommand( // direct property. val newTable = table.copy( properties = table.properties ++ properties, - comment = properties.get("comment").orElse(table.comment)) + comment = properties.get(TableCatalog.PROP_COMMENT).orElse(table.comment)) catalog.alterTable(newTable) Seq.empty[Row] } @@ -278,14 +305,14 @@ case class AlterTableUnsetPropertiesCommand( DDLUtils.verifyAlterTableType(catalog, table, isView) if (!ifExists) { propKeys.foreach { k => - if (!table.properties.contains(k) && k != "comment") { + if (!table.properties.contains(k) && k != TableCatalog.PROP_COMMENT) { throw new AnalysisException( s"Attempted to unset non-existent property '$k' in table '${table.identifier}'") } } } // If comment is in the table property, we reset it to None - val tableComment = if (propKeys.contains("comment")) None else table.comment + val tableComment = if (propKeys.contains(TableCatalog.PROP_COMMENT)) None else table.comment val newProperties = table.properties.filter { case (k, _) => !propKeys.contains(k) } val newTable = table.copy(properties = newProperties, comment = tableComment) catalog.alterTable(newTable) @@ -448,14 +475,19 @@ case class AlterTableAddPartitionCommand( CatalogTablePartition(normalizedSpec, table.storage.copy( locationUri = location.map(CatalogUtils.stringToURI))) } - catalog.createPartitions(table.identifier, parts, ignoreIfExists = ifNotExists) + + // Hive metastore may not have enough memory to handle millions of partitions in single RPC. + // Also the request to metastore times out when adding lot of partitions in one shot. + // we should split them into smaller batches + val batchSize = conf.getConf(SQLConf.ADD_PARTITION_BATCH_SIZE) + parts.toIterator.grouped(batchSize).foreach { batch => + catalog.createPartitions(table.identifier, batch, ignoreIfExists = ifNotExists) + } if (table.stats.nonEmpty) { if (sparkSession.sessionState.conf.autoSizeUpdateEnabled) { - val addedSize = parts.map { part => - CommandUtils.calculateLocationSize(sparkSession.sessionState, table.identifier, - part.storage.locationUri) - }.sum + val addedSize = CommandUtils.calculateMultipleLocationSizes(sparkSession, table.identifier, + parts.map(_.storage.locationUri)).sum if (addedSize > 0) { val newStats = CatalogStatistics(sizeInBytes = table.stats.get.sizeInBytes + addedSize) catalog.alterTableStats(table.identifier, Some(newStats)) @@ -663,7 +695,7 @@ case class AlterTableRecoverPartitionsCommand( val statusPar: GenSeq[FileStatus] = if (partitionNames.length > 1 && statuses.length > threshold || partitionNames.length > 2) { // parallelize the list of partitions here, then we can have better parallelism later. - val parArray = statuses.par + val parArray = new ParVector(statuses.toVector) parArray.tasksupport = evalTaskSupport parArray } else { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala index d3b2491cd7056..6fdc7f4a58195 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/functions.scala @@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, NoSuchFunctionException} import org.apache.spark.sql.catalyst.catalog.{CatalogFunction, FunctionResource} import org.apache.spark.sql.catalyst.expressions.{Attribute, ExpressionInfo} +import org.apache.spark.sql.catalyst.util.StringUtils import org.apache.spark.sql.types.{StringType, StructField, StructType} @@ -222,6 +223,21 @@ case class ShowFunctionsCommand( case (f, "USER") if showUserFunctions => f.unquotedString case (f, "SYSTEM") if showSystemFunctions => f.unquotedString } - functionNames.sorted.map(Row(_)) + // Hard code "<>", "!=", "between", and "case" for now as there is no corresponding functions. + // "<>", "!=", "between", and "case" is SystemFunctions, only show when showSystemFunctions=true + if (showSystemFunctions) { + (functionNames ++ + StringUtils.filterPattern(FunctionsCommand.virtualOperators, pattern.getOrElse("*"))) + .sorted.map(Row(_)) + } else { + functionNames.sorted.map(Row(_)) + } + } } + +object FunctionsCommand { + // operators that do not have corresponding functions. + // They should be handled `DescribeFunctionCommand`, `ShowFunctionsCommand` + val virtualOperators = Seq("!=", "<>", "between", "case") +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/resources.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/resources.scala index 8fee02a8f6c82..1119e5cb1d288 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/resources.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/resources.scala @@ -47,7 +47,8 @@ case class AddJarCommand(path: String) extends RunnableCommand { */ case class AddFileCommand(path: String) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { - sparkSession.sparkContext.addFile(path) + val recursive = sparkSession.sessionState.conf.addDirectoryRecursiveEnabled + sparkSession.sparkContext.addFile(path, recursive) Seq.empty[Row] } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index 9377cb0174673..61500b773cd7e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -17,26 +17,26 @@ package org.apache.spark.sql.execution.command -import java.io.File import java.net.{URI, URISyntaxException} -import java.nio.file.FileSystems +import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import scala.util.Try import scala.util.control.NonFatal import org.apache.hadoop.fs.{FileContext, FsConstants, Path} +import org.apache.hadoop.fs.permission.{AclEntry, AclEntryScope, AclEntryType, FsAction, FsPermission} import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, UnresolvedAttribute, UnresolvedRelation} +import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, UnresolvedAttribute} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.catalog.CatalogTableType._ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.DescribeTableSchema import org.apache.spark.sql.catalyst.plans.logical._ -import org.apache.spark.sql.catalyst.util.{escapeSingleQuotedString, quoteIdentifier} +import org.apache.spark.sql.catalyst.util.{escapeSingleQuotedString, quoteIdentifier, CaseInsensitiveMap} import org.apache.spark.sql.execution.datasources.{DataSource, PartitioningUtils} import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat import org.apache.spark.sql.execution.datasources.json.JsonFileFormat @@ -45,7 +45,7 @@ import org.apache.spark.sql.execution.datasources.v2.csv.CSVDataSourceV2 import org.apache.spark.sql.execution.datasources.v2.json.JsonDataSourceV2 import org.apache.spark.sql.execution.datasources.v2.orc.OrcDataSourceV2 import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetDataSourceV2 -import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} import org.apache.spark.sql.types._ import org.apache.spark.sql.util.SchemaUtils @@ -55,44 +55,76 @@ import org.apache.spark.sql.util.SchemaUtils * are identical to the ones defined in the source table. * * The CatalogTable attributes copied from the source table are storage(inputFormat, outputFormat, - * serde, compressed, properties), schema, provider, partitionColumnNames, bucketSpec. + * serde, compressed, properties), schema, provider, partitionColumnNames, bucketSpec by default. + * + * Use "CREATE TABLE t1 LIKE t2 USING file_format" to specify new provider for t1. + * For Hive compatibility, use "CREATE TABLE t1 LIKE t2 STORED AS hiveFormat" + * to specify new file storage format (inputFormat, outputFormat, serde) for t1. * * The syntax of using this command in SQL is: * {{{ * CREATE TABLE [IF NOT EXISTS] [db_name.]table_name - * LIKE [other_db_name.]existing_table_name [locationSpec] + * LIKE [other_db_name.]existing_table_name + * [USING provider | + * [ + * [ROW FORMAT row_format] + * [STORED AS file_format] [WITH SERDEPROPERTIES (...)] + * ] + * ] + * [locationSpec] + * [TBLPROPERTIES (property_name=property_value, ...)] * }}} */ case class CreateTableLikeCommand( targetTable: TableIdentifier, sourceTable: TableIdentifier, - location: Option[String], + fileFormat: CatalogStorageFormat, + provider: Option[String], + properties: Map[String, String] = Map.empty, ifNotExists: Boolean) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { val catalog = sparkSession.sessionState.catalog val sourceTableDesc = catalog.getTempViewOrPermanentTableMetadata(sourceTable) - - val newProvider = if (sourceTableDesc.tableType == CatalogTableType.VIEW) { + val newProvider = if (provider.isDefined) { + if (!DDLUtils.isHiveTable(provider)) { + // check the validation of provider input, invalid provider will throw + // AnalysisException, ClassNotFoundException, or NoClassDefFoundError + DataSource.lookupDataSource(provider.get, sparkSession.sessionState.conf) + } + provider + } else if (sourceTableDesc.tableType == CatalogTableType.VIEW) { Some(sparkSession.sessionState.conf.defaultDataSourceName) + } else if (fileFormat.inputFormat.isDefined) { + Some(DDLUtils.HIVE_PROVIDER) } else { sourceTableDesc.provider } + val newStorage = if (fileFormat.inputFormat.isDefined) { + fileFormat + } else { + sourceTableDesc.storage.copy(locationUri = fileFormat.locationUri) + } + // If the location is specified, we create an external table internally. // Otherwise create a managed table. - val tblType = if (location.isEmpty) CatalogTableType.MANAGED else CatalogTableType.EXTERNAL + val tblType = if (newStorage.locationUri.isEmpty) { + CatalogTableType.MANAGED + } else { + CatalogTableType.EXTERNAL + } val newTableDesc = CatalogTable( identifier = targetTable, tableType = tblType, - storage = sourceTableDesc.storage.copy( - locationUri = location.map(CatalogUtils.stringToURI(_))), + storage = newStorage, schema = sourceTableDesc.schema, provider = newProvider, partitionColumnNames = sourceTableDesc.partitionColumnNames, - bucketSpec = sourceTableDesc.bucketSpec) + bucketSpec = sourceTableDesc.bucketSpec, + properties = properties) catalog.createTable(newTableDesc, ifNotExists) Seq.empty[Row] @@ -278,6 +310,13 @@ case class LoadDataCommand( val catalog = sparkSession.sessionState.catalog val targetTable = catalog.getTableMetadata(table) val tableIdentwithDB = targetTable.identifier.quotedString + val normalizedSpec = partition.map { spec => + PartitioningUtils.normalizePartitionSpec( + spec, + targetTable.partitionColumnNames, + tableIdentwithDB, + sparkSession.sessionState.conf.resolver) + } if (targetTable.tableType == CatalogTableType.VIEW) { throw new AnalysisException(s"Target table in LOAD DATA cannot be a view: $tableIdentwithDB") @@ -297,13 +336,6 @@ case class LoadDataCommand( s"do not match number of partitioned columns in table " + s"(${targetTable.partitionColumnNames.size})") } - partition.get.keys.foreach { colName => - if (!targetTable.partitionColumnNames.contains(colName)) { - throw new AnalysisException(s"LOAD DATA target table $tableIdentwithDB is partitioned, " + - s"but the specified partition spec refers to a column that is not partitioned: " + - s"'$colName'") - } - } } else { if (partition.nonEmpty) { throw new AnalysisException(s"LOAD DATA target table $tableIdentwithDB is not " + @@ -353,7 +385,7 @@ case class LoadDataCommand( catalog.loadPartition( targetTable.identifier, loadPath.toString, - partition.get, + normalizedSpec.get, isOverwrite, inheritTableSpecs = true, isSrcLocal = isLocal) @@ -464,13 +496,74 @@ case class TruncateTableCommand( partLocations } val hadoopConf = spark.sessionState.newHadoopConf() + val ignorePermissionAcl = SQLConf.get.truncateTableIgnorePermissionAcl locations.foreach { location => if (location.isDefined) { val path = new Path(location.get) try { val fs = path.getFileSystem(hadoopConf) + + // Not all fs impl. support these APIs. + var optPermission: Option[FsPermission] = None + var optAcls: Option[java.util.List[AclEntry]] = None + if (!ignorePermissionAcl) { + val fileStatus = fs.getFileStatus(path) + try { + optPermission = Some(fileStatus.getPermission()) + } catch { + case NonFatal(_) => // do nothing + } + + try { + optAcls = Some(fs.getAclStatus(path).getEntries) + } catch { + case NonFatal(_) => // do nothing + } + } + fs.delete(path, true) + + // We should keep original permission/acl of the path. + // For owner/group, only super-user can set it, for example on HDFS. Because + // current user can delete the path, we assume the user/group is correct or not an issue. fs.mkdirs(path) + if (!ignorePermissionAcl) { + optPermission.foreach { permission => + try { + fs.setPermission(path, permission) + } catch { + case NonFatal(e) => + throw new SecurityException( + s"Failed to set original permission $permission back to " + + s"the created path: $path. Exception: ${e.getMessage}") + } + } + optAcls.foreach { acls => + val aclEntries = acls.asScala.filter(_.getName != null).asJava + + // If the path doesn't have default ACLs, `setAcl` API will throw an error + // as it expects user/group/other permissions must be in ACL entries. + // So we need to add tradition user/group/other permission + // in the form of ACL. + optPermission.map { permission => + aclEntries.add(newAclEntry(AclEntryScope.ACCESS, + AclEntryType.USER, permission.getUserAction())) + aclEntries.add(newAclEntry(AclEntryScope.ACCESS, + AclEntryType.GROUP, permission.getGroupAction())) + aclEntries.add(newAclEntry(AclEntryScope.ACCESS, + AclEntryType.OTHER, permission.getOtherAction())) + } + + try { + fs.setAcl(path, aclEntries) + } catch { + case NonFatal(e) => + throw new SecurityException( + s"Failed to set original ACL $aclEntries back to " + + s"the created path: $path. Exception: ${e.getMessage}") + } + } + } } catch { case NonFatal(e) => throw new AnalysisException( @@ -497,6 +590,16 @@ case class TruncateTableCommand( } Seq.empty[Row] } + + private def newAclEntry( + scope: AclEntryScope, + aclType: AclEntryType, + permission: FsAction): AclEntry = { + new AclEntry.Builder() + .setScope(scope) + .setType(aclType) + .setPermission(permission).build() + } } abstract class DescribeCommandBase extends RunnableCommand { @@ -690,7 +793,8 @@ case class DescribeColumnCommand( } val catalogTable = catalog.getTempViewOrPermanentTableMetadata(table) - val colStats = catalogTable.stats.map(_.colStats).getOrElse(Map.empty) + val colStatsMap = catalogTable.stats.map(_.colStats).getOrElse(Map.empty) + val colStats = if (conf.caseSensitiveAnalysis) colStatsMap else CaseInsensitiveMap(colStatsMap) val cs = colStats.get(field.name) val comment = if (field.metadata.contains("comment")) { @@ -822,22 +926,15 @@ case class ShowTablePropertiesCommand(table: TableIdentifier, propertyKey: Optio } override def run(sparkSession: SparkSession): Seq[Row] = { - val catalog = sparkSession.sessionState.catalog - - if (catalog.isTemporaryTable(table)) { - Seq.empty[Row] - } else { - val catalogTable = sparkSession.sessionState.catalog.getTableMetadata(table) - - propertyKey match { - case Some(p) => - val propValue = catalogTable - .properties - .getOrElse(p, s"Table ${catalogTable.qualifiedName} does not have property: $p") - Seq(Row(propValue)) - case None => - catalogTable.properties.map(p => Row(p._1, p._2)).toSeq - } + val catalogTable = sparkSession.sessionState.catalog.getTableMetadata(table) + propertyKey match { + case Some(p) => + val propValue = catalogTable + .properties + .getOrElse(p, s"Table ${catalogTable.qualifiedName} does not have property: $p") + Seq(Row(propValue)) + case None => + catalogTable.properties.map(p => Row(p._1, p._2)).toSeq } } } @@ -859,12 +956,8 @@ case class ShowColumnsCommand( override def run(sparkSession: SparkSession): Seq[Row] = { val catalog = sparkSession.sessionState.catalog - val resolver = sparkSession.sessionState.conf.resolver val lookupTable = databaseName match { case None => tableName - case Some(db) if tableName.database.exists(!resolver(_, db)) => - throw new AnalysisException( - s"SHOW COLUMNS with conflicting databases: '$db' != '${tableName.database.get}'") case Some(db) => TableIdentifier(tableName.identifier, Some(db)) } val table = catalog.getTempViewOrPermanentTableMetadata(lookupTable) @@ -935,7 +1028,57 @@ case class ShowPartitionsCommand( } } -case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableCommand { +/** + * Provides common utilities between `ShowCreateTableCommand` and `ShowCreateTableAsSparkCommand`. + */ +trait ShowCreateTableCommandBase { + + protected val table: TableIdentifier + + protected def showTableLocation(metadata: CatalogTable, builder: StringBuilder): Unit = { + if (metadata.tableType == EXTERNAL) { + metadata.storage.locationUri.foreach { location => + builder ++= s"LOCATION '${escapeSingleQuotedString(CatalogUtils.URIToString(location))}'\n" + } + } + } + + protected def showTableComment(metadata: CatalogTable, builder: StringBuilder): Unit = { + metadata + .comment + .map("COMMENT '" + escapeSingleQuotedString(_) + "'\n") + .foreach(builder.append) + } + + protected def showTableProperties(metadata: CatalogTable, builder: StringBuilder): Unit = { + if (metadata.properties.nonEmpty) { + val props = metadata.properties.map { case (key, value) => + s"'${escapeSingleQuotedString(key)}' = '${escapeSingleQuotedString(value)}'" + } + + builder ++= "TBLPROPERTIES " + builder ++= concatByMultiLines(props) + } + } + + + protected def concatByMultiLines(iter: Iterable[String]): String = { + iter.mkString("(\n ", ",\n ", ")\n") + } +} + +/** + * A command that shows the Spark DDL syntax that can be used to create a given table. + * For Hive serde table, this command will generate Spark DDL that can be used to + * create corresponding Spark table. + * + * The syntax of using this command in SQL is: + * {{{ + * SHOW CREATE TABLE [db_name.]table_name + * }}} + */ +case class ShowCreateTableCommand(table: TableIdentifier) + extends RunnableCommand with ShowCreateTableCommandBase { override val output: Seq[Attribute] = Seq( AttributeReference("createtab_stmt", StringType, nullable = false)() ) @@ -950,16 +1093,158 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman // TODO: [SPARK-28692] unify this after we unify the // CREATE TABLE syntax for hive serde and data source table. - val stmt = if (DDLUtils.isDatasourceTable(tableMetadata)) { - showCreateDataSourceTable(tableMetadata) + val metadata = if (DDLUtils.isDatasourceTable(tableMetadata)) { + tableMetadata } else { - showCreateHiveTable(tableMetadata) + // For a Hive serde table, we try to convert it to Spark DDL. + if (tableMetadata.unsupportedFeatures.nonEmpty) { + throw new AnalysisException( + "Failed to execute SHOW CREATE TABLE against table " + + s"${tableMetadata.identifier}, which is created by Hive and uses the " + + "following unsupported feature(s)\n" + + tableMetadata.unsupportedFeatures.map(" - " + _).mkString("\n") + ". " + + s"Please use `SHOW CREATE TABLE ${tableMetadata.identifier} AS SERDE` " + + "to show Hive DDL instead." + ) + } + + if (tableMetadata.tableType == VIEW) { + throw new AnalysisException("Hive view isn't supported by SHOW CREATE TABLE") + } + + if ("true".equalsIgnoreCase(tableMetadata.properties.getOrElse("transactional", "false"))) { + throw new AnalysisException( + "SHOW CREATE TABLE doesn't support transactional Hive table. " + + s"Please use `SHOW CREATE TABLE ${tableMetadata.identifier} AS SERDE` " + + "to show Hive DDL instead.") + } + + convertTableMetadata(tableMetadata) } + val stmt = showCreateDataSourceTable(metadata) + Seq(Row(stmt)) } } + private def convertTableMetadata(tableMetadata: CatalogTable): CatalogTable = { + val hiveSerde = HiveSerDe( + serde = tableMetadata.storage.serde, + inputFormat = tableMetadata.storage.inputFormat, + outputFormat = tableMetadata.storage.outputFormat) + + // Looking for Spark data source that maps to to the Hive serde. + // TODO: some Hive fileformat + row serde might be mapped to Spark data source, e.g. CSV. + val source = HiveSerDe.serdeToSource(hiveSerde) + if (source.isEmpty) { + val builder = StringBuilder.newBuilder + hiveSerde.serde.foreach { serde => + builder ++= s" SERDE: $serde" + } + hiveSerde.inputFormat.foreach { format => + builder ++= s" INPUTFORMAT: $format" + } + hiveSerde.outputFormat.foreach { format => + builder ++= s" OUTPUTFORMAT: $format" + } + throw new AnalysisException( + "Failed to execute SHOW CREATE TABLE against table " + + s"${tableMetadata.identifier}, which is created by Hive and uses the " + + "following unsupported serde configuration\n" + + builder.toString() + ) + } else { + // TODO: should we keep Hive serde properties? + val newStorage = tableMetadata.storage.copy(properties = Map.empty) + tableMetadata.copy(provider = source, storage = newStorage) + } + } + + private def showDataSourceTableDataColumns( + metadata: CatalogTable, builder: StringBuilder): Unit = { + val columns = metadata.schema.fields.map(_.toDDL) + builder ++= concatByMultiLines(columns) + } + + private def showDataSourceTableOptions(metadata: CatalogTable, builder: StringBuilder): Unit = { + // For datasource table, there is a provider there in the metadata. + // If it is a Hive table, we already convert its metadata and fill in a provider. + builder ++= s"USING ${metadata.provider.get}\n" + + val dataSourceOptions = SQLConf.get.redactOptions(metadata.storage.properties).map { + case (key, value) => s"${quoteIdentifier(key)} '${escapeSingleQuotedString(value)}'" + } + + if (dataSourceOptions.nonEmpty) { + builder ++= "OPTIONS " + builder ++= concatByMultiLines(dataSourceOptions) + } + } + + private def showDataSourceTableNonDataColumns( + metadata: CatalogTable, builder: StringBuilder): Unit = { + val partCols = metadata.partitionColumnNames + if (partCols.nonEmpty) { + builder ++= s"PARTITIONED BY ${partCols.mkString("(", ", ", ")")}\n" + } + + metadata.bucketSpec.foreach { spec => + if (spec.bucketColumnNames.nonEmpty) { + builder ++= s"CLUSTERED BY ${spec.bucketColumnNames.mkString("(", ", ", ")")}\n" + + if (spec.sortColumnNames.nonEmpty) { + builder ++= s"SORTED BY ${spec.sortColumnNames.mkString("(", ", ", ")")}\n" + } + + builder ++= s"INTO ${spec.numBuckets} BUCKETS\n" + } + } + } + + private def showCreateDataSourceTable(metadata: CatalogTable): String = { + val builder = StringBuilder.newBuilder + + builder ++= s"CREATE TABLE ${table.quotedString} " + showDataSourceTableDataColumns(metadata, builder) + showDataSourceTableOptions(metadata, builder) + showDataSourceTableNonDataColumns(metadata, builder) + showTableComment(metadata, builder) + showTableLocation(metadata, builder) + showTableProperties(metadata, builder) + + builder.toString() + } +} + +/** + * This commands generates the DDL for Hive serde table. + * + * The syntax of using this command in SQL is: + * {{{ + * SHOW CREATE TABLE table_identifier AS SERDE; + * }}} + */ +case class ShowCreateTableAsSerdeCommand(table: TableIdentifier) + extends RunnableCommand with ShowCreateTableCommandBase { + override val output: Seq[Attribute] = Seq( + AttributeReference("createtab_stmt", StringType, nullable = false)() + ) + + override def run(sparkSession: SparkSession): Seq[Row] = { + val catalog = sparkSession.sessionState.catalog + val tableMetadata = catalog.getTableMetadata(table) + + val stmt = if (DDLUtils.isDatasourceTable(tableMetadata)) { + throw new AnalysisException( + s"$table is a Spark data source table. Use `SHOW CREATE TABLE` without `AS SERDE` instead.") + } else { + showCreateHiveTable(tableMetadata) + } + + Seq(Row(stmt)) + } + private def showCreateHiveTable(metadata: CatalogTable): String = { def reportUnsupportedError(features: Seq[String]): Unit = { throw new AnalysisException( @@ -987,10 +1272,10 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman builder ++= s"CREATE$tableTypeString ${table.quotedString}" if (metadata.tableType == VIEW) { - if (metadata.schema.nonEmpty) { - builder ++= metadata.schema.map(_.name).mkString("(", ", ", ")") - } - builder ++= metadata.viewText.mkString(" AS\n", "", "\n") + showViewDataColumns(metadata, builder) + showTableComment(metadata, builder) + showViewProperties(metadata, builder) + showViewText(metadata, builder) } else { showHiveTableHeader(metadata, builder) showTableComment(metadata, builder) @@ -1003,13 +1288,42 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman builder.toString() } + private def showViewDataColumns(metadata: CatalogTable, builder: StringBuilder): Unit = { + if (metadata.schema.nonEmpty) { + val viewColumns = metadata.schema.map { f => + val comment = f.getComment() + .map(escapeSingleQuotedString) + .map(" COMMENT '" + _ + "'") + + // view columns shouldn't have data type info + s"${quoteIdentifier(f.name)}${comment.getOrElse("")}" + } + builder ++= concatByMultiLines(viewColumns) + } + } + + private def showViewProperties(metadata: CatalogTable, builder: StringBuilder): Unit = { + val viewProps = metadata.properties.filterKeys(!_.startsWith(CatalogTable.VIEW_PREFIX)) + if (viewProps.nonEmpty) { + val props = viewProps.map { case (key, value) => + s"'${escapeSingleQuotedString(key)}' = '${escapeSingleQuotedString(value)}'" + } + + builder ++= s"TBLPROPERTIES ${concatByMultiLines(props)}" + } + } + + private def showViewText(metadata: CatalogTable, builder: StringBuilder): Unit = { + builder ++= metadata.viewText.mkString("AS ", "", "\n") + } + private def showHiveTableHeader(metadata: CatalogTable, builder: StringBuilder): Unit = { val columns = metadata.schema.filterNot { column => metadata.partitionColumnNames.contains(column.name) }.map(_.toDDL) if (columns.nonEmpty) { - builder ++= columns.mkString("(", ", ", ")\n") + builder ++= concatByMultiLines(columns) } } @@ -1021,7 +1335,7 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman if (metadata.bucketSpec.isDefined) { val bucketSpec = metadata.bucketSpec.get - builder ++= s"CLUSTERED BY (${bucketSpec.bucketColumnNames.mkString(",")})\n" + builder ++= s"CLUSTERED BY (${bucketSpec.bucketColumnNames.mkString(", ")})\n" if (bucketSpec.sortColumnNames.nonEmpty) { builder ++= s"SORTED BY (${bucketSpec.sortColumnNames.map(_ + " ASC").mkString(", ")})\n" @@ -1036,12 +1350,12 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman storage.serde.foreach { serde => builder ++= s"ROW FORMAT SERDE '$serde'\n" - val serdeProps = metadata.storage.properties.map { + val serdeProps = SQLConf.get.redactOptions(metadata.storage.properties).map { case (key, value) => s"'${escapeSingleQuotedString(key)}' = '${escapeSingleQuotedString(value)}'" } - builder ++= serdeProps.mkString("WITH SERDEPROPERTIES (\n ", ",\n ", "\n)\n") + builder ++= s"WITH SERDEPROPERTIES ${concatByMultiLines(serdeProps)}" } if (storage.inputFormat.isDefined || storage.outputFormat.isDefined) { @@ -1056,83 +1370,4 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman } } } - - private def showTableLocation(metadata: CatalogTable, builder: StringBuilder): Unit = { - if (metadata.tableType == EXTERNAL) { - metadata.storage.locationUri.foreach { location => - builder ++= s"LOCATION '${escapeSingleQuotedString(CatalogUtils.URIToString(location))}'\n" - } - } - } - - private def showTableComment(metadata: CatalogTable, builder: StringBuilder): Unit = { - metadata - .comment - .map("COMMENT '" + escapeSingleQuotedString(_) + "'\n") - .foreach(builder.append) - } - - private def showTableProperties(metadata: CatalogTable, builder: StringBuilder): Unit = { - if (metadata.properties.nonEmpty) { - val props = metadata.properties.map { case (key, value) => - s"'${escapeSingleQuotedString(key)}' = '${escapeSingleQuotedString(value)}'" - } - - builder ++= props.mkString("TBLPROPERTIES (\n ", ",\n ", "\n)\n") - } - } - - private def showCreateDataSourceTable(metadata: CatalogTable): String = { - val builder = StringBuilder.newBuilder - - builder ++= s"CREATE TABLE ${table.quotedString} " - showDataSourceTableDataColumns(metadata, builder) - showDataSourceTableOptions(metadata, builder) - showDataSourceTableNonDataColumns(metadata, builder) - showTableComment(metadata, builder) - showTableLocation(metadata, builder) - showTableProperties(metadata, builder) - - builder.toString() - } - - private def showDataSourceTableDataColumns( - metadata: CatalogTable, builder: StringBuilder): Unit = { - val columns = metadata.schema.fields.map(_.toDDL) - builder ++= columns.mkString("(", ", ", ")\n") - } - - private def showDataSourceTableOptions(metadata: CatalogTable, builder: StringBuilder): Unit = { - builder ++= s"USING ${metadata.provider.get}\n" - - val dataSourceOptions = SQLConf.get.redactOptions(metadata.storage.properties).map { - case (key, value) => s"${quoteIdentifier(key)} '${escapeSingleQuotedString(value)}'" - } - - if (dataSourceOptions.nonEmpty) { - builder ++= "OPTIONS (\n" - builder ++= dataSourceOptions.mkString(" ", ",\n ", "\n") - builder ++= ")\n" - } - } - - private def showDataSourceTableNonDataColumns( - metadata: CatalogTable, builder: StringBuilder): Unit = { - val partCols = metadata.partitionColumnNames - if (partCols.nonEmpty) { - builder ++= s"PARTITIONED BY ${partCols.mkString("(", ", ", ")")}\n" - } - - metadata.bucketSpec.foreach { spec => - if (spec.bucketColumnNames.nonEmpty) { - builder ++= s"CLUSTERED BY ${spec.bucketColumnNames.mkString("(", ", ", ")")}\n" - - if (spec.sortColumnNames.nonEmpty) { - builder ++= s"SORTED BY ${spec.sortColumnNames.mkString("(", ", ", ")")}\n" - } - - builder ++= s"INTO ${spec.numBuckets} BUCKETS\n" - } - } - } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala index b31514827220e..38481dda428a5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala @@ -21,49 +21,14 @@ import scala.collection.mutable import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.analysis.{UnresolvedFunction, UnresolvedRelation} -import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType} +import org.apache.spark.sql.catalyst.analysis.{GlobalTempView, LocalTempView, PersistedView, UnresolvedFunction, UnresolvedRelation, ViewType} +import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType, SessionCatalog} import org.apache.spark.sql.catalyst.expressions.{Alias, SubqueryExpression} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, View} -import org.apache.spark.sql.types.{MetadataBuilder, StructType} +import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.sql.util.SchemaUtils - -/** - * ViewType is used to specify the expected view type when we want to create or replace a view in - * [[CreateViewCommand]]. - */ -sealed trait ViewType { - override def toString: String = getClass.getSimpleName.stripSuffix("$") -} - -/** - * LocalTempView means session-scoped local temporary views. Its lifetime is the lifetime of the - * session that created it, i.e. it will be automatically dropped when the session terminates. It's - * not tied to any databases, i.e. we can't use `db1.view1` to reference a local temporary view. - */ -object LocalTempView extends ViewType - -/** - * GlobalTempView means cross-session global temporary views. Its lifetime is the lifetime of the - * Spark application, i.e. it will be automatically dropped when the application terminates. It's - * tied to a system preserved database `global_temp`, and we must use the qualified name to refer a - * global temp view, e.g. SELECT * FROM global_temp.view1. - */ -object GlobalTempView extends ViewType - -/** - * PersistedView means cross-session persisted views. Persisted views stay until they are - * explicitly dropped by user command. It's always tied to a database, default to the current - * database if not specified. - * - * Note that, Existing persisted view with the same name are not visible to the current session - * while the local temporary view exists, unless the view name is qualified by database. - */ -object PersistedView extends ViewType - - /** * Create or replace a view with given query plan. This command will generate some view-specific * properties(e.g. view default database, view query output column names) and store them as @@ -136,11 +101,12 @@ case class CreateViewCommand( s"specified by CREATE VIEW (num: `${userSpecifiedColumns.length}`).") } + val catalog = sparkSession.sessionState.catalog + // When creating a permanent view, not allowed to reference temporary objects. // This should be called after `qe.assertAnalyzed()` (i.e., `child` can be resolved) - verifyTemporaryObjectsNotExists(sparkSession) + verifyTemporaryObjectsNotExists(catalog) - val catalog = sparkSession.sessionState.catalog if (viewType == LocalTempView) { val aliasedPlan = aliasPlan(sparkSession, analyzedPlan) catalog.createTempView(name.table, aliasedPlan, overrideIfExists = replace) @@ -180,9 +146,8 @@ case class CreateViewCommand( /** * Permanent views are not allowed to reference temp objects, including temp function and views */ - private def verifyTemporaryObjectsNotExists(sparkSession: SparkSession): Unit = { - import sparkSession.sessionState.analyzer.AsTableIdentifier - + private def verifyTemporaryObjectsNotExists(catalog: SessionCatalog): Unit = { + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ if (!isTemporary) { // This func traverses the unresolved plan `child`. Below are the reasons: // 1) Analyzer replaces unresolved temporary views by a SubqueryAlias with the corresponding @@ -190,21 +155,24 @@ case class CreateViewCommand( // added/generated from a temporary view. // 2) The temp functions are represented by multiple classes. Most are inaccessible from this // package (e.g., HiveGenericUDF). - child.collect { - // Disallow creating permanent views based on temporary views. - case UnresolvedRelation(AsTableIdentifier(ident)) - if sparkSession.sessionState.catalog.isTemporaryTable(ident) => - // temporary views are only stored in the session catalog - throw new AnalysisException(s"Not allowed to create a permanent view $name by " + - s"referencing a temporary view $ident") - case other if !other.resolved => other.expressions.flatMap(_.collect { - // Disallow creating permanent views based on temporary UDFs. - case e: UnresolvedFunction - if sparkSession.sessionState.catalog.isTemporaryFunction(e.name) => + def verify(child: LogicalPlan) { + child.collect { + // Disallow creating permanent views based on temporary views. + case UnresolvedRelation(nameParts) if catalog.isTempView(nameParts) => throw new AnalysisException(s"Not allowed to create a permanent view $name by " + - s"referencing a temporary function `${e.name}`") - }) + s"referencing a temporary view ${nameParts.quoted}. " + + "Please create a temp view instead by CREATE TEMP VIEW") + case other if !other.resolved => other.expressions.flatMap(_.collect { + // Traverse subquery plan for any unresolved relations. + case e: SubqueryExpression => verify(e.plan) + // Disallow creating permanent views based on temporary UDFs. + case e: UnresolvedFunction if catalog.isTemporaryFunction(e.name) => + throw new AnalysisException(s"Not allowed to create a permanent view $name by " + + s"referencing a temporary function `${e.name}`") + }) + } } + verify(child) } } @@ -316,13 +284,6 @@ object ViewHelper { import CatalogTable._ - /** - * Generate the view default database in `properties`. - */ - private def generateViewDefaultDatabase(databaseName: String): Map[String, String] = { - Map(VIEW_DEFAULT_DATABASE -> databaseName) - } - /** * Generate the view query output column names in `properties`. */ @@ -372,10 +333,10 @@ object ViewHelper { SchemaUtils.checkColumnNameDuplication( fieldNames, "in the view definition", session.sessionState.conf.resolver) - // Generate the view default database name. - val viewDefaultDatabase = session.sessionState.catalog.getCurrentDatabase + // Generate the view default catalog and namespace. + val manager = session.sessionState.catalogManager removeQueryColumnNames(properties) ++ - generateViewDefaultDatabase(viewDefaultDatabase) ++ + catalogAndNamespaceToProps(manager.currentCatalog.name, manager.currentNamespace) ++ generateQueryColumnNames(queryOutput) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CatalogFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CatalogFileIndex.scala index 8736d0713e0b3..91313f33a78e0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CatalogFileIndex.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/CatalogFileIndex.scala @@ -81,10 +81,15 @@ class CatalogFileIndex( } val partitionSpec = PartitionSpec(partitionSchema, partitions) val timeNs = System.nanoTime() - startTime - new PrunedInMemoryFileIndex( - sparkSession, new Path(baseLocation.get), fileStatusCache, partitionSpec, Option(timeNs)) + new InMemoryFileIndex(sparkSession, + rootPathsSpecified = partitionSpec.partitions.map(_.path), + parameters = Map.empty, + userSpecifiedSchema = Some(partitionSpec.partitionColumns), + fileStatusCache = fileStatusCache, + userSpecifiedPartitionSpec = Some(partitionSpec), + metadataOpsTimeNs = Some(timeNs)) } else { - new InMemoryFileIndex(sparkSession, rootPaths, table.storage.properties, + new InMemoryFileIndex(sparkSession, rootPaths, parameters = table.storage.properties, userSpecifiedSchema = None, fileStatusCache = fileStatusCache) } } @@ -101,23 +106,3 @@ class CatalogFileIndex( override def hashCode(): Int = table.identifier.hashCode() } - -/** - * An override of the standard HDFS listing based catalog, that overrides the partition spec with - * the information from the metastore. - * - * @param tableBasePath The default base path of the Hive metastore table - * @param partitionSpec The partition specifications from Hive metastore - */ -private class PrunedInMemoryFileIndex( - sparkSession: SparkSession, - tableBasePath: Path, - fileStatusCache: FileStatusCache, - override val partitionSpec: PartitionSpec, - override val metadataOpsTimeNs: Option[Long]) - extends InMemoryFileIndex( - sparkSession, - partitionSpec.partitions.map(_.path), - Map.empty, - Some(partitionSpec.partitionColumns), - fileStatusCache) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala index 9376f08351791..3615afcf86c7a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala @@ -33,6 +33,7 @@ import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogUtils} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.sql.connector.catalog.TableProvider import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.command.DataWritingCommand import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat @@ -46,7 +47,6 @@ import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.sources.{RateStreamProvider, TextSocketSourceProvider} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources._ -import org.apache.spark.sql.sources.v2.TableProvider import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{CalendarIntervalType, StructField, StructType} import org.apache.spark.sql.util.SchemaUtils @@ -343,7 +343,12 @@ case class DataSource( val baseRelation = dataSource.createRelation(sparkSession.sqlContext, caseInsensitiveOptions) if (baseRelation.schema != schema) { - throw new AnalysisException(s"$className does not allow user-specified schemas.") + throw new AnalysisException( + "The user-specified schema doesn't match the actual schema: " + + s"user-specified: ${schema.toDDL}, actual: ${baseRelation.schema.toDDL}. If " + + "you're using DataFrameReader.schema API or creating a table, please do not " + + "specify the schema. Or if you're scanning an existed table, please drop " + + "it and re-create it.") } baseRelation @@ -378,8 +383,6 @@ case class DataSource( // This is a non-streaming file based datasource. case (format: FileFormat, _) => - val globbedPaths = - checkAndGlobPathIfNecessary(checkEmptyGlobPath = true, checkFilesExist = checkFilesExist) val useCatalogFileIndex = sparkSession.sqlContext.conf.manageFilesourcePartitions && catalogTable.isDefined && catalogTable.get.tracksPartitionsInCatalog && catalogTable.get.partitionColumnNames.nonEmpty @@ -391,6 +394,8 @@ case class DataSource( catalogTable.get.stats.map(_.sizeInBytes.toLong).getOrElse(defaultTableSize)) (index, catalogTable.get.dataSchema, catalogTable.get.partitionSchema) } else { + val globbedPaths = checkAndGlobPathIfNecessary( + checkEmptyGlobPath = true, checkFilesExist = checkFilesExist) val index = createInMemoryFileIndex(globbedPaths) val (resultDataSchema, resultPartitionSchema) = getOrInferFileFormatSchema(format, () => index) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceResolution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceResolution.scala deleted file mode 100644 index a37a2cf7f0369..0000000000000 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceResolution.scala +++ /dev/null @@ -1,384 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.execution.datasources - -import scala.collection.mutable - -import org.apache.spark.sql.{AnalysisException, SaveMode} -import org.apache.spark.sql.catalog.v2.{CatalogManager, CatalogPlugin, Identifier, LookupCatalog, TableCatalog} -import org.apache.spark.sql.catalog.v2.expressions.Transform -import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.analysis.{CastSupport, UnresolvedRelation} -import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogTableType, CatalogUtils, UnresolvedCatalogRelation} -import org.apache.spark.sql.catalyst.plans.logical.{CreateTableAsSelect, CreateV2Table, DeleteFromTable, DropTable, Filter, LogicalPlan, ReplaceTable, ReplaceTableAsSelect, ShowTables, SubqueryAlias} -import org.apache.spark.sql.catalyst.plans.logical.sql.{AlterTableAddColumnsStatement, AlterTableSetLocationStatement, AlterTableSetPropertiesStatement, AlterTableUnsetPropertiesStatement, AlterViewSetPropertiesStatement, AlterViewUnsetPropertiesStatement, CreateTableAsSelectStatement, CreateTableStatement, DeleteFromStatement, DescribeColumnStatement, DescribeTableStatement, DropTableStatement, DropViewStatement, QualifiedColType, ReplaceTableAsSelectStatement, ReplaceTableStatement, ShowTablesStatement} -import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution.command.{AlterTableAddColumnsCommand, AlterTableSetLocationCommand, AlterTableSetPropertiesCommand, AlterTableUnsetPropertiesCommand, DescribeColumnCommand, DescribeTableCommand, DropTableCommand, ShowTablesCommand} -import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2 -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.types.{HIVE_TYPE_STRING, HiveStringType, MetadataBuilder, StructField, StructType} - -case class DataSourceResolution( - conf: SQLConf, - catalogManager: CatalogManager) - extends Rule[LogicalPlan] with CastSupport with LookupCatalog { - - import org.apache.spark.sql.catalog.v2.CatalogV2Implicits._ - - def v2SessionCatalog: CatalogPlugin = sessionCatalog.getOrElse( - throw new AnalysisException("No v2 session catalog implementation is available")) - - override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { - case CreateTableStatement( - AsTableIdentifier(table), schema, partitionCols, bucketSpec, properties, - V1Provider(provider), options, location, comment, ifNotExists) => - // the source is v1, the identifier has no catalog, and there is no default v2 catalog - val tableDesc = buildCatalogTable(table, schema, partitionCols, bucketSpec, properties, - provider, options, location, comment, ifNotExists) - val mode = if (ifNotExists) SaveMode.Ignore else SaveMode.ErrorIfExists - - CreateTable(tableDesc, mode, None) - - case create: CreateTableStatement => - // the provider was not a v1 source or a v2 catalog is the default, convert to a v2 plan - val CatalogObjectIdentifier(maybeCatalog, identifier) = create.tableName - maybeCatalog match { - case Some(catalog) => - // the identifier had a catalog, or there is a default v2 catalog - convertCreateTable(catalog.asTableCatalog, identifier, create) - case _ => - // the identifier had no catalog and no default catalog is set, but the source is v2. - // use the v2 session catalog, which delegates to the global v1 session catalog - convertCreateTable(v2SessionCatalog.asTableCatalog, identifier, create) - } - - case CreateTableAsSelectStatement( - AsTableIdentifier(table), query, partitionCols, bucketSpec, properties, - V1Provider(provider), options, location, comment, ifNotExists) => - // the source is v1, the identifier has no catalog, and there is no default v2 catalog - val tableDesc = buildCatalogTable(table, new StructType, partitionCols, bucketSpec, - properties, provider, options, location, comment, ifNotExists) - val mode = if (ifNotExists) SaveMode.Ignore else SaveMode.ErrorIfExists - - CreateTable(tableDesc, mode, Some(query)) - - case create: CreateTableAsSelectStatement => - // the provider was not a v1 source or a v2 catalog is the default, convert to a v2 plan - val CatalogObjectIdentifier(maybeCatalog, identifier) = create.tableName - maybeCatalog match { - case Some(catalog) => - // the identifier had a catalog, or there is a default v2 catalog - convertCTAS(catalog.asTableCatalog, identifier, create) - case _ => - // the identifier had no catalog and no default catalog is set, but the source is v2. - // use the v2 session catalog, which delegates to the global v1 session catalog - convertCTAS(v2SessionCatalog.asTableCatalog, identifier, create) - } - - case DescribeColumnStatement( - AsTableIdentifier(tableName), colName, isExtended) => - DescribeColumnCommand(tableName, colName, isExtended) - - case DescribeColumnStatement( - CatalogObjectIdentifier(Some(catalog), ident), colName, isExtended) => - throw new AnalysisException("Describing columns is not supported for v2 tables.") - - case DescribeTableStatement( - AsTableIdentifier(tableName), partitionSpec, isExtended) => - DescribeTableCommand(tableName, partitionSpec, isExtended) - - case ReplaceTableStatement( - AsTableIdentifier(table), schema, partitionCols, bucketSpec, properties, - V1Provider(provider), options, location, comment, orCreate) => - throw new AnalysisException( - s"Replacing tables is not supported using the legacy / v1 Spark external catalog" + - s" API. Write provider name: $provider, identifier: $table.") - - case ReplaceTableAsSelectStatement( - AsTableIdentifier(table), query, partitionCols, bucketSpec, properties, - V1Provider(provider), options, location, comment, orCreate) => - throw new AnalysisException( - s"Replacing tables is not supported using the legacy / v1 Spark external catalog" + - s" API. Write provider name: $provider, identifier: $table.") - - case replace: ReplaceTableStatement => - // the provider was not a v1 source, convert to a v2 plan - val CatalogObjectIdentifier(maybeCatalog, identifier) = replace.tableName - val catalog = maybeCatalog.orElse(sessionCatalog) - .getOrElse(throw new AnalysisException( - s"No catalog specified for table ${identifier.quoted} and no default catalog is set")) - .asTableCatalog - convertReplaceTable(catalog, identifier, replace) - - case rtas: ReplaceTableAsSelectStatement => - // the provider was not a v1 source, convert to a v2 plan - val CatalogObjectIdentifier(maybeCatalog, identifier) = rtas.tableName - val catalog = maybeCatalog.orElse(sessionCatalog) - .getOrElse(throw new AnalysisException( - s"No catalog specified for table ${identifier.quoted} and no default catalog is set")) - .asTableCatalog - convertRTAS(catalog, identifier, rtas) - - case DropTableStatement(CatalogObjectIdentifier(Some(catalog), ident), ifExists, _) => - DropTable(catalog.asTableCatalog, ident, ifExists) - - case DropTableStatement(AsTableIdentifier(tableName), ifExists, purge) => - DropTableCommand(tableName, ifExists, isView = false, purge) - - case DropViewStatement(CatalogObjectIdentifier(Some(catalog), ident), _) => - throw new AnalysisException( - s"Can not specify catalog `${catalog.name}` for view $ident " + - s"because view support in catalog has not been implemented yet") - - case DropViewStatement(AsTableIdentifier(tableName), ifExists) => - DropTableCommand(tableName, ifExists, isView = true, purge = false) - - case AlterTableSetPropertiesStatement(AsTableIdentifier(table), properties) => - AlterTableSetPropertiesCommand(table, properties, isView = false) - - case AlterViewSetPropertiesStatement(AsTableIdentifier(table), properties) => - AlterTableSetPropertiesCommand(table, properties, isView = true) - - case AlterTableUnsetPropertiesStatement(AsTableIdentifier(table), propertyKeys, ifExists) => - AlterTableUnsetPropertiesCommand(table, propertyKeys, ifExists, isView = false) - - case AlterViewUnsetPropertiesStatement(AsTableIdentifier(table), propertyKeys, ifExists) => - AlterTableUnsetPropertiesCommand(table, propertyKeys, ifExists, isView = true) - - case AlterTableSetLocationStatement(AsTableIdentifier(table), newLocation) => - AlterTableSetLocationCommand(table, None, newLocation) - - case AlterTableAddColumnsStatement(AsTableIdentifier(table), newColumns) - if newColumns.forall(_.name.size == 1) => - // only top-level adds are supported using AlterTableAddColumnsCommand - AlterTableAddColumnsCommand(table, newColumns.map(convertToStructField)) - - case DeleteFromStatement(AsTableIdentifier(table), tableAlias, condition) => - throw new AnalysisException( - s"Delete from tables is not supported using the legacy / v1 Spark external catalog" + - s" API. Identifier: $table.") - - case delete: DeleteFromStatement => - val relation = UnresolvedRelation(delete.tableName) - val aliased = delete.tableAlias.map(SubqueryAlias(_, relation)).getOrElse(relation) - DeleteFromTable(aliased, delete.condition) - - case ShowTablesStatement(None, pattern) => - defaultCatalog match { - case Some(catalog) => - ShowTables( - catalog.asTableCatalog, - catalogManager.currentNamespace, - pattern) - case None => - ShowTablesCommand(None, pattern) - } - - case ShowTablesStatement(Some(namespace), pattern) => - val CatalogNamespace(maybeCatalog, ns) = namespace - maybeCatalog match { - case Some(catalog) => - ShowTables(catalog.asTableCatalog, ns, pattern) - case None => - if (namespace.length != 1) { - throw new AnalysisException( - s"The database name is not valid: ${namespace.quoted}") - } - ShowTablesCommand(Some(namespace.quoted), pattern) - } - } - - object V1Provider { - def unapply(provider: String): Option[String] = { - DataSource.lookupDataSourceV2(provider, conf) match { - // TODO(SPARK-28396): Currently file source v2 can't work with tables. - case Some(_: FileDataSourceV2) => Some(provider) - case Some(_) => None - case _ => Some(provider) - } - } - } - - private def buildCatalogTable( - table: TableIdentifier, - schema: StructType, - partitioning: Seq[Transform], - bucketSpec: Option[BucketSpec], - properties: Map[String, String], - provider: String, - options: Map[String, String], - location: Option[String], - comment: Option[String], - ifNotExists: Boolean): CatalogTable = { - - val storage = DataSource.buildStorageFormatFromOptions(options) - if (location.isDefined && storage.locationUri.isDefined) { - throw new AnalysisException( - "LOCATION and 'path' in OPTIONS are both used to indicate the custom table path, " + - "you can only specify one of them.") - } - val customLocation = storage.locationUri.orElse(location.map(CatalogUtils.stringToURI)) - - val tableType = if (customLocation.isDefined) { - CatalogTableType.EXTERNAL - } else { - CatalogTableType.MANAGED - } - - CatalogTable( - identifier = table, - tableType = tableType, - storage = storage.copy(locationUri = customLocation), - schema = schema, - provider = Some(provider), - partitionColumnNames = partitioning.asPartitionColumns, - bucketSpec = bucketSpec, - properties = properties, - comment = comment) - } - - private def convertCTAS( - catalog: TableCatalog, - identifier: Identifier, - ctas: CreateTableAsSelectStatement): CreateTableAsSelect = { - // convert the bucket spec and add it as a transform - val partitioning = ctas.partitioning ++ ctas.bucketSpec.map(_.asTransform) - val properties = convertTableProperties( - ctas.properties, ctas.options, ctas.location, ctas.comment, ctas.provider) - - CreateTableAsSelect( - catalog, - identifier, - partitioning, - ctas.asSelect, - properties, - writeOptions = ctas.options.filterKeys(_ != "path"), - ignoreIfExists = ctas.ifNotExists) - } - - private def convertCreateTable( - catalog: TableCatalog, - identifier: Identifier, - create: CreateTableStatement): CreateV2Table = { - // convert the bucket spec and add it as a transform - val partitioning = create.partitioning ++ create.bucketSpec.map(_.asTransform) - val properties = convertTableProperties( - create.properties, create.options, create.location, create.comment, create.provider) - - CreateV2Table( - catalog, - identifier, - create.tableSchema, - partitioning, - properties, - ignoreIfExists = create.ifNotExists) - } - - private def convertRTAS( - catalog: TableCatalog, - identifier: Identifier, - rtas: ReplaceTableAsSelectStatement): ReplaceTableAsSelect = { - // convert the bucket spec and add it as a transform - val partitioning = rtas.partitioning ++ rtas.bucketSpec.map(_.asTransform) - val properties = convertTableProperties( - rtas.properties, rtas.options, rtas.location, rtas.comment, rtas.provider) - - ReplaceTableAsSelect( - catalog, - identifier, - partitioning, - rtas.asSelect, - properties, - writeOptions = rtas.options.filterKeys(_ != "path"), - orCreate = rtas.orCreate) - } - - private def convertReplaceTable( - catalog: TableCatalog, - identifier: Identifier, - replace: ReplaceTableStatement): ReplaceTable = { - // convert the bucket spec and add it as a transform - val partitioning = replace.partitioning ++ replace.bucketSpec.map(_.asTransform) - val properties = convertTableProperties( - replace.properties, replace.options, replace.location, replace.comment, replace.provider) - - ReplaceTable( - catalog, - identifier, - replace.tableSchema, - partitioning, - properties, - orCreate = replace.orCreate) - } - - private def convertTableProperties( - properties: Map[String, String], - options: Map[String, String], - location: Option[String], - comment: Option[String], - provider: String): Map[String, String] = { - if (options.contains("path") && location.isDefined) { - throw new AnalysisException( - "LOCATION and 'path' in OPTIONS are both used to indicate the custom table path, " + - "you can only specify one of them.") - } - - if ((options.contains("comment") || properties.contains("comment")) - && comment.isDefined) { - throw new AnalysisException( - "COMMENT and option/property 'comment' are both used to set the table comment, you can " + - "only specify one of them.") - } - - if (options.contains("provider") || properties.contains("provider")) { - throw new AnalysisException( - "USING and option/property 'provider' are both used to set the provider implementation, " + - "you can only specify one of them.") - } - - val filteredOptions = options.filterKeys(_ != "path") - - // create table properties from TBLPROPERTIES and OPTIONS clauses - val tableProperties = new mutable.HashMap[String, String]() - tableProperties ++= properties - tableProperties ++= filteredOptions - - // convert USING, LOCATION, and COMMENT clauses to table properties - tableProperties += ("provider" -> provider) - comment.map(text => tableProperties += ("comment" -> text)) - location.orElse(options.get("path")).map(loc => tableProperties += ("location" -> loc)) - - tableProperties.toMap - } - - private def convertToStructField(col: QualifiedColType): StructField = { - val builder = new MetadataBuilder - col.comment.foreach(builder.putString("comment", _)) - - val cleanedDataType = HiveStringType.replaceCharType(col.dataType) - if (col.dataType != cleanedDataType) { - builder.putString(HIVE_TYPE_STRING, col.dataType.catalogString) - } - - StructField( - col.name.head, - cleanedDataType, - nullable = true, - builder.build()) - } -} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index 4dcf5c52ce83d..e3a0a0a6c34e5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala @@ -33,8 +33,8 @@ import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.planning.PhysicalOperation -import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoDir, InsertIntoTable, LogicalPlan, Project} +import org.apache.spark.sql.catalyst.planning.ScanOperation +import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoDir, InsertIntoStatement, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{RowDataSourceScanExec, SparkPlan} import org.apache.spark.sql.execution.command._ @@ -140,7 +140,7 @@ case class DataSourceAnalysis(conf: SQLConf) extends Rule[LogicalPlan] with Cast if query.resolved && DDLUtils.isDatasourceTable(tableDesc) => CreateDataSourceTableAsSelectCommand(tableDesc, mode, query, query.output.map(_.name)) - case InsertIntoTable(l @ LogicalRelation(_: InsertableRelation, _, _, _), + case InsertIntoStatement(l @ LogicalRelation(_: InsertableRelation, _, _, _), parts, query, overwrite, false) if parts.isEmpty => InsertIntoDataSourceCommand(l, query, overwrite) @@ -152,7 +152,7 @@ case class DataSourceAnalysis(conf: SQLConf) extends Rule[LogicalPlan] with Cast InsertIntoDataSourceDirCommand(storage, provider.get, query, overwrite) - case i @ InsertIntoTable( + case i @ InsertIntoStatement( l @ LogicalRelation(t: HadoopFsRelation, _, table, _), parts, query, overwrite, _) => // If the InsertIntoTable command is for a partitioned HadoopFsRelation and // the user has specified static partitions, we add a Project operator on top of the query @@ -188,15 +188,13 @@ case class DataSourceAnalysis(conf: SQLConf) extends Rule[LogicalPlan] with Cast } val outputPath = t.location.rootPaths.head - if (overwrite) DDLUtils.verifyNotReadPath(actualQuery, outputPath) - val mode = if (overwrite) SaveMode.Overwrite else SaveMode.Append val partitionSchema = actualQuery.resolve( t.partitionSchema, t.sparkSession.sessionState.analyzer.resolver) val staticPartitions = parts.filter(_._2.nonEmpty).map { case (k, v) => k -> v.get } - InsertIntoHadoopFsRelationCommand( + val insertCommand = InsertIntoHadoopFsRelationCommand( outputPath, staticPartitions, i.ifPartitionNotExists, @@ -209,6 +207,14 @@ case class DataSourceAnalysis(conf: SQLConf) extends Rule[LogicalPlan] with Cast table, Some(t.location), actualQuery.output.map(_.name)) + + // For dynamic partition overwrite, we do not delete partition directories ahead. + // We write to staging directories and move to final partition directories after writing + // job is done. So it is ok to have outputPath try to overwrite inputpath. + if (overwrite && !insertCommand.dynamicPartitionOverwrite) { + DDLUtils.verifyNotReadPath(actualQuery, outputPath) + } + insertCommand } } @@ -241,11 +247,11 @@ class FindDataSourceTable(sparkSession: SparkSession) extends Rule[LogicalPlan] } override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { - case i @ InsertIntoTable(UnresolvedCatalogRelation(tableMeta), _, _, _, _) + case i @ InsertIntoStatement(UnresolvedCatalogRelation(tableMeta), _, _, _, _) if DDLUtils.isDatasourceTable(tableMeta) => i.copy(table = readDataSourceTable(tableMeta)) - case i @ InsertIntoTable(UnresolvedCatalogRelation(tableMeta), _, _, _, _) => + case i @ InsertIntoStatement(UnresolvedCatalogRelation(tableMeta), _, _, _, _) => i.copy(table = DDLUtils.readHiveTable(tableMeta)) case UnresolvedCatalogRelation(tableMeta) if DDLUtils.isDatasourceTable(tableMeta) => @@ -264,7 +270,7 @@ case class DataSourceStrategy(conf: SQLConf) extends Strategy with Logging with import DataSourceStrategy._ def apply(plan: LogicalPlan): Seq[execution.SparkPlan] = plan match { - case PhysicalOperation(projects, filters, l @ LogicalRelation(t: CatalystScan, _, _, _)) => + case ScanOperation(projects, filters, l @ LogicalRelation(t: CatalystScan, _, _, _)) => pruneFilterProjectRaw( l, projects, @@ -272,7 +278,7 @@ case class DataSourceStrategy(conf: SQLConf) extends Strategy with Logging with (requestedColumns, allPredicates, _) => toCatalystRDD(l, requestedColumns, t.buildScan(requestedColumns, allPredicates))) :: Nil - case PhysicalOperation(projects, filters, + case ScanOperation(projects, filters, l @ LogicalRelation(t: PrunedFilteredScan, _, _, _)) => pruneFilterProject( l, @@ -280,7 +286,7 @@ case class DataSourceStrategy(conf: SQLConf) extends Strategy with Logging with filters, (a, f) => toCatalystRDD(l, a, t.buildScan(a.map(_.name).toArray, f))) :: Nil - case PhysicalOperation(projects, filters, l @ LogicalRelation(t: PrunedScan, _, _, _)) => + case ScanOperation(projects, filters, l @ LogicalRelation(t: PrunedScan, _, _, _)) => pruneFilterProject( l, projects, @@ -403,14 +409,7 @@ case class DataSourceStrategy(conf: SQLConf) extends Strategy with Logging with relation: LogicalRelation, output: Seq[Attribute], rdd: RDD[Row]): RDD[InternalRow] = { - if (relation.relation.needConversion) { - val converters = RowEncoder(StructType.fromAttributes(output)) - rdd.mapPartitions { iterator => - iterator.map(converters.toRow) - } - } else { - rdd.asInstanceOf[RDD[InternalRow]] - } + DataSourceStrategy.toCatalystRDD(relation.relation, output, rdd) } /** @@ -423,14 +422,14 @@ case class DataSourceStrategy(conf: SQLConf) extends Strategy with Logging with object DataSourceStrategy { /** - * The attribute name of predicate could be different than the one in schema in case of - * case insensitive, we should change them to match the one in schema, so we do not need to - * worry about case sensitivity anymore. + * The attribute name may differ from the one in the schema if the query analyzer + * is case insensitive. We should change attribute names to match the ones in the schema, + * so we do not need to worry about case sensitivity anymore. */ - protected[sql] def normalizeFilters( - filters: Seq[Expression], + protected[sql] def normalizeExprs( + exprs: Seq[Expression], attributes: Seq[AttributeReference]): Seq[Expression] = { - filters.map { e => + exprs.map { e => e transform { case a: AttributeReference => a.withName(attributes.find(_.semanticEquals(a)).getOrElse(a).name) @@ -618,4 +617,21 @@ object DataSourceStrategy { (nonconvertiblePredicates ++ unhandledPredicates, pushedFilters, handledFilters) } + + /** + * Convert RDD of Row into RDD of InternalRow with objects in catalyst types + */ + private[sql] def toCatalystRDD( + relation: BaseRelation, + output: Seq[Attribute], + rdd: RDD[Row]): RDD[InternalRow] = { + if (relation.needConversion) { + val converters = RowEncoder(StructType.fromAttributes(output)) + rdd.mapPartitions { iterator => + iterator.map(converters.toRow) + } + } else { + rdd.asInstanceOf[RDD[InternalRow]] + } + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FallBackFileSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FallBackFileSourceV2.scala index 813af8203c2c5..28a63c26604ec 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FallBackFileSourceV2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FallBackFileSourceV2.scala @@ -20,12 +20,12 @@ package org.apache.spark.sql.execution.datasources import scala.collection.JavaConverters._ import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan} +import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoStatement, LogicalPlan} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, FileDataSourceV2, FileTable} /** - * Replace the File source V2 table in [[InsertIntoTable]] to V1 [[FileFormat]]. + * Replace the File source V2 table in [[InsertIntoStatement]] to V1 [[FileFormat]]. * E.g, with temporary view `t` using [[FileDataSourceV2]], inserting into view `t` fails * since there is no corresponding physical plan. * This is a temporary hack for making current data source V2 work. It should be @@ -33,7 +33,8 @@ import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, File */ class FallBackFileSourceV2(sparkSession: SparkSession) extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { - case i @ InsertIntoTable(d @ DataSourceV2Relation(table: FileTable, _, _), _, _, _, _) => + case i @ + InsertIntoStatement(d @ DataSourceV2Relation(table: FileTable, _, _, _, _), _, _, _, _) => val v1FileFormat = table.fallbackFileFormat.newInstance() val relation = HadoopFsRelation( table.fileIndex, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala index 2595cc6371bc2..50c4f6cd57a96 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatDataWriter.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.sources.v2.writer.{DataWriter, WriterCommitMessage} +import org.apache.spark.sql.connector.write.{DataWriter, WriterCommitMessage} import org.apache.spark.sql.types.StringType import org.apache.spark.util.SerializableConfiguration @@ -86,6 +86,8 @@ abstract class FileFormatDataWriter( committer.abortTask(taskAttemptContext) } } + + override def close(): Unit = {} } /** FileFormatWriteTask for empty partitions */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala index f1fc5d762ad56..219c778b9164a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.datasources import java.util.{Date, UUID} import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.Path +import org.apache.hadoop.fs.{FileAlreadyExistsException, Path} import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl @@ -277,10 +277,16 @@ object FileFormatWriter extends Logging { // If there is an error, abort the task dataWriter.abort() logError(s"Job $jobId aborted.") + }, finallyBlock = { + dataWriter.close() }) } catch { case e: FetchFailedException => throw e + case f: FileAlreadyExistsException => + // If any output file to write already exists, it does not make sense to re-run this task. + // We throw the exception and let Executor throw ExceptionFailure to abort the job. + throw new TaskOutputFileAlreadyExistException(f) case t: Throwable => throw new SparkException("Task failed while writing rows.", t) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala index 14bee173cc116..b4fc94e097aa8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FilePartition.scala @@ -22,7 +22,7 @@ import scala.collection.mutable.ArrayBuffer import org.apache.spark.Partition import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.sources.v2.reader.InputPartition +import org.apache.spark.sql.connector.read.InputPartition /** * A collection of file blocks that should be read as a single task diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala index 9e98b0bbfabc9..542c996a5342d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala @@ -19,17 +19,14 @@ package org.apache.spark.sql.execution.datasources import java.io.{FileNotFoundException, IOException} -import scala.collection.mutable - import org.apache.parquet.io.ParquetDecodingException -import org.apache.spark.{Partition => RDDPartition, TaskContext, TaskKilledException} +import org.apache.spark.{Partition => RDDPartition, TaskContext} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.rdd.{InputFileBlockHolder, RDD} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.QueryExecutionException -import org.apache.spark.sql.sources.v2.reader.InputPartition import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.NextIterator diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala index c8a42f043f15f..f45495121a980 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala @@ -22,7 +22,7 @@ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.planning.PhysicalOperation +import org.apache.spark.sql.catalyst.planning.ScanOperation import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.{FileSourceScanExec, SparkPlan} import org.apache.spark.util.collection.BitSet @@ -137,7 +137,7 @@ object FileSourceStrategy extends Strategy with Logging { } def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { - case PhysicalOperation(projects, filters, + case ScanOperation(projects, filters, l @ LogicalRelation(fsRelation: HadoopFsRelation, _, table, _)) => // Filters on this relation fall into four categories based on where we can use them to avoid // reading unneeded data: @@ -147,7 +147,8 @@ object FileSourceStrategy extends Strategy with Logging { // - filters that need to be evaluated again after the scan val filterSet = ExpressionSet(filters) - val normalizedFilters = DataSourceStrategy.normalizeFilters(filters, l.output) + val normalizedFilters = DataSourceStrategy.normalizeExprs( + filters.filter(_.deterministic), l.output) val partitionColumns = l.resolve( @@ -177,6 +178,8 @@ object FileSourceStrategy extends Strategy with Logging { // Partition keys are not available in the statistics of the files. val dataFilters = normalizedFiltersWithoutSubqueries.filter(_.references.intersect(partitionSet).isEmpty) + logInfo(s"Pushed Filters: " + + s"${dataFilters.flatMap(DataSourceStrategy.translateFilter).mkString(",")}") // Predicates with both partition keys and attributes need to be evaluated after the scan. val afterScanFilters = filterSet -- partitionKeyFilters.filter(_.references.nonEmpty) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala index cf7a13050f66c..cac2d6e626120 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala @@ -50,7 +50,9 @@ class InMemoryFileIndex( rootPathsSpecified: Seq[Path], parameters: Map[String, String], userSpecifiedSchema: Option[StructType], - fileStatusCache: FileStatusCache = NoopCache) + fileStatusCache: FileStatusCache = NoopCache, + userSpecifiedPartitionSpec: Option[PartitionSpec] = None, + override val metadataOpsTimeNs: Option[Long] = None) extends PartitioningAwareFileIndex( sparkSession, parameters, userSpecifiedSchema, fileStatusCache) { @@ -69,7 +71,11 @@ class InMemoryFileIndex( override def partitionSpec(): PartitionSpec = { if (cachedPartitionSpec == null) { - cachedPartitionSpec = inferPartitioning() + if (userSpecifiedPartitionSpec.isDefined) { + cachedPartitionSpec = userSpecifiedPartitionSpec.get + } else { + cachedPartitionSpec = inferPartitioning() + } } logTrace(s"Partition spec: $cachedPartitionSpec") cachedPartitionSpec @@ -111,6 +117,7 @@ class InMemoryFileIndex( * This is publicly visible for testing. */ def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = { + val startTime = System.nanoTime() val output = mutable.LinkedHashSet[FileStatus]() val pathsToFetch = mutable.ArrayBuffer[Path]() for (path <- paths) { @@ -121,7 +128,7 @@ class InMemoryFileIndex( case None => pathsToFetch += path } - Unit // for some reasons scalac 2.12 needs this; return type doesn't matter + () // for some reasons scalac 2.12 needs this; return type doesn't matter } val filter = FileInputFormat.getInputPathFilter(new JobConf(hadoopConf, this.getClass)) val discovered = InMemoryFileIndex.bulkListLeafFiles( @@ -131,6 +138,8 @@ class InMemoryFileIndex( fileStatusCache.putLeafFiles(path, leafFiles.toArray) output ++= leafFiles } + logInfo(s"It took ${(System.nanoTime() - startTime) / (1000 * 1000)} ms to list leaf files" + + s" for ${paths.length} paths.") output } } @@ -171,6 +180,7 @@ object InMemoryFileIndex extends Logging { areRootPaths: Boolean): Seq[(Path, Seq[FileStatus])] = { val ignoreMissingFiles = sparkSession.sessionState.conf.ignoreMissingFiles + val ignoreLocality = sparkSession.sessionState.conf.ignoreDataLocality // Short-circuits parallel listing when serial listing is likely to be faster. if (paths.size <= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) { @@ -181,12 +191,14 @@ object InMemoryFileIndex extends Logging { filter, Some(sparkSession), ignoreMissingFiles = ignoreMissingFiles, + ignoreLocality = ignoreLocality, isRootPath = areRootPaths) (path, leafFiles) } } - logInfo(s"Listing leaf files and directories in parallel under: ${paths.mkString(", ")}") + logInfo(s"Listing leaf files and directories in parallel under ${paths.length} paths." + + s" The first several paths are: ${paths.take(10).mkString(", ")}.") HiveCatalogMetrics.incrementParallelListingJobCount(1) val sparkContext = sparkSession.sparkContext @@ -221,6 +233,7 @@ object InMemoryFileIndex extends Logging { filter, None, ignoreMissingFiles = ignoreMissingFiles, + ignoreLocality = ignoreLocality, isRootPath = areRootPaths) (path, leafFiles) }.iterator @@ -287,6 +300,7 @@ object InMemoryFileIndex extends Logging { filter: PathFilter, sessionOpt: Option[SparkSession], ignoreMissingFiles: Boolean, + ignoreLocality: Boolean, isRootPath: Boolean): Seq[FileStatus] = { logTrace(s"Listing $path") val fs = path.getFileSystem(hadoopConf) @@ -299,7 +313,7 @@ object InMemoryFileIndex extends Logging { // to retrieve the file status with the file block location. The reason to still fallback // to listStatus is because the default implementation would potentially throw a // FileNotFoundException which is better handled by doing the lookups manually below. - case _: DistributedFileSystem => + case _: DistributedFileSystem if !ignoreLocality => val remoteIter = fs.listLocatedStatus(path) new Iterator[LocatedFileStatus]() { def next(): LocatedFileStatus = remoteIter.next @@ -353,6 +367,7 @@ object InMemoryFileIndex extends Logging { filter, sessionOpt, ignoreMissingFiles = ignoreMissingFiles, + ignoreLocality = ignoreLocality, isRootPath = false) } } @@ -376,7 +391,7 @@ object InMemoryFileIndex extends Logging { // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not // be a big deal since we always use to `bulkListLeafFiles` when the number of // paths exceeds threshold. - case f => + case f if !ignoreLocality => // The other constructor of LocatedFileStatus will call FileStatus.getPermission(), // which is very slow on some file system (RawLocalFileSystem, which is launch a // subprocess and parse the stdout). @@ -400,6 +415,8 @@ object InMemoryFileIndex extends Logging { missingFiles += f.getPath.toString None } + + case f => Some(f) } if (missingFiles.nonEmpty) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala index d43fa3893df1d..f11972115e09f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala @@ -30,6 +30,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.command._ +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.PartitionOverwriteMode import org.apache.spark.sql.util.SchemaUtils @@ -60,6 +61,21 @@ case class InsertIntoHadoopFsRelationCommand( extends DataWritingCommand { import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.escapePathName + private lazy val parameters = CaseInsensitiveMap(options) + + private[sql] lazy val dynamicPartitionOverwrite: Boolean = { + val partitionOverwriteMode = parameters.get("partitionOverwriteMode") + // scalastyle:off caselocale + .map(mode => PartitionOverwriteMode.withName(mode.toUpperCase)) + // scalastyle:on caselocale + .getOrElse(SQLConf.get.partitionOverwriteMode) + val enableDynamicOverwrite = partitionOverwriteMode == PartitionOverwriteMode.DYNAMIC + // This config only makes sense when we are overwriting a partitioned dataset with dynamic + // partition columns. + enableDynamicOverwrite && mode == SaveMode.Overwrite && + staticPartitions.size < partitionColumns.length + } + override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { // Most formats don't do well with duplicate columns, so lets not allow that SchemaUtils.checkColumnNameDuplication( @@ -90,46 +106,36 @@ case class InsertIntoHadoopFsRelationCommand( fs, catalogTable.get, qualifiedOutputPath, matchingPartitions) } - val pathExists = fs.exists(qualifiedOutputPath) - - val parameters = CaseInsensitiveMap(options) - - val partitionOverwriteMode = parameters.get("partitionOverwriteMode") - // scalastyle:off caselocale - .map(mode => PartitionOverwriteMode.withName(mode.toUpperCase)) - // scalastyle:on caselocale - .getOrElse(sparkSession.sessionState.conf.partitionOverwriteMode) - val enableDynamicOverwrite = partitionOverwriteMode == PartitionOverwriteMode.DYNAMIC - // This config only makes sense when we are overwriting a partitioned dataset with dynamic - // partition columns. - val dynamicPartitionOverwrite = enableDynamicOverwrite && mode == SaveMode.Overwrite && - staticPartitions.size < partitionColumns.length - val committer = FileCommitProtocol.instantiate( sparkSession.sessionState.conf.fileCommitProtocolClass, jobId = java.util.UUID.randomUUID().toString, outputPath = outputPath.toString, dynamicPartitionOverwrite = dynamicPartitionOverwrite) - val doInsertion = (mode, pathExists) match { - case (SaveMode.ErrorIfExists, true) => - throw new AnalysisException(s"path $qualifiedOutputPath already exists.") - case (SaveMode.Overwrite, true) => - if (ifPartitionNotExists && matchingPartitions.nonEmpty) { - false - } else if (dynamicPartitionOverwrite) { - // For dynamic partition overwrite, do not delete partition directories ahead. - true - } else { - deleteMatchingPartitions(fs, qualifiedOutputPath, customPartitionLocations, committer) + val doInsertion = if (mode == SaveMode.Append) { + true + } else { + val pathExists = fs.exists(qualifiedOutputPath) + (mode, pathExists) match { + case (SaveMode.ErrorIfExists, true) => + throw new AnalysisException(s"path $qualifiedOutputPath already exists.") + case (SaveMode.Overwrite, true) => + if (ifPartitionNotExists && matchingPartitions.nonEmpty) { + false + } else if (dynamicPartitionOverwrite) { + // For dynamic partition overwrite, do not delete partition directories ahead. + true + } else { + deleteMatchingPartitions(fs, qualifiedOutputPath, customPartitionLocations, committer) + true + } + case (SaveMode.Overwrite, _) | (SaveMode.ErrorIfExists, false) => true - } - case (SaveMode.Append, _) | (SaveMode.Overwrite, _) | (SaveMode.ErrorIfExists, false) => - true - case (SaveMode.Ignore, exists) => - !exists - case (s, exists) => - throw new IllegalStateException(s"unsupported save mode $s ($exists)") + case (SaveMode.Ignore, exists) => + !exists + case (s, exists) => + throw new IllegalStateException(s"unsupported save mode $s ($exists)") + } } if (doInsertion) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala index 35bda5682fda1..33a3486bf6f67 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala @@ -41,7 +41,7 @@ case class LogicalRelation( override def computeStats(): Statistics = { catalogTable - .flatMap(_.stats.map(_.toPlanStats(output, conf.cboEnabled))) + .flatMap(_.stats.map(_.toPlanStats(output, conf.cboEnabled || conf.planStatsEnabled))) .getOrElse(Statistics(sizeInBytes = relation.sizeInBytes)) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala index 3adec2f790730..2e09c729529a6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala @@ -171,7 +171,7 @@ abstract class PartitioningAwareFileIndex( if (partitionPruningPredicates.nonEmpty) { val predicate = partitionPruningPredicates.reduce(expressions.And) - val boundPredicate = InterpretedPredicate.create(predicate.transform { + val boundPredicate = Predicate.createInterpreted(predicate.transform { case a: AttributeReference => val index = partitionColumns.indexWhere(a.name == _.name) BoundReference(index, partitionColumns(index).dataType, nullable = true) @@ -221,7 +221,15 @@ abstract class PartitioningAwareFileIndex( if (!fs.isDirectory(userDefinedBasePath)) { throw new IllegalArgumentException(s"Option '$BASE_PATH_PARAM' must be a directory") } - Set(fs.makeQualified(userDefinedBasePath)) + val qualifiedBasePath = fs.makeQualified(userDefinedBasePath) + val qualifiedBasePathStr = qualifiedBasePath.toString + rootPaths + .find(!fs.makeQualified(_).toString.startsWith(qualifiedBasePathStr)) + .foreach { rp => + throw new IllegalArgumentException( + s"Wrong basePath $userDefinedBasePath for the root path: $rp") + } + Set(qualifiedBasePath) case None => rootPaths.map { path => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala index 1e47d53b7e976..fdad43b23c5aa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala @@ -130,7 +130,7 @@ object PartitioningUtils { Map.empty[String, String] } - val dateFormatter = DateFormatter() + val dateFormatter = DateFormatter(zoneId) val timestampFormatter = TimestampFormatter(timestampPartitionPattern, zoneId) // First, we need to parse every partition's path and see if we can find partition values. val (partitionValues, optDiscoveredBasePaths) = paths.map { path => @@ -492,7 +492,7 @@ object PartitioningUtils { // We need to check that we can cast the raw string since we later can use Cast to get // the partition values with the right DataType (see // org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex.inferPartitioning) - val dateValue = Cast(Literal(raw), DateType).eval() + val dateValue = Cast(Literal(raw), DateType, Some(zoneId.getId)).eval() // Disallow DateType if the cast returned null require(dateValue != null) Literal.create(dateValue, DateType) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala index 927e77a53bf47..a7129fb14d1a6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala @@ -17,13 +17,61 @@ package org.apache.spark.sql.execution.datasources +import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.catalog.CatalogStatistics import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation -import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project} +import org.apache.spark.sql.catalyst.plans.logical.{Filter, LeafNode, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2ScanRelation, FileScan} +import org.apache.spark.sql.types.StructType +/** + * Prune the partitions of file source based table using partition filters. Currently, this rule + * is applied to [[HadoopFsRelation]] with [[CatalogFileIndex]] and [[DataSourceV2ScanRelation]] + * with [[FileScan]]. + * + * For [[HadoopFsRelation]], the location will be replaced by pruned file index, and corresponding + * statistics will be updated. And the partition filters will be kept in the filters of returned + * logical plan. + * + * For [[DataSourceV2ScanRelation]], both partition filters and data filters will be added to + * its underlying [[FileScan]]. And the partition filters will be removed in the filters of + * returned logical plan. + */ private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] { + + private def getPartitionKeyFiltersAndDataFilters( + sparkSession: SparkSession, + relation: LeafNode, + partitionSchema: StructType, + filters: Seq[Expression], + output: Seq[AttributeReference]): (ExpressionSet, Seq[Expression]) = { + val normalizedFilters = DataSourceStrategy.normalizeExprs( + filters.filter(f => f.deterministic && !SubqueryExpression.hasSubquery(f)), output) + val partitionColumns = + relation.resolve(partitionSchema, sparkSession.sessionState.analyzer.resolver) + val partitionSet = AttributeSet(partitionColumns) + val (partitionFilters, dataFilters) = normalizedFilters.partition(f => + f.references.subsetOf(partitionSet) + ) + + (ExpressionSet(partitionFilters), dataFilters) + } + + private def rebuildPhysicalOperation( + projects: Seq[NamedExpression], + filters: Seq[Expression], + relation: LeafNode): Project = { + val withFilter = if (filters.nonEmpty) { + val filterExpression = filters.reduceLeft(And) + Filter(filterExpression, relation) + } else { + relation + } + Project(projects, withFilter) + } + override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown { case op @ PhysicalOperation(projects, filters, logicalRelation @ @@ -39,31 +87,37 @@ private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] { _, _)) if filters.nonEmpty && fsRelation.partitionSchemaOption.isDefined => - val normalizedFilters = DataSourceStrategy.normalizeFilters( - filters.filterNot(SubqueryExpression.hasSubquery), logicalRelation.output) - - val sparkSession = fsRelation.sparkSession - val partitionColumns = - logicalRelation.resolve( - partitionSchema, sparkSession.sessionState.analyzer.resolver) - val partitionSet = AttributeSet(partitionColumns) - val partitionKeyFilters = ExpressionSet(normalizedFilters.filter { f => - f.references.subsetOf(partitionSet) && f.find(_.isInstanceOf[SubqueryExpression]).isEmpty - }) - + val (partitionKeyFilters, _) = getPartitionKeyFiltersAndDataFilters( + fsRelation.sparkSession, logicalRelation, partitionSchema, filters, logicalRelation.output) if (partitionKeyFilters.nonEmpty) { val prunedFileIndex = catalogFileIndex.filterPartitions(partitionKeyFilters.toSeq) val prunedFsRelation = - fsRelation.copy(location = prunedFileIndex)(sparkSession) + fsRelation.copy(location = prunedFileIndex)(fsRelation.sparkSession) // Change table stats based on the sizeInBytes of pruned files val withStats = logicalRelation.catalogTable.map(_.copy( stats = Some(CatalogStatistics(sizeInBytes = BigInt(prunedFileIndex.sizeInBytes))))) val prunedLogicalRelation = logicalRelation.copy( relation = prunedFsRelation, catalogTable = withStats) // Keep partition-pruning predicates so that they are visible in physical planning - val filterExpression = filters.reduceLeft(And) - val filter = Filter(filterExpression, prunedLogicalRelation) - Project(projects, filter) + rebuildPhysicalOperation(projects, filters, prunedLogicalRelation) + } else { + op + } + + case op @ PhysicalOperation(projects, filters, + v2Relation @ DataSourceV2ScanRelation(_, scan: FileScan, output)) + if filters.nonEmpty && scan.readDataSchema.nonEmpty => + val (partitionKeyFilters, dataFilters) = + getPartitionKeyFiltersAndDataFilters(scan.sparkSession, v2Relation, + scan.readPartitionSchema, filters, output) + // The dataFilters are pushed down only once + if (partitionKeyFilters.nonEmpty || (dataFilters.nonEmpty && scan.dataFilters.isEmpty)) { + val prunedV2Relation = + v2Relation.copy(scan = scan.withFilters(partitionKeyFilters.toSeq, dataFilters)) + // The pushed down partition filters don't need to be reevaluated. + val afterScanFilters = + ExpressionSet(filters) -- partitionKeyFilters.filter(_.references.nonEmpty) + rebuildPhysicalOperation(projects, afterScanFilters.toSeq, prunedV2Relation) } else { op } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala index c2211cccb501c..61e0154a0ffe8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala @@ -17,16 +17,12 @@ package org.apache.spark.sql.execution.datasources -import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical.{Filter, LeafNode, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat -import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, FileTable} -import org.apache.spark.sql.execution.datasources.v2.orc.OrcTable -import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetTable import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StructField, StructType} @@ -58,21 +54,6 @@ object SchemaPruning extends Rule[LogicalPlan] { hadoopFsRelation.copy(dataSchema = prunedDataSchema)(hadoopFsRelation.sparkSession) buildPrunedRelation(l, prunedHadoopRelation) }).getOrElse(op) - - case op @ PhysicalOperation(projects, filters, - d @ DataSourceV2Relation(table: FileTable, output, _)) if canPruneTable(table) => - - prunePhysicalColumns(output, projects, filters, table.dataSchema, - prunedDataSchema => { - val prunedFileTable = table match { - case o: OrcTable => o.copy(userSpecifiedSchema = Some(prunedDataSchema)) - case p: ParquetTable => p.copy(userSpecifiedSchema = Some(prunedDataSchema)) - case _ => - val message = s"${table.formatName} data source doesn't support schema pruning." - throw new AnalysisException(message) - } - buildPrunedRelationV2(d, prunedFileTable) - }).getOrElse(op) } /** @@ -119,12 +100,6 @@ object SchemaPruning extends Rule[LogicalPlan] { fsRelation.fileFormat.isInstanceOf[ParquetFileFormat] || fsRelation.fileFormat.isInstanceOf[OrcFileFormat] - /** - * Checks to see if the given [[FileTable]] can be pruned. Currently we support ORC v2. - */ - private def canPruneTable(table: FileTable) = - table.isInstanceOf[OrcTable] || table.isInstanceOf[ParquetTable] - /** * Normalizes the names of the attribute references in the given projects and filters to reflect * the names in the given logical relation. This makes it possible to compare attributes and @@ -191,17 +166,6 @@ object SchemaPruning extends Rule[LogicalPlan] { outputRelation.copy(relation = prunedBaseRelation, output = prunedOutput) } - /** - * Builds a pruned data source V2 relation from the output of the relation and the schema - * of the pruned [[FileTable]]. - */ - private def buildPrunedRelationV2( - outputRelation: DataSourceV2Relation, - prunedFileTable: FileTable) = { - val prunedOutput = getPrunedOutput(outputRelation.output, prunedFileTable.schema) - outputRelation.copy(table = prunedFileTable, output = prunedOutput) - } - // Prune the given output to make it consistent with `requiredSchema`. private def getPrunedOutput( output: Seq[AttributeReference], diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala index 8abc6fcacd4c5..cbf9d2bac7ceb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala @@ -17,16 +17,13 @@ package org.apache.spark.sql.execution.datasources.csv -import java.nio.charset.Charset - import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.mapreduce._ -import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.csv.{CSVHeaderChecker, CSVOptions, UnivocityGenerator, UnivocityParser} +import org.apache.spark.sql.catalyst.csv.{CSVHeaderChecker, CSVOptions, UnivocityParser} import org.apache.spark.sql.catalyst.expressions.ExprUtils import org.apache.spark.sql.catalyst.util.CompressionCodecs import org.apache.spark.sql.execution.datasources._ @@ -134,7 +131,11 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister { dataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)) val actualRequiredSchema = StructType( requiredSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)) - val parser = new UnivocityParser(actualDataSchema, actualRequiredSchema, parsedOptions) + val parser = new UnivocityParser( + actualDataSchema, + actualRequiredSchema, + parsedOptions, + filters) val schema = if (columnPruning) actualRequiredSchema else actualDataSchema val isStartOfFile = file.start == 0 val headerChecker = new CSVHeaderChecker( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala index 21fabac472f4b..d8b52c503ad34 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala @@ -33,11 +33,12 @@ object CSVUtils { // with the one below, `filterCommentAndEmpty` but execution path is different. One of them // might have to be removed in the near future if possible. import lines.sqlContext.implicits._ - val nonEmptyLines = lines.filter(length(trim($"value")) > 0) + val aliased = lines.toDF("value") + val nonEmptyLines = aliased.filter(length(trim($"value")) > 0) if (options.isCommentSet) { - nonEmptyLines.filter(!$"value".startsWith(options.comment.toString)) + nonEmptyLines.filter(!$"value".startsWith(options.comment.toString)).as[String] } else { - nonEmptyLines + nonEmptyLines.as[String] } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CsvOutputWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CsvOutputWriter.scala index 3ff36bfde3cca..2b549536ae486 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CsvOutputWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CsvOutputWriter.scala @@ -33,25 +33,17 @@ class CsvOutputWriter( context: TaskAttemptContext, params: CSVOptions) extends OutputWriter with Logging { - private var univocityGenerator: Option[UnivocityGenerator] = None + private val charset = Charset.forName(params.charset) + + private val writer = CodecStreams.createOutputStreamWriter(context, new Path(path), charset) + + private val gen = new UnivocityGenerator(dataSchema, writer, params) if (params.headerFlag) { - val gen = getGen() gen.writeHeaders() } - private def getGen(): UnivocityGenerator = univocityGenerator.getOrElse { - val charset = Charset.forName(params.charset) - val os = CodecStreams.createOutputStreamWriter(context, new Path(path), charset) - val newGen = new UnivocityGenerator(dataSchema, os, params) - univocityGenerator = Some(newGen) - newGen - } - - override def write(row: InternalRow): Unit = { - val gen = getGen() - gen.write(row) - } + override def write(row: InternalRow): Unit = gen.write(row) - override def close(): Unit = univocityGenerator.foreach(_.close()) + override def close(): Unit = gen.close() } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala index d184f3cb71b1a..222ef1145b922 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCOptions.scala @@ -147,14 +147,7 @@ class JDBCOptions( """.stripMargin ) - val fetchSize = { - val size = parameters.getOrElse(JDBC_BATCH_FETCH_SIZE, "0").toInt - require(size >= 0, - s"Invalid value `${size.toString}` for parameter " + - s"`$JDBC_BATCH_FETCH_SIZE`. The minimum value is 0. When the value is 0, " + - "the JDBC driver ignores the value and does the estimates.") - size - } + val fetchSize = parameters.getOrElse(JDBC_BATCH_FETCH_SIZE, "0").toInt // ------------------------------------------------------------ // Optional parameters only for writing @@ -184,6 +177,10 @@ class JDBCOptions( case "READ_COMMITTED" => Connection.TRANSACTION_READ_COMMITTED case "REPEATABLE_READ" => Connection.TRANSACTION_REPEATABLE_READ case "SERIALIZABLE" => Connection.TRANSACTION_SERIALIZABLE + case other => throw new IllegalArgumentException( + s"Invalid value `$other` for parameter `$JDBC_TXN_ISOLATION_LEVEL`. This can be " + + "`NONE`, `READ_UNCOMMITTED`, `READ_COMMITTED`, `REPEATABLE_READ` or `SERIALIZABLE`." + ) } // An option to execute custom SQL before fetching data from the remote DB val sessionInitStatement = parameters.get(JDBC_SESSION_INIT_STATEMENT) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala index 16b493892e3be..e25ce53941ff6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRDD.scala @@ -231,7 +231,7 @@ private[jdbc] class JDBCRDD( var stmt: PreparedStatement = null var conn: Connection = null - def close() { + def close(): Unit = { if (closed) return try { if (null != rs) { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala index 3cd5cb1647923..f5a474ddf3904 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JDBCRelation.scala @@ -186,7 +186,7 @@ private[sql] object JDBCRelation extends Logging { } columnType match { case _: NumericType => value.toLong - case DateType => parse(stringToDate).toLong + case DateType => parse(stringToDate(_, getZoneId(timeZoneId))).toLong case TimestampType => parse(stringToTimestamp(_, getZoneId(timeZoneId))) } } @@ -197,7 +197,9 @@ private[sql] object JDBCRelation extends Logging { timeZoneId: String): String = { def dateTimeToString(): String = { val dateTimeStr = columnType match { - case DateType => DateFormatter().format(value.toInt) + case DateType => + val dateFormatter = DateFormatter(DateTimeUtils.getZoneId(timeZoneId)) + dateFormatter.format(value.toInt) case TimestampType => val timestampFormatter = TimestampFormatter.getFractionFormatter( DateTimeUtils.getZoneId(timeZoneId)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala index 86a27b5afc250..c1e1aed83bae5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala @@ -445,7 +445,7 @@ object JdbcUtils extends Logging { case ByteType => (rs: ResultSet, row: InternalRow, pos: Int) => - row.update(pos, rs.getByte(pos + 1)) + row.setByte(pos, rs.getByte(pos + 1)) case StringType => (rs: ResultSet, row: InternalRow, pos: Int) => @@ -605,6 +605,13 @@ object JdbcUtils extends Logging { * implementation changes elsewhere might easily render such a closure * non-Serializable. Instead, we explicitly close over all variables that * are used. + * + * Note that this method records task output metrics. It assumes the method is + * running in a task. For now, we only records the number of rows being written + * because there's no good way to measure the total bytes being written. Only + * effective outputs are taken into account: for example, metric will not be updated + * if it supports transaction and transaction is rolled back, but metric will be + * updated even with error if it doesn't support transaction, as there're dirty outputs. */ def savePartition( getConnection: () => Connection, @@ -615,7 +622,9 @@ object JdbcUtils extends Logging { batchSize: Int, dialect: JdbcDialect, isolationLevel: Int, - options: JDBCOptions): Iterator[Byte] = { + options: JDBCOptions): Unit = { + val outMetrics = TaskContext.get().taskMetrics().outputMetrics + val conn = getConnection() var committed = false @@ -643,7 +652,7 @@ object JdbcUtils extends Logging { } } val supportsTransactions = finalIsolationLevel != Connection.TRANSACTION_NONE - + var totalRowCount = 0L try { if (supportsTransactions) { conn.setAutoCommit(false) // Everything in the same db transaction. @@ -672,6 +681,7 @@ object JdbcUtils extends Logging { } stmt.addBatch() rowCount += 1 + totalRowCount += 1 if (rowCount % batchSize == 0) { stmt.executeBatch() rowCount = 0 @@ -687,7 +697,6 @@ object JdbcUtils extends Logging { conn.commit() } committed = true - Iterator.empty } catch { case e: SQLException => val cause = e.getNextException @@ -715,9 +724,13 @@ object JdbcUtils extends Logging { // tell the user about another problem. if (supportsTransactions) { conn.rollback() + } else { + outMetrics.setRecordsWritten(totalRowCount) } conn.close() } else { + outMetrics.setRecordsWritten(totalRowCount) + // The stage must succeed. We cannot propagate any exception close() might throw. try { conn.close() @@ -840,10 +853,10 @@ object JdbcUtils extends Logging { case Some(n) if n < df.rdd.getNumPartitions => df.coalesce(n) case _ => df } - repartitionedDF.rdd.foreachPartition(iterator => savePartition( + repartitionedDF.rdd.foreachPartition { iterator => savePartition( getConnection, table, iterator, rddSchema, insertStmt, batchSize, dialect, isolationLevel, options) - ) + } } /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonOutputWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonOutputWriter.scala index b3cd570cfb1cf..dfd84e344eb2a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonOutputWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonOutputWriter.scala @@ -44,20 +44,18 @@ class JsonOutputWriter( " which can be read back by Spark only if multiLine is enabled.") } - private var jacksonGenerator: Option[JacksonGenerator] = None + private val writer = CodecStreams.createOutputStreamWriter(context, new Path(path), encoding) - override def write(row: InternalRow): Unit = { - val gen = jacksonGenerator.getOrElse { - val os = CodecStreams.createOutputStreamWriter(context, new Path(path), encoding) - // create the Generator without separator inserted between 2 records - val newGen = new JacksonGenerator(dataSchema, os, options) - jacksonGenerator = Some(newGen) - newGen - } + // create the Generator without separator inserted between 2 records + private[this] val gen = new JacksonGenerator(dataSchema, writer, options) + override def write(row: InternalRow): Unit = { gen.write(row) gen.writeLineEnding() } - override def close(): Unit = jacksonGenerator.foreach(_.close()) + override def close(): Unit = { + gen.close() + writer.close() + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala index e4f9e49c4dd28..4fad0a2484cde 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala @@ -22,10 +22,11 @@ import java.util import scala.collection.JavaConverters._ import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table, TableCapability} +import org.apache.spark.sql.connector.write.{BatchWrite, DataWriter, DataWriterFactory, LogicalWriteInfo, PhysicalWriteInfo, SupportsTruncate, WriteBuilder, WriterCommitMessage} +import org.apache.spark.sql.connector.write.streaming.{StreamingDataWriterFactory, StreamingWrite} +import org.apache.spark.sql.internal.connector.SimpleTableProvider import org.apache.spark.sql.sources.DataSourceRegister -import org.apache.spark.sql.sources.v2._ -import org.apache.spark.sql.sources.v2.writer._ -import org.apache.spark.sql.sources.v2.writer.streaming.{StreamingDataWriterFactory, StreamingWrite} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -33,13 +34,13 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap * This is no-op datasource. It does not do anything besides consuming its input. * This can be useful for benchmarking or to cache data without any additional overhead. */ -class NoopDataSource extends TableProvider with DataSourceRegister { +class NoopDataSource extends SimpleTableProvider with DataSourceRegister { override def shortName(): String = "noop" override def getTable(options: CaseInsensitiveStringMap): Table = NoopTable } private[noop] object NoopTable extends Table with SupportsWrite { - override def newWriteBuilder(options: CaseInsensitiveStringMap): WriteBuilder = NoopWriteBuilder + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = NoopWriteBuilder override def name(): String = "noop-table" override def schema(): StructType = new StructType() override def capabilities(): util.Set[TableCapability] = { @@ -58,7 +59,8 @@ private[noop] object NoopWriteBuilder extends WriteBuilder with SupportsTruncate } private[noop] object NoopBatchWrite extends BatchWrite { - override def createBatchWriterFactory(): DataWriterFactory = NoopWriterFactory + override def createBatchWriterFactory(info: PhysicalWriteInfo): DataWriterFactory = + NoopWriterFactory override def commit(messages: Array[WriterCommitMessage]): Unit = {} override def abort(messages: Array[WriterCommitMessage]): Unit = {} } @@ -71,11 +73,12 @@ private[noop] object NoopWriter extends DataWriter[InternalRow] { override def write(record: InternalRow): Unit = {} override def commit(): WriterCommitMessage = null override def abort(): Unit = {} + override def close(): Unit = {} } private[noop] object NoopStreamingWrite extends StreamingWrite { - override def createStreamingWriterFactory(): StreamingDataWriterFactory = - NoopStreamingDataWriterFactory + override def createStreamingWriterFactory( + info: PhysicalWriteInfo): StreamingDataWriterFactory = NoopStreamingDataWriterFactory override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {} override def abort(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {} } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala index f7c12598da209..fd791ce7c5e19 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala @@ -26,7 +26,7 @@ import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.FileSplit import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl -import org.apache.orc._ +import org.apache.orc.{OrcUtils => _, _} import org.apache.orc.OrcConf.{COMPRESS, MAPRED_OUTPUT_SCHEMA} import org.apache.orc.mapred.OrcStruct import org.apache.orc.mapreduce._ @@ -38,10 +38,9 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.execution.datasources._ -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ -import org.apache.spark.util.SerializableConfiguration +import org.apache.spark.util.{SerializableConfiguration, Utils} private[sql] object OrcFileFormat { private def checkFieldName(name: String): Unit = { @@ -180,10 +179,11 @@ class OrcFileFormat val fs = filePath.getFileSystem(conf) val readerOptions = OrcFile.readerOptions(conf).filesystem(fs) - val reader = OrcFile.createReader(filePath, readerOptions) - - val requestedColIdsOrEmptyFile = OrcUtils.requestedColumnIds( - isCaseSensitive, dataSchema, requiredSchema, reader, conf) + val requestedColIdsOrEmptyFile = + Utils.tryWithResource(OrcFile.createReader(filePath, readerOptions)) { reader => + OrcUtils.requestedColumnIds( + isCaseSensitive, dataSchema, requiredSchema, reader, conf) + } if (requestedColIdsOrEmptyFile.isEmpty) { Iterator.empty diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala index 12d4244e19812..eea9b2a8f9613 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala @@ -35,7 +35,7 @@ import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.util.quoteIdentifier import org.apache.spark.sql.execution.datasources.SchemaMergeUtils import org.apache.spark.sql.types._ -import org.apache.spark.util.{SerializableConfiguration, ThreadUtils} +import org.apache.spark.util.{ThreadUtils, Utils} object OrcUtils extends Logging { @@ -62,8 +62,9 @@ object OrcUtils extends Logging { val fs = file.getFileSystem(conf) val readerOptions = OrcFile.readerOptions(conf).filesystem(fs) try { - val reader = OrcFile.createReader(file, readerOptions) - val schema = reader.getSchema + val schema = Utils.tryWithResource(OrcFile.createReader(file, readerOptions)) { reader => + reader.getSchema + } if (schema.getFieldNames.size == 0) { None } else { @@ -162,6 +163,7 @@ object OrcUtils extends Logging { if (matchedOrcFields.size > 1) { // Need to fail if there is ambiguity, i.e. more than one field is matched. val matchedOrcFieldsString = matchedOrcFields.mkString("[", ", ", "]") + reader.close() throw new RuntimeException(s"""Found duplicate field(s) "$requiredFieldName": """ + s"$matchedOrcFieldsString in case-insensitive mode") } else { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala index 815b62dfbf898..29dbd8dfbca8f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala @@ -27,7 +27,6 @@ import scala.util.{Failure, Try} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.mapreduce._ -import org.apache.hadoop.mapreduce.lib.input.FileSplit import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.parquet.filter2.compat.FilterCompat import org.apache.parquet.filter2.predicate.FilterApi @@ -296,7 +295,7 @@ class ParquetFileFormat val convertTz = if (timestampConversion && !isCreatedByParquetMr) { - Some(DateTimeUtils.getTimeZone(sharedConf.get(SQLConf.SESSION_LOCAL_TIMEZONE.key))) + Some(DateTimeUtils.getZoneId(sharedConf.get(SQLConf.SESSION_LOCAL_TIMEZONE.key))) } else { None } @@ -328,32 +327,28 @@ class ParquetFileFormat iter.asInstanceOf[Iterator[InternalRow]] } else { logDebug(s"Falling back to parquet-mr") - // ParquetRecordReader returns UnsafeRow + // ParquetRecordReader returns InternalRow val readSupport = new ParquetReadSupport(convertTz, enableVectorizedReader = false) val reader = if (pushed.isDefined && enableRecordFilter) { val parquetFilter = FilterCompat.get(pushed.get, null) - new ParquetRecordReader[UnsafeRow](readSupport, parquetFilter) + new ParquetRecordReader[InternalRow](readSupport, parquetFilter) } else { - new ParquetRecordReader[UnsafeRow](readSupport) + new ParquetRecordReader[InternalRow](readSupport) } - val iter = new RecordReaderIterator(reader) + val iter = new RecordReaderIterator[InternalRow](reader) // SPARK-23457 Register a task completion listener before `initialization`. taskContext.foreach(_.addTaskCompletionListener[Unit](_ => iter.close())) reader.initialize(split, hadoopAttemptContext) val fullSchema = requiredSchema.toAttributes ++ partitionSchema.toAttributes - val joinedRow = new JoinedRow() - val appendPartitionColumns = GenerateUnsafeProjection.generate(fullSchema, fullSchema) + val unsafeProjection = GenerateUnsafeProjection.generate(fullSchema, fullSchema) - // This is a horrible erasure hack... if we type the iterator above, then it actually check - // the type in next() and we get a class cast exception. If we make that function return - // Object, then we can defer the cast until later! if (partitionSchema.length == 0) { // There is no partition columns - iter.asInstanceOf[Iterator[InternalRow]] + iter.map(unsafeProjection) } else { - iter.asInstanceOf[Iterator[InternalRow]] - .map(d => appendPartitionColumns(joinedRow(d, file.partitionValues))) + val joinedRow = new JoinedRow() + iter.map(d => unsafeProjection(joinedRow(d, file.partitionValues))) } } } @@ -403,7 +398,7 @@ object ParquetFileFormat extends Logging { logInfo( "Serialized Spark schema in Parquet key-value metadata is not in JSON format, " + "falling back to the deprecated DataType.fromCaseClassString parser.") - LegacyTypeStringParser.parse(serializedSchema.get) + LegacyTypeStringParser.parseString(serializedSchema.get) } .recover { case cause: Throwable => logWarning( @@ -514,7 +509,7 @@ object ParquetFileFormat extends Logging { logInfo( "Serialized Spark schema in Parquet key-value metadata is not in JSON format, " + "falling back to the deprecated DataType.fromCaseClassString parser.") - LegacyTypeStringParser.parse(schemaString).asInstanceOf[StructType] + LegacyTypeStringParser.parseString(schemaString).asInstanceOf[StructType] }.recoverWith { case cause: Throwable => logWarning( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala index b9b86adb438e6..948a120e0d6e0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala @@ -591,7 +591,7 @@ class ParquetFilters( case sources.StringStartsWith(name, prefix) if pushDownStartWith && canMakeFilterOn(name, prefix) => Option(prefix).map { v => - FilterApi.userDefined(binaryColumn(name), + FilterApi.userDefined(binaryColumn(nameToParquetField(name).fieldName), new UserDefinedPredicate[Binary] with Serializable { private val strToBinary = Binary.fromReusedByteArray(v.getBytes) private val size = strToBinary.length diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala index 2c7231d2c3e0a..c05ecf16311ab 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala @@ -17,7 +17,8 @@ package org.apache.spark.sql.execution.datasources.parquet -import java.util.{Locale, Map => JMap, TimeZone} +import java.time.ZoneId +import java.util.{Locale, Map => JMap} import scala.collection.JavaConverters._ @@ -29,13 +30,13 @@ import org.apache.parquet.schema._ import org.apache.parquet.schema.Type.Repetition import org.apache.spark.internal.Logging -import org.apache.spark.sql.catalyst.expressions.UnsafeRow +import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ /** * A Parquet [[ReadSupport]] implementation for reading Parquet records as Catalyst - * [[UnsafeRow]]s. + * [[InternalRow]]s. * * The API interface of [[ReadSupport]] is a little bit over complicated because of historical * reasons. In older versions of parquet-mr (say 1.6.0rc3 and prior), [[ReadSupport]] need to be @@ -49,9 +50,9 @@ import org.apache.spark.sql.types._ * Due to this reason, we no longer rely on [[ReadContext]] to pass requested schema from [[init()]] * to [[prepareForRead()]], but use a private `var` for simplicity. */ -class ParquetReadSupport(val convertTz: Option[TimeZone], +class ParquetReadSupport(val convertTz: Option[ZoneId], enableVectorizedReader: Boolean) - extends ReadSupport[UnsafeRow] with Logging { + extends ReadSupport[InternalRow] with Logging { private var catalystRequestedSchema: StructType = _ def this() { @@ -114,13 +115,13 @@ class ParquetReadSupport(val convertTz: Option[TimeZone], /** * Called on executor side after [[init()]], before instantiating actual Parquet record readers. * Responsible for instantiating [[RecordMaterializer]], which is used for converting Parquet - * records to Catalyst [[UnsafeRow]]s. + * records to Catalyst [[InternalRow]]s. */ override def prepareForRead( conf: Configuration, keyValueMetaData: JMap[String, String], fileSchema: MessageType, - readContext: ReadContext): RecordMaterializer[UnsafeRow] = { + readContext: ReadContext): RecordMaterializer[InternalRow] = { val parquetRequestedSchema = readContext.getRequestedSchema new ParquetRecordMaterializer( parquetRequestedSchema, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRecordMaterializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRecordMaterializer.scala index b2459dd0e8bba..5622169df1281 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRecordMaterializer.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRecordMaterializer.scala @@ -17,12 +17,12 @@ package org.apache.spark.sql.execution.datasources.parquet -import java.util.TimeZone +import java.time.ZoneId import org.apache.parquet.io.api.{GroupConverter, RecordMaterializer} import org.apache.parquet.schema.MessageType -import org.apache.spark.sql.catalyst.expressions.UnsafeRow +import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types.StructType /** @@ -36,13 +36,13 @@ private[parquet] class ParquetRecordMaterializer( parquetSchema: MessageType, catalystSchema: StructType, schemaConverter: ParquetToSparkSchemaConverter, - convertTz: Option[TimeZone]) - extends RecordMaterializer[UnsafeRow] { + convertTz: Option[ZoneId]) + extends RecordMaterializer[InternalRow] { private val rootConverter = new ParquetRowConverter(schemaConverter, parquetSchema, catalystSchema, convertTz, NoopUpdater) - override def getCurrentRecord: UnsafeRow = rootConverter.currentRecord + override def getCurrentRecord: InternalRow = rootConverter.currentRecord override def getRootConverter: GroupConverter = rootConverter } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala index b772b6b77d1ce..850adae8a6b95 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.parquet import java.math.{BigDecimal, BigInteger} import java.nio.ByteOrder -import java.util.TimeZone +import java.time.{ZoneId, ZoneOffset} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer @@ -125,7 +125,7 @@ private[parquet] class ParquetRowConverter( schemaConverter: ParquetToSparkSchemaConverter, parquetType: GroupType, catalystType: StructType, - convertTz: Option[TimeZone], + convertTz: Option[ZoneId], updater: ParentContainerUpdater) extends ParquetGroupConverter(updater) with Logging { @@ -154,8 +154,6 @@ private[parquet] class ParquetRowConverter( |${catalystType.prettyJson} """.stripMargin) - private val UTC = DateTimeUtils.TimeZoneUTC - /** * Updater used together with field converters within a [[ParquetRowConverter]]. It propagates * converted filed values to the `ordinal`-th cell in `currentRow`. @@ -171,17 +169,15 @@ private[parquet] class ParquetRowConverter( override def setFloat(value: Float): Unit = row.setFloat(ordinal, value) } - private val currentRow = new SpecificInternalRow(catalystType.map(_.dataType)) - - private val unsafeProjection = UnsafeProjection.create(catalystType) + private[this] val currentRow = new SpecificInternalRow(catalystType.map(_.dataType)) /** - * The [[UnsafeRow]] converted from an entire Parquet record. + * The [[InternalRow]] converted from an entire Parquet record. */ - def currentRecord: UnsafeRow = unsafeProjection(currentRow) + def currentRecord: InternalRow = currentRow // Converters for each field. - private val fieldConverters: Array[Converter with HasParentContainerUpdater] = { + private[this] val fieldConverters: Array[Converter with HasParentContainerUpdater] = { parquetType.getFields.asScala.map { parquetField => val fieldIndex = catalystType.fieldIndex(parquetField.getName) val catalystField = catalystType(fieldIndex) @@ -190,12 +186,15 @@ private[parquet] class ParquetRowConverter( }.toArray } + // Updaters for each field. + private[this] val fieldUpdaters: Array[ParentContainerUpdater] = fieldConverters.map(_.updater) + override def getConverter(fieldIndex: Int): Converter = fieldConverters(fieldIndex) override def end(): Unit = { var i = 0 - while (i < fieldConverters.length) { - fieldConverters(i).updater.end() + while (i < fieldUpdaters.length) { + fieldUpdaters(i).end() i += 1 } updater.set(currentRow) @@ -203,13 +202,14 @@ private[parquet] class ParquetRowConverter( override def start(): Unit = { var i = 0 - while (i < currentRow.numFields) { + val numFields = currentRow.numFields + while (i < numFields) { currentRow.setNullAt(i) i += 1 } i = 0 - while (i < fieldConverters.length) { - fieldConverters(i).updater.start() + while (i < fieldUpdaters.length) { + fieldUpdaters(i).start() i += 1 } } @@ -290,7 +290,8 @@ private[parquet] class ParquetRowConverter( val timeOfDayNanos = buf.getLong val julianDay = buf.getInt val rawTime = DateTimeUtils.fromJulianDay(julianDay, timeOfDayNanos) - val adjTime = convertTz.map(DateTimeUtils.convertTz(rawTime, _, UTC)).getOrElse(rawTime) + val adjTime = convertTz.map(DateTimeUtils.convertTz(rawTime, _, ZoneOffset.UTC)) + .getOrElse(rawTime) updater.setLong(adjTime) } } @@ -320,10 +321,34 @@ private[parquet] class ParquetRowConverter( new ParquetMapConverter(parquetType.asGroupType(), t, updater) case t: StructType => + val wrappedUpdater = { + // SPARK-30338: avoid unnecessary InternalRow copying for nested structs: + // There are two cases to handle here: + // + // 1. Parent container is a map or array: we must make a deep copy of the mutable row + // because this converter may be invoked multiple times per Parquet input record + // (if the map or array contains multiple elements). + // + // 2. Parent container is a struct: we don't need to copy the row here because either: + // + // (a) all ancestors are structs and therefore no copying is required because this + // converter will only be invoked once per Parquet input record, or + // (b) some ancestor is struct that is nested in a map or array and that ancestor's + // converter will perform deep-copying (which will recursively copy this row). + if (updater.isInstanceOf[RowUpdater]) { + // `updater` is a RowUpdater, implying that the parent container is a struct. + updater + } else { + // `updater` is NOT a RowUpdater, implying that the parent container a map or array. + new ParentContainerUpdater { + override def set(value: Any): Unit = { + updater.set(value.asInstanceOf[SpecificInternalRow].copy()) // deep copy + } + } + } + } new ParquetRowConverter( - schemaConverter, parquetType.asGroupType(), t, convertTz, new ParentContainerUpdater { - override def set(value: Any): Unit = updater.set(value.asInstanceOf[InternalRow].copy()) - }) + schemaConverter, parquetType.asGroupType(), t, convertTz, wrappedUpdater) case t => throw new RuntimeException( @@ -466,9 +491,9 @@ private[parquet] class ParquetRowConverter( updater: ParentContainerUpdater) extends ParquetGroupConverter(updater) { - private var currentArray: ArrayBuffer[Any] = _ + private[this] val currentArray = ArrayBuffer.empty[Any] - private val elementConverter: Converter = { + private[this] val elementConverter: Converter = { val repeatedType = parquetSchema.getType(0) val elementType = catalystSchema.elementType @@ -519,10 +544,7 @@ private[parquet] class ParquetRowConverter( override def end(): Unit = updater.set(new GenericArrayData(currentArray.toArray)) - // NOTE: We can't reuse the mutable `ArrayBuffer` here and must instantiate a new buffer for the - // next value. `Row.copy()` only copies row cells, it doesn't do deep copy to objects stored - // in row cells. - override def start(): Unit = currentArray = ArrayBuffer.empty[Any] + override def start(): Unit = currentArray.clear() /** Array element converter */ private final class ElementConverter(parquetType: Type, catalystType: DataType) @@ -530,9 +552,10 @@ private[parquet] class ParquetRowConverter( private var currentElement: Any = _ - private val converter = newConverter(parquetType, catalystType, new ParentContainerUpdater { - override def set(value: Any): Unit = currentElement = value - }) + private[this] val converter = + newConverter(parquetType, catalystType, new ParentContainerUpdater { + override def set(value: Any): Unit = currentElement = value + }) override def getConverter(fieldIndex: Int): Converter = converter @@ -549,10 +572,10 @@ private[parquet] class ParquetRowConverter( updater: ParentContainerUpdater) extends ParquetGroupConverter(updater) { - private var currentKeys: ArrayBuffer[Any] = _ - private var currentValues: ArrayBuffer[Any] = _ + private[this] val currentKeys = ArrayBuffer.empty[Any] + private[this] val currentValues = ArrayBuffer.empty[Any] - private val keyValueConverter = { + private[this] val keyValueConverter = { val repeatedType = parquetType.getType(0).asGroupType() new KeyValueConverter( repeatedType.getType(0), @@ -567,15 +590,15 @@ private[parquet] class ParquetRowConverter( // The parquet map may contains null or duplicated map keys. When it happens, the behavior is // undefined. // TODO (SPARK-26174): disallow it with a config. - updater.set(ArrayBasedMapData(currentKeys.toArray, currentValues.toArray)) + updater.set( + new ArrayBasedMapData( + new GenericArrayData(currentKeys.toArray), + new GenericArrayData(currentValues.toArray))) } - // NOTE: We can't reuse the mutable Map here and must instantiate a new `Map` for the next - // value. `Row.copy()` only copies row cells, it doesn't do deep copy to objects stored in row - // cells. override def start(): Unit = { - currentKeys = ArrayBuffer.empty[Any] - currentValues = ArrayBuffer.empty[Any] + currentKeys.clear() + currentValues.clear() } /** Parquet converter for key-value pairs within the map. */ @@ -590,7 +613,7 @@ private[parquet] class ParquetRowConverter( private var currentValue: Any = _ - private val converters = Array( + private[this] val converters = Array( // Converter for keys newConverter(parquetKeyType, catalystKeyType, new ParentContainerUpdater { override def set(value: Any): Unit = currentKey = value @@ -616,10 +639,10 @@ private[parquet] class ParquetRowConverter( } private trait RepeatedConverter { - private var currentArray: ArrayBuffer[Any] = _ + private[this] val currentArray = ArrayBuffer.empty[Any] protected def newArrayUpdater(updater: ParentContainerUpdater) = new ParentContainerUpdater { - override def start(): Unit = currentArray = ArrayBuffer.empty[Any] + override def start(): Unit = currentArray.clear() override def end(): Unit = updater.set(new GenericArrayData(currentArray.toArray)) override def set(value: Any): Unit = currentArray += value } @@ -637,7 +660,7 @@ private[parquet] class ParquetRowConverter( val updater: ParentContainerUpdater = newArrayUpdater(parentUpdater) - private val elementConverter: PrimitiveConverter = + private[this] val elementConverter: PrimitiveConverter = newConverter(parquetType, catalystType, updater).asPrimitiveConverter() override def addBoolean(value: Boolean): Unit = elementConverter.addBoolean(value) @@ -664,7 +687,7 @@ private[parquet] class ParquetRowConverter( val updater: ParentContainerUpdater = newArrayUpdater(parentUpdater) - private val elementConverter: GroupConverter = + private[this] val elementConverter: GroupConverter = newConverter(parquetType, catalystType, updater).asGroupConverter() override def getConverter(field: Int): Converter = elementConverter.getConverter(field) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala index b507ef1c509dd..95343e2872def 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala @@ -20,18 +20,18 @@ package org.apache.spark.sql.execution.datasources import java.util.Locale import org.apache.spark.sql.{AnalysisException, SaveMode, SparkSession} -import org.apache.spark.sql.catalog.v2.expressions.{FieldReference, RewritableTransform} import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.catalog._ -import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Cast, Expression, InputFileBlockLength, InputFileBlockStart, InputFileName, RowOrdering} +import org.apache.spark.sql.catalyst.expressions.{Expression, InputFileBlockLength, InputFileBlockStart, InputFileName, RowOrdering} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.connector.expressions.{FieldReference, RewritableTransform} import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2 import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy import org.apache.spark.sql.sources.InsertableRelation -import org.apache.spark.sql.types.{ArrayType, AtomicType, StructField, StructType} +import org.apache.spark.sql.types.{AtomicType, StructType} import org.apache.spark.sql.util.SchemaUtils /** @@ -189,14 +189,11 @@ case class PreprocessTableCreation(sparkSession: SparkSession) extends Rule[Logi query } - // SPARK-28730: for V1 data source, we use the "LEGACY" as default store assignment policy. - // TODO: use ANSI store assignment policy by default in SPARK-28495. - val storeAssignmentPolicy = conf.storeAssignmentPolicy.getOrElse(StoreAssignmentPolicy.LEGACY) c.copy( tableDesc = existingTable, query = Some(TableOutputResolver.resolveOutputColumns( tableDesc.qualifiedName, existingTable.schema.toAttributes, newQuery, - byName = true, conf, storeAssignmentPolicy))) + byName = true, conf))) // Here we normalize partition, bucket and sort column names, w.r.t. the case sensitivity // config, and do various checks: @@ -377,19 +374,19 @@ case class PreprocessTableCreation(sparkSession: SparkSession) extends Rule[Logi } /** - * Preprocess the [[InsertIntoTable]] plan. Throws exception if the number of columns mismatch, or - * specified partition columns are different from the existing partition columns in the target + * Preprocess the [[InsertIntoStatement]] plan. Throws exception if the number of columns mismatch, + * or specified partition columns are different from the existing partition columns in the target * table. It also does data type casting and field renaming, to make sure that the columns to be * inserted have the correct data type and fields have the correct names. */ case class PreprocessTableInsertion(conf: SQLConf) extends Rule[LogicalPlan] { private def preprocess( - insert: InsertIntoTable, + insert: InsertIntoStatement, tblName: String, - partColNames: Seq[String]): InsertIntoTable = { + partColNames: Seq[String]): InsertIntoStatement = { val normalizedPartSpec = PartitioningUtils.normalizePartitionSpec( - insert.partition, partColNames, tblName, conf.resolver) + insert.partitionSpec, partColNames, tblName, conf.resolver) val staticPartCols = normalizedPartSpec.filter(_._2.isDefined).keySet val expectedColumns = insert.table.output.filterNot(a => staticPartCols.contains(a.name)) @@ -402,11 +399,8 @@ case class PreprocessTableInsertion(conf: SQLConf) extends Rule[LogicalPlan] { s"including ${staticPartCols.size} partition column(s) having constant value(s).") } - // SPARK-28730: for V1 data source, we use the "LEGACY" as default store assignment policy. - // TODO: use ANSI store assignment policy by default in SPARK-28495. - val storeAssignmentPolicy = conf.storeAssignmentPolicy.getOrElse(StoreAssignmentPolicy.LEGACY) val newQuery = TableOutputResolver.resolveOutputColumns( - tblName, expectedColumns, insert.query, byName = false, conf, storeAssignmentPolicy) + tblName, expectedColumns, insert.query, byName = false, conf) if (normalizedPartSpec.nonEmpty) { if (normalizedPartSpec.size != partColNames.length) { throw new AnalysisException( @@ -417,16 +411,16 @@ case class PreprocessTableInsertion(conf: SQLConf) extends Rule[LogicalPlan] { """.stripMargin) } - insert.copy(query = newQuery, partition = normalizedPartSpec) + insert.copy(query = newQuery, partitionSpec = normalizedPartSpec) } else { // All partition columns are dynamic because the InsertIntoTable command does // not explicitly specify partitioning columns. - insert.copy(query = newQuery, partition = partColNames.map(_ -> None).toMap) + insert.copy(query = newQuery, partitionSpec = partColNames.map(_ -> None).toMap) } } def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { - case i @ InsertIntoTable(table, _, query, _, _) if table.resolved && query.resolved => + case i @ InsertIntoStatement(table, _, query, _, _) if table.resolved && query.resolved => table match { case relation: HiveTableRelation => val metadata = relation.tableMeta @@ -503,7 +497,7 @@ object PreWriteCheck extends (LogicalPlan => Unit) { def apply(plan: LogicalPlan): Unit = { plan.foreach { - case InsertIntoTable(l @ LogicalRelation(relation, _, _, _), partition, query, _, _) => + case InsertIntoStatement(l @ LogicalRelation(relation, _, _, _), partition, query, _, _) => // Get all input data source relations of the query. val srcRelations = query.collect { case LogicalRelation(src, _, _, _) => src @@ -525,7 +519,7 @@ object PreWriteCheck extends (LogicalPlan => Unit) { case _ => failAnalysis(s"$relation does not allow insertion.") } - case InsertIntoTable(t, _, _, _, _) + case InsertIntoStatement(t, _, _, _, _) if !t.isInstanceOf[LeafNode] || t.isInstanceOf[Range] || t.isInstanceOf[OneRowRelation] || diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextOutputWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextOutputWriter.scala index faf6e573105f2..2b1b81f60ceb4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextOutputWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextOutputWriter.scala @@ -16,8 +16,6 @@ */ package org.apache.spark.sql.execution.datasources.text -import java.io.OutputStream - import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.TaskAttemptContext @@ -32,23 +30,17 @@ class TextOutputWriter( context: TaskAttemptContext) extends OutputWriter { - private var outputStream: Option[OutputStream] = None + private val writer = CodecStreams.createOutputStream(context, new Path(path)) override def write(row: InternalRow): Unit = { - val os = outputStream.getOrElse { - val newStream = CodecStreams.createOutputStream(context, new Path(path)) - outputStream = Some(newStream) - newStream - } - if (!row.isNullAt(0)) { val utf8string = row.getUTF8String(0) - utf8string.writeTo(os) + utf8string.writeTo(writer) } - os.write(lineSeparator) + writer.write(lineSeparator) } override def close(): Unit = { - outputStream.foreach(_.close()) + writer.close() } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AlterNamespaceSetPropertiesExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AlterNamespaceSetPropertiesExec.scala new file mode 100644 index 0000000000000..1eebe4cdb6a86 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AlterNamespaceSetPropertiesExec.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.v2 + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.connector.catalog.{NamespaceChange, SupportsNamespaces} + +/** + * Physical plan node for setting properties of namespace. + */ +case class AlterNamespaceSetPropertiesExec( + catalog: SupportsNamespaces, + namespace: Seq[String], + props: Map[String, String]) extends V2CommandExec { + override protected def run(): Seq[InternalRow] = { + val changes = props.map{ case (k, v) => + NamespaceChange.setProperty(k, v) + }.toSeq + catalog.alterNamespace(namespace.toArray, changes: _*) + Seq.empty + } + + override def output: Seq[Attribute] = Seq.empty +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AlterTableExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AlterTableExec.scala index a3fa82b12e938..8b2930cca841d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AlterTableExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/AlterTableExec.scala @@ -18,11 +18,9 @@ package org.apache.spark.sql.execution.datasources.v2 import org.apache.spark.SparkException -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalog.v2.{Identifier, TableCatalog, TableChange} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.execution.LeafExecNode +import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog, TableChange} /** * Physical plan node for altering a table. @@ -30,11 +28,11 @@ import org.apache.spark.sql.execution.LeafExecNode case class AlterTableExec( catalog: TableCatalog, ident: Identifier, - changes: Seq[TableChange]) extends LeafExecNode { + changes: Seq[TableChange]) extends V2CommandExec { override def output: Seq[Attribute] = Seq.empty - override protected def doExecute(): RDD[InternalRow] = { + override protected def run(): Seq[InternalRow] = { try { catalog.alterTable(ident, changes: _*) } catch { @@ -42,6 +40,6 @@ case class AlterTableExec( throw new SparkException(s"Unsupported table change: ${e.getMessage}", e) } - sqlContext.sparkContext.parallelize(Seq.empty, 1) + Seq.empty } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExec.scala index 0f98d9486bbbf..e4e7887017a1d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/BatchScanExec.scala @@ -21,7 +21,7 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.QueryPlan -import org.apache.spark.sql.sources.v2.reader._ +import org.apache.spark.sql.connector.read.{InputPartition, PartitionReaderFactory, Scan} /** * Physical plan node for scanning a batch of data from a data source v2. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ContinuousScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ContinuousScanExec.scala index f54ff608a53e3..dc95d157e40fa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ContinuousScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ContinuousScanExec.scala @@ -20,9 +20,9 @@ package org.apache.spark.sql.execution.datasources.v2 import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.connector.read.{InputPartition, Scan} +import org.apache.spark.sql.connector.read.streaming.{ContinuousPartitionReaderFactory, ContinuousStream, Offset} import org.apache.spark.sql.execution.streaming.continuous._ -import org.apache.spark.sql.sources.v2.reader._ -import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousPartitionReaderFactory, ContinuousStream, Offset} /** * Physical plan node for scanning data from a streaming data source with continuous mode. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateNamespaceExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateNamespaceExec.scala new file mode 100644 index 0000000000000..d5b81d13a7cc4 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateNamespaceExec.scala @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.v2 + +import scala.collection.JavaConverters.mapAsJavaMapConverter + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.analysis.NamespaceAlreadyExistsException +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.connector.catalog.SupportsNamespaces +import org.apache.spark.util.Utils + +/** + * Physical plan node for creating a namespace. + */ +case class CreateNamespaceExec( + catalog: SupportsNamespaces, + namespace: Seq[String], + ifNotExists: Boolean, + private var properties: Map[String, String]) + extends V2CommandExec { + override protected def run(): Seq[InternalRow] = { + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + import org.apache.spark.sql.connector.catalog.SupportsNamespaces._ + + val ns = namespace.toArray + if (!catalog.namespaceExists(ns)) { + try { + val ownership = + Map(PROP_OWNER -> Utils.getCurrentUserName()) + catalog.createNamespace(ns, (properties ++ ownership).asJava) + } catch { + case _: NamespaceAlreadyExistsException if ifNotExists => + logWarning(s"Namespace ${namespace.quoted} was created concurrently. Ignoring.") + } + } else if (!ifNotExists) { + throw new NamespaceAlreadyExistsException(ns) + } + + Seq.empty + } + + override def output: Seq[Attribute] = Seq.empty +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateTableExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateTableExec.scala index f35758bf08c67..511cd8a9a438f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateTableExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateTableExec.scala @@ -19,13 +19,11 @@ package org.apache.spark.sql.execution.datasources.v2 import scala.collection.JavaConverters._ -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalog.v2.{Identifier, TableCatalog} -import org.apache.spark.sql.catalog.v2.expressions.Transform import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.execution.LeafExecNode +import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog} +import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.types.StructType case class CreateTableExec( @@ -34,10 +32,10 @@ case class CreateTableExec( tableSchema: StructType, partitioning: Seq[Transform], tableProperties: Map[String, String], - ignoreIfExists: Boolean) extends LeafExecNode { - import org.apache.spark.sql.catalog.v2.CatalogV2Implicits._ + ignoreIfExists: Boolean) extends V2CommandExec { + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ - override protected def doExecute(): RDD[InternalRow] = { + override protected def run(): Seq[InternalRow] = { if (!catalog.tableExists(identifier)) { try { catalog.createTable(identifier, tableSchema, partitioning.toArray, tableProperties.asJava) @@ -49,7 +47,7 @@ case class CreateTableExec( throw new TableAlreadyExistsException(identifier) } - sqlContext.sparkContext.parallelize(Seq.empty, 1) + Seq.empty } override def output: Seq[Attribute] = Seq.empty diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourcePartitioning.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourcePartitioning.scala index 33079d5912506..9211ec25525fa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourcePartitioning.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourcePartitioning.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.v2 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression} import org.apache.spark.sql.catalyst.plans.physical -import org.apache.spark.sql.sources.v2.reader.partitioning.{ClusteredDistribution, Partitioning} +import org.apache.spark.sql.connector.read.partitioning.{ClusteredDistribution, Partitioning} /** * An adapter from public data source partitioning to catalyst internal `Partitioning`. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala index f62f7349d1da7..63403b9577237 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala @@ -17,10 +17,15 @@ package org.apache.spark.sql.execution.datasources.v2 +import scala.language.existentials + import org.apache.spark._ +import org.apache.spark.deploy.SparkHadoopUtil +import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.sources.v2.reader.{InputPartition, PartitionReader, PartitionReaderFactory} +import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory} +import org.apache.spark.sql.vectorized.ColumnarBatch class DataSourceRDDPartition(val index: Int, val inputPartition: InputPartition) extends Partition with Serializable @@ -47,31 +52,16 @@ class DataSourceRDD( override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = { val inputPartition = castPartition(split).inputPartition - val reader: PartitionReader[_] = if (columnarReads) { - partitionReaderFactory.createColumnarReader(inputPartition) + val (iter, reader) = if (columnarReads) { + val batchReader = partitionReaderFactory.createColumnarReader(inputPartition) + val iter = new MetricsBatchIterator(new PartitionIterator[ColumnarBatch](batchReader)) + (iter, batchReader) } else { - partitionReaderFactory.createReader(inputPartition) + val rowReader = partitionReaderFactory.createReader(inputPartition) + val iter = new MetricsRowIterator(new PartitionIterator[InternalRow](rowReader)) + (iter, rowReader) } - context.addTaskCompletionListener[Unit](_ => reader.close()) - val iter = new Iterator[Any] { - private[this] var valuePrepared = false - - override def hasNext: Boolean = { - if (!valuePrepared) { - valuePrepared = reader.next() - } - valuePrepared - } - - override def next(): Any = { - if (!hasNext) { - throw new java.util.NoSuchElementException("End of stream") - } - valuePrepared = false - reader.get() - } - } // TODO: SPARK-25083 remove the type erasure hack in data source scan new InterruptibleIterator(context, iter.asInstanceOf[Iterator[InternalRow]]) } @@ -80,3 +70,68 @@ class DataSourceRDD( castPartition(split).inputPartition.preferredLocations() } } + +private class PartitionIterator[T](reader: PartitionReader[T]) extends Iterator[T] { + private[this] var valuePrepared = false + + override def hasNext: Boolean = { + if (!valuePrepared) { + valuePrepared = reader.next() + } + valuePrepared + } + + override def next(): T = { + if (!hasNext) { + throw new java.util.NoSuchElementException("End of stream") + } + valuePrepared = false + reader.get() + } +} + +private class MetricsHandler extends Logging with Serializable { + private val inputMetrics = TaskContext.get().taskMetrics().inputMetrics + private val startingBytesRead = inputMetrics.bytesRead + private val getBytesRead = SparkHadoopUtil.get.getFSBytesReadOnThreadCallback() + + def updateMetrics(numRows: Int, force: Boolean = false): Unit = { + inputMetrics.incRecordsRead(numRows) + val shouldUpdateBytesRead = + inputMetrics.recordsRead % SparkHadoopUtil.UPDATE_INPUT_METRICS_INTERVAL_RECORDS == 0 + if (shouldUpdateBytesRead || force) { + inputMetrics.setBytesRead(startingBytesRead + getBytesRead()) + } + } +} + +private abstract class MetricsIterator[I](iter: Iterator[I]) extends Iterator[I] { + protected val metricsHandler = new MetricsHandler + + override def hasNext: Boolean = { + if (iter.hasNext) { + true + } else { + metricsHandler.updateMetrics(0, force = true) + false + } + } +} + +private class MetricsRowIterator( + iter: Iterator[InternalRow]) extends MetricsIterator[InternalRow](iter) { + override def next(): InternalRow = { + val item = iter.next + metricsHandler.updateMetrics(1) + item + } +} + +private class MetricsBatchIterator( + iter: Iterator[ColumnarBatch]) extends MetricsIterator[ColumnarBatch](iter) { + override def next(): ColumnarBatch = { + val batch: ColumnarBatch = iter.next + metricsHandler.updateMetrics(batch.numRows) + batch + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExecBase.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExecBase.scala index 74fc5432ea82c..211f61279ddd5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExecBase.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExecBase.scala @@ -23,9 +23,9 @@ import org.apache.spark.sql.catalyst.expressions.AttributeMap import org.apache.spark.sql.catalyst.plans.physical import org.apache.spark.sql.catalyst.plans.physical.SinglePartition import org.apache.spark.sql.catalyst.util.truncatedString +import org.apache.spark.sql.connector.read.{InputPartition, PartitionReaderFactory, Scan, SupportsReportPartitioning} import org.apache.spark.sql.execution.LeafExecNode import org.apache.spark.sql.execution.metric.SQLMetrics -import org.apache.spark.sql.sources.v2.reader.{InputPartition, PartitionReaderFactory, Scan, SupportsReportPartitioning} import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.Utils diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala index 7cad305aefeb8..8f4e2d256c714 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala @@ -17,182 +17,134 @@ package org.apache.spark.sql.execution.datasources.v2 -import java.util.UUID - import scala.collection.JavaConverters._ -import scala.collection.mutable -import org.apache.spark.sql.{AnalysisException, Strategy} -import org.apache.spark.sql.catalog.v2.StagingTableCatalog -import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, AttributeSet, Expression, PredicateHelper, SubqueryExpression} +import org.apache.spark.sql.{AnalysisException, SparkSession, Strategy} +import org.apache.spark.sql.catalyst.analysis.{ResolvedNamespace, ResolvedTable} +import org.apache.spark.sql.catalyst.expressions.{And, Expression, NamedExpression, PredicateHelper, SubqueryExpression} import org.apache.spark.sql.catalyst.planning.PhysicalOperation -import org.apache.spark.sql.catalyst.plans.logical.{AlterTable, AppendData, CreateTableAsSelect, CreateV2Table, DeleteFromTable, DescribeTable, DropTable, LogicalPlan, OverwriteByExpression, OverwritePartitionsDynamic, Repartition, ReplaceTable, ReplaceTableAsSelect, ShowTables} -import org.apache.spark.sql.execution.{FilterExec, ProjectExec, SparkPlan} +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.connector.catalog.{CatalogV2Util, StagingTableCatalog, SupportsNamespaces, TableCapability, TableCatalog, TableChange} +import org.apache.spark.sql.connector.read.streaming.{ContinuousStream, MicroBatchStream} +import org.apache.spark.sql.execution.{FilterExec, LeafExecNode, ProjectExec, RowDataSourceScanExec, SparkPlan} import org.apache.spark.sql.execution.datasources.DataSourceStrategy import org.apache.spark.sql.execution.streaming.continuous.{ContinuousCoalesceExec, WriteToContinuousDataSource, WriteToContinuousDataSourceExec} -import org.apache.spark.sql.sources -import org.apache.spark.sql.sources.v2.TableCapability -import org.apache.spark.sql.sources.v2.reader._ -import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousStream, MicroBatchStream} -import org.apache.spark.sql.sources.v2.writer.V1WriteBuilder +import org.apache.spark.sql.sources.{BaseRelation, TableScan} import org.apache.spark.sql.util.CaseInsensitiveStringMap -object DataSourceV2Strategy extends Strategy with PredicateHelper { - - /** - * Pushes down filters to the data source reader - * - * @return pushed filter and post-scan filters. - */ - private def pushFilters( - scanBuilder: ScanBuilder, - filters: Seq[Expression]): (Seq[Expression], Seq[Expression]) = { - scanBuilder match { - case r: SupportsPushDownFilters => - // A map from translated data source leaf node filters to original catalyst filter - // expressions. For a `And`/`Or` predicate, it is possible that the predicate is partially - // pushed down. This map can be used to construct a catalyst filter expression from the - // input filter, or a superset(partial push down filter) of the input filter. - val translatedFilterToExpr = mutable.HashMap.empty[sources.Filter, Expression] - val translatedFilters = mutable.ArrayBuffer.empty[sources.Filter] - // Catalyst filter expression that can't be translated to data source filters. - val untranslatableExprs = mutable.ArrayBuffer.empty[Expression] - - for (filterExpr <- filters) { - val translated = - DataSourceStrategy.translateFilterWithMapping(filterExpr, Some(translatedFilterToExpr)) - if (translated.isEmpty) { - untranslatableExprs += filterExpr - } else { - translatedFilters += translated.get - } - } - - // Data source filters that need to be evaluated again after scanning. which means - // the data source cannot guarantee the rows returned can pass these filters. - // As a result we must return it so Spark can plan an extra filter operator. - val postScanFilters = r.pushFilters(translatedFilters.toArray).map { filter => - DataSourceStrategy.rebuildExpressionFromFilter(filter, translatedFilterToExpr) - } - // The filters which are marked as pushed to this data source - val pushedFilters = r.pushedFilters().map { filter => - DataSourceStrategy.rebuildExpressionFromFilter(filter, translatedFilterToExpr) - } - (pushedFilters, untranslatableExprs ++ postScanFilters) - - case _ => (Nil, filters) - } - } - - /** - * Applies column pruning to the data source, w.r.t. the references of the given expressions. - * - * @return the created `ScanConfig`(since column pruning is the last step of operator pushdown), - * and new output attributes after column pruning. - */ - // TODO: nested column pruning. - private def pruneColumns( - scanBuilder: ScanBuilder, - relation: DataSourceV2Relation, - exprs: Seq[Expression]): (Scan, Seq[AttributeReference]) = { - scanBuilder match { - case r: SupportsPushDownRequiredColumns => - val requiredColumns = AttributeSet(exprs.flatMap(_.references)) - val neededOutput = relation.output.filter(requiredColumns.contains) - if (neededOutput != relation.output) { - r.pruneColumns(neededOutput.toStructType) - val scan = r.build() - val nameToAttr = relation.output.map(_.name).zip(relation.output).toMap - scan -> scan.readSchema().toAttributes.map { - // We have to keep the attribute id during transformation. - a => a.withExprId(nameToAttr(a.name).exprId) - } - } else { - r.build() -> relation.output - } +class DataSourceV2Strategy(session: SparkSession) extends Strategy with PredicateHelper { - case _ => scanBuilder.build() -> relation.output + import DataSourceV2Implicits._ + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + + private def withProjectAndFilter( + project: Seq[NamedExpression], + filters: Seq[Expression], + scan: LeafExecNode, + needsUnsafeConversion: Boolean): SparkPlan = { + val filterCondition = filters.reduceLeftOption(And) + val withFilter = filterCondition.map(FilterExec(_, scan)).getOrElse(scan) + + if (withFilter.output != project || needsUnsafeConversion) { + ProjectExec(project, withFilter) + } else { + withFilter } } - import DataSourceV2Implicits._ - override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { - case PhysicalOperation(project, filters, relation: DataSourceV2Relation) => - val scanBuilder = relation.newScanBuilder() - - val (withSubquery, withoutSubquery) = filters.partition(SubqueryExpression.hasSubquery) - val normalizedFilters = DataSourceStrategy.normalizeFilters( - withoutSubquery, relation.output) - - // `pushedFilters` will be pushed down and evaluated in the underlying data sources. - // `postScanFilters` need to be evaluated after the scan. - // `postScanFilters` and `pushedFilters` can overlap, e.g. the parquet row group filter. - val (pushedFilters, postScanFiltersWithoutSubquery) = - pushFilters(scanBuilder, normalizedFilters) - val postScanFilters = postScanFiltersWithoutSubquery ++ withSubquery - val (scan, output) = pruneColumns(scanBuilder, relation, project ++ postScanFilters) - logInfo( - s""" - |Pushing operators to ${relation.name} - |Pushed Filters: ${pushedFilters.mkString(", ")} - |Post-Scan Filters: ${postScanFilters.mkString(",")} - |Output: ${output.mkString(", ")} - """.stripMargin) - - val batchExec = BatchScanExec(output, scan) - - val filterCondition = postScanFilters.reduceLeftOption(And) - val withFilter = filterCondition.map(FilterExec(_, batchExec)).getOrElse(batchExec) - - val withProjection = if (withFilter.output != project || !batchExec.supportsColumnar) { - ProjectExec(project, withFilter) - } else { - withFilter + case PhysicalOperation(project, filters, + relation @ DataSourceV2ScanRelation(_, V1ScanWrapper(scan, translated, pushed), output)) => + val v1Relation = scan.toV1TableScan[BaseRelation with TableScan](session.sqlContext) + if (v1Relation.schema != scan.readSchema()) { + throw new IllegalArgumentException( + "The fallback v1 relation reports inconsistent schema:\n" + + "Schema of v2 scan: " + scan.readSchema() + "\n" + + "Schema of v1 relation: " + v1Relation.schema) } - - withProjection :: Nil + val rdd = v1Relation.buildScan() + val unsafeRowRDD = DataSourceStrategy.toCatalystRDD(v1Relation, output, rdd) + val originalOutputNames = relation.table.schema().map(_.name) + val requiredColumnsIndex = output.map(_.name).map(originalOutputNames.indexOf) + val dsScan = RowDataSourceScanExec( + output, + requiredColumnsIndex, + translated.toSet, + pushed.toSet, + unsafeRowRDD, + v1Relation, + tableIdentifier = None) + withProjectAndFilter(project, filters, dsScan, needsUnsafeConversion = false) :: Nil + + case PhysicalOperation(project, filters, relation: DataSourceV2ScanRelation) => + // projection and filters were already pushed down in the optimizer. + // this uses PhysicalOperation to get the projection and ensure that if the batch scan does + // not support columnar, a projection is added to convert the rows to UnsafeRow. + val batchExec = BatchScanExec(relation.output, relation.scan) + withProjectAndFilter(project, filters, batchExec, !batchExec.supportsColumnar) :: Nil case r: StreamingDataSourceV2Relation if r.startOffset.isDefined && r.endOffset.isDefined => val microBatchStream = r.stream.asInstanceOf[MicroBatchStream] - // ensure there is a projection, which will produce unsafe rows required by some operators - ProjectExec(r.output, - MicroBatchScanExec( - r.output, r.scan, microBatchStream, r.startOffset.get, r.endOffset.get)) :: Nil + val scanExec = MicroBatchScanExec( + r.output, r.scan, microBatchStream, r.startOffset.get, r.endOffset.get) + + val withProjection = if (scanExec.supportsColumnar) { + scanExec + } else { + // Add a Project here to make sure we produce unsafe rows. + ProjectExec(r.output, scanExec) + } + + withProjection :: Nil case r: StreamingDataSourceV2Relation if r.startOffset.isDefined && r.endOffset.isEmpty => val continuousStream = r.stream.asInstanceOf[ContinuousStream] - // ensure there is a projection, which will produce unsafe rows required by some operators - ProjectExec(r.output, - ContinuousScanExec( - r.output, r.scan, continuousStream, r.startOffset.get)) :: Nil + val scanExec = ContinuousScanExec(r.output, r.scan, continuousStream, r.startOffset.get) + + val withProjection = if (scanExec.supportsColumnar) { + scanExec + } else { + // Add a Project here to make sure we produce unsafe rows. + ProjectExec(r.output, scanExec) + } + + withProjection :: Nil case WriteToDataSourceV2(writer, query) => WriteToDataSourceV2Exec(writer, planLater(query)) :: Nil case CreateV2Table(catalog, ident, schema, parts, props, ifNotExists) => - CreateTableExec(catalog, ident, schema, parts, props, ifNotExists) :: Nil + val propsWithOwner = CatalogV2Util.withDefaultOwnership(props) + CreateTableExec(catalog, ident, schema, parts, propsWithOwner, ifNotExists) :: Nil case CreateTableAsSelect(catalog, ident, parts, query, props, options, ifNotExists) => + val propsWithOwner = CatalogV2Util.withDefaultOwnership(props) val writeOptions = new CaseInsensitiveStringMap(options.asJava) catalog match { case staging: StagingTableCatalog => - AtomicCreateTableAsSelectExec( - staging, ident, parts, query, planLater(query), props, writeOptions, ifNotExists) :: Nil + AtomicCreateTableAsSelectExec(staging, ident, parts, query, planLater(query), + propsWithOwner, writeOptions, ifNotExists) :: Nil case _ => - CreateTableAsSelectExec( - catalog, ident, parts, query, planLater(query), props, writeOptions, ifNotExists) :: Nil + CreateTableAsSelectExec(catalog, ident, parts, query, planLater(query), + propsWithOwner, writeOptions, ifNotExists) :: Nil } + case RefreshTable(catalog, ident) => + RefreshTableExec(catalog, ident) :: Nil + case ReplaceTable(catalog, ident, schema, parts, props, orCreate) => + val propsWithOwner = CatalogV2Util.withDefaultOwnership(props) catalog match { case staging: StagingTableCatalog => - AtomicReplaceTableExec(staging, ident, schema, parts, props, orCreate = orCreate) :: Nil + AtomicReplaceTableExec( + staging, ident, schema, parts, propsWithOwner, orCreate = orCreate) :: Nil case _ => - ReplaceTableExec(catalog, ident, schema, parts, props, orCreate = orCreate) :: Nil + ReplaceTableExec( + catalog, ident, schema, parts, propsWithOwner, orCreate = orCreate) :: Nil } case ReplaceTableAsSelect(catalog, ident, parts, query, props, options, orCreate) => + val propsWithOwner = CatalogV2Util.withDefaultOwnership(props) val writeOptions = new CaseInsensitiveStringMap(options.asJava) catalog match { case staging: StagingTableCatalog => @@ -202,7 +154,7 @@ object DataSourceV2Strategy extends Strategy with PredicateHelper { parts, query, planLater(query), - props, + propsWithOwner, writeOptions, orCreate = orCreate) :: Nil case _ => @@ -212,20 +164,20 @@ object DataSourceV2Strategy extends Strategy with PredicateHelper { parts, query, planLater(query), - props, + propsWithOwner, writeOptions, orCreate = orCreate) :: Nil } - case AppendData(r: DataSourceV2Relation, query, _) => + case AppendData(r: DataSourceV2Relation, query, writeOptions, _) => r.table.asWritable match { case v1 if v1.supports(TableCapability.V1_BATCH_WRITE) => - AppendDataExecV1(v1, r.options, query) :: Nil + AppendDataExecV1(v1, writeOptions.asOptions, query) :: Nil case v2 => - AppendDataExec(v2, r.options, planLater(query)) :: Nil + AppendDataExec(v2, writeOptions.asOptions, planLater(query)) :: Nil } - case OverwriteByExpression(r: DataSourceV2Relation, deleteExpr, query, _) => + case OverwriteByExpression(r: DataSourceV2Relation, deleteExpr, query, writeOptions, _) => // fail if any filter cannot be converted. correctness depends on removing all matching data. val filters = splitConjunctivePredicates(deleteExpr).map { filter => DataSourceStrategy.translateFilter(deleteExpr).getOrElse( @@ -233,26 +185,34 @@ object DataSourceV2Strategy extends Strategy with PredicateHelper { }.toArray r.table.asWritable match { case v1 if v1.supports(TableCapability.V1_BATCH_WRITE) => - OverwriteByExpressionExecV1(v1, filters, r.options, query) :: Nil + OverwriteByExpressionExecV1(v1, filters, writeOptions.asOptions, query) :: Nil case v2 => - OverwriteByExpressionExec(v2, filters, r.options, planLater(query)) :: Nil + OverwriteByExpressionExec(v2, filters, writeOptions.asOptions, planLater(query)) :: Nil } - case OverwritePartitionsDynamic(r: DataSourceV2Relation, query, _) => - OverwritePartitionsDynamicExec(r.table.asWritable, r.options, planLater(query)) :: Nil + case OverwritePartitionsDynamic(r: DataSourceV2Relation, query, writeOptions, _) => + OverwritePartitionsDynamicExec( + r.table.asWritable, writeOptions.asOptions, planLater(query)) :: Nil - case DeleteFromTable(r: DataSourceV2Relation, condition) => - if (SubqueryExpression.hasSubquery(condition)) { - throw new AnalysisException( - s"Delete by condition with subquery is not supported: $condition") + case DeleteFromTable(relation, condition) => + relation match { + case DataSourceV2ScanRelation(table, _, output) => + if (condition.exists(SubqueryExpression.hasSubquery)) { + throw new AnalysisException( + s"Delete by condition with subquery is not supported: $condition") + } + // fail if any filter cannot be converted. + // correctness depends on removing all matching data. + val filters = DataSourceStrategy.normalizeExprs(condition.toSeq, output) + .flatMap(splitConjunctivePredicates(_).map { + f => DataSourceStrategy.translateFilter(f).getOrElse( + throw new AnalysisException(s"Exec update failed:" + + s" cannot translate expression to source filter: $f")) + }).toArray + DeleteFromTableExec(table.asDeletable, filters) :: Nil + case _ => + throw new AnalysisException("DELETE is only supported with v2 tables.") } - // fail if any filter cannot be converted. correctness depends on removing all matching data. - val filters = splitConjunctivePredicates(condition).map { - f => DataSourceStrategy.translateFilter(f).getOrElse( - throw new AnalysisException(s"Exec delete failed:" + - s" cannot translate expression to source filter: $f")) - }.toArray - DeleteFromTableExec(r.table.asDeletable, filters) :: Nil case WriteToContinuousDataSource(writer, query) => WriteToContinuousDataSourceExec(writer, planLater(query)) :: Nil @@ -269,7 +229,13 @@ object DataSourceV2Strategy extends Strategy with PredicateHelper { Nil } - case desc @ DescribeTable(r: DataSourceV2Relation, isExtended) => + case desc @ DescribeNamespace(ResolvedNamespace(catalog, ns), extended) => + DescribeNamespaceExec(desc.output, catalog.asNamespaceCatalog, ns, extended) :: Nil + + case desc @ DescribeRelation(r: ResolvedTable, partitionSpec, isExtended) => + if (partitionSpec.nonEmpty) { + throw new AnalysisException("DESCRIBE does not support partition for v2 tables.") + } DescribeTableExec(desc.output, r.table, isExtended) :: Nil case DropTable(catalog, ident, ifExists) => @@ -278,8 +244,48 @@ object DataSourceV2Strategy extends Strategy with PredicateHelper { case AlterTable(catalog, ident, _, changes) => AlterTableExec(catalog, ident, changes) :: Nil - case r : ShowTables => - ShowTablesExec(r.output, r.catalog, r.namespace, r.pattern) :: Nil + case RenameTable(catalog, oldIdent, newIdent) => + RenameTableExec(catalog, oldIdent, newIdent) :: Nil + + case AlterNamespaceSetProperties(ResolvedNamespace(catalog, ns), properties) => + AlterNamespaceSetPropertiesExec(catalog.asNamespaceCatalog, ns, properties) :: Nil + + case AlterNamespaceSetLocation(ResolvedNamespace(catalog, ns), location) => + AlterNamespaceSetPropertiesExec( + catalog.asNamespaceCatalog, + ns, + Map(SupportsNamespaces.PROP_LOCATION -> location)) :: Nil + + case CommentOnNamespace(ResolvedNamespace(catalog, ns), comment) => + AlterNamespaceSetPropertiesExec( + catalog.asNamespaceCatalog, + ns, + Map(SupportsNamespaces.PROP_COMMENT -> comment)) :: Nil + + case CommentOnTable(ResolvedTable(catalog, identifier, _), comment) => + val changes = TableChange.setProperty(TableCatalog.PROP_COMMENT, comment) + AlterTableExec(catalog, identifier, Seq(changes)) :: Nil + + case CreateNamespace(catalog, namespace, ifNotExists, properties) => + CreateNamespaceExec(catalog, namespace, ifNotExists, properties) :: Nil + + case DropNamespace(ResolvedNamespace(catalog, ns), ifExists, cascade) => + DropNamespaceExec(catalog, ns, ifExists, cascade) :: Nil + + case r @ ShowNamespaces(ResolvedNamespace(catalog, ns), pattern) => + ShowNamespacesExec(r.output, catalog.asNamespaceCatalog, ns, pattern) :: Nil + + case r @ ShowTables(ResolvedNamespace(catalog, ns), pattern) => + ShowTablesExec(r.output, catalog.asTableCatalog, ns, pattern) :: Nil + + case SetCatalogAndNamespace(catalogManager, catalogName, ns) => + SetCatalogAndNamespaceExec(catalogManager, catalogName, ns) :: Nil + + case r: ShowCurrentNamespace => + ShowCurrentNamespaceExec(r.output, r.catalogManager) :: Nil + + case r @ ShowTableProperties(rt: ResolvedTable, propertyKey) => + ShowTablePropertiesExec(r.output, rt.table, propertyKey) :: Nil case _ => Nil } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Utils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Utils.scala index 30897d86f8179..b50b8295463eb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Utils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Utils.scala @@ -20,8 +20,10 @@ package org.apache.spark.sql.execution.datasources.v2 import java.util.regex.Pattern import org.apache.spark.internal.Logging +import org.apache.spark.sql.connector.catalog.{SessionConfigSupport, Table, TableProvider} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.sources.v2.{SessionConfigSupport, TableProvider} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.CaseInsensitiveStringMap private[sql] object DataSourceV2Utils extends Logging { @@ -57,4 +59,28 @@ private[sql] object DataSourceV2Utils extends Logging { case _ => Map.empty } } + + def getTableFromProvider( + provider: TableProvider, + options: CaseInsensitiveStringMap, + userSpecifiedSchema: Option[StructType]): Table = { + userSpecifiedSchema match { + case Some(schema) => + if (provider.supportsExternalMetadata()) { + provider.getTable( + schema, + provider.inferPartitioning(options), + options.asCaseSensitiveMap()) + } else { + throw new UnsupportedOperationException( + s"${provider.getClass.getSimpleName} source does not support user-specified schema.") + } + + case None => + provider.getTable( + provider.inferSchema(options), + provider.inferPartitioning(options), + options.asCaseSensitiveMap()) + } + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DeleteFromTableExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DeleteFromTableExec.scala index a5840571fff23..afebbfd01db22 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DeleteFromTableExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DeleteFromTableExec.scala @@ -17,21 +17,18 @@ package org.apache.spark.sql.execution.datasources.v2 -import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.execution.LeafExecNode +import org.apache.spark.sql.connector.catalog.SupportsDelete import org.apache.spark.sql.sources.Filter -import org.apache.spark.sql.sources.v2.SupportsDelete -import org.apache.spark.sql.util.CaseInsensitiveStringMap case class DeleteFromTableExec( table: SupportsDelete, - condition: Array[Filter]) extends LeafExecNode { + condition: Array[Filter]) extends V2CommandExec { - override protected def doExecute(): RDD[InternalRow] = { + override protected def run(): Seq[InternalRow] = { table.deleteWhere(condition) - sparkContext.emptyRDD + Seq.empty } override def output: Seq[Attribute] = Nil diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeNamespaceExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeNamespaceExec.scala new file mode 100644 index 0000000000000..64b98fb83b8fa --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeNamespaceExec.scala @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.v2 + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ArrayBuffer + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.encoders.RowEncoder +import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericRowWithSchema} +import org.apache.spark.sql.connector.catalog.{CatalogV2Util, SupportsNamespaces} +import org.apache.spark.sql.types.StructType + +/** + * Physical plan node for describing a namespace. + */ +case class DescribeNamespaceExec( + output: Seq[Attribute], + catalog: SupportsNamespaces, + namespace: Seq[String], + isExtended: Boolean) extends V2CommandExec { + private val encoder = RowEncoder(StructType.fromAttributes(output)).resolveAndBind() + + override protected def run(): Seq[InternalRow] = { + val rows = new ArrayBuffer[InternalRow]() + val ns = namespace.toArray + val metadata = catalog.loadNamespaceMetadata(ns) + + rows += toCatalystRow("Namespace Name", ns.last) + + CatalogV2Util.NAMESPACE_RESERVED_PROPERTIES.foreach { p => + rows ++= Option(metadata.get(p)).map(toCatalystRow(p.capitalize, _)) + } + + if (isExtended) { + val properties = metadata.asScala -- CatalogV2Util.NAMESPACE_RESERVED_PROPERTIES + if (properties.nonEmpty) { + rows += toCatalystRow("Properties", properties.toSeq.mkString("(", ",", ")")) + } + } + rows + } + + private def toCatalystRow(strs: String*): InternalRow = { + encoder.toRow(new GenericRowWithSchema(strs.toArray, schema)).copy() + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeTableExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeTableExec.scala index 640bdfb8cba54..9c280206c548e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeTableExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DescribeTableExec.scala @@ -20,30 +20,47 @@ package org.apache.spark.sql.execution.datasources.v2 import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer -import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericRowWithSchema} -import org.apache.spark.sql.execution.LeafExecNode -import org.apache.spark.sql.sources.v2.Table +import org.apache.spark.sql.connector.catalog.{CatalogV2Util, Table, TableCatalog} import org.apache.spark.sql.types.StructType case class DescribeTableExec( output: Seq[Attribute], table: Table, - isExtended: Boolean) extends LeafExecNode { + isExtended: Boolean) extends V2CommandExec { private val encoder = RowEncoder(StructType.fromAttributes(output)).resolveAndBind() - override protected def doExecute(): RDD[InternalRow] = { + override protected def run(): Seq[InternalRow] = { val rows = new ArrayBuffer[InternalRow]() addSchema(rows) + addPartitioning(rows) if (isExtended) { - addPartitioning(rows) - addProperties(rows) + addTableDetails(rows) } - sparkContext.parallelize(rows) + rows + } + + private def addTableDetails(rows: ArrayBuffer[InternalRow]): Unit = { + rows += emptyRow() + rows += toCatalystRow("# Detailed Table Information", "", "") + rows += toCatalystRow("Name", table.name(), "") + + CatalogV2Util.TABLE_RESERVED_PROPERTIES.foreach(propKey => { + if (table.properties.containsKey(propKey)) { + rows += toCatalystRow(propKey.capitalize, table.properties.get(propKey), "") + } + }) + val properties = + table.properties.asScala.toList + .filter(kv => !CatalogV2Util.TABLE_RESERVED_PROPERTIES.contains(kv._1)) + .sortBy(_._1).map { + case (key, value) => key + "=" + value + }.mkString("[", ",", "]") + rows += toCatalystRow("Table Properties", properties, "") } private def addSchema(rows: ArrayBuffer[InternalRow]): Unit = { @@ -55,8 +72,7 @@ case class DescribeTableExec( private def addPartitioning(rows: ArrayBuffer[InternalRow]): Unit = { rows += emptyRow() - rows += toCatalystRow(" Partitioning", "", "") - rows += toCatalystRow("--------------", "", "") + rows += toCatalystRow("# Partitioning", "", "") if (table.partitioning.isEmpty) { rows += toCatalystRow("Not partitioned", "", "") } else { @@ -66,15 +82,6 @@ case class DescribeTableExec( } } - private def addProperties(rows: ArrayBuffer[InternalRow]): Unit = { - rows += emptyRow() - rows += toCatalystRow(" Table Property", " Value", "") - rows += toCatalystRow("----------------", "-------", "") - rows ++= table.properties.asScala.toList.sortBy(_._1).map { - case (key, value) => toCatalystRow(key, value, "") - } - } - private def emptyRow(): InternalRow = toCatalystRow("", "", "") private def toCatalystRow(strs: String*): InternalRow = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropNamespaceExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropNamespaceExec.scala new file mode 100644 index 0000000000000..f7b4317ad65e2 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropNamespaceExec.scala @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.v2 + +import org.apache.spark.SparkException +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.connector.catalog.{CatalogPlugin, SupportsNamespaces} + +/** + * Physical plan node for dropping a namespace. + */ +case class DropNamespaceExec( + catalog: CatalogPlugin, + namespace: Seq[String], + ifExists: Boolean, + cascade: Boolean) + extends V2CommandExec { + override protected def run(): Seq[InternalRow] = { + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + + val nsCatalog = catalog.asNamespaceCatalog + val ns = namespace.toArray + if (nsCatalog.namespaceExists(ns)) { + // The default behavior of `SupportsNamespace.dropNamespace()` is cascading, + // so make sure the namespace to drop is empty. + if (!cascade) { + if (catalog.asTableCatalog.listTables(ns).nonEmpty + || nsCatalog.listNamespaces(ns).nonEmpty) { + throw new SparkException( + s"Cannot drop a non-empty namespace: ${namespace.quoted}. " + + "Use CASCADE option to drop a non-empty namespace.") + } + } + + if (!nsCatalog.dropNamespace(ns)) { + throw new SparkException(s"Failed to drop a namespace: ${namespace.quoted}.") + } + } else if (!ifExists) { + throw new NoSuchNamespaceException(ns) + } + + Seq.empty + } + + override def output: Seq[Attribute] = Seq.empty +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropTableExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropTableExec.scala index d325e0205f9d8..967613f77577c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropTableExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DropTableExec.scala @@ -17,27 +17,25 @@ package org.apache.spark.sql.execution.datasources.v2 -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalog.v2.{Identifier, TableCatalog} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.NoSuchTableException import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.execution.LeafExecNode +import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog} /** * Physical plan node for dropping a table. */ case class DropTableExec(catalog: TableCatalog, ident: Identifier, ifExists: Boolean) - extends LeafExecNode { + extends V2CommandExec { - override def doExecute(): RDD[InternalRow] = { + override def run(): Seq[InternalRow] = { if (catalog.tableExists(ident)) { catalog.dropTable(ident) } else if (!ifExists) { throw new NoSuchTableException(ident) } - sqlContext.sparkContext.parallelize(Seq.empty, 1) + Seq.empty } override def output: Seq[Attribute] = Seq.empty diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/EmptyPartitionReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/EmptyPartitionReader.scala index b177d15e1fe32..711bd41e1db24 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/EmptyPartitionReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/EmptyPartitionReader.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.v2 import java.io.IOException -import org.apache.spark.sql.sources.v2.reader.PartitionReader +import org.apache.spark.sql.connector.read.PartitionReader /** * A [[PartitionReader]] with empty output. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileBatchWrite.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileBatchWrite.scala index db31927fa73bb..266c834909363 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileBatchWrite.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileBatchWrite.scala @@ -20,10 +20,9 @@ import org.apache.hadoop.mapreduce.Job import org.apache.spark.internal.Logging import org.apache.spark.internal.io.FileCommitProtocol +import org.apache.spark.sql.connector.write.{BatchWrite, DataWriterFactory, PhysicalWriteInfo, WriterCommitMessage} import org.apache.spark.sql.execution.datasources.{WriteJobDescription, WriteTaskResult} import org.apache.spark.sql.execution.datasources.FileFormatWriter.processStats -import org.apache.spark.sql.sources.v2.writer._ -import org.apache.spark.util.SerializableConfiguration class FileBatchWrite( job: Job, @@ -45,7 +44,7 @@ class FileBatchWrite( committer.abortJob(job) } - override def createBatchWriterFactory(): DataWriterFactory = { + override def createBatchWriterFactory(info: PhysicalWriteInfo): DataWriterFactory = { FileWriterFactory(description, committer) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala index ac786bbaac6d7..30a964d7e643f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileDataSourceV2.scala @@ -16,13 +16,17 @@ */ package org.apache.spark.sql.execution.datasources.v2 +import java.util + import com.fasterxml.jackson.databind.ObjectMapper import org.apache.hadoop.fs.Path import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connector.catalog.{Table, TableProvider} +import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources.DataSourceRegister -import org.apache.spark.sql.sources.v2.TableProvider +import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util.Utils @@ -59,4 +63,40 @@ trait FileDataSourceV2 extends TableProvider with DataSourceRegister { val fs = hdfsPath.getFileSystem(sparkSession.sessionState.newHadoopConf()) hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory).toString } + + // TODO: To reduce code diff of SPARK-29665, we create stub implementations for file source v2, so + // that we don't need to touch all the file source v2 classes. We should remove the stub + // implementation and directly implement the TableProvider APIs. + protected def getTable(options: CaseInsensitiveStringMap): Table + protected def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = { + throw new UnsupportedOperationException("user-specified schema") + } + + override def supportsExternalMetadata(): Boolean = true + + private var t: Table = null + + override def inferSchema(options: CaseInsensitiveStringMap): StructType = { + if (t == null) t = getTable(options) + t.schema() + } + + // TODO: implement a light-weight partition inference which only looks at the path of one leaf + // file and return partition column names. For now the partition inference happens in + // `getTable`, because we don't know the user-specified schema here. + override def inferPartitioning(options: CaseInsensitiveStringMap): Array[Transform] = { + Array.empty + } + + override def getTable( + schema: StructType, + partitioning: Array[Transform], + properties: util.Map[String, String]): Table = { + // If the table is already loaded during schema inference, return it directly. + if (t != null) { + t + } else { + getTable(new CaseInsensitiveStringMap(properties), schema) + } + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FilePartitionReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FilePartitionReader.scala index 836eae88e4da7..8f51d454b1434 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FilePartitionReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FilePartitionReader.scala @@ -22,10 +22,10 @@ import org.apache.parquet.io.ParquetDecodingException import org.apache.spark.internal.Logging import org.apache.spark.rdd.InputFileBlockHolder +import org.apache.spark.sql.connector.read.PartitionReader import org.apache.spark.sql.execution.QueryExecutionException import org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedException import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.sources.v2.reader.PartitionReader class FilePartitionReader[T](readers: Iterator[PartitionedFileReader[T]]) extends PartitionReader[T] with Logging { @@ -42,9 +42,8 @@ class FilePartitionReader[T](readers: Iterator[PartitionedFileReader[T]]) currentReader = getNextReader() } catch { case e: FileNotFoundException if ignoreMissingFiles => - logWarning(s"Skipped missing file: $currentReader", e) + logWarning(s"Skipped missing file.", e) currentReader = null - return false // Throw FileNotFoundException even if `ignoreCorruptFiles` is true case e: FileNotFoundException if !ignoreMissingFiles => throw new FileNotFoundException( @@ -54,10 +53,8 @@ class FilePartitionReader[T](readers: Iterator[PartitionedFileReader[T]]) "recreating the Dataset/DataFrame involved.") case e @ (_: RuntimeException | _: IOException) if ignoreCorruptFiles => logWarning( - s"Skipped the rest of the content in the corrupted file: $currentReader", e) + s"Skipped the rest of the content in the corrupted file.", e) currentReader = null - InputFileBlockHolder.unset() - return false } } else { return false @@ -67,7 +64,7 @@ class FilePartitionReader[T](readers: Iterator[PartitionedFileReader[T]]) // In PartitionReader.next(), the current reader proceeds to next record. // It might throw RuntimeException/IOException and Spark should handle these exceptions. val hasNext = try { - currentReader.next() + currentReader != null && currentReader.next() } catch { case e: SchemaColumnConvertNotSupportedException => val message = "Parquet column cannot be converted in " + diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FilePartitionReaderFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FilePartitionReaderFactory.scala index 5a19412c90334..c1d91736a8b8e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FilePartitionReaderFactory.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FilePartitionReaderFactory.scala @@ -17,9 +17,8 @@ package org.apache.spark.sql.execution.datasources.v2 import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile, PartitioningUtils} -import org.apache.spark.sql.sources.v2.reader.{InputPartition, PartitionReader, PartitionReaderFactory} -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory} +import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile} import org.apache.spark.sql.vectorized.ColumnarBatch abstract class FilePartitionReaderFactory extends PartitionReaderFactory { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala index 0438bd0430da1..6e05aa56f4f72 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala @@ -24,21 +24,16 @@ import org.apache.hadoop.fs.Path import org.apache.spark.internal.Logging import org.apache.spark.internal.config.IO_WARNING_LARGEFILETHRESHOLD import org.apache.spark.sql.{AnalysisException, SparkSession} +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionSet} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection +import org.apache.spark.sql.connector.read.{Batch, InputPartition, Scan, Statistics, SupportsReportStatistics} import org.apache.spark.sql.execution.PartitionedFileUtil import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources.Filter -import org.apache.spark.sql.sources.v2.reader._ import org.apache.spark.sql.types.StructType import org.apache.spark.util.Utils -abstract class FileScan( - sparkSession: SparkSession, - fileIndex: PartitioningAwareFileIndex, - readDataSchema: StructType, - readPartitionSchema: StructType) - extends Scan - with Batch with SupportsReportStatistics with Logging { +trait FileScan extends Scan with Batch with SupportsReportStatistics with Logging { /** * Returns whether a file with `path` could be split or not. */ @@ -46,6 +41,36 @@ abstract class FileScan( false } + def sparkSession: SparkSession + + def fileIndex: PartitioningAwareFileIndex + + /** + * Returns the required data schema + */ + def readDataSchema: StructType + + /** + * Returns the required partition schema + */ + def readPartitionSchema: StructType + + /** + * Returns the filters that can be use for partition pruning + */ + def partitionFilters: Seq[Expression] + + /** + * Returns the data filters that can be use for file listing + */ + def dataFilters: Seq[Expression] + + /** + * Create a new `FileScan` instance from the current one + * with different `partitionFilters` and `dataFilters` + */ + def withFilters(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): FileScan + /** * If a file with `path` is unsplittable, return the unsplittable reason, * otherwise return `None`. @@ -55,11 +80,26 @@ abstract class FileScan( "undefined" } + protected def seqToString(seq: Seq[Any]): String = seq.mkString("[", ", ", "]") + + override def equals(obj: Any): Boolean = obj match { + case f: FileScan => + fileIndex == f.fileIndex && readSchema == f.readSchema + ExpressionSet(partitionFilters) == ExpressionSet(f.partitionFilters) && + ExpressionSet(dataFilters) == ExpressionSet(f.dataFilters) + + case _ => false + } + + override def hashCode(): Int = getClass.hashCode() + override def description(): String = { val locationDesc = fileIndex.getClass.getSimpleName + fileIndex.rootPaths.mkString("[", ", ", "]") val metadata: Map[String, String] = Map( "ReadSchema" -> readDataSchema.catalogString, + "PartitionFilters" -> seqToString(partitionFilters), + "DataFilters" -> seqToString(dataFilters), "Location" -> locationDesc) val metadataStr = metadata.toSeq.sorted.map { case (key, value) => @@ -71,7 +111,7 @@ abstract class FileScan( } protected def partitions: Seq[FilePartition] = { - val selectedPartitions = fileIndex.listFiles(Seq.empty, Seq.empty) + val selectedPartitions = fileIndex.listFiles(partitionFilters, dataFilters) val maxSplitBytes = FilePartition.maxSplitBytes(sparkSession, selectedPartitions) val partitionAttributes = fileIndex.partitionSchema.toAttributes val attributeMap = partitionAttributes.map(a => normalizeName(a.name) -> a).toMap diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScanBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScanBuilder.scala index 3b236be90e6ff..97874e8f4932e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScanBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScanBuilder.scala @@ -17,8 +17,8 @@ package org.apache.spark.sql.execution.datasources.v2 import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connector.read.{ScanBuilder, SupportsPushDownRequiredColumns} import org.apache.spark.sql.execution.datasources.{PartitioningAwareFileIndex, PartitioningUtils} -import org.apache.spark.sql.sources.v2.reader.{ScanBuilder, SupportsPushDownRequiredColumns} import org.apache.spark.sql.types.StructType abstract class FileScanBuilder( @@ -27,15 +27,21 @@ abstract class FileScanBuilder( dataSchema: StructType) extends ScanBuilder with SupportsPushDownRequiredColumns { private val partitionSchema = fileIndex.partitionSchema private val isCaseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis + protected val supportsNestedSchemaPruning = false protected var requiredSchema = StructType(dataSchema.fields ++ partitionSchema.fields) override def pruneColumns(requiredSchema: StructType): Unit = { + // [SPARK-30107] While `requiredSchema` might have pruned nested columns, + // the actual data schema of this scan is determined in `readDataSchema`. + // File formats that don't support nested schema pruning, + // use `requiredSchema` as a reference and prune only top-level columns. this.requiredSchema = requiredSchema } protected def readDataSchema(): StructType = { val requiredNameSet = createRequiredNameSet() - val fields = dataSchema.fields.filter { field => + val schema = if (supportsNestedSchemaPruning) requiredSchema else dataSchema + val fields = schema.fields.filter { field => val colName = PartitioningUtils.getColName(field, isCaseSensitive) requiredNameSet.contains(colName) && !partitionNameSet.contains(colName) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala index 4483f5b1dd30c..59dc3ae56bf25 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala @@ -23,11 +23,11 @@ import scala.collection.JavaConverters._ import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.spark.sql.{AnalysisException, SparkSession} -import org.apache.spark.sql.catalog.v2.expressions.Transform +import org.apache.spark.sql.connector.catalog.{SupportsRead, SupportsWrite, Table, TableCapability} +import org.apache.spark.sql.connector.catalog.TableCapability._ +import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.streaming.{FileStreamSink, MetadataLogFileIndex} -import org.apache.spark.sql.sources.v2.{SupportsRead, SupportsWrite, Table, TableCapability} -import org.apache.spark.sql.sources.v2.TableCapability._ import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.sql.util.SchemaUtils @@ -39,7 +39,7 @@ abstract class FileTable( userSpecifiedSchema: Option[StructType]) extends Table with SupportsRead with SupportsWrite { - import org.apache.spark.sql.catalog.v2.CatalogV2Implicits._ + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ lazy val fileIndex: PartitioningAwareFileIndex = { val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap @@ -102,7 +102,7 @@ abstract class FileTable( StructType(fields) } - override def partitioning: Array[Transform] = fileIndex.partitionSchema.asTransforms + override def partitioning: Array[Transform] = fileIndex.partitionSchema.names.toSeq.asTransforms override def properties: util.Map[String, String] = options.asCaseSensitiveMap diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriteBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriteBuilder.scala index eacc4cb3ac4a9..d519832c57501 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriteBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriteBuilder.scala @@ -30,34 +30,24 @@ import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.sql.{AnalysisException, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils} +import org.apache.spark.sql.connector.write.{BatchWrite, LogicalWriteInfo, WriteBuilder} import org.apache.spark.sql.execution.datasources.{BasicWriteJobStatsTracker, DataSource, OutputWriterFactory, WriteJobDescription} import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.sources.v2.writer.{BatchWrite, WriteBuilder} import org.apache.spark.sql.types.{DataType, StructType} -import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.sql.util.SchemaUtils import org.apache.spark.util.SerializableConfiguration abstract class FileWriteBuilder( - options: CaseInsensitiveStringMap, paths: Seq[String], formatName: String, - supportsDataType: DataType => Boolean) extends WriteBuilder { - private var schema: StructType = _ - private var queryId: String = _ + supportsDataType: DataType => Boolean, + info: LogicalWriteInfo) extends WriteBuilder { + private val schema = info.schema() + private val queryId = info.queryId() + private val options = info.options() private var mode: SaveMode = _ - override def withInputDataSchema(schema: StructType): WriteBuilder = { - this.schema = schema - this - } - - override def withQueryId(queryId: String): WriteBuilder = { - this.queryId = queryId - this - } - def mode(mode: SaveMode): WriteBuilder = { this.mode = mode this diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriterFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriterFactory.scala index eb573b317142a..1f25fed3000b2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriterFactory.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriterFactory.scala @@ -23,9 +23,8 @@ import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.internal.io.{FileCommitProtocol, SparkHadoopWriterUtils} import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.connector.write.{DataWriter, DataWriterFactory} import org.apache.spark.sql.execution.datasources.{DynamicPartitionDataWriter, SingleDirectoryDataWriter, WriteJobDescription} -import org.apache.spark.sql.sources.v2.writer.{DataWriter, DataWriterFactory} -import org.apache.spark.util.SerializableConfiguration case class FileWriterFactory ( description: WriteJobDescription, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/MicroBatchScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/MicroBatchScanExec.scala index a9b0f5bce1b09..bca28e3cacb62 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/MicroBatchScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/MicroBatchScanExec.scala @@ -20,8 +20,8 @@ package org.apache.spark.sql.execution.datasources.v2 import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.sources.v2.reader.{InputPartition, PartitionReaderFactory, Scan} -import org.apache.spark.sql.sources.v2.reader.streaming.{MicroBatchStream, Offset} +import org.apache.spark.sql.connector.read.{InputPartition, PartitionReaderFactory, Scan} +import org.apache.spark.sql.connector.read.streaming.{MicroBatchStream, Offset} /** * Physical plan node for scanning a micro-batch of data from a data source. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PartitionReaderFromIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PartitionReaderFromIterator.scala index f9dfcf448a3ea..0d9aa5b42a6ea 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PartitionReaderFromIterator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PartitionReaderFromIterator.scala @@ -16,7 +16,7 @@ */ package org.apache.spark.sql.execution.datasources.v2 -import org.apache.spark.sql.sources.v2.reader.PartitionReader +import org.apache.spark.sql.connector.read.PartitionReader class PartitionReaderFromIterator[InternalRow]( iter: Iterator[InternalRow]) extends PartitionReader[InternalRow] { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PartitionReaderWithPartitionValues.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PartitionReaderWithPartitionValues.scala index 072465b56857d..7bca98e54efa7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PartitionReaderWithPartitionValues.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PartitionReaderWithPartitionValues.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.v2 import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.JoinedRow import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection -import org.apache.spark.sql.sources.v2.reader.PartitionReader +import org.apache.spark.sql.connector.read.PartitionReader import org.apache.spark.sql.types.StructType /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PartitionRecordReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PartitionRecordReader.scala index baa8cb6b24659..8e524a986aa06 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PartitionRecordReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PartitionRecordReader.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.v2 import org.apache.hadoop.mapreduce.RecordReader -import org.apache.spark.sql.sources.v2.reader.PartitionReader +import org.apache.spark.sql.connector.read.PartitionReader class PartitionRecordReader[T]( private[this] var rowReader: RecordReader[_, T]) extends PartitionReader[T] { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala new file mode 100644 index 0000000000000..33338b06565c9 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.v2 + +import scala.collection.mutable + +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeSet, Expression, NamedExpression, PredicateHelper, SchemaPruning} +import org.apache.spark.sql.connector.read.{Scan, ScanBuilder, SupportsPushDownFilters, SupportsPushDownRequiredColumns} +import org.apache.spark.sql.execution.datasources.DataSourceStrategy +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.sources +import org.apache.spark.sql.types.StructType + +object PushDownUtils extends PredicateHelper { + /** + * Pushes down filters to the data source reader + * + * @return pushed filter and post-scan filters. + */ + def pushFilters( + scanBuilder: ScanBuilder, + filters: Seq[Expression]): (Seq[sources.Filter], Seq[Expression]) = { + scanBuilder match { + case r: SupportsPushDownFilters => + // A map from translated data source leaf node filters to original catalyst filter + // expressions. For a `And`/`Or` predicate, it is possible that the predicate is partially + // pushed down. This map can be used to construct a catalyst filter expression from the + // input filter, or a superset(partial push down filter) of the input filter. + val translatedFilterToExpr = mutable.HashMap.empty[sources.Filter, Expression] + val translatedFilters = mutable.ArrayBuffer.empty[sources.Filter] + // Catalyst filter expression that can't be translated to data source filters. + val untranslatableExprs = mutable.ArrayBuffer.empty[Expression] + + for (filterExpr <- filters) { + val translated = + DataSourceStrategy.translateFilterWithMapping(filterExpr, Some(translatedFilterToExpr)) + if (translated.isEmpty) { + untranslatableExprs += filterExpr + } else { + translatedFilters += translated.get + } + } + + // Data source filters that need to be evaluated again after scanning. which means + // the data source cannot guarantee the rows returned can pass these filters. + // As a result we must return it so Spark can plan an extra filter operator. + val postScanFilters = r.pushFilters(translatedFilters.toArray).map { filter => + DataSourceStrategy.rebuildExpressionFromFilter(filter, translatedFilterToExpr) + } + (r.pushedFilters(), untranslatableExprs ++ postScanFilters) + + case _ => (Nil, filters) + } + } + + /** + * Applies column pruning to the data source, w.r.t. the references of the given expressions. + * + * @return the `Scan` instance (since column pruning is the last step of operator pushdown), + * and new output attributes after column pruning. + */ + def pruneColumns( + scanBuilder: ScanBuilder, + relation: DataSourceV2Relation, + projects: Seq[NamedExpression], + filters: Seq[Expression]): (Scan, Seq[AttributeReference]) = { + scanBuilder match { + case r: SupportsPushDownRequiredColumns if SQLConf.get.nestedSchemaPruningEnabled => + val rootFields = SchemaPruning.identifyRootFields(projects, filters) + val prunedSchema = if (rootFields.nonEmpty) { + SchemaPruning.pruneDataSchema(relation.schema, rootFields) + } else { + new StructType() + } + r.pruneColumns(prunedSchema) + val scan = r.build() + scan -> toOutputAttrs(scan.readSchema(), relation) + + case r: SupportsPushDownRequiredColumns => + val exprs = projects ++ filters + val requiredColumns = AttributeSet(exprs.flatMap(_.references)) + val neededOutput = relation.output.filter(requiredColumns.contains) + if (neededOutput != relation.output) { + r.pruneColumns(neededOutput.toStructType) + val scan = r.build() + scan -> toOutputAttrs(scan.readSchema(), relation) + } else { + r.build() -> relation.output + } + + case _ => scanBuilder.build() -> relation.output + } + } + + private def toOutputAttrs( + schema: StructType, + relation: DataSourceV2Relation): Seq[AttributeReference] = { + val nameToAttr = relation.output.map(_.name).zip(relation.output).toMap + schema.toAttributes.map { + // we have to keep the attribute id during transformation + a => a.withExprId(nameToAttr(a.name).exprId) + } + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/sql/DropTableStatement.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/RefreshTableExec.scala similarity index 70% rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/sql/DropTableStatement.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/RefreshTableExec.scala index d41e8a5010257..2a19ff304a9e0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/sql/DropTableStatement.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/RefreshTableExec.scala @@ -15,20 +15,19 @@ * limitations under the License. */ -package org.apache.spark.sql.catalyst.plans.logical.sql +package org.apache.spark.sql.execution.datasources.v2 +import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog} -/** - * A DROP TABLE statement, as parsed from SQL. - */ -case class DropTableStatement( - tableName: Seq[String], - ifExists: Boolean, - purge: Boolean) extends ParsedStatement { +case class RefreshTableExec( + catalog: TableCatalog, + ident: Identifier) extends V2CommandExec { + override protected def run(): Seq[InternalRow] = { + catalog.invalidateTable(ident) + Seq.empty + } override def output: Seq[Attribute] = Seq.empty - - override def children: Seq[LogicalPlan] = Seq.empty } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/sql/DropViewStatement.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/RenameTableExec.scala similarity index 64% rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/sql/DropViewStatement.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/RenameTableExec.scala index 523158788e834..a650607d5f129 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/sql/DropViewStatement.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/RenameTableExec.scala @@ -15,19 +15,26 @@ * limitations under the License. */ -package org.apache.spark.sql.catalyst.plans.logical.sql +package org.apache.spark.sql.execution.datasources.v2 +import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog} /** - * A DROP VIEW statement, as parsed from SQL. + * Physical plan node for renaming a table. */ -case class DropViewStatement( - viewName: Seq[String], - ifExists: Boolean) extends ParsedStatement { +case class RenameTableExec( + catalog: TableCatalog, + oldIdent: Identifier, + newIdent: Identifier) extends V2CommandExec { override def output: Seq[Attribute] = Seq.empty - override def children: Seq[LogicalPlan] = Seq.empty + override protected def run(): Seq[InternalRow] = { + catalog.invalidateTable(oldIdent) + catalog.renameTable(oldIdent, newIdent) + + Seq.empty + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ReplaceTableExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ReplaceTableExec.scala index 35d86ee2abbbb..1f3bcf2e3fe57 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ReplaceTableExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ReplaceTableExec.scala @@ -19,14 +19,11 @@ package org.apache.spark.sql.execution.datasources.v2 import scala.collection.JavaConverters._ -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalog.v2.{Identifier, StagingTableCatalog, TableCatalog} -import org.apache.spark.sql.catalog.v2.expressions.Transform import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.{CannotReplaceMissingTableException, NoSuchTableException} import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.execution.LeafExecNode -import org.apache.spark.sql.sources.v2.StagedTable +import org.apache.spark.sql.connector.catalog.{Identifier, StagedTable, StagingTableCatalog, TableCatalog} +import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.types.StructType import org.apache.spark.util.Utils @@ -36,16 +33,16 @@ case class ReplaceTableExec( tableSchema: StructType, partitioning: Seq[Transform], tableProperties: Map[String, String], - orCreate: Boolean) extends LeafExecNode { + orCreate: Boolean) extends V2CommandExec { - override protected def doExecute(): RDD[InternalRow] = { + override protected def run(): Seq[InternalRow] = { if (catalog.tableExists(ident)) { catalog.dropTable(ident) } else if (!orCreate) { throw new CannotReplaceMissingTableException(ident) } catalog.createTable(ident, tableSchema, partitioning.toArray, tableProperties.asJava) - sqlContext.sparkContext.parallelize(Seq.empty, 1) + Seq.empty } override def output: Seq[Attribute] = Seq.empty @@ -57,9 +54,9 @@ case class AtomicReplaceTableExec( tableSchema: StructType, partitioning: Seq[Transform], tableProperties: Map[String, String], - orCreate: Boolean) extends LeafExecNode { + orCreate: Boolean) extends V2CommandExec { - override protected def doExecute(): RDD[InternalRow] = { + override protected def run(): Seq[InternalRow] = { val staged = if (orCreate) { catalog.stageCreateOrReplace( identifier, tableSchema, partitioning.toArray, tableProperties.asJava) @@ -75,8 +72,7 @@ case class AtomicReplaceTableExec( throw new CannotReplaceMissingTableException(identifier) } commitOrAbortStagedChanges(staged) - - sqlContext.sparkContext.parallelize(Seq.empty, 1) + Seq.empty } override def output: Seq[Attribute] = Seq.empty diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/SetCatalogAndNamespaceExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/SetCatalogAndNamespaceExec.scala new file mode 100644 index 0000000000000..9e6f00e0923ea --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/SetCatalogAndNamespaceExec.scala @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.v2 + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.connector.catalog.CatalogManager + +/** + * Physical plan node for setting the current catalog and/or namespace. + */ +case class SetCatalogAndNamespaceExec( + catalogManager: CatalogManager, + catalogName: Option[String], + namespace: Option[Seq[String]]) + extends V2CommandExec { + override protected def run(): Seq[InternalRow] = { + // The catalog is updated first because CatalogManager resets the current namespace + // when the current catalog is set. + catalogName.map(catalogManager.setCurrentCatalog) + namespace.map(ns => catalogManager.setCurrentNamespace(ns.toArray)) + + Seq.empty + } + + override def output: Seq[Attribute] = Seq.empty +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowCurrentNamespaceExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowCurrentNamespaceExec.scala new file mode 100644 index 0000000000000..42b80a15080a6 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowCurrentNamespaceExec.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.v2 + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.encoders.RowEncoder +import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericRowWithSchema} +import org.apache.spark.sql.connector.catalog.CatalogManager +import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.NamespaceHelper + +/** + * Physical plan node for showing current catalog/namespace. + */ +case class ShowCurrentNamespaceExec( + output: Seq[Attribute], + catalogManager: CatalogManager) + extends V2CommandExec { + override protected def run(): Seq[InternalRow] = { + val encoder = RowEncoder(schema).resolveAndBind() + Seq(encoder + .toRow(new GenericRowWithSchema( + Array(catalogManager.currentCatalog.name, catalogManager.currentNamespace.quoted), schema)) + .copy()) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowNamespacesExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowNamespacesExec.scala new file mode 100644 index 0000000000000..fe3ab8023db6f --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowNamespacesExec.scala @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.v2 + +import scala.collection.mutable.ArrayBuffer + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.encoders.RowEncoder +import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericRowWithSchema} +import org.apache.spark.sql.catalyst.util.StringUtils +import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.NamespaceHelper +import org.apache.spark.sql.connector.catalog.SupportsNamespaces + +/** + * Physical plan node for showing namespaces. + */ +case class ShowNamespacesExec( + output: Seq[Attribute], + catalog: SupportsNamespaces, + namespace: Seq[String], + pattern: Option[String]) + extends V2CommandExec { + + override protected def run(): Seq[InternalRow] = { + val namespaces = if (namespace.nonEmpty) { + catalog.listNamespaces(namespace.toArray) + } else { + catalog.listNamespaces() + } + + val rows = new ArrayBuffer[InternalRow]() + val encoder = RowEncoder(schema).resolveAndBind() + + namespaces.map(_.quoted).map { ns => + if (pattern.map(StringUtils.filterPattern(Seq(ns), _).nonEmpty).getOrElse(true)) { + rows += encoder + .toRow(new GenericRowWithSchema(Array(ns), schema)) + .copy() + } + } + + rows + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablePropertiesExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablePropertiesExec.scala new file mode 100644 index 0000000000000..7905c35f55de0 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablePropertiesExec.scala @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.v2 + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.encoders.RowEncoder +import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericRowWithSchema} +import org.apache.spark.sql.connector.catalog.Table + +/** + * Physical plan node for showing table properties. + */ +case class ShowTablePropertiesExec( + output: Seq[Attribute], + catalogTable: Table, + propertyKey: Option[String]) extends V2CommandExec { + + override protected def run(): Seq[InternalRow] = { + import scala.collection.JavaConverters._ + val encoder = RowEncoder(schema).resolveAndBind() + + val properties = catalogTable.properties.asScala + propertyKey match { + case Some(p) => + val propValue = properties + .getOrElse(p, s"Table ${catalogTable.name} does not have property: $p") + Seq(encoder.toRow(new GenericRowWithSchema(Array(p, propValue), schema)).copy()) + case None => + properties.keys.map(k => + encoder.toRow(new GenericRowWithSchema(Array(k, properties(k)), schema)).copy()).toSeq + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablesExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablesExec.scala index c652f28a5e760..995b00871fc2a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablesExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/ShowTablesExec.scala @@ -19,14 +19,12 @@ package org.apache.spark.sql.execution.datasources.v2 import scala.collection.mutable.ArrayBuffer -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalog.v2.CatalogV2Implicits.NamespaceHelper -import org.apache.spark.sql.catalog.v2.TableCatalog import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericRowWithSchema} import org.apache.spark.sql.catalyst.util.StringUtils -import org.apache.spark.sql.execution.LeafExecNode +import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.NamespaceHelper +import org.apache.spark.sql.connector.catalog.TableCatalog /** * Physical plan node for showing tables. @@ -36,8 +34,8 @@ case class ShowTablesExec( catalog: TableCatalog, namespace: Seq[String], pattern: Option[String]) - extends LeafExecNode { - override protected def doExecute(): RDD[InternalRow] = { + extends V2CommandExec { + override protected def run(): Seq[InternalRow] = { val rows = new ArrayBuffer[InternalRow]() val encoder = RowEncoder(schema).resolveAndBind() @@ -53,6 +51,6 @@ case class ShowTablesExec( } } - sparkContext.parallelize(rows, 1) + rows } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TableCapabilityCheck.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TableCapabilityCheck.scala index 660b6e763e056..509a5f7139cca 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TableCapabilityCheck.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TableCapabilityCheck.scala @@ -20,8 +20,9 @@ package org.apache.spark.sql.execution.datasources.v2 import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan, OverwriteByExpression, OverwritePartitionsDynamic} +import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table} +import org.apache.spark.sql.connector.catalog.TableCapability._ import org.apache.spark.sql.execution.streaming.{StreamingRelation, StreamingRelationV2} -import org.apache.spark.sql.sources.v2.TableCapability._ import org.apache.spark.sql.types.BooleanType /** @@ -32,6 +33,10 @@ object TableCapabilityCheck extends (LogicalPlan => Unit) { private def failAnalysis(msg: String): Unit = throw new AnalysisException(msg) + private def supportsBatchWrite(table: Table): Boolean = { + table.supportsAny(BATCH_WRITE, V1_BATCH_WRITE) + } + override def apply(plan: LogicalPlan): Unit = { plan foreach { case r: DataSourceV2Relation if !r.table.supports(BATCH_READ) => @@ -43,24 +48,23 @@ object TableCapabilityCheck extends (LogicalPlan => Unit) { // TODO: check STREAMING_WRITE capability. It's not doable now because we don't have a // a logical plan for streaming write. - - case AppendData(r: DataSourceV2Relation, _, _) if !r.table.supports(BATCH_WRITE) => + case AppendData(r: DataSourceV2Relation, _, _, _) if !supportsBatchWrite(r.table) => failAnalysis(s"Table ${r.table.name()} does not support append in batch mode.") - case OverwritePartitionsDynamic(r: DataSourceV2Relation, _, _) + case OverwritePartitionsDynamic(r: DataSourceV2Relation, _, _, _) if !r.table.supports(BATCH_WRITE) || !r.table.supports(OVERWRITE_DYNAMIC) => failAnalysis(s"Table ${r.table.name()} does not support dynamic overwrite in batch mode.") - case OverwriteByExpression(r: DataSourceV2Relation, expr, _, _) => + case OverwriteByExpression(r: DataSourceV2Relation, expr, _, _, _) => expr match { case Literal(true, BooleanType) => - if (!r.table.supports(BATCH_WRITE) || - !r.table.supportsAny(TRUNCATE, OVERWRITE_BY_FILTER)) { + if (!supportsBatchWrite(r.table) || + !r.table.supportsAny(TRUNCATE, OVERWRITE_BY_FILTER)) { failAnalysis( s"Table ${r.table.name()} does not support truncate in batch mode.") } case _ => - if (!r.table.supports(BATCH_WRITE) || !r.table.supports(OVERWRITE_BY_FILTER)) { + if (!supportsBatchWrite(r.table) || !r.table.supports(OVERWRITE_BY_FILTER)) { failAnalysis(s"Table ${r.table.name()} does not support " + "overwrite by filter in batch mode.") } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TextBasedFileScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TextBasedFileScan.scala index 7ddd99a0293b1..1ca3fd42c0597 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TextBasedFileScan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/TextBasedFileScan.scala @@ -29,11 +29,7 @@ import org.apache.spark.util.Utils abstract class TextBasedFileScan( sparkSession: SparkSession, - fileIndex: PartitioningAwareFileIndex, - readDataSchema: StructType, - readPartitionSchema: StructType, - options: CaseInsensitiveStringMap) - extends FileScan(sparkSession, fileIndex, readDataSchema, readPartitionSchema) { + options: CaseInsensitiveStringMap) extends FileScan { @transient private lazy val codecFactory: CompressionCodecFactory = new CompressionCodecFactory( sparkSession.sessionState.newHadoopConfWithOptions(options.asScala.toMap)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V1FallbackWriters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V1FallbackWriters.scala index 2f05ff3a7c2e1..f97300025400d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V1FallbackWriters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V1FallbackWriters.scala @@ -19,18 +19,16 @@ package org.apache.spark.sql.execution.datasources.v2 import java.util.UUID -import scala.collection.JavaConverters._ - import org.apache.spark.SparkException import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Dataset, SaveMode} +import org.apache.spark.sql.Dataset import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.connector.catalog.SupportsWrite +import org.apache.spark.sql.connector.write.{LogicalWriteInfoImpl, SupportsOverwrite, SupportsTruncate, V1WriteBuilder, WriteBuilder} import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.sources.{AlwaysTrue, CreatableRelationProvider, Filter, InsertableRelation} -import org.apache.spark.sql.sources.v2.{SupportsWrite, Table} -import org.apache.spark.sql.sources.v2.writer._ +import org.apache.spark.sql.sources.{AlwaysTrue, Filter, InsertableRelation} import org.apache.spark.sql.util.CaseInsensitiveStringMap /** @@ -100,9 +98,12 @@ sealed trait V1FallbackWriters extends SupportsV1Write { } protected def newWriteBuilder(): V1WriteBuilder = { - val writeBuilder = table.newWriteBuilder(writeOptions) - .withInputDataSchema(plan.schema) - .withQueryId(UUID.randomUUID().toString) + val info = LogicalWriteInfoImpl( + queryId = UUID.randomUUID().toString, + schema = plan.schema, + options = writeOptions) + val writeBuilder = table.newWriteBuilder(info) + writeBuilder.asV1Builder } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2CommandExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2CommandExec.scala new file mode 100644 index 0000000000000..a1f685d47a346 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2CommandExec.scala @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.v2 + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.LeafExecNode + +/** + * A physical operator that executes run() and saves the result to prevent multiple executions. + * Any V2 commands that do not require triggering a spark job should extend this class. + */ +abstract class V2CommandExec extends LeafExecNode { + + /** + * Abstract method that each concrete command needs to implement to compute the result. + */ + protected def run(): Seq[InternalRow] + + /** + * The value of this field can be used as the contents of the corresponding RDD generated from + * the physical plan of this command. + */ + private lazy val result: Seq[InternalRow] = run() + + /** + * The `execute()` method of all the physical command classes should reference `result` + * so that the command can be executed eagerly right after the command query is created. + */ + override def executeCollect(): Array[InternalRow] = result.toArray + + override def executeToIterator: Iterator[InternalRow] = result.toIterator + + override def executeTake(limit: Int): Array[InternalRow] = result.take(limit).toArray + + override def executeTail(limit: Int): Array[InternalRow] = result.takeRight(limit).toArray + + protected override def doExecute(): RDD[InternalRow] = { + sqlContext.sparkContext.parallelize(result, 1) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala new file mode 100644 index 0000000000000..59089fa6b77e9 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.datasources.v2 + +import org.apache.spark.sql.catalyst.expressions.{And, Expression, NamedExpression, ProjectionOverSchema, SubqueryExpression} +import org.apache.spark.sql.catalyst.planning.ScanOperation +import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.connector.read.{Scan, V1Scan} +import org.apache.spark.sql.execution.datasources.DataSourceStrategy +import org.apache.spark.sql.sources +import org.apache.spark.sql.types.StructType + +object V2ScanRelationPushDown extends Rule[LogicalPlan] { + import DataSourceV2Implicits._ + + override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown { + case ScanOperation(project, filters, relation: DataSourceV2Relation) => + val scanBuilder = relation.table.asReadable.newScanBuilder(relation.options) + + val normalizedFilters = DataSourceStrategy.normalizeExprs(filters, relation.output) + val (normalizedFiltersWithSubquery, normalizedFiltersWithoutSubquery) = + normalizedFilters.partition(SubqueryExpression.hasSubquery) + + // `pushedFilters` will be pushed down and evaluated in the underlying data sources. + // `postScanFilters` need to be evaluated after the scan. + // `postScanFilters` and `pushedFilters` can overlap, e.g. the parquet row group filter. + val (pushedFilters, postScanFiltersWithoutSubquery) = PushDownUtils.pushFilters( + scanBuilder, normalizedFiltersWithoutSubquery) + val postScanFilters = postScanFiltersWithoutSubquery ++ normalizedFiltersWithSubquery + + val normalizedProjects = DataSourceStrategy + .normalizeExprs(project, relation.output) + .asInstanceOf[Seq[NamedExpression]] + val (scan, output) = PushDownUtils.pruneColumns( + scanBuilder, relation, normalizedProjects, postScanFilters) + logInfo( + s""" + |Pushing operators to ${relation.name} + |Pushed Filters: ${pushedFilters.mkString(", ")} + |Post-Scan Filters: ${postScanFilters.mkString(",")} + |Output: ${output.mkString(", ")} + """.stripMargin) + + val wrappedScan = scan match { + case v1: V1Scan => + val translated = filters.flatMap(DataSourceStrategy.translateFilter) + V1ScanWrapper(v1, translated, pushedFilters) + case _ => scan + } + + val scanRelation = DataSourceV2ScanRelation(relation.table, wrappedScan, output) + + val projectionOverSchema = ProjectionOverSchema(output.toStructType) + val projectionFunc = (expr: Expression) => expr transformDown { + case projectionOverSchema(newExpr) => newExpr + } + + val filterCondition = postScanFilters.reduceLeftOption(And) + val newFilterCondition = filterCondition.map(projectionFunc) + val withFilter = newFilterCondition.map(Filter(_, scanRelation)).getOrElse(scanRelation) + + val withProjection = if (withFilter.output != project) { + val newProjects = normalizedProjects + .map(projectionFunc) + .asInstanceOf[Seq[NamedExpression]] + Project(newProjects, withFilter) + } else { + withFilter + } + + withProjection + } +} + +// A wrapper for v1 scan to carry the translated filters and the handled ones. This is required by +// the physical v1 scan node. +case class V1ScanWrapper( + v1Scan: V1Scan, + translatedFilters: Seq[sources.Filter], + handledFilters: Seq[sources.Filter]) extends Scan { + override def readSchema(): StructType = v1Scan.readSchema() +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala index ebfd7384930fe..cef9b5f675889 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalog.scala @@ -23,48 +23,39 @@ import java.util import scala.collection.JavaConverters._ import scala.collection.mutable -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalog.v2.{Identifier, NamespaceChange, SupportsNamespaces, TableCatalog, TableChange} -import org.apache.spark.sql.catalog.v2.NamespaceChange.{RemoveProperty, SetProperty} -import org.apache.spark.sql.catalog.v2.expressions.{BucketTransform, FieldReference, IdentityTransform, Transform} -import org.apache.spark.sql.catalog.v2.utils.CatalogV2Util import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{NamespaceAlreadyExistsException, NoSuchNamespaceException, NoSuchTableException, TableAlreadyExistsException} import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogDatabase, CatalogTable, CatalogTableType, CatalogUtils, SessionCatalog} +import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogV2Util, Identifier, NamespaceChange, SupportsNamespaces, Table, TableCatalog, TableChange, V1Table} +import org.apache.spark.sql.connector.catalog.NamespaceChange.RemoveProperty +import org.apache.spark.sql.connector.expressions.{BucketTransform, FieldReference, IdentityTransform, Transform} import org.apache.spark.sql.execution.datasources.DataSource -import org.apache.spark.sql.internal.SessionState -import org.apache.spark.sql.sources.v2.Table -import org.apache.spark.sql.sources.v2.internal.V1Table +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap /** * A [[TableCatalog]] that translates calls to the v1 SessionCatalog. */ -class V2SessionCatalog(sessionState: SessionState) extends TableCatalog with SupportsNamespaces { - import org.apache.spark.sql.catalog.v2.CatalogV2Implicits._ +class V2SessionCatalog(catalog: SessionCatalog, conf: SQLConf) + extends TableCatalog with SupportsNamespaces { + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.NamespaceHelper import V2SessionCatalog._ - def this() = { - this(SparkSession.active.sessionState) - } - override val defaultNamespace: Array[String] = Array("default") - private lazy val catalog: SessionCatalog = sessionState.catalog - - private var _name: String = _ + override def name: String = CatalogManager.SESSION_CATALOG_NAME - override def name: String = _name - - override def initialize(name: String, options: CaseInsensitiveStringMap): Unit = { - this._name = name - } + // This class is instantiated by Spark, so `initialize` method will not be called. + override def initialize(name: String, options: CaseInsensitiveStringMap): Unit = {} override def listTables(namespace: Array[String]): Array[Identifier] = { namespace match { case Array(db) => - catalog.listTables(db).map(ident => Identifier.of(Array(db), ident.table)).toArray + catalog + .listTables(db) + .map(ident => Identifier.of(Array(ident.database.getOrElse("")), ident.table)) + .toArray case _ => throw new NoSuchNamespaceException(namespace) } @@ -92,9 +83,9 @@ class V2SessionCatalog(sessionState: SessionState) extends TableCatalog with Sup properties: util.Map[String, String]): Table = { val (partitionColumns, maybeBucketSpec) = V2SessionCatalog.convertTransforms(partitions) - val provider = properties.getOrDefault("provider", sessionState.conf.defaultDataSourceName) + val provider = properties.getOrDefault(TableCatalog.PROP_PROVIDER, conf.defaultDataSourceName) val tableProperties = properties.asScala - val location = Option(properties.get(LOCATION_TABLE_PROP)) + val location = Option(properties.get(TableCatalog.PROP_LOCATION)) val storage = DataSource.buildStorageFormatFromOptions(tableProperties.toMap) .copy(locationUri = location.map(CatalogUtils.stringToURI)) val tableType = if (location.isDefined) CatalogTableType.EXTERNAL else CatalogTableType.MANAGED @@ -108,8 +99,8 @@ class V2SessionCatalog(sessionState: SessionState) extends TableCatalog with Sup partitionColumnNames = partitionColumns, bucketSpec = maybeBucketSpec, properties = tableProperties.toMap, - tracksPartitionsInCatalog = sessionState.conf.manageFilesourcePartitions, - comment = Option(properties.get(COMMENT_TABLE_PROP))) + tracksPartitionsInCatalog = conf.manageFilesourcePartitions, + comment = Option(properties.get(TableCatalog.PROP_COMMENT))) try { catalog.createTable(tableDesc, ignoreIfExists = false) @@ -133,9 +124,13 @@ class V2SessionCatalog(sessionState: SessionState) extends TableCatalog with Sup val properties = CatalogV2Util.applyPropertiesChanges(catalogTable.properties, changes) val schema = CatalogV2Util.applySchemaChanges(catalogTable.schema, changes) + val comment = properties.get(TableCatalog.PROP_COMMENT) + val owner = properties.getOrElse(TableCatalog.PROP_OWNER, catalogTable.owner) try { - catalog.alterTable(catalogTable.copy(properties = properties, schema = schema)) + catalog.alterTable( + catalogTable + .copy(properties = properties, schema = schema, owner = owner, comment = comment)) } catch { case _: NoSuchTableException => throw new NoSuchTableException(ident) @@ -236,7 +231,8 @@ class V2SessionCatalog(sessionState: SessionState) extends TableCatalog with Sup case Array(db) => // validate that this catalog's reserved properties are not removed changes.foreach { - case remove: RemoveProperty if RESERVED_PROPERTIES.contains(remove.property) => + case remove: RemoveProperty + if CatalogV2Util.NAMESPACE_RESERVED_PROPERTIES.contains(remove.property) => throw new UnsupportedOperationException( s"Cannot remove reserved property: ${remove.property}") case _ => @@ -271,9 +267,6 @@ class V2SessionCatalog(sessionState: SessionState) extends TableCatalog with Sup } private[sql] object V2SessionCatalog { - val COMMENT_TABLE_PROP: String = "comment" - val LOCATION_TABLE_PROP: String = "location" - val RESERVED_PROPERTIES: Set[String] = Set(COMMENT_TABLE_PROP, LOCATION_TABLE_PROP) /** * Convert v2 Transforms to v1 partition columns and an optional bucket spec. @@ -303,12 +296,13 @@ private[sql] object V2SessionCatalog { defaultLocation: Option[URI] = None): CatalogDatabase = { CatalogDatabase( name = db, - description = metadata.getOrDefault(COMMENT_TABLE_PROP, ""), - locationUri = Option(metadata.get(LOCATION_TABLE_PROP)) + description = metadata.getOrDefault(SupportsNamespaces.PROP_COMMENT, ""), + locationUri = Option(metadata.get(SupportsNamespaces.PROP_LOCATION)) .map(CatalogUtils.stringToURI) .orElse(defaultLocation) .getOrElse(throw new IllegalArgumentException("Missing database location")), - properties = metadata.asScala.toMap -- Seq("comment", "location")) + properties = metadata.asScala.toMap -- + Seq(SupportsNamespaces.PROP_COMMENT, SupportsNamespaces.PROP_LOCATION)) } private implicit class CatalogDatabaseHelper(catalogDatabase: CatalogDatabase) { @@ -318,8 +312,8 @@ private[sql] object V2SessionCatalog { catalogDatabase.properties.foreach { case (key, value) => metadata.put(key, value) } - metadata.put(LOCATION_TABLE_PROP, catalogDatabase.locationUri.toString) - metadata.put(COMMENT_TABLE_PROP, catalogDatabase.description) + metadata.put(SupportsNamespaces.PROP_LOCATION, catalogDatabase.locationUri.toString) + metadata.put(SupportsNamespaces.PROP_COMMENT, catalogDatabase.description) metadata.asJava } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala index 0131d72ebc97a..e360a9e656a16 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala @@ -26,16 +26,15 @@ import org.apache.spark.{SparkEnv, SparkException, TaskContext} import org.apache.spark.executor.CommitDeniedException import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD -import org.apache.spark.sql.catalog.v2.{Identifier, StagingTableCatalog, TableCatalog} -import org.apache.spark.sql.catalog.v2.expressions.Transform import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.{CannotReplaceMissingTableException, NoSuchTableException, TableAlreadyExistsException} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.connector.catalog.{Identifier, StagedTable, StagingTableCatalog, SupportsWrite, TableCatalog} +import org.apache.spark.sql.connector.expressions.Transform +import org.apache.spark.sql.connector.write.{BatchWrite, DataWriterFactory, LogicalWriteInfoImpl, PhysicalWriteInfoImpl, SupportsDynamicOverwrite, SupportsOverwrite, SupportsTruncate, V1WriteBuilder, WriteBuilder, WriterCommitMessage} import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.sources.{AlwaysTrue, Filter} -import org.apache.spark.sql.sources.v2.{StagedTable, SupportsWrite} -import org.apache.spark.sql.sources.v2.writer.{BatchWrite, DataWriterFactory, SupportsDynamicOverwrite, SupportsOverwrite, SupportsTruncate, V1WriteBuilder, WriteBuilder, WriterCommitMessage} import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util.{LongAccumulator, Utils} @@ -69,7 +68,7 @@ case class CreateTableAsSelectExec( writeOptions: CaseInsensitiveStringMap, ifNotExists: Boolean) extends V2TableWriteExec with SupportsV1Write { - import org.apache.spark.sql.catalog.v2.CatalogV2Implicits.IdentifierHelper + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.IdentifierHelper override protected def doExecute(): RDD[InternalRow] = { if (catalog.tableExists(ident)) { @@ -85,9 +84,11 @@ case class CreateTableAsSelectExec( catalog.createTable( ident, schema, partitioning.toArray, properties.asJava) match { case table: SupportsWrite => - val writeBuilder = table.newWriteBuilder(writeOptions) - .withInputDataSchema(schema) - .withQueryId(UUID.randomUUID().toString) + val info = LogicalWriteInfoImpl( + queryId = UUID.randomUUID().toString, + schema, + writeOptions) + val writeBuilder = table.newWriteBuilder(info) writeBuilder match { case v1: V1WriteBuilder => writeWithV1(v1.buildForV1Write()) @@ -158,7 +159,7 @@ case class ReplaceTableAsSelectExec( writeOptions: CaseInsensitiveStringMap, orCreate: Boolean) extends V2TableWriteExec with SupportsV1Write { - import org.apache.spark.sql.catalog.v2.CatalogV2Implicits.IdentifierHelper + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.IdentifierHelper override protected def doExecute(): RDD[InternalRow] = { // Note that this operation is potentially unsafe, but these are the strict semantics of @@ -180,9 +181,11 @@ case class ReplaceTableAsSelectExec( Utils.tryWithSafeFinallyAndFailureCallbacks({ createdTable match { case table: SupportsWrite => - val writeBuilder = table.newWriteBuilder(writeOptions) - .withInputDataSchema(schema) - .withQueryId(UUID.randomUUID().toString) + val info = LogicalWriteInfoImpl( + queryId = UUID.randomUUID().toString, + schema, + writeOptions) + val writeBuilder = table.newWriteBuilder(info) writeBuilder match { case v1: V1WriteBuilder => writeWithV1(v1.buildForV1Write()) @@ -336,9 +339,11 @@ trait BatchWriteHelper { def writeOptions: CaseInsensitiveStringMap def newWriteBuilder(): WriteBuilder = { - table.newWriteBuilder(writeOptions) - .withInputDataSchema(query.schema) - .withQueryId(UUID.randomUUID().toString) + val info = LogicalWriteInfoImpl( + queryId = UUID.randomUUID().toString, + query.schema, + writeOptions) + table.newWriteBuilder(info) } } @@ -354,17 +359,20 @@ trait V2TableWriteExec extends UnaryExecNode { override def output: Seq[Attribute] = Nil protected def writeWithV2(batchWrite: BatchWrite): RDD[InternalRow] = { - val writerFactory = batchWrite.createBatchWriterFactory() - val useCommitCoordinator = batchWrite.useCommitCoordinator - val rdd = query.execute() - // SPARK-23271 If we are attempting to write a zero partition rdd, create a dummy single - // partition rdd to make sure we at least set up one write task to write the metadata. - val rddWithNonEmptyPartitions = if (rdd.partitions.length == 0) { - sparkContext.parallelize(Array.empty[InternalRow], 1) - } else { - rdd + val rdd: RDD[InternalRow] = { + val tempRdd = query.execute() + // SPARK-23271 If we are attempting to write a zero partition rdd, create a dummy single + // partition rdd to make sure we at least set up one write task to write the metadata. + if (tempRdd.partitions.length == 0) { + sparkContext.parallelize(Array.empty[InternalRow], 1) + } else { + tempRdd + } } - val messages = new Array[WriterCommitMessage](rddWithNonEmptyPartitions.partitions.length) + val writerFactory = batchWrite.createBatchWriterFactory( + PhysicalWriteInfoImpl(rdd.getNumPartitions)) + val useCommitCoordinator = batchWrite.useCommitCoordinator + val messages = new Array[WriterCommitMessage](rdd.partitions.length) val totalNumRowsAccumulator = new LongAccumulator() logInfo(s"Start processing data source write support: $batchWrite. " + @@ -372,10 +380,10 @@ trait V2TableWriteExec extends UnaryExecNode { try { sparkContext.runJob( - rddWithNonEmptyPartitions, + rdd, (context: TaskContext, iter: Iterator[InternalRow]) => DataWritingSparkTask.run(writerFactory, context, iter, useCommitCoordinator), - rddWithNonEmptyPartitions.partitions.indices, + rdd.partitions.indices, (index, result: DataWritingSparkTaskResult) => { val commitMessage = result.writerCommitMessage messages(index) = commitMessage @@ -465,12 +473,14 @@ object DataWritingSparkTask extends Logging { dataWriter.abort() logError(s"Aborted commit for partition $partId (task $taskId, attempt $attemptId, " + s"stage $stageId.$stageAttempt)") + }, finallyBlock = { + dataWriter.close() }) } } private[v2] trait AtomicTableWriteExec extends V2TableWriteExec with SupportsV1Write { - import org.apache.spark.sql.catalog.v2.CatalogV2Implicits.IdentifierHelper + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.IdentifierHelper protected def writeToStagedTable( stagedTable: StagedTable, @@ -479,9 +489,11 @@ private[v2] trait AtomicTableWriteExec extends V2TableWriteExec with SupportsV1W Utils.tryWithSafeFinallyAndFailureCallbacks({ stagedTable match { case table: SupportsWrite => - val writeBuilder = table.newWriteBuilder(writeOptions) - .withInputDataSchema(query.schema) - .withQueryId(UUID.randomUUID().toString) + val info = LogicalWriteInfoImpl( + queryId = UUID.randomUUID().toString, + query.schema, + writeOptions) + val writeBuilder = table.newWriteBuilder(info) val writtenRows = writeBuilder match { case v1: V1WriteBuilder => writeWithV1(v1.buildForV1Write()) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVDataSourceV2.scala index 045f41e670ad3..1f99d4282f6da 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVDataSourceV2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVDataSourceV2.scala @@ -16,10 +16,10 @@ */ package org.apache.spark.sql.execution.datasources.v2.csv +import org.apache.spark.sql.connector.catalog.Table import org.apache.spark.sql.execution.datasources.FileFormat import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2 -import org.apache.spark.sql.sources.v2.Table import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVPartitionReaderFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVPartitionReaderFactory.scala index 828594ffb10af..31d31bd43f453 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVPartitionReaderFactory.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVPartitionReaderFactory.scala @@ -19,11 +19,12 @@ package org.apache.spark.sql.execution.datasources.v2.csv import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.csv.{CSVHeaderChecker, CSVOptions, UnivocityParser} +import org.apache.spark.sql.connector.read.PartitionReader import org.apache.spark.sql.execution.datasources.PartitionedFile import org.apache.spark.sql.execution.datasources.csv.CSVDataSource import org.apache.spark.sql.execution.datasources.v2._ import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.sources.v2.reader.PartitionReader +import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType import org.apache.spark.util.SerializableConfiguration @@ -43,7 +44,8 @@ case class CSVPartitionReaderFactory( dataSchema: StructType, readDataSchema: StructType, partitionSchema: StructType, - parsedOptions: CSVOptions) extends FilePartitionReaderFactory { + parsedOptions: CSVOptions, + filters: Seq[Filter]) extends FilePartitionReaderFactory { private val columnPruning = sqlConf.csvColumnPruning override def buildReader(file: PartitionedFile): PartitionReader[InternalRow] = { @@ -55,7 +57,8 @@ case class CSVPartitionReaderFactory( val parser = new UnivocityParser( actualDataSchema, actualReadDataSchema, - parsedOptions) + parsedOptions, + filters) val schema = if (columnPruning) actualReadDataSchema else actualDataSchema val isStartOfFile = file.start == 0 val headerChecker = new CSVHeaderChecker( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVScan.scala index 3cbcfca01a9c3..4f510322815ef 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVScan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVScan.scala @@ -22,12 +22,13 @@ import org.apache.hadoop.fs.Path import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.catalyst.csv.CSVOptions -import org.apache.spark.sql.catalyst.expressions.ExprUtils +import org.apache.spark.sql.catalyst.expressions.{Expression, ExprUtils} +import org.apache.spark.sql.connector.read.PartitionReaderFactory import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex -import org.apache.spark.sql.execution.datasources.csv.{CSVDataSource, MultiLineCSVDataSource} -import org.apache.spark.sql.execution.datasources.v2.TextBasedFileScan -import org.apache.spark.sql.sources.v2.reader.PartitionReaderFactory -import org.apache.spark.sql.types.{DataType, StructType} +import org.apache.spark.sql.execution.datasources.csv.CSVDataSource +import org.apache.spark.sql.execution.datasources.v2.{FileScan, TextBasedFileScan} +import org.apache.spark.sql.sources.Filter +import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util.SerializableConfiguration @@ -37,8 +38,11 @@ case class CSVScan( dataSchema: StructType, readDataSchema: StructType, readPartitionSchema: StructType, - options: CaseInsensitiveStringMap) - extends TextBasedFileScan(sparkSession, fileIndex, readDataSchema, readPartitionSchema, options) { + options: CaseInsensitiveStringMap, + pushedFilters: Array[Filter], + partitionFilters: Seq[Expression] = Seq.empty, + dataFilters: Seq[Expression] = Seq.empty) + extends TextBasedFileScan(sparkSession, options) { private lazy val parsedOptions: CSVOptions = new CSVOptions( options.asScala.toMap, @@ -85,6 +89,22 @@ case class CSVScan( // The partition values are already truncated in `FileScan.partitions`. // We should use `readPartitionSchema` as the partition schema here. CSVPartitionReaderFactory(sparkSession.sessionState.conf, broadcastedConf, - dataSchema, readDataSchema, readPartitionSchema, parsedOptions) + dataSchema, readDataSchema, readPartitionSchema, parsedOptions, pushedFilters) + } + + override def withFilters( + partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): FileScan = + this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters) + + override def equals(obj: Any): Boolean = obj match { + case c: CSVScan => super.equals(c) && dataSchema == c.dataSchema && options == c.options && + equivalentFilters(pushedFilters, c.pushedFilters) + case _ => false + } + + override def hashCode(): Int = super.hashCode() + + override def description(): String = { + super.description() + ", PushedFilters: " + pushedFilters.mkString("[", ", ", "]") } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVScanBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVScanBuilder.scala index 28c5b3d81a3d5..81a234e254000 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVScanBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVScanBuilder.scala @@ -18,9 +18,11 @@ package org.apache.spark.sql.execution.datasources.v2.csv import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.csv.CSVFilters +import org.apache.spark.sql.connector.read.{Scan, SupportsPushDownFilters} import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder -import org.apache.spark.sql.sources.v2.reader.Scan +import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -30,9 +32,27 @@ case class CSVScanBuilder( schema: StructType, dataSchema: StructType, options: CaseInsensitiveStringMap) - extends FileScanBuilder(sparkSession, fileIndex, dataSchema) { + extends FileScanBuilder(sparkSession, fileIndex, dataSchema) with SupportsPushDownFilters { override def build(): Scan = { - CSVScan(sparkSession, fileIndex, dataSchema, readDataSchema(), readPartitionSchema(), options) + CSVScan( + sparkSession, + fileIndex, + dataSchema, + readDataSchema(), + readPartitionSchema(), + options, + pushedFilters()) } + + private var _pushedFilters: Array[Filter] = Array.empty + + override def pushFilters(filters: Array[Filter]): Array[Filter] = { + if (sparkSession.sessionState.conf.csvFilterPushDown) { + _pushedFilters = CSVFilters.pushedFilters(filters, dataSchema) + } + filters + } + + override def pushedFilters(): Array[Filter] = _pushedFilters } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVTable.scala index 8170661a70172..3cafe37b743f3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVTable.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVTable.scala @@ -22,10 +22,10 @@ import org.apache.hadoop.fs.FileStatus import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.csv.CSVOptions +import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder} import org.apache.spark.sql.execution.datasources.FileFormat import org.apache.spark.sql.execution.datasources.csv.CSVDataSource import org.apache.spark.sql.execution.datasources.v2.FileTable -import org.apache.spark.sql.sources.v2.writer.WriteBuilder import org.apache.spark.sql.types.{AtomicType, DataType, StructType, UserDefinedType} import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -49,8 +49,8 @@ case class CSVTable( CSVDataSource(parsedOptions).inferSchema(sparkSession, files, parsedOptions) } - override def newWriteBuilder(options: CaseInsensitiveStringMap): WriteBuilder = - new CSVWriteBuilder(options, paths, formatName, supportsDataType) + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = + new CSVWriteBuilder(paths, formatName, supportsDataType, info) override def supportsDataType(dataType: DataType): Boolean = dataType match { case _: AtomicType => true diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVWriteBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVWriteBuilder.scala index 92b47e4354807..bfbb1831aa63d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVWriteBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/csv/CSVWriteBuilder.scala @@ -20,19 +20,19 @@ import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.spark.sql.catalyst.csv.CSVOptions import org.apache.spark.sql.catalyst.util.CompressionCodecs +import org.apache.spark.sql.connector.write.LogicalWriteInfo import org.apache.spark.sql.execution.datasources.{CodecStreams, OutputWriter, OutputWriterFactory} import org.apache.spark.sql.execution.datasources.csv.CsvOutputWriter import org.apache.spark.sql.execution.datasources.v2.FileWriteBuilder import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DataType, StructType} -import org.apache.spark.sql.util.CaseInsensitiveStringMap class CSVWriteBuilder( - options: CaseInsensitiveStringMap, paths: Seq[String], formatName: String, - supportsDataType: DataType => Boolean) - extends FileWriteBuilder(options, paths, formatName, supportsDataType) { + supportsDataType: DataType => Boolean, + info: LogicalWriteInfo) + extends FileWriteBuilder(paths, formatName, supportsDataType, info) { override def prepareWrite( sqlConf: SQLConf, job: Job, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonDataSourceV2.scala index 610bd4c1b9d85..7a0949e586cd8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonDataSourceV2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonDataSourceV2.scala @@ -16,10 +16,10 @@ */ package org.apache.spark.sql.execution.datasources.v2.json +import org.apache.spark.sql.connector.catalog.Table import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.datasources.json.JsonFileFormat import org.apache.spark.sql.execution.datasources.v2._ -import org.apache.spark.sql.sources.v2.Table import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonPartitionReaderFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonPartitionReaderFactory.scala index e5b7ae0bd228a..698423948f916 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonPartitionReaderFactory.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonPartitionReaderFactory.scala @@ -19,11 +19,11 @@ package org.apache.spark.sql.execution.datasources.v2.json import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.json.{JacksonParser, JSONOptionsInRead} +import org.apache.spark.sql.connector.read.PartitionReader import org.apache.spark.sql.execution.datasources.PartitionedFile import org.apache.spark.sql.execution.datasources.json.JsonDataSource import org.apache.spark.sql.execution.datasources.v2._ import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.sources.v2.reader.PartitionReader import org.apache.spark.sql.types.StructType import org.apache.spark.util.SerializableConfiguration diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonScan.scala index 5c41bbd931982..75231625676ff 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonScan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonScan.scala @@ -21,13 +21,13 @@ import scala.collection.JavaConverters._ import org.apache.hadoop.fs.Path import org.apache.spark.sql.{AnalysisException, SparkSession} -import org.apache.spark.sql.catalyst.expressions.ExprUtils +import org.apache.spark.sql.catalyst.expressions.{Expression, ExprUtils} import org.apache.spark.sql.catalyst.json.JSONOptionsInRead import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.sql.connector.read.PartitionReaderFactory import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex import org.apache.spark.sql.execution.datasources.json.JsonDataSource import org.apache.spark.sql.execution.datasources.v2.{FileScan, TextBasedFileScan} -import org.apache.spark.sql.sources.v2.reader.PartitionReaderFactory import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util.SerializableConfiguration @@ -38,8 +38,10 @@ case class JsonScan( dataSchema: StructType, readDataSchema: StructType, readPartitionSchema: StructType, - options: CaseInsensitiveStringMap) - extends TextBasedFileScan(sparkSession, fileIndex, readDataSchema, readPartitionSchema, options) { + options: CaseInsensitiveStringMap, + partitionFilters: Seq[Expression] = Seq.empty, + dataFilters: Seq[Expression] = Seq.empty) + extends TextBasedFileScan(sparkSession, options) { private val parsedOptions = new JSONOptionsInRead( CaseInsensitiveMap(options.asScala.toMap), @@ -86,4 +88,16 @@ case class JsonScan( JsonPartitionReaderFactory(sparkSession.sessionState.conf, broadcastedConf, dataSchema, readDataSchema, readPartitionSchema, parsedOptions) } + + override def withFilters( + partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): FileScan = + this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters) + + override def equals(obj: Any): Boolean = obj match { + case j: JsonScan => super.equals(j) && dataSchema == j.dataSchema && options == j.options + + case _ => false + } + + override def hashCode(): Int = super.hashCode() } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonScanBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonScanBuilder.scala index bb3c0366bdc2f..be53b1b1676f1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonScanBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonScanBuilder.scala @@ -17,9 +17,9 @@ package org.apache.spark.sql.execution.datasources.v2.json import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connector.read.Scan import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder -import org.apache.spark.sql.sources.v2.reader.Scan import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonTable.scala index bbdd3ae69222a..4b66aec6acbed 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonTable.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonTable.scala @@ -22,10 +22,10 @@ import org.apache.hadoop.fs.FileStatus import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.json.JSONOptionsInRead +import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder} import org.apache.spark.sql.execution.datasources.FileFormat import org.apache.spark.sql.execution.datasources.json.JsonDataSource import org.apache.spark.sql.execution.datasources.v2.FileTable -import org.apache.spark.sql.sources.v2.writer.WriteBuilder import org.apache.spark.sql.types._ import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -49,8 +49,8 @@ case class JsonTable( sparkSession, files, parsedOptions) } - override def newWriteBuilder(options: CaseInsensitiveStringMap): WriteBuilder = - new JsonWriteBuilder(options, paths, formatName, supportsDataType) + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = + new JsonWriteBuilder(paths, formatName, supportsDataType, info) override def supportsDataType(dataType: DataType): Boolean = dataType match { case _: AtomicType => true diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonWriteBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonWriteBuilder.scala index 3c99e07489a77..19f472057ea7d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonWriteBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/json/JsonWriteBuilder.scala @@ -20,19 +20,19 @@ import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.spark.sql.catalyst.json.JSONOptions import org.apache.spark.sql.catalyst.util.CompressionCodecs +import org.apache.spark.sql.connector.write.LogicalWriteInfo import org.apache.spark.sql.execution.datasources.{CodecStreams, OutputWriter, OutputWriterFactory} import org.apache.spark.sql.execution.datasources.json.JsonOutputWriter import org.apache.spark.sql.execution.datasources.v2.FileWriteBuilder import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ -import org.apache.spark.sql.util.CaseInsensitiveStringMap class JsonWriteBuilder( - options: CaseInsensitiveStringMap, paths: Seq[String], formatName: String, - supportsDataType: DataType => Boolean) - extends FileWriteBuilder(options, paths, formatName, supportsDataType) { + supportsDataType: DataType => Boolean, + info: LogicalWriteInfo) + extends FileWriteBuilder(paths, formatName, supportsDataType, info) { override def prepareWrite( sqlConf: SQLConf, job: Job, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcDataSourceV2.scala index 1ea80d2ba5fbc..8665af33b976a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcDataSourceV2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcDataSourceV2.scala @@ -16,10 +16,10 @@ */ package org.apache.spark.sql.execution.datasources.v2.orc +import org.apache.spark.sql.connector.catalog.Table import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat import org.apache.spark.sql.execution.datasources.v2._ -import org.apache.spark.sql.sources.v2.Table import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcPartitionReaderFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcPartitionReaderFactory.scala index ec923797e2691..03d58fdcb7f67 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcPartitionReaderFactory.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcPartitionReaderFactory.scala @@ -29,16 +29,14 @@ import org.apache.orc.mapreduce.OrcInputFormat import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.JoinedRow -import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection -import org.apache.spark.sql.execution.datasources.{PartitionedFile, PartitioningUtils} +import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader} +import org.apache.spark.sql.execution.datasources.PartitionedFile import org.apache.spark.sql.execution.datasources.orc.{OrcColumnarBatchReader, OrcDeserializer, OrcUtils} import org.apache.spark.sql.execution.datasources.v2._ import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.sources.v2.reader.{InputPartition, PartitionReader} import org.apache.spark.sql.types.{AtomicType, StructType} import org.apache.spark.sql.vectorized.ColumnarBatch -import org.apache.spark.util.SerializableConfiguration +import org.apache.spark.util.{SerializableConfiguration, Utils} /** * A factory used to create Orc readers. @@ -76,10 +74,11 @@ case class OrcPartitionReaderFactory( val fs = filePath.getFileSystem(conf) val readerOptions = OrcFile.readerOptions(conf).filesystem(fs) - val reader = OrcFile.createReader(filePath, readerOptions) - - val requestedColIdsOrEmptyFile = OrcUtils.requestedColumnIds( - isCaseSensitive, dataSchema, readDataSchema, reader, conf) + val requestedColIdsOrEmptyFile = + Utils.tryWithResource(OrcFile.createReader(filePath, readerOptions)) { reader => + OrcUtils.requestedColumnIds( + isCaseSensitive, dataSchema, readDataSchema, reader, conf) + } if (requestedColIdsOrEmptyFile.isEmpty) { new EmptyPartitionReader[InternalRow] @@ -121,10 +120,11 @@ case class OrcPartitionReaderFactory( val fs = filePath.getFileSystem(conf) val readerOptions = OrcFile.readerOptions(conf).filesystem(fs) - val reader = OrcFile.createReader(filePath, readerOptions) - - val requestedColIdsOrEmptyFile = OrcUtils.requestedColumnIds( - isCaseSensitive, dataSchema, readDataSchema, reader, conf) + val requestedColIdsOrEmptyFile = + Utils.tryWithResource(OrcFile.createReader(filePath, readerOptions)) { reader => + OrcUtils.requestedColumnIds( + isCaseSensitive, dataSchema, readDataSchema, reader, conf) + } if (requestedColIdsOrEmptyFile.isEmpty) { new EmptyPartitionReader diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScan.scala index a4fb03405d162..62894fa7a2538 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScan.scala @@ -20,10 +20,11 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.connector.read.PartitionReaderFactory import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex import org.apache.spark.sql.execution.datasources.v2.FileScan import org.apache.spark.sql.sources.Filter -import org.apache.spark.sql.sources.v2.reader.PartitionReaderFactory import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util.SerializableConfiguration @@ -36,8 +37,9 @@ case class OrcScan( readDataSchema: StructType, readPartitionSchema: StructType, options: CaseInsensitiveStringMap, - pushedFilters: Array[Filter]) - extends FileScan(sparkSession, fileIndex, readDataSchema, readPartitionSchema) { + pushedFilters: Array[Filter], + partitionFilters: Seq[Expression] = Seq.empty, + dataFilters: Seq[Expression] = Seq.empty) extends FileScan { override def isSplitable(path: Path): Boolean = true override def createReaderFactory(): PartitionReaderFactory = { @@ -51,15 +53,19 @@ case class OrcScan( override def equals(obj: Any): Boolean = obj match { case o: OrcScan => - fileIndex == o.fileIndex && dataSchema == o.dataSchema && - readDataSchema == o.readDataSchema && readPartitionSchema == o.readPartitionSchema && - options == o.options && equivalentFilters(pushedFilters, o.pushedFilters) + super.equals(o) && dataSchema == o.dataSchema && options == o.options && + equivalentFilters(pushedFilters, o.pushedFilters) + case _ => false } override def hashCode(): Int = getClass.hashCode() override def description(): String = { - super.description() + ", PushedFilters: " + pushedFilters.mkString("[", ", ", "]") + super.description() + ", PushedFilters: " + seqToString(pushedFilters) } + + override def withFilters( + partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): FileScan = + this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScanBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScanBuilder.scala index 458b98c627be4..1421ffd8b6de4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScanBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScanBuilder.scala @@ -22,11 +22,11 @@ import scala.collection.JavaConverters._ import org.apache.orc.mapreduce.OrcInputFormat import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connector.read.{Scan, SupportsPushDownFilters} import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex import org.apache.spark.sql.execution.datasources.orc.OrcFilters import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder import org.apache.spark.sql.sources.Filter -import org.apache.spark.sql.sources.v2.reader.{Scan, SupportsPushDownFilters} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -43,6 +43,8 @@ case class OrcScanBuilder( sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap) } + override protected val supportsNestedSchemaPruning: Boolean = true + override def build(): Scan = { OrcScan(sparkSession, hadoopConf, fileIndex, dataSchema, readDataSchema(), readPartitionSchema(), options, pushedFilters()) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcTable.scala index 3fe433861a3c4..3ef41210de181 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcTable.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcTable.scala @@ -21,10 +21,10 @@ import scala.collection.JavaConverters._ import org.apache.hadoop.fs.FileStatus import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder} import org.apache.spark.sql.execution.datasources.FileFormat import org.apache.spark.sql.execution.datasources.orc.OrcUtils import org.apache.spark.sql.execution.datasources.v2.FileTable -import org.apache.spark.sql.sources.v2.writer.WriteBuilder import org.apache.spark.sql.types._ import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -43,8 +43,8 @@ case class OrcTable( override def inferSchema(files: Seq[FileStatus]): Option[StructType] = OrcUtils.inferSchema(sparkSession, files, options.asScala.toMap) - override def newWriteBuilder(options: CaseInsensitiveStringMap): WriteBuilder = - new OrcWriteBuilder(options, paths, formatName, supportsDataType) + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = + new OrcWriteBuilder(paths, formatName, supportsDataType, info) override def supportsDataType(dataType: DataType): Boolean = dataType match { case _: AtomicType => true diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcWriteBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcWriteBuilder.scala index f5b06e11c8bd7..48044748708d9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcWriteBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcWriteBuilder.scala @@ -21,19 +21,19 @@ import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.orc.OrcConf.{COMPRESS, MAPRED_OUTPUT_SCHEMA} import org.apache.orc.mapred.OrcStruct +import org.apache.spark.sql.connector.write.LogicalWriteInfo import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory} import org.apache.spark.sql.execution.datasources.orc.{OrcFileFormat, OrcOptions, OrcOutputWriter, OrcUtils} import org.apache.spark.sql.execution.datasources.v2.FileWriteBuilder import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ -import org.apache.spark.sql.util.CaseInsensitiveStringMap class OrcWriteBuilder( - options: CaseInsensitiveStringMap, paths: Seq[String], formatName: String, - supportsDataType: DataType => Boolean) - extends FileWriteBuilder(options, paths, formatName, supportsDataType) { + supportsDataType: DataType => Boolean, + info: LogicalWriteInfo) + extends FileWriteBuilder(paths, formatName, supportsDataType, info) { override def prepareWrite( sqlConf: SQLConf, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetDataSourceV2.scala index 0b6d5a960374b..8cb6186c12ff3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetDataSourceV2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetDataSourceV2.scala @@ -16,10 +16,10 @@ */ package org.apache.spark.sql.execution.datasources.v2.parquet +import org.apache.spark.sql.connector.catalog.Table import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.execution.datasources.v2._ -import org.apache.spark.sql.sources.v2.Table import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetPartitionReaderFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetPartitionReaderFactory.scala index a0f19c3dd2eb4..047bc74a8d81e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetPartitionReaderFactory.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetPartitionReaderFactory.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution.datasources.v2.parquet import java.net.URI -import java.util.TimeZone +import java.time.ZoneId import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce._ @@ -31,14 +31,13 @@ import org.apache.spark.TaskContext import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader} import org.apache.spark.sql.execution.datasources.{PartitionedFile, RecordReaderIterator} import org.apache.spark.sql.execution.datasources.parquet._ import org.apache.spark.sql.execution.datasources.v2._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.Filter -import org.apache.spark.sql.sources.v2.reader.{InputPartition, PartitionReader} import org.apache.spark.sql.types.{AtomicType, StructType} import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.SerializableConfiguration @@ -118,7 +117,7 @@ case class ParquetPartitionReaderFactory( file: PartitionedFile, buildReaderFunc: ( ParquetInputSplit, InternalRow, TaskAttemptContextImpl, Option[FilterPredicate], - Option[TimeZone]) => RecordReader[Void, T]): RecordReader[Void, T] = { + Option[ZoneId]) => RecordReader[Void, T]): RecordReader[Void, T] = { val conf = broadcastedConf.value.value val filePath = new Path(new URI(file.filePath)) @@ -157,7 +156,7 @@ case class ParquetPartitionReaderFactory( val convertTz = if (timestampConversion && !isCreatedByParquetMr) { - Some(DateTimeUtils.getTimeZone(conf.get(SQLConf.SESSION_LOCAL_TIMEZONE.key))) + Some(DateTimeUtils.getZoneId(conf.get(SQLConf.SESSION_LOCAL_TIMEZONE.key))) } else { None } @@ -176,7 +175,7 @@ case class ParquetPartitionReaderFactory( reader } - private def createRowBaseReader(file: PartitionedFile): RecordReader[Void, UnsafeRow] = { + private def createRowBaseReader(file: PartitionedFile): RecordReader[Void, InternalRow] = { buildReaderBase(file, createRowBaseParquetReader) } @@ -185,16 +184,16 @@ case class ParquetPartitionReaderFactory( partitionValues: InternalRow, hadoopAttemptContext: TaskAttemptContextImpl, pushed: Option[FilterPredicate], - convertTz: Option[TimeZone]): RecordReader[Void, UnsafeRow] = { + convertTz: Option[ZoneId]): RecordReader[Void, InternalRow] = { logDebug(s"Falling back to parquet-mr") val taskContext = Option(TaskContext.get()) - // ParquetRecordReader returns UnsafeRow + // ParquetRecordReader returns InternalRow val readSupport = new ParquetReadSupport(convertTz, enableVectorizedReader = false) val reader = if (pushed.isDefined && enableRecordFilter) { val parquetFilter = FilterCompat.get(pushed.get, null) - new ParquetRecordReader[UnsafeRow](readSupport, parquetFilter) + new ParquetRecordReader[InternalRow](readSupport, parquetFilter) } else { - new ParquetRecordReader[UnsafeRow](readSupport) + new ParquetRecordReader[InternalRow](readSupport) } val iter = new RecordReaderIterator(reader) // SPARK-23457 Register a task completion listener before `initialization`. @@ -214,7 +213,7 @@ case class ParquetPartitionReaderFactory( partitionValues: InternalRow, hadoopAttemptContext: TaskAttemptContextImpl, pushed: Option[FilterPredicate], - convertTz: Option[TimeZone]): VectorizedParquetRecordReader = { + convertTz: Option[ZoneId]): VectorizedParquetRecordReader = { val taskContext = Option(TaskContext.get()) val vectorizedReader = new VectorizedParquetRecordReader( convertTz.orNull, enableOffHeapColumnVector && taskContext.isDefined, capacity) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScan.scala index a67aa3b92ce82..bb315262a8211 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScan.scala @@ -21,13 +21,13 @@ import org.apache.hadoop.fs.Path import org.apache.parquet.hadoop.ParquetInputFormat import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.connector.read.PartitionReaderFactory import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex import org.apache.spark.sql.execution.datasources.parquet.{ParquetReadSupport, ParquetWriteSupport} import org.apache.spark.sql.execution.datasources.v2.FileScan -import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.Filter -import org.apache.spark.sql.sources.v2.reader.PartitionReaderFactory import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util.SerializableConfiguration @@ -40,8 +40,9 @@ case class ParquetScan( readDataSchema: StructType, readPartitionSchema: StructType, pushedFilters: Array[Filter], - options: CaseInsensitiveStringMap) - extends FileScan(sparkSession, fileIndex, readDataSchema, readPartitionSchema) { + options: CaseInsensitiveStringMap, + partitionFilters: Seq[Expression] = Seq.empty, + dataFilters: Seq[Expression] = Seq.empty) extends FileScan { override def isSplitable(path: Path): Boolean = true override def createReaderFactory(): PartitionReaderFactory = { @@ -81,11 +82,18 @@ case class ParquetScan( override def equals(obj: Any): Boolean = obj match { case p: ParquetScan => - fileIndex == p.fileIndex && dataSchema == p.dataSchema && - readDataSchema == p.readDataSchema && readPartitionSchema == p.readPartitionSchema && - options == p.options && equivalentFilters(pushedFilters, p.pushedFilters) + super.equals(p) && dataSchema == p.dataSchema && options == p.options && + equivalentFilters(pushedFilters, p.pushedFilters) case _ => false } override def hashCode(): Int = getClass.hashCode() + + override def description(): String = { + super.description() + ", PushedFilters: " + seqToString(pushedFilters) + } + + override def withFilters( + partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): FileScan = + this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScanBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScanBuilder.scala index 4b8b434af88e6..2f861356e9499 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScanBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScanBuilder.scala @@ -20,11 +20,11 @@ package org.apache.spark.sql.execution.datasources.v2.parquet import scala.collection.JavaConverters._ import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connector.read.{Scan, SupportsPushDownFilters} import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex import org.apache.spark.sql.execution.datasources.parquet.{ParquetFilters, SparkToParquetSchemaConverter} import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder import org.apache.spark.sql.sources.Filter -import org.apache.spark.sql.sources.v2.reader.{Scan, SupportsPushDownFilters} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -56,6 +56,8 @@ case class ParquetScanBuilder( parquetFilters.convertibleFilters(this.filters).toArray } + override protected val supportsNestedSchemaPruning: Boolean = true + private var filters: Array[Filter] = Array.empty override def pushFilters(filters: Array[Filter]): Array[Filter] = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetTable.scala index dce851dbcd336..e9f9bf8df35e6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetTable.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetTable.scala @@ -21,10 +21,10 @@ import scala.collection.JavaConverters._ import org.apache.hadoop.fs.FileStatus import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder} import org.apache.spark.sql.execution.datasources.FileFormat import org.apache.spark.sql.execution.datasources.parquet.ParquetUtils import org.apache.spark.sql.execution.datasources.v2.FileTable -import org.apache.spark.sql.sources.v2.writer.WriteBuilder import org.apache.spark.sql.types._ import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -43,8 +43,8 @@ case class ParquetTable( override def inferSchema(files: Seq[FileStatus]): Option[StructType] = ParquetUtils.inferSchema(sparkSession, options.asScala.toMap, files) - override def newWriteBuilder(options: CaseInsensitiveStringMap): WriteBuilder = - new ParquetWriteBuilder(options, paths, formatName, supportsDataType) + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = + new ParquetWriteBuilder(paths, formatName, supportsDataType, info) override def supportsDataType(dataType: DataType): Boolean = dataType match { case _: AtomicType => true diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetWriteBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetWriteBuilder.scala index bfe2084299df3..a4e22c21a11f3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetWriteBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetWriteBuilder.scala @@ -16,7 +16,6 @@ */ package org.apache.spark.sql.execution.datasources.v2.parquet -import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.mapreduce.{Job, OutputCommitter, TaskAttemptContext} import org.apache.parquet.hadoop.{ParquetOutputCommitter, ParquetOutputFormat} import org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel @@ -25,19 +24,19 @@ import org.apache.parquet.hadoop.util.ContextUtil import org.apache.spark.internal.Logging import org.apache.spark.sql.Row +import org.apache.spark.sql.connector.write.LogicalWriteInfo import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory} import org.apache.spark.sql.execution.datasources.parquet._ import org.apache.spark.sql.execution.datasources.v2.FileWriteBuilder import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ -import org.apache.spark.sql.util.CaseInsensitiveStringMap class ParquetWriteBuilder( - options: CaseInsensitiveStringMap, paths: Seq[String], formatName: String, - supportsDataType: DataType => Boolean) - extends FileWriteBuilder(options, paths, formatName, supportsDataType) with Logging { + supportsDataType: DataType => Boolean, + info: LogicalWriteInfo) + extends FileWriteBuilder(paths, formatName, supportsDataType, info) with Logging { override def prepareWrite( sqlConf: SQLConf, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextDataSourceV2.scala index f6aa1e9c898b9..049c717effa26 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextDataSourceV2.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextDataSourceV2.scala @@ -16,10 +16,10 @@ */ package org.apache.spark.sql.execution.datasources.v2.text +import org.apache.spark.sql.connector.catalog.Table import org.apache.spark.sql.execution.datasources.FileFormat import org.apache.spark.sql.execution.datasources.text.TextFileFormat import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2 -import org.apache.spark.sql.sources.v2.Table import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextPartitionReaderFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextPartitionReaderFactory.scala index 8788887111880..0cd184da6ef8f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextPartitionReaderFactory.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextPartitionReaderFactory.scala @@ -21,11 +21,11 @@ import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter +import org.apache.spark.sql.connector.read.PartitionReader import org.apache.spark.sql.execution.datasources.{HadoopFileLinesReader, HadoopFileWholeTextReader, PartitionedFile} import org.apache.spark.sql.execution.datasources.text.TextOptions import org.apache.spark.sql.execution.datasources.v2._ import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.sources.v2.reader.PartitionReader import org.apache.spark.sql.types.StructType import org.apache.spark.util.SerializableConfiguration diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextScan.scala index 89b0511442d4a..e75de2c4a4079 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextScan.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextScan.scala @@ -21,10 +21,11 @@ import scala.collection.JavaConverters._ import org.apache.hadoop.fs.Path import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.connector.read.PartitionReaderFactory import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex import org.apache.spark.sql.execution.datasources.text.TextOptions -import org.apache.spark.sql.execution.datasources.v2.TextBasedFileScan -import org.apache.spark.sql.sources.v2.reader.PartitionReaderFactory +import org.apache.spark.sql.execution.datasources.v2.{FileScan, TextBasedFileScan} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util.SerializableConfiguration @@ -34,8 +35,10 @@ case class TextScan( fileIndex: PartitioningAwareFileIndex, readDataSchema: StructType, readPartitionSchema: StructType, - options: CaseInsensitiveStringMap) - extends TextBasedFileScan(sparkSession, fileIndex, readDataSchema, readPartitionSchema, options) { + options: CaseInsensitiveStringMap, + partitionFilters: Seq[Expression] = Seq.empty, + dataFilters: Seq[Expression] = Seq.empty) + extends TextBasedFileScan(sparkSession, options) { private val optionsAsScala = options.asScala.toMap private lazy val textOptions: TextOptions = new TextOptions(optionsAsScala) @@ -67,4 +70,16 @@ case class TextScan( TextPartitionReaderFactory(sparkSession.sessionState.conf, broadcastedConf, readDataSchema, readPartitionSchema, textOptions) } + + override def withFilters( + partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): FileScan = + this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters) + + override def equals(obj: Any): Boolean = obj match { + case t: TextScan => super.equals(t) && options == t.options + + case _ => false + } + + override def hashCode(): Int = super.hashCode() } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextScanBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextScanBuilder.scala index fbe5e1688b836..b2b518c12b01a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextScanBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextScanBuilder.scala @@ -18,9 +18,9 @@ package org.apache.spark.sql.execution.datasources.v2.text import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connector.read.Scan import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder -import org.apache.spark.sql.sources.v2.reader.Scan import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextTable.scala index b8cb61a6c646e..36304a9b17a1e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextTable.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextTable.scala @@ -19,9 +19,9 @@ package org.apache.spark.sql.execution.datasources.v2.text import org.apache.hadoop.fs.FileStatus import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder} import org.apache.spark.sql.execution.datasources.FileFormat import org.apache.spark.sql.execution.datasources.v2.FileTable -import org.apache.spark.sql.sources.v2.writer.WriteBuilder import org.apache.spark.sql.types.{DataType, StringType, StructField, StructType} import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -39,8 +39,8 @@ case class TextTable( override def inferSchema(files: Seq[FileStatus]): Option[StructType] = Some(StructType(Seq(StructField("value", StringType)))) - override def newWriteBuilder(options: CaseInsensitiveStringMap): WriteBuilder = - new TextWriteBuilder(options, paths, formatName, supportsDataType) + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = + new TextWriteBuilder(paths, formatName, supportsDataType, info) override def supportsDataType(dataType: DataType): Boolean = dataType == StringType diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextWriteBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextWriteBuilder.scala index c00dbc20be64a..a3bf4dcae3f33 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextWriteBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextWriteBuilder.scala @@ -20,19 +20,19 @@ import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.util.CompressionCodecs +import org.apache.spark.sql.connector.write.LogicalWriteInfo import org.apache.spark.sql.execution.datasources.{CodecStreams, OutputWriter, OutputWriterFactory} import org.apache.spark.sql.execution.datasources.text.{TextOptions, TextOutputWriter} import org.apache.spark.sql.execution.datasources.v2.FileWriteBuilder import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ -import org.apache.spark.sql.util.CaseInsensitiveStringMap class TextWriteBuilder( - options: CaseInsensitiveStringMap, paths: Seq[String], formatName: String, - supportsDataType: DataType => Boolean) - extends FileWriteBuilder(options, paths, formatName, supportsDataType) { + supportsDataType: DataType => Boolean, + info: LogicalWriteInfo) + extends FileWriteBuilder(paths, formatName, supportsDataType, info) { private def verifySchema(schema: StructType): Unit = { if (schema.size != 1) { throw new AnalysisException( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala index 03adeaaa66569..6a57ef2cafe23 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.execution import java.util.Collections import scala.collection.JavaConverters._ +import scala.util.control.NonFatal import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging @@ -27,7 +28,7 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodegenContext, ExprCode} +import org.apache.spark.sql.catalyst.expressions.codegen.{ByteCodeStats, CodeFormatter, CodegenContext, CodeGenerator, ExprCode} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.trees.TreeNodeRef import org.apache.spark.sql.catalyst.util.StringUtils.StringConcat @@ -81,11 +82,20 @@ package object debug { def writeCodegen(append: String => Unit, plan: SparkPlan): Unit = { val codegenSeq = codegenStringSeq(plan) append(s"Found ${codegenSeq.size} WholeStageCodegen subtrees.\n") - for (((subtree, code), i) <- codegenSeq.zipWithIndex) { - append(s"== Subtree ${i + 1} / ${codegenSeq.size} ==\n") + for (((subtree, code, codeStats), i) <- codegenSeq.zipWithIndex) { + val usedConstPoolRatio = if (codeStats.maxConstPoolSize > 0) { + val rt = 100.0 * codeStats.maxConstPoolSize / CodeGenerator.MAX_JVM_CONSTANT_POOL_SIZE + "(%.2f%% used)".format(rt) + } else { + "" + } + val codeStatsStr = s"maxMethodCodeSize:${codeStats.maxMethodCodeSize}; " + + s"maxConstantPoolSize:${codeStats.maxConstPoolSize}$usedConstPoolRatio; " + + s"numInnerClasses:${codeStats.numInnerClasses}" + append(s"== Subtree ${i + 1} / ${codegenSeq.size} ($codeStatsStr) ==\n") append(subtree) append("\nGenerated code:\n") - append(s"${code}\n") + append(s"$code\n") } } @@ -95,7 +105,7 @@ package object debug { * @param plan the query plan for codegen * @return Sequence of WholeStageCodegen subtrees and corresponding codegen */ - def codegenStringSeq(plan: SparkPlan): Seq[(String, String)] = { + def codegenStringSeq(plan: SparkPlan): Seq[(String, String, ByteCodeStats)] = { val codegenSubtrees = new collection.mutable.HashSet[WholeStageCodegenExec]() plan transform { case s: WholeStageCodegenExec => @@ -105,7 +115,13 @@ package object debug { } codegenSubtrees.toSeq.map { subtree => val (_, source) = subtree.doCodeGen() - (subtree.toString, CodeFormatter.format(source)) + val codeStats = try { + CodeGenerator.compile(source)._2 + } catch { + case NonFatal(_) => + ByteCodeStats.UNAVAILABLE + } + (subtree.toString, CodeFormatter.format(source), codeStats) } } @@ -130,7 +146,7 @@ package object debug { * @param query the streaming query for codegen * @return Sequence of WholeStageCodegen subtrees and corresponding codegen */ - def codegenStringSeq(query: StreamingQuery): Seq[(String, String)] = { + def codegenStringSeq(query: StreamingQuery): Seq[(String, String, ByteCodeStats)] = { val w = asStreamExecution(query) if (w.lastExecution != null) { codegenStringSeq(w.lastExecution.executedPlan) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala index c56a5c015f32d..ab4176cada527 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala @@ -36,7 +36,7 @@ import org.apache.spark.sql.internal.SQLConf */ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] { private def defaultNumPreShufflePartitions: Int = - if (conf.adaptiveExecutionEnabled) { + if (conf.adaptiveExecutionEnabled && conf.reducePostShufflePartitionsEnabled) { conf.maxNumPostShufflePartitions } else { conf.numShufflePartitions @@ -83,7 +83,24 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] { numPartitionsSet.headOption } - val targetNumPartitions = requiredNumPartitions.getOrElse(childrenNumPartitions.max) + // If there are non-shuffle children that satisfy the required distribution, we have + // some tradeoffs when picking the expected number of shuffle partitions: + // 1. We should avoid shuffling these children. + // 2. We should have a reasonable parallelism. + val nonShuffleChildrenNumPartitions = + childrenIndexes.map(children).filterNot(_.isInstanceOf[ShuffleExchangeExec]) + .map(_.outputPartitioning.numPartitions) + val expectedChildrenNumPartitions = if (nonShuffleChildrenNumPartitions.nonEmpty) { + // Here we pick the max number of partitions among these non-shuffle children as the + // expected number of shuffle partitions. However, if it's smaller than + // `conf.numShufflePartitions`, we pick `conf.numShufflePartitions` as the + // expected number of shuffle partitions. + math.max(nonShuffleChildrenNumPartitions.max, conf.numShufflePartitions) + } else { + childrenNumPartitions.max + } + + val targetNumPartitions = requiredNumPartitions.getOrElse(expectedChildrenNumPartitions) children = children.zip(requiredChildDistributions).zipWithIndex.map { case ((child, distribution), index) if childrenIndexes.contains(index) => @@ -188,10 +205,11 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] { ShuffledHashJoinExec(reorderedLeftKeys, reorderedRightKeys, joinType, buildSide, condition, left, right) - case SortMergeJoinExec(leftKeys, rightKeys, joinType, condition, left, right) => + case SortMergeJoinExec(leftKeys, rightKeys, joinType, condition, left, right, isPartial) => val (reorderedLeftKeys, reorderedRightKeys) = reorderJoinKeys(leftKeys, rightKeys, left.outputPartitioning, right.outputPartitioning) - SortMergeJoinExec(reorderedLeftKeys, reorderedRightKeys, joinType, condition, left, right) + SortMergeJoinExec(reorderedLeftKeys, reorderedRightKeys, joinType, condition, + left, right, isPartial) case other => other } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala index 3315ae7dabef1..849ff384c130a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression, SortOrder} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution.{ExplainUtils, LeafExecNode, SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch @@ -71,7 +71,7 @@ case class ReusedExchangeExec(override val output: Seq[Attribute], child: Exchan // `ReusedExchangeExec` can have distinct set of output attribute ids from its child, we need // to update the attribute ids in `outputPartitioning` and `outputOrdering`. - private lazy val updateAttr: Expression => Expression = { + private[sql] lazy val updateAttr: Expression => Expression = { val originalAttrToNewAttr = AttributeMap(child.output.zip(output)) e => e.transform { case attr: Attribute => originalAttrToNewAttr.getOrElse(attr, attr) @@ -109,9 +109,10 @@ case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] { } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]() - plan.transformUp { + + // Replace a Exchange duplicate with a ReusedExchange + def reuse: PartialFunction[Exchange, SparkPlan] = { case exchange: Exchange => - // the exchanges that have same results usually also have same schemas (same column names). val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]()) val samePlan = sameSchema.find { e => exchange.sameResult(e) @@ -125,5 +126,16 @@ case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] { exchange } } + + plan transformUp { + case exchange: Exchange => reuse(exchange) + } transformAllExpressions { + // Lookup inside subqueries for duplicate exchanges + case in: InSubqueryExec => + val newIn = in.plan.transformUp { + case exchange: Exchange => reuse(exchange) + } + in.copy(plan = newIn.asInstanceOf[BaseSubqueryExec]) + } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala index 2f4c5734469f8..4b08da043b83e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchangeExec.scala @@ -20,6 +20,8 @@ package org.apache.spark.sql.execution.exchange import java.util.Random import java.util.function.Supplier +import scala.concurrent.Future + import org.apache.spark._ import org.apache.spark.internal.config import org.apache.spark.rdd.RDD @@ -28,10 +30,11 @@ import org.apache.spark.shuffle.{ShuffleWriteMetricsReporter, ShuffleWriteProces import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ -import org.apache.spark.sql.catalyst.expressions.{Attribute, BoundReference, UnsafeProjection, UnsafeRow} +import org.apache.spark.sql.catalyst.expressions.{Attribute, BoundReference, Divide, Literal, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.LazilyGeneratedOrdering import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution._ +import org.apache.spark.sql.execution.adaptive.LocalShuffledRowRDD import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics, SQLShuffleReadMetricsReporter, SQLShuffleWriteMetricsReporter} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType @@ -46,11 +49,9 @@ case class ShuffleExchangeExec( child: SparkPlan, canChangeNumPartitions: Boolean = true) extends Exchange { - // NOTE: coordinator can be null after serialization/deserialization, - // e.g. it can be null on the Executor side private lazy val writeMetrics = SQLShuffleWriteMetricsReporter.createShuffleWriteMetrics(sparkContext) - private lazy val readMetrics = + private[sql] lazy val readMetrics = SQLShuffleReadMetricsReporter.createShuffleReadMetrics(sparkContext) override lazy val metrics = Map( "dataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size") @@ -63,6 +64,15 @@ case class ShuffleExchangeExec( @transient lazy val inputRDD: RDD[InternalRow] = child.execute() + // 'mapOutputStatisticsFuture' is only needed when enable AQE. + @transient lazy val mapOutputStatisticsFuture: Future[MapOutputStatistics] = { + if (inputRDD.getNumPartitions == 0) { + Future.successful(null) + } else { + sparkContext.submitMapStage(shuffleDependency) + } + } + /** * A [[ShuffleDependency]] that will partition rows of its child based on * the partitioning scheme defined in `newPartitioning`. Those partitions of @@ -82,6 +92,11 @@ case class ShuffleExchangeExec( new ShuffledRowRDD(shuffleDependency, readMetrics, partitionStartIndices) } + def createLocalShuffleRDD( + partitionStartIndicesPerMapper: Array[Array[Int]]): LocalShuffledRowRDD = { + new LocalShuffledRowRDD(shuffleDependency, readMetrics, partitionStartIndicesPerMapper) + } + /** * Caches the created ShuffleRowRDD so we can reuse that. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/history/SQLEventFilterBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/history/SQLEventFilterBuilder.scala new file mode 100644 index 0000000000000..e1f42d7abe0fe --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/history/SQLEventFilterBuilder.scala @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.history + +import scala.collection.mutable + +import org.apache.spark.deploy.history.{EventFilter, EventFilterBuilder, JobEventFilter} +import org.apache.spark.internal.Logging +import org.apache.spark.scheduler._ +import org.apache.spark.sql.execution.SQLExecution +import org.apache.spark.sql.execution.ui._ +import org.apache.spark.sql.streaming.StreamingQueryListener + +/** + * This class tracks live SQL executions, and pass the list to the [[SQLLiveEntitiesEventFilter]] + * to help SQLLiveEntitiesEventFilter to accept live SQL executions as well as relevant + * jobs (+ stages/tasks/RDDs). + * + * Note that this class only tracks the jobs which are relevant to SQL executions - cannot classify + * between finished job and live job without relation of SQL execution. + */ +private[spark] class SQLEventFilterBuilder extends SparkListener with EventFilterBuilder { + private val liveExecutionToJobs = new mutable.HashMap[Long, mutable.Set[Int]] + private val jobToStages = new mutable.HashMap[Int, Set[Int]] + private val stageToTasks = new mutable.HashMap[Int, mutable.Set[Long]] + private val stageToRDDs = new mutable.HashMap[Int, Set[Int]] + private val stages = new mutable.HashSet[Int] + + private[history] def liveSQLExecutions: Set[Long] = liveExecutionToJobs.keySet.toSet + private[history] def liveJobs: Set[Int] = liveExecutionToJobs.values.flatten.toSet + private[history] def liveStages: Set[Int] = stageToRDDs.keySet.toSet + private[history] def liveTasks: Set[Long] = stageToTasks.values.flatten.toSet + private[history] def liveRDDs: Set[Int] = stageToRDDs.values.flatten.toSet + + override def onJobStart(jobStart: SparkListenerJobStart): Unit = { + val executionIdString = jobStart.properties.getProperty(SQLExecution.EXECUTION_ID_KEY) + if (executionIdString == null) { + // This is not a job created by SQL + return + } + + val executionId = executionIdString.toLong + val jobId = jobStart.jobId + + val jobsForExecution = liveExecutionToJobs.getOrElseUpdate(executionId, + mutable.HashSet[Int]()) + jobsForExecution += jobId + + jobToStages += jobStart.jobId -> jobStart.stageIds.toSet + stages ++= jobStart.stageIds + } + + override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = { + val stageId = stageSubmitted.stageInfo.stageId + if (stages.contains(stageId)) { + stageToRDDs.put(stageId, stageSubmitted.stageInfo.rddInfos.map(_.id).toSet) + stageToTasks.getOrElseUpdate(stageId, new mutable.HashSet[Long]()) + } + } + + override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = { + stageToTasks.get(taskStart.stageId).foreach { tasks => + tasks += taskStart.taskInfo.taskId + } + } + + override def onOtherEvent(event: SparkListenerEvent): Unit = event match { + case e: SparkListenerSQLExecutionStart => onExecutionStart(e) + case e: SparkListenerSQLExecutionEnd => onExecutionEnd(e) + case _ => // Ignore + } + + private def onExecutionStart(event: SparkListenerSQLExecutionStart): Unit = { + liveExecutionToJobs += event.executionId -> mutable.HashSet[Int]() + } + + private def onExecutionEnd(event: SparkListenerSQLExecutionEnd): Unit = { + liveExecutionToJobs.remove(event.executionId).foreach { jobs => + val stagesToDrop = jobToStages.filter(kv => jobs.contains(kv._1)).values.flatten + jobToStages --= jobs + stages --= stagesToDrop + stageToTasks --= stagesToDrop + stageToRDDs --= stagesToDrop + } + } + + override def createFilter(): EventFilter = { + new SQLLiveEntitiesEventFilter(liveSQLExecutions, liveJobs, liveStages, liveTasks, liveRDDs) + } +} + +/** + * This class accepts events which are related to the live SQL executions based on the given + * information. + * + * Note that acceptFn will not match the event ("Don't mind") instead of returning false on + * job related events, because it cannot determine whether the job is related to the finished + * SQL executions, or job is NOT related to the SQL executions. For this case, it just gives up + * the decision and let other filters decide it. + */ +private[spark] class SQLLiveEntitiesEventFilter( + liveSQLExecutions: Set[Long], + liveJobs: Set[Int], + liveStages: Set[Int], + liveTasks: Set[Long], + liveRDDs: Set[Int]) + extends JobEventFilter(None, liveJobs, liveStages, liveTasks, liveRDDs) with Logging { + + logDebug(s"live SQL executions : $liveSQLExecutions") + + private val _acceptFn: PartialFunction[SparkListenerEvent, Boolean] = { + case e: SparkListenerSQLExecutionStart => + liveSQLExecutions.contains(e.executionId) + case e: SparkListenerSQLAdaptiveExecutionUpdate => + liveSQLExecutions.contains(e.executionId) + case e: SparkListenerSQLExecutionEnd => + liveSQLExecutions.contains(e.executionId) + case e: SparkListenerDriverAccumUpdates => + liveSQLExecutions.contains(e.executionId) + + case e if acceptFnForJobEvents.lift(e).contains(true) => + // NOTE: if acceptFnForJobEvents(e) returns false, we should leave it to "unmatched" + // because we don't know whether the job has relevant SQL execution which is finished, + // or the job is not related to the SQL execution. + true + + // these events are for finished batches so safer to ignore + case _: StreamingQueryListener.QueryProgressEvent => false + } + + override def acceptFn(): PartialFunction[SparkListenerEvent, Boolean] = _acceptFn +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala index f526a19876670..5517c0dcdb188 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoinExec.scala @@ -19,14 +19,12 @@ package org.apache.spark.sql.execution.joins import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD -import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.collection.{BitSet, CompactBuffer} case class BroadcastNestedLoopJoinExec( @@ -84,7 +82,7 @@ case class BroadcastNestedLoopJoinExec( @transient private lazy val boundCondition = { if (condition.isDefined) { - newPredicate(condition.get, streamed.output ++ broadcast.output).eval _ + Predicate.create(condition.get, streamed.output ++ broadcast.output).eval _ } else { (r: InternalRow) => true } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala index 88d98530991c9..29645a736548c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProductExec.scala @@ -20,9 +20,8 @@ package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, Predicate, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner -import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.execution.{BinaryExecNode, ExplainUtils, ExternalAppendOnlyUnsafeRowArray, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.util.CompletionIterator @@ -93,7 +92,7 @@ case class CartesianProductExec( pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { - val boundCondition = newPredicate(condition.get, left.output ++ right.output) + val boundCondition = Predicate.create(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala index e8938cb22e890..137f0b87a2f3d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala @@ -99,7 +99,7 @@ trait HashJoin { UnsafeProjection.create(streamedKeys) @transient private[this] lazy val boundCondition = if (condition.isDefined) { - newPredicate(condition.get, streamedPlan.output ++ buildPlan.output).eval _ + Predicate.create(condition.get, streamedPlan.output ++ buildPlan.output).eval _ } else { (r: InternalRow) => true } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala index efe03e0f9ab46..4001338662d53 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala @@ -335,7 +335,7 @@ private[joins] object UnsafeHashedRelation { if (!success) { binaryMap.free() // scalastyle:off throwerror - throw new SparkOutOfMemoryError("There is no enough memory to build hash map") + throw new SparkOutOfMemoryError("There is not enough memory to build hash map") // scalastyle:on throwerror } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala index 189727a9bc88d..62eea611556ff 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala @@ -40,11 +40,18 @@ case class SortMergeJoinExec( joinType: JoinType, condition: Option[Expression], left: SparkPlan, - right: SparkPlan) extends BinaryExecNode with CodegenSupport { + right: SparkPlan, + isSkewJoin: Boolean = false) extends BinaryExecNode with CodegenSupport { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) + override def nodeName: String = { + if (isSkewJoin) super.nodeName + "(skew=true)" else super.nodeName + } + + override def stringArgs: Iterator[Any] = super.stringArgs.toSeq.dropRight(1).iterator + override def simpleStringWithNodeId(): String = { val opId = ExplainUtils.getOpId(this) s"$nodeName $joinType ($opId)".trim @@ -95,8 +102,15 @@ case class SortMergeJoinExec( s"${getClass.getSimpleName} should not take $x as the JoinType") } - override def requiredChildDistribution: Seq[Distribution] = - HashClusteredDistribution(leftKeys) :: HashClusteredDistribution(rightKeys) :: Nil + override def requiredChildDistribution: Seq[Distribution] = { + if (isSkewJoin) { + // We re-arrange the shuffle partitions to deal with skew join, and the new children + // partitioning doesn't satisfy `HashClusteredDistribution`. + UnspecifiedDistribution :: UnspecifiedDistribution :: Nil + } else { + HashClusteredDistribution(leftKeys) :: HashClusteredDistribution(rightKeys) :: Nil + } + } override def outputOrdering: Seq[SortOrder] = joinType match { // For inner join, orders of both sides keys should be kept. @@ -168,14 +182,14 @@ case class SortMergeJoinExec( left.execute().zipPartitions(right.execute()) { (leftIter, rightIter) => val boundCondition: (InternalRow) => Boolean = { condition.map { cond => - newPredicate(cond, left.output ++ right.output).eval _ + Predicate.create(cond, left.output ++ right.output).eval _ }.getOrElse { (r: InternalRow) => true } } // An ordering that can be used to compare keys from both sides. - val keyOrdering = newNaturalAscendingOrdering(leftKeys.map(_.dataType)) + val keyOrdering = RowOrdering.createNaturalAscendingOrdering(leftKeys.map(_.dataType)) val resultProj: InternalRow => InternalRow = UnsafeProjection.create(output, output) joinType match { @@ -191,7 +205,8 @@ case class SortMergeJoinExec( RowIterator.fromScala(leftIter), RowIterator.fromScala(rightIter), inMemoryThreshold, - spillThreshold + spillThreshold, + cleanupResources ) private[this] val joinRow = new JoinedRow @@ -235,7 +250,8 @@ case class SortMergeJoinExec( streamedIter = RowIterator.fromScala(leftIter), bufferedIter = RowIterator.fromScala(rightIter), inMemoryThreshold, - spillThreshold + spillThreshold, + cleanupResources ) val rightNullRow = new GenericInternalRow(right.output.length) new LeftOuterIterator( @@ -249,7 +265,8 @@ case class SortMergeJoinExec( streamedIter = RowIterator.fromScala(rightIter), bufferedIter = RowIterator.fromScala(leftIter), inMemoryThreshold, - spillThreshold + spillThreshold, + cleanupResources ) val leftNullRow = new GenericInternalRow(left.output.length) new RightOuterIterator( @@ -283,7 +300,8 @@ case class SortMergeJoinExec( RowIterator.fromScala(leftIter), RowIterator.fromScala(rightIter), inMemoryThreshold, - spillThreshold + spillThreshold, + cleanupResources ) private[this] val joinRow = new JoinedRow @@ -318,7 +336,8 @@ case class SortMergeJoinExec( RowIterator.fromScala(leftIter), RowIterator.fromScala(rightIter), inMemoryThreshold, - spillThreshold + spillThreshold, + cleanupResources ) private[this] val joinRow = new JoinedRow @@ -360,7 +379,8 @@ case class SortMergeJoinExec( RowIterator.fromScala(leftIter), RowIterator.fromScala(rightIter), inMemoryThreshold, - spillThreshold + spillThreshold, + cleanupResources ) private[this] val joinRow = new JoinedRow @@ -640,6 +660,9 @@ case class SortMergeJoinExec( (evaluateVariables(leftVars), "") } + val thisPlan = ctx.addReferenceObj("plan", this) + val eagerCleanup = s"$thisPlan.cleanupResources();" + s""" |while (findNextInnerJoinRows($leftInput, $rightInput)) { | ${leftVarDecl.mkString("\n")} @@ -653,6 +676,7 @@ case class SortMergeJoinExec( | } | if (shouldStop()) return; |} + |$eagerCleanup """.stripMargin } } @@ -678,6 +702,7 @@ case class SortMergeJoinExec( * @param inMemoryThreshold Threshold for number of rows guaranteed to be held in memory by * internal buffer * @param spillThreshold Threshold for number of rows to be spilled by internal buffer + * @param eagerCleanupResources the eager cleanup function to be invoked when no join row found */ private[joins] class SortMergeJoinScanner( streamedKeyGenerator: Projection, @@ -686,7 +711,8 @@ private[joins] class SortMergeJoinScanner( streamedIter: RowIterator, bufferedIter: RowIterator, inMemoryThreshold: Int, - spillThreshold: Int) { + spillThreshold: Int, + eagerCleanupResources: () => Unit) { private[this] var streamedRow: InternalRow = _ private[this] var streamedRowKey: InternalRow = _ private[this] var bufferedRow: InternalRow = _ @@ -710,7 +736,8 @@ private[joins] class SortMergeJoinScanner( def getBufferedMatches: ExternalAppendOnlyUnsafeRowArray = bufferedMatches /** - * Advances both input iterators, stopping when we have found rows with matching join keys. + * Advances both input iterators, stopping when we have found rows with matching join keys. If no + * join rows found, try to do the eager resources cleanup. * @return true if matching rows have been found and false otherwise. If this returns true, then * [[getStreamedRow]] and [[getBufferedMatches]] can be called to construct the join * results. @@ -720,7 +747,7 @@ private[joins] class SortMergeJoinScanner( // Advance the streamed side of the join until we find the next row whose join key contains // no nulls or we hit the end of the streamed iterator. } - if (streamedRow == null) { + val found = if (streamedRow == null) { // We have consumed the entire streamed iterator, so there can be no more matches. matchJoinKey = null bufferedMatches.clear() @@ -760,17 +787,19 @@ private[joins] class SortMergeJoinScanner( true } } + if (!found) eagerCleanupResources() + found } /** * Advances the streamed input iterator and buffers all rows from the buffered input that - * have matching keys. + * have matching keys. If no join rows found, try to do the eager resources cleanup. * @return true if the streamed iterator returned a row, false otherwise. If this returns true, * then [[getStreamedRow]] and [[getBufferedMatches]] can be called to produce the outer * join results. */ final def findNextOuterJoinRows(): Boolean = { - if (!advancedStreamed()) { + val found = if (!advancedStreamed()) { // We have consumed the entire streamed iterator, so there can be no more matches. matchJoinKey = null bufferedMatches.clear() @@ -800,6 +829,8 @@ private[joins] class SortMergeJoinScanner( // If there is a streamed input then we always return true true } + if (!found) eagerCleanupResources() + found } // --- Private methods -------------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala index 2ff08883d5cab..ddbd0a343ffcf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala @@ -65,6 +65,28 @@ case class CollectLimitExec(limit: Int, child: SparkPlan) extends LimitExec { } } +/** + * Take the last `limit` elements and collect them to a single partition. + * + * This operator will be used when a logical `Tail` operation is the final operator in an + * logical plan, which happens when the user is collecting results back to the driver. + */ +case class CollectTailExec(limit: Int, child: SparkPlan) extends LimitExec { + override def output: Seq[Attribute] = child.output + override def outputPartitioning: Partitioning = SinglePartition + override def executeCollect(): Array[InternalRow] = child.executeTail(limit) + protected override def doExecute(): RDD[InternalRow] = { + // This is a bit hacky way to avoid a shuffle and scanning all data when it performs + // at `Dataset.tail`. + // Since this execution plan and `execute` are currently called only when + // `Dataset.tail` is invoked, the jobs are always executed when they are supposed to be. + + // If we use this execution plan separately like `Dataset.limit` without an actual + // job launch, we might just have to mimic the implementation of `CollectLimitExec`. + sparkContext.parallelize(executeCollect(), numSlices = 1) + } +} + object BaseLimitExec { private val curId = new java.util.concurrent.atomic.AtomicInteger() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala index 19809b07508d9..65aabe004d75b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.execution.metric import java.text.NumberFormat -import java.util.Locale +import java.util.{Arrays, Locale} import scala.concurrent.duration._ @@ -50,14 +50,19 @@ class SQLMetric(val metricType: String, initValue: Long = 0L) extends Accumulato override def reset(): Unit = _value = _zeroValue override def merge(other: AccumulatorV2[Long, Long]): Unit = other match { - case o: SQLMetric => _value += o.value + case o: SQLMetric => + if (_value < 0) _value = 0 + if (o.value > 0) _value += o.value case _ => throw new UnsupportedOperationException( s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}") } override def isZero(): Boolean = _value == _zeroValue - override def add(v: Long): Unit = _value += v + override def add(v: Long): Unit = { + if (_value < 0) _value = 0 + _value += v + } // We can set a double value to `SQLMetric` which stores only long value, if it is // average metrics. @@ -65,7 +70,7 @@ class SQLMetric(val metricType: String, initValue: Long = 0L) extends Accumulato def set(v: Long): Unit = _value = v - def +=(v: Long): Unit = _value += v + def +=(v: Long): Unit = add(v) override def value: Long = _value @@ -111,7 +116,8 @@ object SQLMetrics { // data size total (min, med, max): // 100GB (100MB, 1GB, 10GB) val acc = new SQLMetric(SIZE_METRIC, -1) - acc.register(sc, name = Some(s"$name total (min, med, max)"), countFailedValues = false) + acc.register(sc, name = Some(s"$name total (min, med, max (stageId (attemptId): taskId))"), + countFailedValues = false) acc } @@ -120,14 +126,16 @@ object SQLMetrics { // duration(min, med, max): // 5s (800ms, 1s, 2s) val acc = new SQLMetric(TIMING_METRIC, -1) - acc.register(sc, name = Some(s"$name total (min, med, max)"), countFailedValues = false) + acc.register(sc, name = Some(s"$name total (min, med, max (stageId (attemptId): taskId))"), + countFailedValues = false) acc } def createNanoTimingMetric(sc: SparkContext, name: String): SQLMetric = { // Same with createTimingMetric, just normalize the unit of time to millisecond. val acc = new SQLMetric(NS_TIMING_METRIC, -1) - acc.register(sc, name = Some(s"$name total (min, med, max)"), countFailedValues = false) + acc.register(sc, name = Some(s"$name total (min, med, max (stageId (attemptId): taskId))"), + countFailedValues = false) acc } @@ -142,30 +150,46 @@ object SQLMetrics { // probe avg (min, med, max): // (1.2, 2.2, 6.3) val acc = new SQLMetric(AVERAGE_METRIC) - acc.register(sc, name = Some(s"$name (min, med, max)"), countFailedValues = false) + acc.register(sc, name = Some(s"$name (min, med, max (stageId (attemptId): taskId))"), + countFailedValues = false) acc } + private def toNumberFormat(value: Long): String = { + val numberFormat = NumberFormat.getNumberInstance(Locale.US) + numberFormat.format(value.toDouble / baseForAvgMetric) + } + + def metricNeedsMax(metricsType: String): Boolean = { + metricsType != SUM_METRIC + } + /** * A function that defines how we aggregate the final accumulator results among all tasks, * and represent it in string for a SQL physical operator. - */ - def stringValue(metricsType: String, values: Seq[Long]): String = { + */ + def stringValue(metricsType: String, values: Array[Long], maxMetrics: Array[Long]): String = { + // stringMetric = "(driver)" OR (stage $stageId (attempt $attemptId): task $taskId)) + val stringMetric = if (maxMetrics.isEmpty) { + "(driver)" + } else { + s"(stage ${maxMetrics(1)} (attempt ${maxMetrics(2)}): task ${maxMetrics(3)})" + } if (metricsType == SUM_METRIC) { val numberFormat = NumberFormat.getIntegerInstance(Locale.US) numberFormat.format(values.sum) } else if (metricsType == AVERAGE_METRIC) { - val numberFormat = NumberFormat.getNumberInstance(Locale.US) - val validValues = values.filter(_ > 0) val Seq(min, med, max) = { val metric = if (validValues.isEmpty) { - Seq.fill(3)(0L) + val zeros = Seq.fill(3)(0L) + zeros.map(v => toNumberFormat(v)) } else { - val sorted = validValues.sorted - Seq(sorted(0), sorted(validValues.length / 2), sorted(validValues.length - 1)) + Arrays.sort(validValues) + Seq(toNumberFormat(validValues(0)), toNumberFormat(validValues(validValues.length / 2)), + s"${toNumberFormat(validValues(validValues.length - 1))} $stringMetric") } - metric.map(v => numberFormat.format(v.toDouble / baseForAvgMetric)) + metric } s"\n($min, $med, $max)" } else { @@ -182,12 +206,15 @@ object SQLMetrics { val validValues = values.filter(_ >= 0) val Seq(sum, min, med, max) = { val metric = if (validValues.isEmpty) { - Seq.fill(4)(0L) + val zeros = Seq.fill(4)(0L) + zeros.map(v => strFormat(v)) } else { - val sorted = validValues.sorted - Seq(sorted.sum, sorted(0), sorted(validValues.length / 2), sorted(validValues.length - 1)) + Arrays.sort(validValues) + Seq(strFormat(validValues.sum), strFormat(validValues(0)), + strFormat(validValues(validValues.length / 2)), + s"${strFormat(validValues(validValues.length - 1))} $stringMetric") } - metric.map(strFormat) + metric } s"\n$sum ($min, $med, $max)" } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala index 5101f7e871af2..b44b13c8de0da 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala @@ -19,12 +19,9 @@ package org.apache.spark.sql.execution.python import java.io._ import java.net._ -import java.util.concurrent.atomic.AtomicBoolean - -import scala.collection.JavaConverters._ import org.apache.arrow.vector.VectorSchemaRoot -import org.apache.arrow.vector.ipc.{ArrowStreamReader, ArrowStreamWriter} +import org.apache.arrow.vector.ipc.ArrowStreamWriter import org.apache.spark._ import org.apache.spark.api.python._ @@ -33,7 +30,7 @@ import org.apache.spark.sql.execution.arrow.ArrowWriter import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.sql.util.ArrowUtils -import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector} +import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.Utils /** @@ -46,8 +43,8 @@ class ArrowPythonRunner( schema: StructType, timeZoneId: String, conf: Map[String, String]) - extends BasePythonRunner[Iterator[InternalRow], ColumnarBatch]( - funcs, evalType, argOffsets) { + extends BasePythonRunner[Iterator[InternalRow], ColumnarBatch](funcs, evalType, argOffsets) + with PythonArrowOutput { override val bufferSize: Int = SQLConf.get.pandasUDFBufferSize require( @@ -119,72 +116,4 @@ class ArrowPythonRunner( } } - protected override def newReaderIterator( - stream: DataInputStream, - writerThread: WriterThread, - startTime: Long, - env: SparkEnv, - worker: Socket, - releasedOrClosed: AtomicBoolean, - context: TaskContext): Iterator[ColumnarBatch] = { - new ReaderIterator(stream, writerThread, startTime, env, worker, releasedOrClosed, context) { - - private val allocator = ArrowUtils.rootAllocator.newChildAllocator( - s"stdin reader for $pythonExec", 0, Long.MaxValue) - - private var reader: ArrowStreamReader = _ - private var root: VectorSchemaRoot = _ - private var schema: StructType = _ - private var vectors: Array[ColumnVector] = _ - - context.addTaskCompletionListener[Unit] { _ => - if (reader != null) { - reader.close(false) - } - allocator.close() - } - - private var batchLoaded = true - - protected override def read(): ColumnarBatch = { - if (writerThread.exception.isDefined) { - throw writerThread.exception.get - } - try { - if (reader != null && batchLoaded) { - batchLoaded = reader.loadNextBatch() - if (batchLoaded) { - val batch = new ColumnarBatch(vectors) - batch.setNumRows(root.getRowCount) - batch - } else { - reader.close(false) - allocator.close() - // Reach end of stream. Call `read()` again to read control data. - read() - } - } else { - stream.readInt() match { - case SpecialLengths.START_ARROW_STREAM => - reader = new ArrowStreamReader(stream, allocator) - root = reader.getVectorSchemaRoot() - schema = ArrowUtils.fromArrowSchema(root.getSchema()) - vectors = root.getFieldVectors().asScala.map { vector => - new ArrowColumnVector(vector) - }.toArray[ColumnVector] - read() - case SpecialLengths.TIMING_DATA => - handleTimingData() - read() - case SpecialLengths.PYTHON_EXCEPTION_THROWN => - throw handlePythonException() - case SpecialLengths.END_OF_DATA_SECTION => - handleEndOfDataSection() - null - } - } - } catch handleException - } - } - } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/CoGroupedArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/CoGroupedArrowPythonRunner.scala new file mode 100644 index 0000000000000..25ce16db264ac --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/CoGroupedArrowPythonRunner.scala @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.python + +import java.io.DataOutputStream +import java.net.Socket + +import org.apache.arrow.vector.VectorSchemaRoot +import org.apache.arrow.vector.ipc.ArrowStreamWriter + +import org.apache.spark.{SparkEnv, TaskContext} +import org.apache.spark.api.python.{BasePythonRunner, ChainedPythonFunctions, PythonRDD} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.arrow.ArrowWriter +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.ArrowUtils +import org.apache.spark.sql.vectorized.ColumnarBatch +import org.apache.spark.util.Utils + + +/** + * Python UDF Runner for cogrouped udfs. It sends Arrow bathes from two different DataFrames, + * groups them in Python, and receive it back in JVM as batches of single DataFrame. + */ +class CoGroupedArrowPythonRunner( + funcs: Seq[ChainedPythonFunctions], + evalType: Int, + argOffsets: Array[Array[Int]], + leftSchema: StructType, + rightSchema: StructType, + timeZoneId: String, + conf: Map[String, String]) + extends BasePythonRunner[ + (Iterator[InternalRow], Iterator[InternalRow]), ColumnarBatch](funcs, evalType, argOffsets) + with PythonArrowOutput { + + protected def newWriterThread( + env: SparkEnv, + worker: Socket, + inputIterator: Iterator[(Iterator[InternalRow], Iterator[InternalRow])], + partitionIndex: Int, + context: TaskContext): WriterThread = { + + new WriterThread(env, worker, inputIterator, partitionIndex, context) { + + protected override def writeCommand(dataOut: DataOutputStream): Unit = { + + // Write config for the worker as a number of key -> value pairs of strings + dataOut.writeInt(conf.size) + for ((k, v) <- conf) { + PythonRDD.writeUTF(k, dataOut) + PythonRDD.writeUTF(v, dataOut) + } + + PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets) + } + + protected override def writeIteratorToStream(dataOut: DataOutputStream): Unit = { + // For each we first send the number of dataframes in each group then send + // first df, then send second df. End of data is marked by sending 0. + while (inputIterator.hasNext) { + dataOut.writeInt(2) + val (nextLeft, nextRight) = inputIterator.next() + writeGroup(nextLeft, leftSchema, dataOut, "left") + writeGroup(nextRight, rightSchema, dataOut, "right") + } + dataOut.writeInt(0) + } + + private def writeGroup( + group: Iterator[InternalRow], + schema: StructType, + dataOut: DataOutputStream, + name: String): Unit = { + val arrowSchema = ArrowUtils.toArrowSchema(schema, timeZoneId) + val allocator = ArrowUtils.rootAllocator.newChildAllocator( + s"stdout writer for $pythonExec ($name)", 0, Long.MaxValue) + val root = VectorSchemaRoot.create(arrowSchema, allocator) + + Utils.tryWithSafeFinally { + val writer = new ArrowStreamWriter(root, null, dataOut) + val arrowWriter = ArrowWriter.create(root) + writer.start() + + while (group.hasNext) { + arrowWriter.write(group.next()) + } + arrowWriter.finish() + writer.writeBatch() + writer.end() + }{ + root.close() + allocator.close() + } + } + } + } +} + diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala index 3554bdb5c9e0c..a0f23e925d237 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvalPythonExec.scala @@ -113,7 +113,7 @@ abstract class EvalPythonExec(udfs: Seq[PythonUDF], resultAttrs: Seq[Attribute], } }.toArray }.toArray - val projection = newMutableProjection(allInputs, child.output) + val projection = MutableProjection.create(allInputs, child.output) val schema = StructType(dataTypes.zipWithIndex.map { case (dt, i) => StructField(s"_$i", dt) }) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala index d49d790d7888b..7bc8b95cfb03b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala @@ -205,7 +205,7 @@ object ExtractPythonUDFs extends Rule[LogicalPlan] with PredicateHelper { def apply(plan: LogicalPlan): LogicalPlan = plan match { // SPARK-26293: A subquery will be rewritten into join later, and will go through this rule // eventually. Here we skip subquery, as Python UDF only needs to be extracted once. - case _: Subquery => plan + case s: Subquery if s.correlated => plan case _ => plan transformUp { // A safe guard. `ExtractPythonUDFs` only runs once, so we will not hit `BatchEvalPython` and diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapCoGroupsInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapCoGroupsInPandasExec.scala new file mode 100644 index 0000000000000..b079405bdc2f8 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapCoGroupsInPandasExec.scala @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.python + +import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, Partitioning} +import org.apache.spark.sql.execution.{BinaryExecNode, CoGroupedIterator, SparkPlan} +import org.apache.spark.sql.execution.python.PandasGroupUtils._ +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.ArrowUtils + + +/** + * Physical node for [[org.apache.spark.sql.catalyst.plans.logical.FlatMapCoGroupsInPandas]] + * + * The input dataframes are first Cogrouped. Rows from each side of the cogroup are passed to the + * Python worker via Arrow. As each side of the cogroup may have a different schema we send every + * group in its own Arrow stream. + * The Python worker turns the resulting record batches to `pandas.DataFrame`s, invokes the + * user-defined function, and passes the resulting `pandas.DataFrame` + * as an Arrow record batch. Finally, each record batch is turned to + * Iterator[InternalRow] using ColumnarBatch. + * + * Note on memory usage: + * Both the Python worker and the Java executor need to have enough memory to + * hold the largest cogroup. The memory on the Java side is used to construct the + * record batches (off heap memory). The memory on the Python side is used for + * holding the `pandas.DataFrame`. It's possible to further split one group into + * multiple record batches to reduce the memory footprint on the Java side, this + * is left as future work. + */ +case class FlatMapCoGroupsInPandasExec( + leftGroup: Seq[Attribute], + rightGroup: Seq[Attribute], + func: Expression, + output: Seq[Attribute], + left: SparkPlan, + right: SparkPlan) + extends SparkPlan with BinaryExecNode { + + private val sessionLocalTimeZone = conf.sessionLocalTimeZone + private val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf) + private val pandasFunction = func.asInstanceOf[PythonUDF].func + private val chainedFunc = Seq(ChainedPythonFunctions(Seq(pandasFunction))) + + override def producedAttributes: AttributeSet = AttributeSet(output) + + override def outputPartitioning: Partitioning = left.outputPartitioning + + override def requiredChildDistribution: Seq[Distribution] = { + val leftDist = if (leftGroup.isEmpty) AllTuples else ClusteredDistribution(leftGroup) + val rightDist = if (rightGroup.isEmpty) AllTuples else ClusteredDistribution(rightGroup) + leftDist :: rightDist :: Nil + } + + override def requiredChildOrdering: Seq[Seq[SortOrder]] = { + leftGroup + .map(SortOrder(_, Ascending)) :: rightGroup.map(SortOrder(_, Ascending)) :: Nil + } + + override protected def doExecute(): RDD[InternalRow] = { + + val (leftDedup, leftArgOffsets) = resolveArgOffsets(left, leftGroup) + val (rightDedup, rightArgOffsets) = resolveArgOffsets(right, rightGroup) + + // Map cogrouped rows to ArrowPythonRunner results, Only execute if partition is not empty + left.execute().zipPartitions(right.execute()) { (leftData, rightData) => + if (leftData.isEmpty && rightData.isEmpty) Iterator.empty else { + + val leftGrouped = groupAndProject(leftData, leftGroup, left.output, leftDedup) + val rightGrouped = groupAndProject(rightData, rightGroup, right.output, rightDedup) + val data = new CoGroupedIterator(leftGrouped, rightGrouped, leftGroup) + .map { case (_, l, r) => (l, r) } + + val runner = new CoGroupedArrowPythonRunner( + chainedFunc, + PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF, + Array(leftArgOffsets ++ rightArgOffsets), + StructType.fromAttributes(leftDedup), + StructType.fromAttributes(rightDedup), + sessionLocalTimeZone, + pythonRunnerConf) + + executePython(data, output, runner) + } + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala index 267698d1bca50..5032bc81327b9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/FlatMapGroupsInPandasExec.scala @@ -17,19 +17,16 @@ package org.apache.spark.sql.execution.python -import scala.collection.JavaConverters._ -import scala.collection.mutable.ArrayBuffer - -import org.apache.spark.TaskContext import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, Partitioning} -import org.apache.spark.sql.execution.{GroupedIterator, SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.python.PandasGroupUtils._ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.ArrowUtils -import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch} + /** * Physical node for [[org.apache.spark.sql.catalyst.plans.logical.FlatMapGroupsInPandas]] @@ -53,14 +50,17 @@ case class FlatMapGroupsInPandasExec( func: Expression, output: Seq[Attribute], child: SparkPlan) - extends UnaryExecNode { + extends SparkPlan with UnaryExecNode { + private val sessionLocalTimeZone = conf.sessionLocalTimeZone + private val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf) private val pandasFunction = func.asInstanceOf[PythonUDF].func - - override def outputPartitioning: Partitioning = child.outputPartitioning + private val chainedFunc = Seq(ChainedPythonFunctions(Seq(pandasFunction))) override def producedAttributes: AttributeSet = AttributeSet(output) + override def outputPartitioning: Partitioning = child.outputPartitioning + override def requiredChildDistribution: Seq[Distribution] = { if (groupingAttributes.isEmpty) { AllTuples :: Nil @@ -75,88 +75,23 @@ case class FlatMapGroupsInPandasExec( override protected def doExecute(): RDD[InternalRow] = { val inputRDD = child.execute() - val chainedFunc = Seq(ChainedPythonFunctions(Seq(pandasFunction))) - val sessionLocalTimeZone = conf.sessionLocalTimeZone - val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf) - - // Deduplicate the grouping attributes. - // If a grouping attribute also appears in data attributes, then we don't need to send the - // grouping attribute to Python worker. If a grouping attribute is not in data attributes, - // then we need to send this grouping attribute to python worker. - // - // We use argOffsets to distinguish grouping attributes and data attributes as following: - // - // argOffsets[0] is the length of grouping attributes - // argOffsets[1 .. argOffsets[0]+1] is the arg offsets for grouping attributes - // argOffsets[argOffsets[0]+1 .. ] is the arg offsets for data attributes - - val dataAttributes = child.output.drop(groupingAttributes.length) - val groupingIndicesInData = groupingAttributes.map { attribute => - dataAttributes.indexWhere(attribute.semanticEquals) - } - - val groupingArgOffsets = new ArrayBuffer[Int] - val nonDupGroupingAttributes = new ArrayBuffer[Attribute] - val nonDupGroupingSize = groupingIndicesInData.count(_ == -1) - - // Non duplicate grouping attributes are added to nonDupGroupingAttributes and - // their offsets are 0, 1, 2 ... - // Duplicate grouping attributes are NOT added to nonDupGroupingAttributes and - // their offsets are n + index, where n is the total number of non duplicate grouping - // attributes and index is the index in the data attributes that the grouping attribute - // is a duplicate of. - - groupingAttributes.zip(groupingIndicesInData).foreach { - case (attribute, index) => - if (index == -1) { - groupingArgOffsets += nonDupGroupingAttributes.length - nonDupGroupingAttributes += attribute - } else { - groupingArgOffsets += index + nonDupGroupingSize - } - } - - val dataArgOffsets = nonDupGroupingAttributes.length until - (nonDupGroupingAttributes.length + dataAttributes.length) - - val argOffsets = Array(Array(groupingAttributes.length) ++ groupingArgOffsets ++ dataArgOffsets) - - // Attributes after deduplication - val dedupAttributes = nonDupGroupingAttributes ++ dataAttributes - val dedupSchema = StructType.fromAttributes(dedupAttributes) + val (dedupAttributes, argOffsets) = resolveArgOffsets(child, groupingAttributes) // Map grouped rows to ArrowPythonRunner results, Only execute if partition is not empty inputRDD.mapPartitionsInternal { iter => if (iter.isEmpty) iter else { - val grouped = if (groupingAttributes.isEmpty) { - Iterator(iter) - } else { - val groupedIter = GroupedIterator(iter, groupingAttributes, child.output) - val dedupProj = UnsafeProjection.create(dedupAttributes, child.output) - groupedIter.map { - case (_, groupedRowIter) => groupedRowIter.map(dedupProj) - } - } - val context = TaskContext.get() + val data = groupAndProject(iter, groupingAttributes, child.output, dedupAttributes) + .map { case (_, x) => x } - val columnarBatchIter = new ArrowPythonRunner( + val runner = new ArrowPythonRunner( chainedFunc, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, - argOffsets, - dedupSchema, + Array(argOffsets), + StructType.fromAttributes(dedupAttributes), sessionLocalTimeZone, - pythonRunnerConf).compute(grouped, context.partitionId(), context) - - val unsafeProj = UnsafeProjection.create(output, output) + pythonRunnerConf) - columnarBatchIter.flatMap { batch => - // Grouped Map UDF returns a StructType column in ColumnarBatch, select the children here - val structVector = batch.column(0).asInstanceOf[ArrowColumnVector] - val outputVectors = output.indices.map(structVector.getChild) - val flattenedBatch = new ColumnarBatch(outputVectors.toArray) - flattenedBatch.setNumRows(batch.numRows()) - flattenedBatch.rowIterator.asScala - }.map(unsafeProj) + executePython(data, output, runner) }} } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PandasGroupUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PandasGroupUtils.scala new file mode 100644 index 0000000000000..68ce991a8ae7f --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PandasGroupUtils.scala @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.python + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ArrayBuffer + +import org.apache.spark.TaskContext +import org.apache.spark.api.python.BasePythonRunner +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} +import org.apache.spark.sql.execution.{GroupedIterator, SparkPlan} +import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch} + +/** + * Base functionality for plans which execute grouped python udfs. + */ +private[python] object PandasGroupUtils { + /** + * passes the data to the python runner and coverts the resulting + * columnarbatch into internal rows. + */ + def executePython[T]( + data: Iterator[T], + output: Seq[Attribute], + runner: BasePythonRunner[T, ColumnarBatch]): Iterator[InternalRow] = { + + val context = TaskContext.get() + val columnarBatchIter = runner.compute(data, context.partitionId(), context) + val unsafeProj = UnsafeProjection.create(output, output) + + columnarBatchIter.flatMap { batch => + // UDF returns a StructType column in ColumnarBatch, select the children here + val structVector = batch.column(0).asInstanceOf[ArrowColumnVector] + val outputVectors = output.indices.map(structVector.getChild) + val flattenedBatch = new ColumnarBatch(outputVectors.toArray) + flattenedBatch.setNumRows(batch.numRows()) + flattenedBatch.rowIterator.asScala + }.map(unsafeProj) + } + + /** + * groups according to grouping attributes and then projects into the deduplicated schema + */ + def groupAndProject( + input: Iterator[InternalRow], + groupingAttributes: Seq[Attribute], + inputSchema: Seq[Attribute], + dedupSchema: Seq[Attribute]): Iterator[(InternalRow, Iterator[InternalRow])] = { + val groupedIter = GroupedIterator(input, groupingAttributes, inputSchema) + val dedupProj = UnsafeProjection.create(dedupSchema, inputSchema) + groupedIter.map { + case (k, groupedRowIter) => (k, groupedRowIter.map(dedupProj)) + } + } + + /** + * Returns a the deduplicated attributes of the spark plan and the arg offsets of the + * keys and values. + * + * The deduplicated attributes are needed because the spark plan may contain an attribute + * twice; once in the key and once in the value. For any such attribute we need to + * deduplicate. + * + * The arg offsets are used to distinguish grouping grouping attributes and data attributes + * as following: + * + * argOffsets[0] is the length of the argOffsets array + * + * argOffsets[1] is the length of grouping attribute + * argOffsets[2 .. argOffsets[0]+2] is the arg offsets for grouping attributes + * + * argOffsets[argOffsets[0]+2 .. ] is the arg offsets for data attributes + */ + def resolveArgOffsets( + child: SparkPlan, groupingAttributes: Seq[Attribute]): (Seq[Attribute], Array[Int]) = { + + val dataAttributes = child.output.drop(groupingAttributes.length) + val groupingIndicesInData = groupingAttributes.map { attribute => + dataAttributes.indexWhere(attribute.semanticEquals) + } + + val groupingArgOffsets = new ArrayBuffer[Int] + val nonDupGroupingAttributes = new ArrayBuffer[Attribute] + val nonDupGroupingSize = groupingIndicesInData.count(_ == -1) + + groupingAttributes.zip(groupingIndicesInData).foreach { + case (attribute, index) => + if (index == -1) { + groupingArgOffsets += nonDupGroupingAttributes.length + nonDupGroupingAttributes += attribute + } else { + groupingArgOffsets += index + nonDupGroupingSize + } + } + + val dataArgOffsets = nonDupGroupingAttributes.length until + (nonDupGroupingAttributes.length + dataAttributes.length) + + val argOffsetsLength = groupingAttributes.length + dataArgOffsets.length + 1 + val argOffsets = Array(argOffsetsLength, + groupingAttributes.length) ++ groupingArgOffsets ++ dataArgOffsets + + // Attributes after deduplication + val dedupAttributes = nonDupGroupingAttributes ++ dataAttributes + (dedupAttributes, argOffsets) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonArrowOutput.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonArrowOutput.scala new file mode 100644 index 0000000000000..bb353062384a0 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonArrowOutput.scala @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.python + +import java.io.DataInputStream +import java.net.Socket +import java.util.concurrent.atomic.AtomicBoolean + +import scala.collection.JavaConverters._ + +import org.apache.arrow.vector.VectorSchemaRoot +import org.apache.arrow.vector.ipc.ArrowStreamReader + +import org.apache.spark.{SparkEnv, TaskContext} +import org.apache.spark.api.python.{BasePythonRunner, SpecialLengths} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.ArrowUtils +import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector} + +/** + * A trait that can be mixed-in with [[BasePythonRunner]]. It implements the logic from + * Python (Arrow) to JVM (ColumnarBatch). + */ +private[python] trait PythonArrowOutput { self: BasePythonRunner[_, ColumnarBatch] => + + protected def newReaderIterator( + stream: DataInputStream, + writerThread: WriterThread, + startTime: Long, + env: SparkEnv, + worker: Socket, + releasedOrClosed: AtomicBoolean, + context: TaskContext): Iterator[ColumnarBatch] = { + + new ReaderIterator(stream, writerThread, startTime, env, worker, releasedOrClosed, context) { + + private val allocator = ArrowUtils.rootAllocator.newChildAllocator( + s"stdin reader for $pythonExec", 0, Long.MaxValue) + + private var reader: ArrowStreamReader = _ + private var root: VectorSchemaRoot = _ + private var schema: StructType = _ + private var vectors: Array[ColumnVector] = _ + + context.addTaskCompletionListener[Unit] { _ => + if (reader != null) { + reader.close(false) + } + allocator.close() + } + + private var batchLoaded = true + + protected override def read(): ColumnarBatch = { + if (writerThread.exception.isDefined) { + throw writerThread.exception.get + } + try { + if (reader != null && batchLoaded) { + batchLoaded = reader.loadNextBatch() + if (batchLoaded) { + val batch = new ColumnarBatch(vectors) + batch.setNumRows(root.getRowCount) + batch + } else { + reader.close(false) + allocator.close() + // Reach end of stream. Call `read()` again to read control data. + read() + } + } else { + stream.readInt() match { + case SpecialLengths.START_ARROW_STREAM => + reader = new ArrowStreamReader(stream, allocator) + root = reader.getVectorSchemaRoot() + schema = ArrowUtils.fromArrowSchema(root.getSchema()) + vectors = root.getFieldVectors().asScala.map { vector => + new ArrowColumnVector(vector) + }.toArray[ColumnVector] + read() + case SpecialLengths.TIMING_DATA => + handleTimingData() + read() + case SpecialLengths.PYTHON_EXCEPTION_THROWN => + throw handlePythonException() + case SpecialLengths.END_OF_DATA_SECTION => + handleEndOfDataSection() + null + } + } + } catch handleException + } + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonForeachWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonForeachWriter.scala index a4e9b3305052f..2a799bab1eb81 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonForeachWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonForeachWriter.scala @@ -78,7 +78,7 @@ object PythonForeachWriter { * * Internally, it uses a [[HybridRowQueue]] to buffer the rows in a practically unlimited queue * across memory and local disk. However, HybridRowQueue is designed to be used only with - * EvalPythonExec where the reader is always behind the the writer, that is, the reader does not + * EvalPythonExec where the reader is always behind the writer, that is, the reader does not * try to read n+1 rows if the writer has only written n rows at any point of time. This * assumption is not true for PythonForeachWriter where rows may be added at a different rate as * they are consumed by the python worker. Hence, to maintain the invariant of the reader being diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala index 752d271c4cc35..0a250b27ccb94 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDFRunner.scala @@ -73,7 +73,7 @@ class PythonUDFRunner( val obj = new Array[Byte](length) stream.readFully(obj) obj - case 0 => Array.empty[Byte] + case 0 => Array.emptyByteArray case SpecialLengths.TIMING_DATA => handleTimingData() read() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/WindowInPandasExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/WindowInPandasExec.scala index cad89dedb8b07..f54c4b8f22066 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/WindowInPandasExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/WindowInPandasExec.scala @@ -304,7 +304,7 @@ case class WindowInPandasExec( var nextRow: UnsafeRow = null var nextGroup: UnsafeRow = null var nextRowAvailable: Boolean = false - private[this] def fetchNextRow() { + private[this] def fetchNextRow(): Unit = { nextRowAvailable = stream.hasNext if (nextRowAvailable) { nextRow = stream.next().asInstanceOf[UnsafeRow] @@ -325,7 +325,7 @@ case class WindowInPandasExec( val frames = factories.map(_(indexRow)) - private[this] def fetchNextPartition() { + private[this] def fetchNextPartition(): Unit = { // Collect all the rows in the current partition. // Before we start to fetch new input rows, make a copy of nextGroup. val currentGroup = nextGroup.copy() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/r/ArrowRRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/r/ArrowRRunner.scala index 0fe2b628fa38b..59f5a7078a151 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/r/ArrowRRunner.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/r/ArrowRRunner.scala @@ -191,11 +191,7 @@ class ArrowRRunner( null } } - } catch { - case eof: EOFException => - throw new SparkException( - "R worker exited unexpectedly (crashed)\n " + errThread.getLines(), eof) - } + } catch handleException } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala index a6c9c2972df6c..fffd8805a6525 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Cast, Expression, GenericInternalRow, GetArrayItem, Literal} import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.plans.logical.LocalRelation -import org.apache.spark.sql.catalyst.util.QuantileSummaries +import org.apache.spark.sql.catalyst.util.{GenericArrayData, QuantileSummaries} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -248,7 +248,9 @@ object StatFunctions extends Logging { percentileIndex += 1 (child: Expression) => GetArrayItem( - new ApproximatePercentile(child, Literal.create(percentiles)).toAggregateExpression(), + new ApproximatePercentile(child, + Literal(new GenericArrayData(percentiles), ArrayType(DoubleType, false))) + .toAggregateExpression(), Literal(index)) } else { stats.toLowerCase(Locale.ROOT) match { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ContinuousRecordEndpoint.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ContinuousRecordEndpoint.scala index c9c2ebc875f28..985a5fa6063ef 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ContinuousRecordEndpoint.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ContinuousRecordEndpoint.scala @@ -18,8 +18,8 @@ package org.apache.spark.sql.execution.streaming import org.apache.spark.SparkEnv import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.sources.v2.reader.streaming.PartitionOffset +import org.apache.spark.sql.catalyst.expressions.UnsafeRow +import org.apache.spark.sql.connector.read.streaming.PartitionOffset case class ContinuousRecordPartitionOffset(partitionId: Int, offset: Int) extends PartitionOffset case class GetRecord(offset: ContinuousRecordPartitionOffset) @@ -33,7 +33,7 @@ case class GetRecord(offset: ContinuousRecordPartitionOffset) * to the number of partitions. * @param lock a lock object for locking the buckets for read */ -class ContinuousRecordEndpoint(buckets: Seq[Seq[Any]], lock: Object) +class ContinuousRecordEndpoint(buckets: Seq[Seq[UnsafeRow]], lock: Object) extends ThreadSafeRpcEndpoint { private var startOffsets: Seq[Int] = List.fill(buckets.size)(0) @@ -63,7 +63,7 @@ class ContinuousRecordEndpoint(buckets: Seq[Seq[Any]], lock: Object) val buf = buckets(partitionId) val record = if (buf.size <= bufOffset) None else Some(buf(bufOffset)) - context.reply(record.map(InternalRow(_))) + context.reply(record) } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala index 6d1131e6939db..eac5246904ffd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/EventTimeWatermarkExec.scala @@ -21,7 +21,7 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark -import org.apache.spark.sql.catalyst.util.DateTimeUtils._ +import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_MILLIS import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala index 1d57cb084df9e..712ed1585bc8a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamOptions.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.execution.streaming +import java.util.Locale + import scala.util.Try import org.apache.spark.internal.Logging @@ -74,6 +76,30 @@ class FileStreamOptions(parameters: CaseInsensitiveMap[String]) extends Logging */ val fileNameOnly: Boolean = withBooleanParameter("fileNameOnly", false) + /** + * The archive directory to move completed files. The option will be only effective when + * "cleanSource" is set to "archive". + * + * Note that the completed file will be moved to this archive directory with respecting to + * its own path. + * + * For example, if the path of source file is "/a/b/dataset.txt", and the path of archive + * directory is "/archived/here", file will be moved to "/archived/here/a/b/dataset.txt". + */ + val sourceArchiveDir: Option[String] = parameters.get("sourceArchiveDir") + + /** + * Defines how to clean up completed files. Available options are "archive", "delete", "off". + */ + val cleanSource: CleanSourceMode.Value = { + val matchedMode = CleanSourceMode.fromString(parameters.get("cleanSource")) + if (matchedMode == CleanSourceMode.ARCHIVE && sourceArchiveDir.isEmpty) { + throw new IllegalArgumentException("Archive mode must be used with 'sourceArchiveDir' " + + "option.") + } + matchedMode + } + private def withBooleanParameter(name: String, default: Boolean) = { parameters.get(name).map { str => try { @@ -86,3 +112,14 @@ class FileStreamOptions(parameters: CaseInsensitiveMap[String]) extends Logging }.getOrElse(default) } } + +object CleanSourceMode extends Enumeration { + val ARCHIVE, DELETE, OFF = Value + + def fromString(value: Option[String]): CleanSourceMode.Value = value.map { v => + CleanSourceMode.values.find(_.toString == v.toUpperCase(Locale.ROOT)) + .getOrElse(throw new IllegalArgumentException( + s"Invalid mode for clean source option $value." + + s" Must be one of ${CleanSourceMode.values.mkString(",")}")) + }.getOrElse(OFF) +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala index 67e26dc1a2dbc..e8ce8e1487093 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala @@ -18,16 +18,24 @@ package org.apache.spark.sql.execution.streaming import java.net.URI +import java.util.concurrent.ThreadPoolExecutor import java.util.concurrent.TimeUnit._ -import org.apache.hadoop.fs.{FileStatus, Path} +import scala.util.control.NonFatal + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileStatus, FileSystem, GlobFilter, Path} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.sql.connector.read.streaming +import org.apache.spark.sql.connector.read.streaming.{ReadAllAvailable, ReadLimit, ReadMaxFiles, SupportsAdmissionControl} import org.apache.spark.sql.execution.datasources.{DataSource, InMemoryFileIndex, LogicalRelation} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType +import org.apache.spark.util.ThreadUtils /** * A very simple source that reads files from the given directory as they appear. @@ -39,7 +47,7 @@ class FileStreamSource( override val schema: StructType, partitionColumns: Seq[String], metadataPath: String, - options: Map[String, String]) extends Source with Logging { + options: Map[String, String]) extends SupportsAdmissionControl with Source with Logging { import FileStreamSource._ @@ -53,6 +61,9 @@ class FileStreamSource( fs.makeQualified(new Path(path)) // can contain glob patterns } + private val sourceCleaner: Option[FileStreamSourceCleaner] = FileStreamSourceCleaner( + fs, qualifiedBasePath, sourceOptions, hadoopConf) + private val optionsWithPartitionBasePath = sourceOptions.optionMapWithoutPath ++ { if (!SparkHadoopUtil.get.isGlobPath(new Path(path)) && options.contains("path")) { Map("basePath" -> path) @@ -106,15 +117,17 @@ class FileStreamSource( * `synchronized` on this method is for solving race conditions in tests. In the normal usage, * there is no race here, so the cost of `synchronized` should be rare. */ - private def fetchMaxOffset(): FileStreamSourceOffset = synchronized { + private def fetchMaxOffset(limit: ReadLimit): FileStreamSourceOffset = synchronized { // All the new files found - ignore aged files and files that we have seen. val newFiles = fetchAllFiles().filter { case (path, timestamp) => seenFiles.isNewFile(path, timestamp) } // Obey user's setting to limit the number of files in this batch trigger. - val batchFiles = - if (maxFilesPerBatch.nonEmpty) newFiles.take(maxFilesPerBatch.get) else newFiles + val batchFiles = limit match { + case files: ReadMaxFiles => newFiles.take(files.maxFiles()) + case _: ReadAllAvailable => newFiles + } batchFiles.foreach { file => seenFiles.add(file._1, file._2) @@ -141,6 +154,10 @@ class FileStreamSource( FileStreamSourceOffset(metadataLogCurrentOffset) } + override def getDefaultReadLimit: ReadLimit = { + maxFilesPerBatch.map(ReadLimit.maxFiles).getOrElse(super.getDefaultReadLimit) + } + /** * For test only. Run `func` with the internal lock to make sure when `func` is running, * the current offset won't be changed and no new batch will be emitted. @@ -200,6 +217,17 @@ class FileStreamSource( CaseInsensitiveMap(options), None).allFiles() } + private def setSourceHasMetadata(newValue: Option[Boolean]): Unit = newValue match { + case Some(true) => + if (sourceCleaner.isDefined) { + throw new UnsupportedOperationException("Clean up source files is not supported when" + + " reading from the output directory of FileStreamSink.") + } + sourceHasMetadata = Some(true) + case _ => + sourceHasMetadata = newValue + } + /** * Returns a list of files found, sorted by their timestamp. */ @@ -210,7 +238,7 @@ class FileStreamSource( sourceHasMetadata match { case None => if (FileStreamSink.hasMetadata(Seq(path), hadoopConf, sparkSession.sessionState.conf)) { - sourceHasMetadata = Some(true) + setSourceHasMetadata(Some(true)) allFiles = allFilesUsingMetadataLogFileIndex() } else { allFiles = allFilesUsingInMemoryFileIndex() @@ -222,10 +250,10 @@ class FileStreamSource( // metadata log and data files are only generated after the previous // `FileStreamSink.hasMetadata` check if (FileStreamSink.hasMetadata(Seq(path), hadoopConf, sparkSession.sessionState.conf)) { - sourceHasMetadata = Some(true) + setSourceHasMetadata(Some(true)) allFiles = allFilesUsingMetadataLogFileIndex() } else { - sourceHasMetadata = Some(false) + setSourceHasMetadata(Some(false)) // `allFiles` have already been fetched using InMemoryFileIndex in this round } } @@ -249,7 +277,14 @@ class FileStreamSource( files } - override def getOffset: Option[Offset] = Some(fetchMaxOffset()).filterNot(_.logOffset == -1) + override def getOffset: Option[Offset] = { + throw new UnsupportedOperationException( + "latestOffset(Offset, ReadLimit) should be called instead of this method") + } + + override def latestOffset(startOffset: streaming.Offset, limit: ReadLimit): streaming.Offset = { + Some(fetchMaxOffset(limit)).filterNot(_.logOffset == -1).orNull + } override def toString: String = s"FileStreamSource[$qualifiedBasePath]" @@ -258,16 +293,21 @@ class FileStreamSource( * equal to `end` and will only request offsets greater than `end` in the future. */ override def commit(end: Offset): Unit = { - // No-op for now; FileStreamSource currently garbage-collects files based on timestamp - // and the value of the maxFileAge parameter. + val logOffset = FileStreamSourceOffset(end).logOffset + + sourceCleaner.foreach { cleaner => + val files = metadataLog.get(Some(logOffset), Some(logOffset)).flatMap(_._2) + val validFileEntities = files.filter(_.batchId == logOffset) + logDebug(s"completed file entries: ${validFileEntities.mkString(",")}") + validFileEntities.foreach(cleaner.clean) + } } - override def stop() {} + override def stop(): Unit = sourceCleaner.foreach(_.stop()) } object FileStreamSource { - /** Timestamp for file modification time, in ms since January 1, 1970 UTC. */ type Timestamp = Long @@ -330,4 +370,166 @@ object FileStreamSource { def size: Int = map.size() } + + private[sql] abstract class FileStreamSourceCleaner extends Logging { + private val cleanThreadPool: Option[ThreadPoolExecutor] = { + val numThreads = SQLConf.get.getConf(SQLConf.FILE_SOURCE_CLEANER_NUM_THREADS) + if (numThreads > 0) { + logDebug(s"Cleaning file source on $numThreads separate thread(s)") + Some(ThreadUtils.newDaemonCachedThreadPool("file-source-cleaner-threadpool", numThreads)) + } else { + logDebug("Cleaning file source on main thread") + None + } + } + + def stop(): Unit = cleanThreadPool.foreach(ThreadUtils.shutdown(_)) + + def clean(entry: FileEntry): Unit = { + cleanThreadPool match { + case Some(p) => + p.submit(new Runnable { + override def run(): Unit = { + cleanTask(entry) + } + }) + + case None => + cleanTask(entry) + } + } + + protected def cleanTask(entry: FileEntry): Unit + } + + private[sql] object FileStreamSourceCleaner { + def apply( + fileSystem: FileSystem, + sourcePath: Path, + option: FileStreamOptions, + hadoopConf: Configuration): Option[FileStreamSourceCleaner] = option.cleanSource match { + case CleanSourceMode.ARCHIVE => + require(option.sourceArchiveDir.isDefined) + val path = new Path(option.sourceArchiveDir.get) + val archiveFs = path.getFileSystem(hadoopConf) + val qualifiedArchivePath = archiveFs.makeQualified(path) + Some(new SourceFileArchiver(fileSystem, sourcePath, archiveFs, qualifiedArchivePath)) + + case CleanSourceMode.DELETE => + Some(new SourceFileRemover(fileSystem)) + + case _ => None + } + } + + private[sql] class SourceFileArchiver( + fileSystem: FileSystem, + sourcePath: Path, + baseArchiveFileSystem: FileSystem, + baseArchivePath: Path) extends FileStreamSourceCleaner with Logging { + assertParameters() + + private def assertParameters(): Unit = { + require(fileSystem.getUri == baseArchiveFileSystem.getUri, "Base archive path is located " + + s"on a different file system than the source files. source path: $sourcePath" + + s" / base archive path: $baseArchivePath") + + require(!isBaseArchivePathMatchedAgainstSourcePattern, "Base archive path cannot be set to" + + " the path where archived path can possibly match with source pattern. Ensure the base " + + "archive path doesn't match with source pattern in depth, where the depth is minimum of" + + " depth on both paths.") + } + + private def getAncestorEnsuringDepth(path: Path, depth: Int): Path = { + var newPath = path + while (newPath.depth() > depth) { + newPath = newPath.getParent + } + newPath + } + + private def isBaseArchivePathMatchedAgainstSourcePattern: Boolean = { + // We should disallow end users to set base archive path which path matches against source + // pattern to avoid checking each source file. There're couple of cases which allow + // FileStreamSource to read any depth of subdirectory under the source pattern, so we should + // consider all three cases 1) both has same depth 2) base archive path is longer than source + // pattern 3) source pattern is longer than base archive path. To handle all cases, we take + // min of depth for both paths, and check the match. + + val minDepth = math.min(sourcePath.depth(), baseArchivePath.depth()) + + val sourcePathMinDepth = getAncestorEnsuringDepth(sourcePath, minDepth) + val baseArchivePathMinDepth = getAncestorEnsuringDepth(baseArchivePath, minDepth) + + val sourceGlobFilters: Seq[GlobFilter] = buildSourceGlobFilters(sourcePathMinDepth) + + var matched = true + + // pathToCompare should have same depth as sourceGlobFilters.length + var pathToCompare = baseArchivePathMinDepth + var index = 0 + do { + // GlobFilter only matches against its name, not full path so it's safe to compare + if (!sourceGlobFilters(index).accept(pathToCompare)) { + matched = false + } else { + pathToCompare = pathToCompare.getParent + index += 1 + } + } while (matched && !pathToCompare.isRoot) + + matched + } + + private def buildSourceGlobFilters(sourcePath: Path): Seq[GlobFilter] = { + val filters = new scala.collection.mutable.MutableList[GlobFilter]() + + var currentPath = sourcePath + while (!currentPath.isRoot) { + filters += new GlobFilter(currentPath.getName) + currentPath = currentPath.getParent + } + + filters.toList + } + + override protected def cleanTask(entry: FileEntry): Unit = { + val curPath = new Path(new URI(entry.path)) + val newPath = new Path(baseArchivePath.toString.stripSuffix("/") + curPath.toUri.getPath) + + try { + logDebug(s"Creating directory if it doesn't exist ${newPath.getParent}") + if (!fileSystem.exists(newPath.getParent)) { + fileSystem.mkdirs(newPath.getParent) + } + + logDebug(s"Archiving completed file $curPath to $newPath") + if (!fileSystem.rename(curPath, newPath)) { + logWarning(s"Fail to move $curPath to $newPath / skip moving file.") + } + } catch { + case NonFatal(e) => + logWarning(s"Fail to move $curPath to $newPath / skip moving file.", e) + } + } + } + + private[sql] class SourceFileRemover(fileSystem: FileSystem) + extends FileStreamSourceCleaner with Logging { + + override protected def cleanTask(entry: FileEntry): Unit = { + val curPath = new Path(new URI(entry.path)) + try { + logDebug(s"Removing completed file $curPath") + + if (!fileSystem.delete(curPath, false)) { + logWarning(s"Failed to remove $curPath / skip removing file.") + } + } catch { + case NonFatal(e) => + // Log to error but swallow exception to avoid process being stopped + logWarning(s"Fail to remove $curPath / skip removing file.", e) + } + } + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/GroupStateImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/GroupStateImpl.scala index dda9d41f630e6..59ce7c3707b27 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/GroupStateImpl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/GroupStateImpl.scala @@ -21,9 +21,10 @@ import java.sql.Date import java.util.concurrent.TimeUnit import org.apache.spark.sql.catalyst.plans.logical.{EventTimeTimeout, ProcessingTimeTimeout} +import org.apache.spark.sql.catalyst.util.IntervalUtils import org.apache.spark.sql.execution.streaming.GroupStateImpl._ import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout} -import org.apache.spark.unsafe.types.CalendarInterval +import org.apache.spark.unsafe.types.UTF8String /** @@ -159,13 +160,12 @@ private[sql] class GroupStateImpl[S] private( def getTimeoutTimestamp: Long = timeoutTimestamp private def parseDuration(duration: String): Long = { - val cal = CalendarInterval.fromCaseInsensitiveString(duration) - if (cal.milliseconds < 0 || cal.months < 0) { - throw new IllegalArgumentException(s"Provided duration ($duration) is not positive") + val cal = IntervalUtils.stringToInterval(UTF8String.fromString(duration)) + if (IntervalUtils.isNegative(cal)) { + throw new IllegalArgumentException(s"Provided duration ($duration) is negative") } - val millisPerMonth = TimeUnit.MICROSECONDS.toMillis(CalendarInterval.MICROS_PER_DAY) * 31 - cal.milliseconds + cal.months * millisPerMonth + IntervalUtils.getDuration(cal, TimeUnit.MILLISECONDS) } private def checkTimeoutTimestampAllowed(): Unit = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala index 5c9249fb16343..ed0c44da08c5d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala @@ -90,13 +90,21 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path: } } + /** + * Serialize the metadata and write to the output stream. If this method is overridden in a + * subclass, the overriding method should not close the given output stream, as it will be closed + * in the caller. + */ protected def serialize(metadata: T, out: OutputStream): Unit = { - // called inside a try-finally where the underlying stream is closed in the caller Serialization.write(metadata, out) } + /** + * Read and deserialize the metadata from input stream. If this method is overridden in a + * subclass, the overriding method should not close the given input stream, as it will be closed + * in the caller. + */ protected def deserialize(in: InputStream): T = { - // called inside a try-finally where the underlying stream is closed in the caller val reader = new InputStreamReader(in, StandardCharsets.UTF_8) Serialization.read[T](reader) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala index af52af0d1d7e6..09ae7692ec518 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.expressions.{CurrentBatchTimestamp, Express import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, HashPartitioning, SinglePartition} import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution.{QueryExecution, SparkPlan, SparkPlanner, UnaryExecNode} +import org.apache.spark.sql.execution.{LeafExecNode, LocalLimitExec, QueryExecution, SparkPlan, SparkPlanner, UnaryExecNode} import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.OutputMode @@ -50,7 +50,7 @@ class IncrementalExecution( // Modified planner with stateful operations. override val planner: SparkPlanner = new SparkPlanner( - sparkSession.sparkContext, + sparkSession, sparkSession.sessionState.conf, sparkSession.sessionState.experimentalMethods) { override def strategies: Seq[Strategy] = @@ -77,7 +77,8 @@ class IncrementalExecution( */ override lazy val optimizedPlan: LogicalPlan = tracker.measurePhase(QueryPlanningTracker.OPTIMIZATION) { - sparkSession.sessionState.optimizer.execute(withCachedData) transformAllExpressions { + sparkSession.sessionState.optimizer.executeAndTrack(withCachedData, + tracker) transformAllExpressions { case ts @ CurrentBatchTimestamp(timestamp, _, _) => logInfo(s"Current batch timestamp = $timestamp") ts.toLiteral @@ -104,6 +105,32 @@ class IncrementalExecution( /** Locates save/restore pairs surrounding aggregation. */ val state = new Rule[SparkPlan] { + /** + * Ensures that this plan DOES NOT have any stateful operation in it whose pipelined execution + * depends on this plan. In other words, this function returns true if this plan does + * have a narrow dependency on a stateful subplan. + */ + private def hasNoStatefulOp(plan: SparkPlan): Boolean = { + var statefulOpFound = false + + def findStatefulOp(planToCheck: SparkPlan): Unit = { + planToCheck match { + case s: StatefulOperator => + statefulOpFound = true + + case e: ShuffleExchangeExec => + // Don't search recursively any further as any child stateful operator as we + // are only looking for stateful subplans that this plan has narrow dependencies on. + + case p: SparkPlan => + p.children.foreach(findStatefulOp) + } + } + + findStatefulOp(plan) + !statefulOpFound + } + override def apply(plan: SparkPlan): SparkPlan = plan transform { case StateStoreSaveExec(keys, None, None, None, stateFormatVersion, UnaryExecNode(agg, @@ -148,6 +175,12 @@ class IncrementalExecution( l.copy( stateInfo = Some(nextStatefulOperationStateInfo), outputMode = Some(outputMode)) + + case StreamingLocalLimitExec(limit, child) if hasNoStatefulOp(child) => + // Optimize limit execution by replacing StreamingLocalLimitExec (consumes the iterator + // completely) to LocalLimitExec (does not consume the iterator) when the child plan has + // no stateful operator (i.e., consuming the iterator is not needed). + LocalLimitExec(limit, child) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala index 916bd2ddbc818..f6cc8116c6c4c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.execution.streaming +import java.io.IOException import java.util.UUID import scala.collection.mutable.ArrayBuffer @@ -43,6 +44,8 @@ class ManifestFileCommitProtocol(jobId: String, path: String) @transient private var fileLog: FileStreamSinkLog = _ private var batchId: Long = _ + @transient private var pendingCommitFiles: ArrayBuffer[Path] = _ + /** * Sets up the manifest log output and the batch id for this job. * Must be called before any other function. @@ -54,13 +57,21 @@ class ManifestFileCommitProtocol(jobId: String, path: String) override def setupJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") - // Do nothing + pendingCommitFiles = new ArrayBuffer[Path] } override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") val fileStatuses = taskCommits.flatMap(_.obj.asInstanceOf[Seq[SinkFileStatus]]).toArray + // We shouldn't remove the files if they're written to the metadata: + // `fileLog.add(batchId, fileStatuses)` could fail AFTER writing files to the metadata + // as well as there could be race + // so for the safety we clean up the list before calling anything incurs exception. + // The case is uncommon and we do best effort instead of guarantee, so the simplicity of + // logic here would be OK, and safe for dealing with unexpected situations. + pendingCommitFiles.clear() + if (fileLog.add(batchId, fileStatuses)) { logInfo(s"Committed batch $batchId") } else { @@ -70,7 +81,29 @@ class ManifestFileCommitProtocol(jobId: String, path: String) override def abortJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") - // Do nothing + // Best effort cleanup of complete files from failed job. + // Since the file has UUID in its filename, we are safe to try deleting them + // as the file will not conflict with file with another attempt on the same task. + if (pendingCommitFiles.nonEmpty) { + pendingCommitFiles.foreach { path => + try { + val fs = path.getFileSystem(jobContext.getConfiguration) + // this is to make sure the file can be seen from driver as well + if (fs.exists(path)) { + fs.delete(path, false) + } + } catch { + case e: IOException => + logWarning(s"Fail to remove temporary file $path, continue removing next.", e) + } + } + pendingCommitFiles.clear() + } + } + + override def onTaskCommit(taskCommit: TaskCommitMessage): Unit = { + pendingCommitFiles ++= taskCommit.obj.asInstanceOf[Seq[SinkFileStatus]] + .map(_.toFileStatus.getPath) } override def setupTask(taskContext: TaskAttemptContext): Unit = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala index e7eb2cb558cdb..45a2ce16183a5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala @@ -24,12 +24,12 @@ import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp} import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, LogicalPlan, Project} import org.apache.spark.sql.catalyst.util.truncatedString +import org.apache.spark.sql.connector.catalog.{SupportsRead, SupportsWrite, Table, TableCapability} +import org.apache.spark.sql.connector.read.streaming.{MicroBatchStream, Offset => OffsetV2, ReadLimit, SparkDataStream, SupportsAdmissionControl} import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.datasources.v2.{StreamingDataSourceV2Relation, StreamWriterCommitProgress, WriteToDataSourceV2Exec} -import org.apache.spark.sql.execution.streaming.sources.{RateControlMicroBatchStream, WriteToMicroBatchDataSource} +import org.apache.spark.sql.execution.streaming.sources.WriteToMicroBatchDataSource import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.sources.v2._ -import org.apache.spark.sql.sources.v2.reader.streaming.{MicroBatchStream, Offset => OffsetV2, SparkDataStream} import org.apache.spark.sql.streaming.{OutputMode, Trigger} import org.apache.spark.util.Clock @@ -79,7 +79,7 @@ class MicroBatchExecution( import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._ val _logicalPlan = analyzedPlan.transform { - case streamingRelation@StreamingRelation(dataSourceV1, sourceName, output) => + case streamingRelation @ StreamingRelation(dataSourceV1, sourceName, output) => toExecutionRelationMap.getOrElseUpdate(streamingRelation, { // Materialize source to avoid creating it in every batch val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId" @@ -122,7 +122,18 @@ class MicroBatchExecution( // v2 source case r: StreamingDataSourceV2Relation => r.stream } - uniqueSources = sources.distinct + uniqueSources = sources.distinct.map { + case source: SupportsAdmissionControl => + val limit = source.getDefaultReadLimit + if (trigger == OneTimeTrigger && limit != ReadLimit.allAvailable()) { + logWarning(s"The read limit $limit for $source is ignored when Trigger.Once() is used.") + source -> ReadLimit.allAvailable() + } else { + source -> limit + } + case other => + other -> ReadLimit.allAvailable() + }.toMap // TODO (SPARK-27484): we should add the writing node before the plan is analyzed. sink match { @@ -150,8 +161,7 @@ class MicroBatchExecution( state.set(TERMINATED) if (queryExecutionThread.isAlive) { sparkSession.sparkContext.cancelJobGroup(runId.toString) - queryExecutionThread.interrupt() - queryExecutionThread.join() + interruptAndAwaitExecutionThreadTermination() // microBatchThread may spawn new jobs, so we need to cancel again to prevent a leak sparkSession.sparkContext.cancelJobGroup(runId.toString) } @@ -355,25 +365,33 @@ class MicroBatchExecution( // Generate a map from each unique source to the next available offset. val latestOffsets: Map[SparkDataStream, Option[OffsetV2]] = uniqueSources.map { - case s: Source => + case (s: SupportsAdmissionControl, limit) => updateStatusMessage(s"Getting offsets from $s") - reportTimeTaken("getOffset") { - (s, s.getOffset) + reportTimeTaken("latestOffset") { + val startOffsetOpt = availableOffsets.get(s) + val startOffset = s match { + case _: Source => + startOffsetOpt.orNull + case v2: MicroBatchStream => + startOffsetOpt.map(offset => v2.deserializeOffset(offset.json)) + .getOrElse(v2.initialOffset()) + } + (s, Option(s.latestOffset(startOffset, limit))) } - case s: RateControlMicroBatchStream => + case (s: Source, _) => updateStatusMessage(s"Getting offsets from $s") - reportTimeTaken("latestOffset") { - val startOffset = availableOffsets - .get(s).map(off => s.deserializeOffset(off.json)) - .getOrElse(s.initialOffset()) - (s, Option(s.latestOffset(startOffset))) + reportTimeTaken("getOffset") { + (s, s.getOffset) } - case s: MicroBatchStream => + case (s: MicroBatchStream, _) => updateStatusMessage(s"Getting offsets from $s") reportTimeTaken("latestOffset") { (s, Option(s.latestOffset())) } - }.toMap + case (s, _) => + // for some reason, the compiler is unhappy and thinks the match is not exhaustive + throw new IllegalStateException(s"Unexpected source: $s") + } availableOffsets ++= latestOffsets.filter { case (_, o) => o.nonEmpty }.mapValues(_.get) // Update the query metadata @@ -545,11 +563,11 @@ class MicroBatchExecution( } val nextBatch = - new Dataset(sparkSessionToRunBatch, lastExecution, RowEncoder(lastExecution.analyzed.schema)) + new Dataset(lastExecution, RowEncoder(lastExecution.analyzed.schema)) val batchSinkProgress: Option[StreamWriterCommitProgress] = reportTimeTaken("addBatch") { - SQLExecution.withNewExecutionId(sparkSessionToRunBatch, lastExecution) { + SQLExecution.withNewExecutionId(lastExecution) { sink match { case s: Sink => s.addBatch(currentBatchId, nextBatch) case _: SupportsWrite => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala index b6fa2e9dc3612..1c59464268444 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala @@ -22,9 +22,9 @@ import org.json4s.jackson.Serialization import org.apache.spark.internal.Logging import org.apache.spark.sql.RuntimeConfig -import org.apache.spark.sql.execution.streaming.state.{FlatMapGroupsWithStateExecHelper, StreamingAggregationStateManager} +import org.apache.spark.sql.connector.read.streaming.{Offset => OffsetV2, SparkDataStream} +import org.apache.spark.sql.execution.streaming.state.{FlatMapGroupsWithStateExecHelper, StreamingAggregationStateManager, SymmetricHashJoinStateManager} import org.apache.spark.sql.internal.SQLConf.{FLATMAPGROUPSWITHSTATE_STATE_FORMAT_VERSION, _} -import org.apache.spark.sql.sources.v2.reader.streaming.{Offset => OffsetV2, SparkDataStream} /** @@ -91,7 +91,8 @@ object OffsetSeqMetadata extends Logging { private implicit val format = Serialization.formats(NoTypeHints) private val relevantSQLConfs = Seq( SHUFFLE_PARTITIONS, STATE_STORE_PROVIDER_CLASS, STREAMING_MULTIPLE_WATERMARK_POLICY, - FLATMAPGROUPSWITHSTATE_STATE_FORMAT_VERSION, STREAMING_AGGREGATION_STATE_FORMAT_VERSION) + FLATMAPGROUPSWITHSTATE_STATE_FORMAT_VERSION, STREAMING_AGGREGATION_STATE_FORMAT_VERSION, + STREAMING_JOIN_STATE_FORMAT_VERSION) /** * Default values of relevant configurations that are used for backward compatibility. @@ -108,7 +109,9 @@ object OffsetSeqMetadata extends Logging { FLATMAPGROUPSWITHSTATE_STATE_FORMAT_VERSION.key -> FlatMapGroupsWithStateExecHelper.legacyVersion.toString, STREAMING_AGGREGATION_STATE_FORMAT_VERSION.key -> - StreamingAggregationStateManager.legacyVersion.toString + StreamingAggregationStateManager.legacyVersion.toString, + STREAMING_JOIN_STATE_FORMAT_VERSION.key -> + SymmetricHashJoinStateManager.legacyVersion.toString ) def apply(json: String): OffsetSeqMetadata = Serialization.read[OffsetSeqMetadata](json) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala index b40426aff0e79..f6543c3e4c4ca 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala @@ -24,7 +24,7 @@ import java.nio.charset.StandardCharsets._ import scala.io.{Source => IOSource} import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.sources.v2.reader.streaming.{Offset => OffsetV2} +import org.apache.spark.sql.connector.read.streaming.{Offset => OffsetV2} /** * This class is used to log offsets to persistent files in HDFS. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala index 6cb75083d0c0b..f20291e11fd70 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala @@ -24,13 +24,14 @@ import scala.collection.JavaConverters._ import scala.collection.mutable import org.apache.spark.internal.Logging -import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.plans.logical.{EventTimeWatermark, LogicalPlan} -import org.apache.spark.sql.catalyst.util.DateTimeUtils._ +import org.apache.spark.sql.catalyst.util.DateTimeConstants.MILLIS_PER_SECOND +import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.connector.catalog.Table +import org.apache.spark.sql.connector.read.streaming.{MicroBatchStream, SparkDataStream} import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.execution.datasources.v2.{MicroBatchScanExec, StreamingDataSourceV2Relation, StreamWriterCommitProgress} -import org.apache.spark.sql.sources.v2.Table -import org.apache.spark.sql.sources.v2.reader.streaming.{MicroBatchStream, SparkDataStream} import org.apache.spark.sql.streaming._ import org.apache.spark.sql.streaming.StreamingQueryListener.QueryProgressEvent import org.apache.spark.util.Clock @@ -88,7 +89,7 @@ trait ProgressReporter extends Logging { private var lastNoDataProgressEventTime = Long.MinValue private val timestampFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") // ISO8601 - timestampFormat.setTimeZone(getTimeZone("UTC")) + timestampFormat.setTimeZone(DateTimeUtils.getTimeZone("UTC")) @volatile protected var currentStatus: StreamingQueryStatus = { @@ -147,8 +148,8 @@ trait ProgressReporter extends Logging { currentTriggerEndTimestamp = triggerClock.getTimeMillis() val executionStats = extractExecutionStats(hasNewData) - val processingTimeSec = - (currentTriggerEndTimestamp - currentTriggerStartTimestamp).toDouble / MILLIS_PER_SECOND + val processingTimeMills = currentTriggerEndTimestamp - currentTriggerStartTimestamp + val processingTimeSec = Math.max(1L, processingTimeMills).toDouble / MILLIS_PER_SECOND val inputTimeSec = if (lastTriggerStartTimestamp >= 0) { (currentTriggerStartTimestamp - lastTriggerStartTimestamp).toDouble / MILLIS_PER_SECOND @@ -172,6 +173,7 @@ trait ProgressReporter extends Logging { val sinkProgress = SinkProgress( sink.toString, sinkCommitProgress.map(_.numOutputRows)) + val observedMetrics = extractObservedMetrics(hasNewData, lastExecution) val newProgress = new StreamingQueryProgress( id = id, @@ -179,11 +181,13 @@ trait ProgressReporter extends Logging { name = name, timestamp = formatTimestamp(currentTriggerStartTimestamp), batchId = currentBatchId, + batchDuration = processingTimeMills, durationMs = new java.util.HashMap(currentDurationsMs.toMap.mapValues(long2Long).asJava), eventTime = new java.util.HashMap(executionStats.eventTimeStats.asJava), stateOperators = executionStats.stateOperators.toArray, sources = sourceProgress.toArray, - sink = sinkProgress) + sink = sinkProgress, + observedMetrics = new java.util.HashMap(observedMetrics.asJava)) if (hasNewData) { // Reset noDataEventTimestamp if we processed any data @@ -322,6 +326,16 @@ trait ProgressReporter extends Logging { } } + /** Extracts observed metrics from the most recent query execution. */ + private def extractObservedMetrics( + hasNewData: Boolean, + lastExecution: QueryExecution): Map[String, Row] = { + if (!hasNewData || lastExecution == null) { + return Map.empty + } + lastExecution.observedMetrics + } + /** Records the duration of running `body` for the next query progress update. */ protected def reportTimeTaken[T](triggerDetailKey: String)(body: => T): T = { val startTime = triggerClock.getTimeMillis() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala index 02fed50485b94..84f0961e4af12 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala @@ -20,10 +20,10 @@ package org.apache.spark.sql.execution.streaming import org.json4s.DefaultFormats import org.json4s.jackson.Serialization -import org.apache.spark.sql.sources.v2 +import org.apache.spark.sql.connector.read.streaming.{Offset => OffsetV2} case class RateStreamOffset(partitionToValueAndRunTimeMs: Map[Int, ValueRunTimeMsPair]) - extends v2.reader.streaming.Offset { + extends OffsetV2 { implicit val defaultFormats: DefaultFormats = DefaultFormats override val json = Serialization.write(partitionToValueAndRunTimeMs) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Sink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Sink.scala index 190325fb7ec25..36c7796ec4399 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Sink.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Sink.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.streaming import java.util import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.sources.v2.{Table, TableCapability} +import org.apache.spark.sql.connector.catalog.{Table, TableCapability} import org.apache.spark.sql.types.StructType /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala index 7f66d0b055cc3..6d51d7dc44171 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala @@ -18,8 +18,7 @@ package org.apache.spark.sql.execution.streaming import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.sources.v2.reader.streaming.{Offset => OffsetV2} -import org.apache.spark.sql.sources.v2.reader.streaming.SparkDataStream +import org.apache.spark.sql.connector.read.streaming.{Offset => OffsetV2, SparkDataStream} import org.apache.spark.sql.types.StructType /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala index 7c1f6ca42c1f2..8b3534bc0837a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala @@ -20,9 +20,9 @@ package org.apache.spark.sql.execution.streaming import java.io.{InterruptedIOException, IOException, UncheckedIOException} import java.nio.channels.ClosedByInterruptException import java.util.UUID -import java.util.concurrent.{CountDownLatch, ExecutionException, TimeUnit} +import java.util.concurrent.{CountDownLatch, ExecutionException, TimeoutException, TimeUnit} import java.util.concurrent.atomic.AtomicReference -import java.util.concurrent.locks.{Condition, ReentrantLock} +import java.util.concurrent.locks.ReentrantLock import scala.collection.JavaConverters._ import scala.collection.mutable.{Map => MutableMap} @@ -36,14 +36,14 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._ +import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table} +import org.apache.spark.sql.connector.read.streaming.{Offset => OffsetV2, ReadLimit, SparkDataStream} +import org.apache.spark.sql.connector.write.{LogicalWriteInfoImpl, SupportsTruncate} +import org.apache.spark.sql.connector.write.streaming.StreamingWrite import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.execution.command.StreamingExplainCommand import org.apache.spark.sql.execution.datasources.v2.StreamWriterCommitProgress import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.sources.v2.{SupportsWrite, Table} -import org.apache.spark.sql.sources.v2.reader.streaming.{Offset => OffsetV2, SparkDataStream} -import org.apache.spark.sql.sources.v2.writer.SupportsTruncate -import org.apache.spark.sql.sources.v2.writer.streaming.StreamingWrite import org.apache.spark.sql.streaming._ import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util.{Clock, UninterruptibleThread, Utils} @@ -206,7 +206,7 @@ abstract class StreamExecution( /** * A list of unique sources in the query plan. This will be set when generating logical plan. */ - @volatile protected var uniqueSources: Seq[SparkDataStream] = Seq.empty + @volatile protected var uniqueSources: Map[SparkDataStream, ReadLimit] = Map.empty /** Defines the internal state of execution */ protected val state = new AtomicReference[State](INITIALIZING) @@ -307,7 +307,8 @@ abstract class StreamExecution( } // `postEvent` does not throw non fatal exception. - postEvent(new QueryStartedEvent(id, runId, name)) + val submissionTime = triggerClock.getTimeMillis() + postEvent(new QueryStartedEvent(id, runId, name, submissionTime)) // Unblock starting thread startLatch.countDown() @@ -424,7 +425,7 @@ abstract class StreamExecution( /** Stops all streaming sources safely. */ protected def stopSources(): Unit = { - uniqueSources.foreach { source => + uniqueSources.foreach { case (source, _) => try { source.stop() } catch { @@ -434,6 +435,30 @@ abstract class StreamExecution( } } + /** + * Interrupts the query execution thread and awaits its termination until until it exceeds the + * timeout. The timeout can be set on "spark.sql.streaming.stopTimeout". + * + * @throws TimeoutException If the thread cannot be stopped within the timeout + */ + @throws[TimeoutException] + protected def interruptAndAwaitExecutionThreadTermination(): Unit = { + val timeout = math.max( + sparkSession.sessionState.conf.getConf(SQLConf.STREAMING_STOP_TIMEOUT), 0) + queryExecutionThread.interrupt() + queryExecutionThread.join(timeout) + if (queryExecutionThread.isAlive) { + val stackTraceException = new SparkException("The stream thread was last executing:") + stackTraceException.setStackTrace(queryExecutionThread.getStackTrace) + val timeoutException = new TimeoutException( + s"Stream Execution thread failed to stop within $timeout milliseconds (specified by " + + s"${SQLConf.STREAMING_STOP_TIMEOUT.key}). See the cause on what was " + + "being executed in the streaming query thread.") + timeoutException.initCause(stackTraceException) + throw timeoutException + } + } + /** * Blocks the current thread until processing for data from the given `source` has reached at * least the given `Offset`. This method is intended for use primarily when writing tests. @@ -578,17 +603,21 @@ abstract class StreamExecution( protected def getBatchDescriptionString: String = { val batchDescription = if (currentBatchId < 0) "init" else currentBatchId.toString - Option(name).map(_ + "
    ").getOrElse("") + - s"id = $id
    runId = $runId
    batch = $batchDescription" + s"""|${Option(name).getOrElse("")} + |id = $id + |runId = $runId + |batch = $batchDescription""".stripMargin } protected def createStreamingWrite( table: SupportsWrite, options: Map[String, String], inputPlan: LogicalPlan): StreamingWrite = { - val writeBuilder = table.newWriteBuilder(new CaseInsensitiveStringMap(options.asJava)) - .withQueryId(id.toString) - .withInputDataSchema(inputPlan.schema) + val info = LogicalWriteInfoImpl( + queryId = id.toString, + inputPlan.schema, + new CaseInsensitiveStringMap(options.asJava)) + val writeBuilder = table.newWriteBuilder(info) outputMode match { case Append => writeBuilder.buildForStreaming() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala index 7dd491ede9d05..1b8d69ffb7521 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala @@ -63,7 +63,7 @@ class StreamingQueryListenerBus(sparkListenerBus: LiveListenerBus) * are dispatched to Spark listener bus. This method is guaranteed to be called by queries in * the same SparkSession as this listener. */ - def post(event: StreamingQueryListener.Event) { + def post(event: StreamingQueryListener.Event): Unit = { event match { case s: QueryStartedEvent => activeQueryRunIds.synchronized { activeQueryRunIds += s.runId } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala index 142b6e7d18068..5858c54ce554a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala @@ -23,10 +23,10 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} +import org.apache.spark.sql.connector.catalog.{Table, TableProvider} +import org.apache.spark.sql.connector.read.streaming.SparkDataStream import org.apache.spark.sql.execution.LeafExecNode import org.apache.spark.sql.execution.datasources.DataSource -import org.apache.spark.sql.sources.v2.{Table, TableProvider} -import org.apache.spark.sql.sources.v2.reader.streaming.SparkDataStream import org.apache.spark.sql.util.CaseInsensitiveStringMap object StreamingRelation { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala index 50cf971e4ec3c..198e17db419a7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala @@ -21,13 +21,14 @@ import java.util.concurrent.TimeUnit.NANOSECONDS import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, GenericInternalRow, JoinedRow, Literal, UnsafeProjection, UnsafeRow} +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, GenericInternalRow, JoinedRow, Literal, Predicate, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinHelper._ import org.apache.spark.sql.execution.streaming.state._ +import org.apache.spark.sql.execution.streaming.state.SymmetricHashJoinStateManager.KeyToValuePair import org.apache.spark.sql.internal.SessionState import org.apache.spark.util.{CompletionIterator, SerializableConfiguration} @@ -109,7 +110,7 @@ import org.apache.spark.util.{CompletionIterator, SerializableConfiguration} * * 3. When both window in join key and time range conditions are present, case 1 + 2. * In this case, since window equality is a stricter condition than the time range, we can - * use the the State Key Watermark = event time watermark to discard state (similar to case 1). + * use the State Key Watermark = event time watermark to discard state (similar to case 1). * * @param leftKeys Expression to generate key rows for joining from left input * @param rightKeys Expression to generate key rows for joining from right input @@ -131,6 +132,7 @@ case class StreamingSymmetricHashJoinExec( stateInfo: Option[StatefulOperatorStateInfo], eventTimeWatermark: Option[Long], stateWatermarkPredicates: JoinStateWatermarkPredicates, + stateFormatVersion: Int, left: SparkPlan, right: SparkPlan) extends SparkPlan with BinaryExecNode with StateStoreWriter { @@ -139,13 +141,20 @@ case class StreamingSymmetricHashJoinExec( rightKeys: Seq[Expression], joinType: JoinType, condition: Option[Expression], + stateFormatVersion: Int, left: SparkPlan, right: SparkPlan) = { this( leftKeys, rightKeys, joinType, JoinConditionSplitPredicates(condition, left, right), stateInfo = None, eventTimeWatermark = None, - stateWatermarkPredicates = JoinStateWatermarkPredicates(), left, right) + stateWatermarkPredicates = JoinStateWatermarkPredicates(), stateFormatVersion, left, right) + } + + if (stateFormatVersion < 2 && joinType != Inner) { + throw new IllegalArgumentException("The query is using stream-stream outer join with state" + + s" format version ${stateFormatVersion} - correctness issue is discovered. Please discard" + + " the checkpoint and rerun the query. See SPARK-26154 for more details.") } private def throwBadJoinTypeException(): Nothing = { @@ -206,6 +215,7 @@ case class StreamingSymmetricHashJoinExec( } private def processPartitions( + partitionId: Int, leftInputIter: Iterator[InternalRow], rightInputIter: Iterator[InternalRow]): Iterator[InternalRow] = { if (stateInfo.isEmpty) { @@ -224,14 +234,15 @@ case class StreamingSymmetricHashJoinExec( val joinedRow = new JoinedRow + val inputSchema = left.output ++ right.output val postJoinFilter = - newPredicate(condition.bothSides.getOrElse(Literal(true)), left.output ++ right.output).eval _ + Predicate.create(condition.bothSides.getOrElse(Literal(true)), inputSchema).eval _ val leftSideJoiner = new OneSideHashJoiner( LeftSide, left.output, leftKeys, leftInputIter, - condition.leftSideOnly, postJoinFilter, stateWatermarkPredicates.left) + condition.leftSideOnly, postJoinFilter, stateWatermarkPredicates.left, partitionId) val rightSideJoiner = new OneSideHashJoiner( RightSide, right.output, rightKeys, rightInputIter, - condition.rightSideOnly, postJoinFilter, stateWatermarkPredicates.right) + condition.rightSideOnly, postJoinFilter, stateWatermarkPredicates.right, partitionId) // Join one side input using the other side's buffered/state rows. Here is how it is done. // @@ -270,20 +281,30 @@ case class StreamingSymmetricHashJoinExec( // * Getting an iterator over the rows that have aged out on the left side. These rows are // candidates for being null joined. Note that to avoid doing two passes, this iterator // removes the rows from the state manager as they're processed. - // * Checking whether the current row matches a key in the right side state, and that key - // has any value which satisfies the filter function when joined. If it doesn't, - // we know we can join with null, since there was never (including this batch) a match - // within the watermark period. If it does, there must have been a match at some point, so - // we know we can't join with null. + // * (state format version 1) Checking whether the current row matches a key in the + // right side state, and that key has any value which satisfies the filter function when + // joined. If it doesn't, we know we can join with null, since there was never + // (including this batch) a match within the watermark period. If it does, there must have + // been a match at some point, so we know we can't join with null. + // * (state format version 2) We found edge-case of above approach which brings correctness + // issue, and had to take another approach (see SPARK-26154); now Spark stores 'matched' + // flag along with row, which is set to true when there's any matching row on the right. + def matchesWithRightSideState(leftKeyValue: UnsafeRowPair) = { rightSideJoiner.get(leftKeyValue.key).exists { rightValue => postJoinFilter(joinedRow.withLeft(leftKeyValue.value).withRight(rightValue)) } } val removedRowIter = leftSideJoiner.removeOldState() - val outerOutputIter = removedRowIter - .filterNot(pair => matchesWithRightSideState(pair)) - .map(pair => joinedRow.withLeft(pair.value).withRight(nullRight)) + val outerOutputIter = removedRowIter.filterNot { kv => + stateFormatVersion match { + case 1 => matchesWithRightSideState(new UnsafeRowPair(kv.key, kv.value)) + case 2 => kv.matched + case _ => + throw new IllegalStateException("Unexpected state format version! " + + s"version $stateFormatVersion") + } + }.map(pair => joinedRow.withLeft(pair.value).withRight(nullRight)) innerOutputIter ++ outerOutputIter case RightOuter => @@ -294,9 +315,15 @@ case class StreamingSymmetricHashJoinExec( } } val removedRowIter = rightSideJoiner.removeOldState() - val outerOutputIter = removedRowIter - .filterNot(pair => matchesWithLeftSideState(pair)) - .map(pair => joinedRow.withLeft(nullLeft).withRight(pair.value)) + val outerOutputIter = removedRowIter.filterNot { kv => + stateFormatVersion match { + case 1 => matchesWithLeftSideState(new UnsafeRowPair(kv.key, kv.value)) + case 2 => kv.matched + case _ => + throw new IllegalStateException("Unexpected state format version! " + + s"version $stateFormatVersion") + } + }.map(pair => joinedRow.withLeft(nullLeft).withRight(pair.value)) innerOutputIter ++ outerOutputIter case _ => throwBadJoinTypeException() @@ -380,6 +407,7 @@ case class StreamingSymmetricHashJoinExec( * @param stateWatermarkPredicate The state watermark predicate. See * [[StreamingSymmetricHashJoinExec]] for further description of * state watermarks. + * @param partitionId A partition ID of source RDD. */ private class OneSideHashJoiner( joinSide: JoinSide, @@ -388,30 +416,32 @@ case class StreamingSymmetricHashJoinExec( inputIter: Iterator[InternalRow], preJoinFilterExpr: Option[Expression], postJoinFilter: (InternalRow) => Boolean, - stateWatermarkPredicate: Option[JoinStateWatermarkPredicate]) { + stateWatermarkPredicate: Option[JoinStateWatermarkPredicate], + partitionId: Int) { // Filter the joined rows based on the given condition. val preJoinFilter = - newPredicate(preJoinFilterExpr.getOrElse(Literal(true)), inputAttributes).eval _ + Predicate.create(preJoinFilterExpr.getOrElse(Literal(true)), inputAttributes).eval _ private val joinStateManager = new SymmetricHashJoinStateManager( - joinSide, inputAttributes, joinKeys, stateInfo, storeConf, hadoopConfBcast.value.value) + joinSide, inputAttributes, joinKeys, stateInfo, storeConf, hadoopConfBcast.value.value, + partitionId, stateFormatVersion) private[this] val keyGenerator = UnsafeProjection.create(joinKeys, inputAttributes) private[this] val stateKeyWatermarkPredicateFunc = stateWatermarkPredicate match { case Some(JoinStateKeyWatermarkPredicate(expr)) => // inputSchema can be empty as expr should only have BoundReferences and does not require // the schema to generated predicate. See [[StreamingSymmetricHashJoinHelper]]. - newPredicate(expr, Seq.empty).eval _ + Predicate.create(expr, Seq.empty).eval _ case _ => - newPredicate(Literal(false), Seq.empty).eval _ // false = do not remove if no predicate + Predicate.create(Literal(false), Seq.empty).eval _ // false = do not remove if no predicate } private[this] val stateValueWatermarkPredicateFunc = stateWatermarkPredicate match { case Some(JoinStateValueWatermarkPredicate(expr)) => - newPredicate(expr, inputAttributes).eval _ + Predicate.create(expr, inputAttributes).eval _ case _ => - newPredicate(Literal(false), Seq.empty).eval _ // false = do not remove if no predicate + Predicate.create(Literal(false), Seq.empty).eval _ // false = do not remove if no predicate } private[this] var updatedStateRowsCount = 0 @@ -431,7 +461,7 @@ case class StreamingSymmetricHashJoinExec( val nonLateRows = WatermarkSupport.watermarkExpression(watermarkAttribute, eventTimeWatermark) match { case Some(watermarkExpr) => - val predicate = newPredicate(watermarkExpr, inputAttributes) + val predicate = Predicate.create(watermarkExpr, inputAttributes) inputIter.filter { row => !predicate.eval(row) } case None => inputIter @@ -445,16 +475,9 @@ case class StreamingSymmetricHashJoinExec( // the case of inner join). if (preJoinFilter(thisRow)) { val key = keyGenerator(thisRow) - val outputIter = otherSideJoiner.joinStateManager.get(key).map { thatRow => - generateJoinedRow(thisRow, thatRow) - }.filter(postJoinFilter) - val shouldAddToState = // add only if both removal predicates do not match - !stateKeyWatermarkPredicateFunc(key) && !stateValueWatermarkPredicateFunc(thisRow) - if (shouldAddToState) { - joinStateManager.append(key, thisRow) - updatedStateRowsCount += 1 - } - outputIter + val outputIter: Iterator[JoinedRow] = otherSideJoiner.joinStateManager + .getJoinedRows(key, thatRow => generateJoinedRow(thisRow, thatRow), postJoinFilter) + new AddingProcessedRowToStateCompletionIterator(key, thisRow, outputIter) } else { joinSide match { case LeftSide if joinType == LeftOuter => @@ -467,6 +490,23 @@ case class StreamingSymmetricHashJoinExec( } } + private class AddingProcessedRowToStateCompletionIterator( + key: UnsafeRow, + thisRow: UnsafeRow, + subIter: Iterator[JoinedRow]) + extends CompletionIterator[JoinedRow, Iterator[JoinedRow]](subIter) { + private val iteratorNotEmpty: Boolean = super.hasNext + + override def completion(): Unit = { + val shouldAddToState = // add only if both removal predicates do not match + !stateKeyWatermarkPredicateFunc(key) && !stateValueWatermarkPredicateFunc(thisRow) + if (shouldAddToState) { + joinStateManager.append(key, thisRow, matched = iteratorNotEmpty) + updatedStateRowsCount += 1 + } + } + } + /** * Get an iterator over the values stored in this joiner's state manager for the given key. * @@ -486,7 +526,7 @@ case class StreamingSymmetricHashJoinExec( * We do this to avoid requiring either two passes or full materialization when * processing the rows for outer join. */ - def removeOldState(): Iterator[UnsafeRowPair] = { + def removeOldState(): Iterator[KeyToValuePair] = { stateWatermarkPredicate match { case Some(JoinStateKeyWatermarkPredicate(expr)) => joinStateManager.removeByKeyCondition(stateKeyWatermarkPredicateFunc) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala index 2d4c3c10e6445..cdd3a854c9a90 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala @@ -18,11 +18,10 @@ package org.apache.spark.sql.execution.streaming import scala.reflect.ClassTag -import scala.util.control.NonFatal -import org.apache.spark.{Partition, SparkContext} +import org.apache.spark.{Partition, SparkContext, TaskContext} import org.apache.spark.internal.Logging -import org.apache.spark.rdd.{RDD, ZippedPartitionsRDD2} +import org.apache.spark.rdd.{RDD, ZippedPartitionsBaseRDD, ZippedPartitionsPartition, ZippedPartitionsRDD2} import org.apache.spark.sql.catalyst.analysis.StreamingJoinHelper import org.apache.spark.sql.catalyst.expressions.{Add, And, Attribute, AttributeReference, AttributeSet, BoundReference, Cast, CheckOverflow, Expression, ExpressionSet, GreaterThan, GreaterThanOrEqual, LessThan, LessThanOrEqual, Literal, Multiply, NamedExpression, PreciseTimestampConversion, PredicateHelper, Subtract, TimeAdd, TimeSub, UnaryMinus} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark._ @@ -203,17 +202,18 @@ object StreamingSymmetricHashJoinHelper extends Logging { /** * A custom RDD that allows partitions to be "zipped" together, while ensuring the tasks' * preferred location is based on which executors have the required join state stores already - * loaded. This is class is a modified version of [[ZippedPartitionsRDD2]]. + * loaded. This class is a variant of [[ZippedPartitionsRDD2]] which only changes signature + * of `f`. */ class StateStoreAwareZipPartitionsRDD[A: ClassTag, B: ClassTag, V: ClassTag]( sc: SparkContext, - f: (Iterator[A], Iterator[B]) => Iterator[V], - rdd1: RDD[A], - rdd2: RDD[B], + var f: (Int, Iterator[A], Iterator[B]) => Iterator[V], + var rdd1: RDD[A], + var rdd2: RDD[B], stateInfo: StatefulOperatorStateInfo, stateStoreNames: Seq[String], @transient private val storeCoordinator: Option[StateStoreCoordinatorRef]) - extends ZippedPartitionsRDD2[A, B, V](sc, f, rdd1, rdd2) { + extends ZippedPartitionsBaseRDD[V](sc, List(rdd1, rdd2)) { /** * Set the preferred location of each partition using the executor that has the related @@ -225,6 +225,24 @@ object StreamingSymmetricHashJoinHelper extends Logging { storeCoordinator.flatMap(_.getLocation(stateStoreProviderId)) }.distinct } + + override def compute(s: Partition, context: TaskContext): Iterator[V] = { + val partitions = s.asInstanceOf[ZippedPartitionsPartition].partitions + if (partitions(0).index != partitions(1).index) { + throw new IllegalStateException(s"Partition ID should be same in both side: " + + s"left ${partitions(0).index} , right ${partitions(1).index}") + } + + val partitionId = partitions(0).index + f(partitionId, rdd1.iterator(partitions(0), context), rdd2.iterator(partitions(1), context)) + } + + override def clearDependencies(): Unit = { + super.clearDependencies() + rdd1 = null + rdd2 = null + f = null + } } implicit class StateStoreAwareZipPartitionsHelper[T: ClassTag](dataRDD: RDD[T]) { @@ -239,7 +257,7 @@ object StreamingSymmetricHashJoinHelper extends Logging { stateInfo: StatefulOperatorStateInfo, storeNames: Seq[String], storeCoordinator: StateStoreCoordinatorRef - )(f: (Iterator[T], Iterator[U]) => Iterator[V]): RDD[V] = { + )(f: (Int, Iterator[T], Iterator[U]) => Iterator[V]): RDD[V] = { new StateStoreAwareZipPartitionsRDD( dataRDD.sparkContext, f, dataRDD, dataRDD2, stateInfo, storeNames, Some(storeCoordinator)) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala index 2bdb3402c14b1..1a27fe61d9602 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala @@ -21,8 +21,10 @@ import java.util.concurrent.TimeUnit import scala.concurrent.duration.Duration +import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_DAY +import org.apache.spark.sql.catalyst.util.IntervalUtils import org.apache.spark.sql.streaming.Trigger -import org.apache.spark.unsafe.types.CalendarInterval +import org.apache.spark.unsafe.types.UTF8String private object Triggers { def validate(intervalMs: Long): Unit = { @@ -30,11 +32,11 @@ private object Triggers { } def convert(interval: String): Long = { - val cal = CalendarInterval.fromCaseInsensitiveString(interval) - if (cal.months > 0) { + val cal = IntervalUtils.stringToInterval(UTF8String.fromString(interval)) + if (cal.months != 0) { throw new IllegalArgumentException(s"Doesn't support month or year interval: $interval") } - TimeUnit.MICROSECONDS.toMillis(cal.microseconds) + TimeUnit.MICROSECONDS.toMillis(cal.microseconds + cal.days * MICROS_PER_DAY) } def convert(interval: Duration): Long = interval.toMillis diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/WatermarkTracker.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/WatermarkTracker.scala index 76ab1284633b1..b0f8cf9cd1846 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/WatermarkTracker.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/WatermarkTracker.scala @@ -63,7 +63,7 @@ case object MinWatermark extends MultipleWatermarkPolicy { } /** - * Policy to choose the *min* of the operator watermark values as the global watermark value. So the + * Policy to choose the *max* of the operator watermark values as the global watermark value. So the * global watermark will advance if any of the individual operator watermarks has advanced. * In other words, in a streaming query with multiple input streams and watermarks defined on all * of them, the global watermark will advance as fast as the fastest input. So if there is watermark @@ -108,10 +108,9 @@ case class WatermarkTracker(policy: MultipleWatermarkPolicy) extends Logging { } } - // Update the global watermark to the minimum of all watermark nodes. - // This is the safest option, because only the global watermark is fault-tolerant. Making - // it the minimum of all individual watermarks guarantees it will never advance past where - // any individual watermark operator would be if it were in a plan by itself. + // Update the global watermark accordingly to the chosen policy. To find all available policies + // and their semantics, please check the comments of + // `org.apache.spark.sql.execution.streaming.MultipleWatermarkPolicy` implementations. val chosenGlobalWatermark = policy.chooseGlobalWatermark(operatorToWatermarkMap.values.toSeq) if (chosenGlobalWatermark > globalWatermarkMs) { logInfo(s"Updating event-time watermark from $globalWatermarkMs to $chosenGlobalWatermark ms") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala index 9ae39c79c5156..e471e6c601d16 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/console.scala @@ -22,11 +22,12 @@ import java.util import scala.collection.JavaConverters._ import org.apache.spark.sql._ +import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table, TableCapability} +import org.apache.spark.sql.connector.write.{LogicalWriteInfo, SupportsTruncate, WriteBuilder} +import org.apache.spark.sql.connector.write.streaming.StreamingWrite import org.apache.spark.sql.execution.streaming.sources.ConsoleWrite +import org.apache.spark.sql.internal.connector.SimpleTableProvider import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister} -import org.apache.spark.sql.sources.v2._ -import org.apache.spark.sql.sources.v2.writer.{SupportsTruncate, WriteBuilder} -import org.apache.spark.sql.sources.v2.writer.streaming.StreamingWrite import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -35,7 +36,7 @@ case class ConsoleRelation(override val sqlContext: SQLContext, data: DataFrame) override def schema: StructType = data.schema } -class ConsoleSinkProvider extends TableProvider +class ConsoleSinkProvider extends SimpleTableProvider with DataSourceRegister with CreatableRelationProvider { @@ -71,21 +72,16 @@ object ConsoleTable extends Table with SupportsWrite { Set(TableCapability.STREAMING_WRITE).asJava } - override def newWriteBuilder(options: CaseInsensitiveStringMap): WriteBuilder = { + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { new WriteBuilder with SupportsTruncate { - private var inputSchema: StructType = _ - - override def withInputDataSchema(schema: StructType): WriteBuilder = { - this.inputSchema = schema - this - } + private val inputSchema: StructType = info.schema() // Do nothing for truncate. Console sink is special that it just prints all the records. override def truncate(): WriteBuilder = this override def buildForStreaming(): StreamingWrite = { assert(inputSchema != null) - new ConsoleWrite(inputSchema, options) + new ConsoleWrite(inputSchema, info.options) } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDD.scala index b68f67e0b22d9..5ee27c71aa731 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDD.scala @@ -20,8 +20,8 @@ package org.apache.spark.sql.execution.streaming.continuous import org.apache.spark._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.sources.v2.reader._ -import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousPartitionReaderFactory +import org.apache.spark.sql.connector.read.InputPartition +import org.apache.spark.sql.connector.read.streaming.ContinuousPartitionReaderFactory import org.apache.spark.sql.types.StructType import org.apache.spark.util.NextIterator diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala index f6d156ded7663..a109c2171f3d2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala @@ -28,12 +28,11 @@ import org.apache.spark.SparkEnv import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions.{CurrentDate, CurrentTimestamp} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.connector.catalog.{SupportsRead, SupportsWrite, TableCapability} +import org.apache.spark.sql.connector.read.streaming.{ContinuousStream, Offset => OffsetV2, PartitionOffset, ReadLimit} import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.datasources.v2.StreamingDataSourceV2Relation import org.apache.spark.sql.execution.streaming.{StreamingRelationV2, _} -import org.apache.spark.sql.sources.v2 -import org.apache.spark.sql.sources.v2.{SupportsRead, SupportsWrite, TableCapability} -import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousStream, PartitionOffset} import org.apache.spark.sql.streaming.{OutputMode, Trigger} import org.apache.spark.util.Clock @@ -85,7 +84,7 @@ class ContinuousExecution( sources = _logicalPlan.collect { case r: StreamingDataSourceV2Relation => r.stream.asInstanceOf[ContinuousStream] } - uniqueSources = sources.distinct + uniqueSources = sources.distinct.map(s => s -> ReadLimit.allAvailable()).toMap // TODO (SPARK-27484): we should add the writing node before the plan is analyzed. WriteToContinuousDataSource( @@ -253,7 +252,7 @@ class ContinuousExecution( updateStatusMessage("Running") reportTimeTaken("runContinuous") { - SQLExecution.withNewExecutionId(sparkSessionForQuery, lastExecution) { + SQLExecution.withNewExecutionId(lastExecution) { lastExecution.executedPlan.execute() } } @@ -340,7 +339,7 @@ class ContinuousExecution( val offset = sources(0).deserializeOffset(offsetLog.get(epoch).get.offsets(0).get.json) committedOffsets ++= Seq(sources(0) -> offset) - sources(0).commit(offset.asInstanceOf[v2.reader.streaming.Offset]) + sources(0).commit(offset.asInstanceOf[OffsetV2]) } else { return } @@ -428,8 +427,7 @@ class ContinuousExecution( if (queryExecutionThread.isAlive) { // The query execution thread will clean itself up in the finally clause of runContinuous. // We just need to interrupt the long running job. - queryExecutionThread.interrupt() - queryExecutionThread.join() + interruptAndAwaitExecutionThreadTermination() } logInfo(s"Query $prettyIdString was stopped") } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousQueuedDataReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousQueuedDataReader.scala index 65c5fc63c2f46..dff2fa69e42fd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousQueuedDataReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousQueuedDataReader.scala @@ -26,7 +26,7 @@ import org.apache.spark.{SparkEnv, SparkException, TaskContext} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeProjection -import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousPartitionReader, PartitionOffset} +import org.apache.spark.sql.connector.read.streaming.{ContinuousPartitionReader, PartitionOffset} import org.apache.spark.sql.types.StructType import org.apache.spark.util.ThreadUtils diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala index e1b7a8fc283d3..e66a1fe48a2e0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala @@ -22,9 +22,9 @@ import org.json4s.jackson.Serialization import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.connector.read.InputPartition +import org.apache.spark.sql.connector.read.streaming.{ContinuousPartitionReader, ContinuousPartitionReaderFactory, ContinuousStream, Offset, PartitionOffset} import org.apache.spark.sql.execution.streaming.{RateStreamOffset, ValueRunTimeMsPair} -import org.apache.spark.sql.sources.v2.reader._ -import org.apache.spark.sql.sources.v2.reader.streaming._ case class RateStreamPartitionOffset( partition: Int, currentValue: Long, currentTimeMs: Long) extends PartitionOffset diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTextSocketSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTextSocketSource.scala index 2263b42870a65..fc47c5ed3ac00 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTextSocketSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTextSocketSource.scala @@ -32,10 +32,12 @@ import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, UnsafeRow} +import org.apache.spark.sql.connector.read.InputPartition +import org.apache.spark.sql.connector.read.streaming.{ContinuousPartitionReader, ContinuousPartitionReaderFactory, ContinuousStream, Offset, PartitionOffset} import org.apache.spark.sql.execution.streaming.{Offset => _, _} import org.apache.spark.sql.execution.streaming.sources.TextSocketReader -import org.apache.spark.sql.sources.v2.reader._ -import org.apache.spark.sql.sources.v2.reader.streaming._ import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util.RpcUtils @@ -54,6 +56,9 @@ class TextSocketContinuousStream( implicit val defaultFormats: DefaultFormats = DefaultFormats + private val encoder = ExpressionEncoder.tuple(ExpressionEncoder[String], + ExpressionEncoder[Timestamp]) + @GuardedBy("this") private var socket: Socket = _ @@ -61,7 +66,7 @@ class TextSocketContinuousStream( private var readThread: Thread = _ @GuardedBy("this") - private val buckets = Seq.fill(numPartitions)(new ListBuffer[(String, Timestamp)]) + private val buckets = Seq.fill(numPartitions)(new ListBuffer[UnsafeRow]) @GuardedBy("this") private var currentOffset: Int = -1 @@ -182,7 +187,8 @@ class TextSocketContinuousStream( Timestamp.valueOf( TextSocketReader.DATE_FORMAT.format(Calendar.getInstance().getTime())) ) - buckets(currentOffset % numPartitions) += newData + buckets(currentOffset % numPartitions) += encoder.toRow(newData) + .copy().asInstanceOf[UnsafeRow] } } } catch { @@ -240,6 +246,8 @@ class TextSocketContinuousPartitionReader( private var currentOffset = startOffset private var current: Option[InternalRow] = None + private val projectWithoutTimestamp = UnsafeProjection.create(TextSocketReader.SCHEMA_REGULAR) + override def next(): Boolean = { try { current = getRecord @@ -271,8 +279,7 @@ class TextSocketContinuousPartitionReader( if (includeTimestamp) { rec } else { - InternalRow(rec.get(0, TextSocketReader.SCHEMA_TIMESTAMP) - .asInstanceOf[(String, Timestamp)]._1) + projectWithoutTimestamp(rec) } ) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousWriteRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousWriteRDD.scala index a08411d746abe..909dda57ee586 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousWriteRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousWriteRDD.scala @@ -20,8 +20,8 @@ package org.apache.spark.sql.execution.streaming.continuous import org.apache.spark.{Partition, SparkEnv, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.sources.v2.writer.DataWriter -import org.apache.spark.sql.sources.v2.writer.streaming.StreamingDataWriterFactory +import org.apache.spark.sql.connector.write.DataWriter +import org.apache.spark.sql.connector.write.streaming.StreamingDataWriterFactory import org.apache.spark.util.Utils /** @@ -80,13 +80,15 @@ class ContinuousWriteRDD(var prev: RDD[InternalRow], writerFactory: StreamingDat logError(s"Writer for partition ${context.partitionId()} is aborting.") if (dataWriter != null) dataWriter.abort() logError(s"Writer for partition ${context.partitionId()} aborted.") + }, finallyBlock = { + dataWriter.close() }) } Iterator() } - override def clearDependencies() { + override def clearDependencies(): Unit = { super.clearDependencies() prev = null } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala index decf524f7167c..dbddab2e9acdd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala @@ -23,9 +23,9 @@ import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousStream, PartitionOffset} -import org.apache.spark.sql.sources.v2.writer.WriterCommitMessage -import org.apache.spark.sql.sources.v2.writer.streaming.StreamingWrite +import org.apache.spark.sql.connector.read.streaming.{ContinuousStream, PartitionOffset} +import org.apache.spark.sql.connector.write.WriterCommitMessage +import org.apache.spark.sql.connector.write.streaming.StreamingWrite import org.apache.spark.util.RpcUtils private[continuous] sealed trait EpochCoordinatorMessage extends Serializable diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/WriteToContinuousDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/WriteToContinuousDataSource.scala index 54f484c4adae3..cecb2843fc3b0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/WriteToContinuousDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/WriteToContinuousDataSource.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.streaming.continuous import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan -import org.apache.spark.sql.sources.v2.writer.streaming.StreamingWrite +import org.apache.spark.sql.connector.write.streaming.StreamingWrite /** * The logical plan for writing data in a continuous stream. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/WriteToContinuousDataSourceExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/WriteToContinuousDataSourceExec.scala index 2f3af6a6544c4..f1898ad3f27ca 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/WriteToContinuousDataSourceExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/WriteToContinuousDataSourceExec.scala @@ -24,9 +24,10 @@ import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.connector.write.PhysicalWriteInfoImpl +import org.apache.spark.sql.connector.write.streaming.StreamingWrite import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.execution.streaming.StreamExecution -import org.apache.spark.sql.sources.v2.writer.streaming.StreamingWrite /** * The physical plan for writing data into a continuous processing [[StreamingWrite]]. @@ -38,8 +39,10 @@ case class WriteToContinuousDataSourceExec(write: StreamingWrite, query: SparkPl override def output: Seq[Attribute] = Nil override protected def doExecute(): RDD[InternalRow] = { - val writerFactory = write.createStreamingWriterFactory() - val rdd = new ContinuousWriteRDD(query.execute(), writerFactory) + val queryRdd = query.execute() + val writerFactory = write.createStreamingWriterFactory( + PhysicalWriteInfoImpl(queryRdd.getNumPartitions)) + val rdd = new ContinuousWriteRDD(queryRdd, writerFactory) logInfo(s"Start processing data source write support: $write. " + s"The input RDD has ${rdd.partitions.length} partitions.") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala index df149552dfb30..ea39c549bd072 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala @@ -31,10 +31,11 @@ import org.apache.spark.sql.catalyst.encoders.encoderFor import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.util.truncatedString +import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapability} +import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory, Scan, ScanBuilder} +import org.apache.spark.sql.connector.read.streaming.{ContinuousStream, MicroBatchStream, Offset => OffsetV2, SparkDataStream} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.sources.v2._ -import org.apache.spark.sql.sources.v2.reader._ -import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousStream, MicroBatchStream, Offset => OffsetV2, SparkDataStream} +import org.apache.spark.sql.internal.connector.SimpleTableProvider import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -44,6 +45,9 @@ object MemoryStream { def apply[A : Encoder](implicit sqlContext: SQLContext): MemoryStream[A] = new MemoryStream[A](memoryStreamId.getAndIncrement(), sqlContext) + + def apply[A : Encoder](numPartitions: Int)(implicit sqlContext: SQLContext): MemoryStream[A] = + new MemoryStream[A](memoryStreamId.getAndIncrement(), sqlContext, Some(numPartitions)) } /** @@ -94,7 +98,7 @@ abstract class MemoryStreamBase[A : Encoder](sqlContext: SQLContext) extends Spa // This class is used to indicate the memory stream data source. We don't actually use it, as // memory stream is for test only and we never look it up by name. -object MemoryStreamTableProvider extends TableProvider { +object MemoryStreamTableProvider extends SimpleTableProvider { override def getTable(options: CaseInsensitiveStringMap): Table = { throw new IllegalStateException("MemoryStreamTableProvider should not be used.") } @@ -136,9 +140,14 @@ class MemoryStreamScanBuilder(stream: MemoryStreamBase[_]) extends ScanBuilder w * A [[Source]] that produces value stored in memory as they are added by the user. This [[Source]] * is intended for use in unit tests as it can only replay data when the object is still * available. + * + * If numPartitions is provided, the rows will be redistributed to the given number of partitions. */ -case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext) - extends MemoryStreamBase[A](sqlContext) with MicroBatchStream with Logging { +case class MemoryStream[A : Encoder]( + id: Int, + sqlContext: SQLContext, + numPartitions: Option[Int] = None) + extends MemoryStreamBase[A](sqlContext) with MicroBatchStream with Logging { protected val output = logicalPlan.output @@ -206,9 +215,23 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext) logDebug(generateDebugString(newBlocks.flatten, startOrdinal, endOrdinal)) - newBlocks.map { block => - new MemoryStreamInputPartition(block) - }.toArray + numPartitions match { + case Some(numParts) => + // When the number of partition is provided, we redistribute the rows into + // the given number of partition, via round-robin manner. + val inputRows = newBlocks.flatten.toArray + (0 until numParts).map { newPartIdx => + val records = inputRows.zipWithIndex.filter { case (_, idx) => + idx % numParts == newPartIdx + }.map(_._1) + new MemoryStreamInputPartition(records) + }.toArray + + case _ => + newBlocks.map { block => + new MemoryStreamInputPartition(block) + }.toArray + } } } @@ -237,7 +260,7 @@ case class MemoryStream[A : Encoder](id: Int, sqlContext: SQLContext) lastOffsetCommitted = newOffset } - override def stop() {} + override def stop(): Unit = {} def reset(): Unit = synchronized { batches.clear() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWrite.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWrite.scala index dbe242784986d..dc25289aa1e2d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWrite.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWrite.scala @@ -20,12 +20,12 @@ package org.apache.spark.sql.execution.streaming.sources import org.apache.spark.internal.Logging import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.catalyst.plans.logical.LocalRelation -import org.apache.spark.sql.sources.v2.writer.WriterCommitMessage -import org.apache.spark.sql.sources.v2.writer.streaming.{StreamingDataWriterFactory, StreamingWrite} +import org.apache.spark.sql.connector.write.{PhysicalWriteInfo, WriterCommitMessage} +import org.apache.spark.sql.connector.write.streaming.{StreamingDataWriterFactory, StreamingWrite} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap -/** Common methods used to create writes for the the console sink */ +/** Common methods used to create writes for the console sink */ class ConsoleWrite(schema: StructType, options: CaseInsensitiveStringMap) extends StreamingWrite with Logging { @@ -38,7 +38,8 @@ class ConsoleWrite(schema: StructType, options: CaseInsensitiveStringMap) assert(SparkSession.getActiveSession.isDefined) protected val spark = SparkSession.getActiveSession.get - def createStreamingWriterFactory(): StreamingDataWriterFactory = PackedRowWriterFactory + def createStreamingWriterFactory(info: PhysicalWriteInfo): StreamingDataWriterFactory = + PackedRowWriterFactory override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = { // We have to print a "Batch" label for the epoch for compatibility with the pre-data source V2 diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ContinuousMemoryStream.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ContinuousMemoryStream.scala index 41eaf84b7f9ea..f94469385b281 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ContinuousMemoryStream.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ContinuousMemoryStream.scala @@ -29,9 +29,10 @@ import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.sql.{Encoder, SQLContext} import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.UnsafeRow +import org.apache.spark.sql.connector.read.InputPartition +import org.apache.spark.sql.connector.read.streaming.{ContinuousPartitionReader, ContinuousPartitionReaderFactory, ContinuousStream, Offset, PartitionOffset} import org.apache.spark.sql.execution.streaming.{Offset => _, _} -import org.apache.spark.sql.sources.v2.reader.InputPartition -import org.apache.spark.sql.sources.v2.reader.streaming._ import org.apache.spark.util.RpcUtils /** @@ -50,7 +51,7 @@ class ContinuousMemoryStream[A : Encoder](id: Int, sqlContext: SQLContext, numPa // ContinuousReader implementation @GuardedBy("this") - private val records = Seq.fill(numPartitions)(new ListBuffer[A]) + private val records = Seq.fill(numPartitions)(new ListBuffer[UnsafeRow]) private val recordEndpoint = new ContinuousRecordEndpoint(records, this) @volatile private var endpointRef: RpcEndpointRef = _ @@ -58,7 +59,8 @@ class ContinuousMemoryStream[A : Encoder](id: Int, sqlContext: SQLContext, numPa def addData(data: TraversableOnce[A]): Offset = synchronized { // Distribute data evenly among partition lists. data.toSeq.zipWithIndex.map { - case (item, index) => records(index % numPartitions) += item + case (item, index) => + records(index % numPartitions) += encoder.toRow(item).copy().asInstanceOf[UnsafeRow] } // The new target offset is the offset where all records in all partitions have been processed. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachWriterTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachWriterTable.scala index 838c7d497e35b..6e4f40ad080d4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachWriterTable.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ForeachWriterTable.scala @@ -26,12 +26,11 @@ import org.apache.spark.sql.{ForeachWriter, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.UnsafeRow +import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table, TableCapability} +import org.apache.spark.sql.connector.write.{DataWriter, LogicalWriteInfo, PhysicalWriteInfo, SupportsTruncate, WriteBuilder, WriterCommitMessage} +import org.apache.spark.sql.connector.write.streaming.{StreamingDataWriterFactory, StreamingWrite} import org.apache.spark.sql.execution.python.PythonForeachWriter -import org.apache.spark.sql.sources.v2.{SupportsWrite, Table, TableCapability} -import org.apache.spark.sql.sources.v2.writer.{DataWriter, SupportsTruncate, WriteBuilder, WriterCommitMessage} -import org.apache.spark.sql.sources.v2.writer.streaming.{StreamingDataWriterFactory, StreamingWrite} import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.util.CaseInsensitiveStringMap /** * A write-only table for forwarding data into the specified [[ForeachWriter]]. @@ -54,14 +53,9 @@ case class ForeachWriterTable[T]( Set(TableCapability.STREAMING_WRITE).asJava } - override def newWriteBuilder(options: CaseInsensitiveStringMap): WriteBuilder = { + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { new WriteBuilder with SupportsTruncate { - private var inputSchema: StructType = _ - - override def withInputDataSchema(schema: StructType): WriteBuilder = { - this.inputSchema = schema - this - } + private var inputSchema: StructType = info.schema() // Do nothing for truncate. Foreach sink is special that it just forwards all the records to // ForeachWriter. @@ -72,7 +66,8 @@ case class ForeachWriterTable[T]( override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {} override def abort(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {} - override def createStreamingWriterFactory(): StreamingDataWriterFactory = { + override def createStreamingWriterFactory( + info: PhysicalWriteInfo): StreamingDataWriterFactory = { val rowConverter: InternalRow => T = converter match { case Left(enc) => val boundEnc = enc.resolveAndBind( @@ -134,7 +129,7 @@ class ForeachDataWriter[T]( // If open returns false, we should skip writing rows. private val opened = writer.open(partitionId, epochId) - private var closeCalled: Boolean = false + private var errorOrNull: Throwable = _ override def write(record: InternalRow): Unit = { if (!opened) return @@ -143,25 +138,24 @@ class ForeachDataWriter[T]( writer.process(rowConverter(record)) } catch { case t: Throwable => - closeWriter(t) + errorOrNull = t throw t } + } override def commit(): WriterCommitMessage = { - closeWriter(null) ForeachWriterCommitMessage } override def abort(): Unit = { - closeWriter(new SparkException("Foreach writer has been aborted due to a task failure")) + if (errorOrNull == null) { + errorOrNull = new SparkException("Foreach writer has been aborted due to a task failure") + } } - private def closeWriter(errorOrNull: Throwable): Unit = { - if (!closeCalled) { - closeCalled = true - writer.close(errorOrNull) - } + override def close(): Unit = { + writer.close(errorOrNull) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/MicroBatchWrite.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/MicroBatchWrite.scala index f3951897ea747..c2adc1dd6742a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/MicroBatchWrite.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/MicroBatchWrite.scala @@ -18,8 +18,8 @@ package org.apache.spark.sql.execution.streaming.sources import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.sources.v2.writer.{BatchWrite, DataWriter, DataWriterFactory, WriterCommitMessage} -import org.apache.spark.sql.sources.v2.writer.streaming.{StreamingDataWriterFactory, StreamingWrite} +import org.apache.spark.sql.connector.write.{BatchWrite, DataWriter, DataWriterFactory, PhysicalWriteInfo, WriterCommitMessage} +import org.apache.spark.sql.connector.write.streaming.{StreamingDataWriterFactory, StreamingWrite} /** * A [[BatchWrite]] used to hook V2 stream writers into a microbatch plan. It implements @@ -36,8 +36,8 @@ class MicroBatchWrite(eppchId: Long, val writeSupport: StreamingWrite) extends B writeSupport.abort(eppchId, messages) } - override def createBatchWriterFactory(): DataWriterFactory = { - new MicroBatchWriterFactory(eppchId, writeSupport.createStreamingWriterFactory()) + override def createBatchWriterFactory(info: PhysicalWriteInfo): DataWriterFactory = { + new MicroBatchWriterFactory(eppchId, writeSupport.createStreamingWriterFactory(info)) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala index fd4cb444ce580..507f860e0452a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala @@ -21,8 +21,8 @@ import scala.collection.mutable import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.sources.v2.writer.{BatchWrite, DataWriter, DataWriterFactory, WriterCommitMessage} -import org.apache.spark.sql.sources.v2.writer.streaming.StreamingDataWriterFactory +import org.apache.spark.sql.connector.write.{BatchWrite, DataWriter, DataWriterFactory, WriterCommitMessage} +import org.apache.spark.sql.connector.write.streaming.StreamingDataWriterFactory /** * A simple [[DataWriterFactory]] whose tasks just pack rows into the commit message for delivery @@ -56,10 +56,12 @@ class PackedRowDataWriter() extends DataWriter[InternalRow] with Logging { override def write(row: InternalRow): Unit = data.append(row.copy()) override def commit(): PackedRowCommitMessage = { - val msg = PackedRowCommitMessage(data.toArray) - data.clear() - msg + PackedRowCommitMessage(data.toArray) } - override def abort(): Unit = data.clear() + override def abort(): Unit = {} + + override def close(): Unit = { + data.clear() + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamMicroBatchStream.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamMicroBatchStream.scala index 156ba95ab9733..eb6baf698a5b9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamMicroBatchStream.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamMicroBatchStream.scala @@ -27,9 +27,9 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory} +import org.apache.spark.sql.connector.read.streaming.{MicroBatchStream, Offset} import org.apache.spark.sql.execution.streaming._ -import org.apache.spark.sql.sources.v2.reader._ -import org.apache.spark.sql.sources.v2.reader.streaming.{MicroBatchStream, Offset} import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util.{ManualClock, SystemClock} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamProvider.scala index f61e9dbecd4ea..a093bf54b2107 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamProvider.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamProvider.scala @@ -23,11 +23,12 @@ import scala.collection.JavaConverters._ import org.apache.spark.network.util.JavaUtils import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapability} +import org.apache.spark.sql.connector.read.{Scan, ScanBuilder} +import org.apache.spark.sql.connector.read.streaming.{ContinuousStream, MicroBatchStream} import org.apache.spark.sql.execution.streaming.continuous.RateStreamContinuousStream +import org.apache.spark.sql.internal.connector.SimpleTableProvider import org.apache.spark.sql.sources.DataSourceRegister -import org.apache.spark.sql.sources.v2._ -import org.apache.spark.sql.sources.v2.reader.{Scan, ScanBuilder} -import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousStream, MicroBatchStream} import org.apache.spark.sql.types._ import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -45,7 +46,7 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap * generated rows. The source will try its best to reach `rowsPerSecond`, but the query may * be resource constrained, and `numPartitions` can be tweaked to help reach the desired speed. */ -class RateStreamProvider extends TableProvider with DataSourceRegister { +class RateStreamProvider extends SimpleTableProvider with DataSourceRegister { import RateStreamProvider._ override def getTable(options: CaseInsensitiveStringMap): Table = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketMicroBatchStream.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketMicroBatchStream.scala index 25e9af2bc2927..97a6576832515 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketMicroBatchStream.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketMicroBatchStream.scala @@ -28,9 +28,9 @@ import scala.collection.mutable.ListBuffer import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory} +import org.apache.spark.sql.connector.read.streaming.{MicroBatchStream, Offset} import org.apache.spark.sql.execution.streaming.LongOffset -import org.apache.spark.sql.sources.v2.reader.{InputPartition, PartitionReader, PartitionReaderFactory} -import org.apache.spark.sql.sources.v2.reader.streaming.{MicroBatchStream, Offset} import org.apache.spark.unsafe.types.UTF8String /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketSourceProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketSourceProvider.scala index 0f807e235661a..a4dcb2049eb87 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketSourceProvider.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketSourceProvider.scala @@ -26,15 +26,16 @@ import scala.util.{Failure, Success, Try} import org.apache.spark.internal.Logging import org.apache.spark.sql._ +import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapability} +import org.apache.spark.sql.connector.read.{Scan, ScanBuilder} +import org.apache.spark.sql.connector.read.streaming.{ContinuousStream, MicroBatchStream} import org.apache.spark.sql.execution.streaming.continuous.TextSocketContinuousStream +import org.apache.spark.sql.internal.connector.SimpleTableProvider import org.apache.spark.sql.sources.DataSourceRegister -import org.apache.spark.sql.sources.v2._ -import org.apache.spark.sql.sources.v2.reader.{Scan, ScanBuilder} -import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousStream, MicroBatchStream} import org.apache.spark.sql.types.{StringType, StructField, StructType, TimestampType} import org.apache.spark.sql.util.CaseInsensitiveStringMap -class TextSocketSourceProvider extends TableProvider with DataSourceRegister with Logging { +class TextSocketSourceProvider extends SimpleTableProvider with DataSourceRegister with Logging { private def checkParameters(params: CaseInsensitiveStringMap): Unit = { logWarning("The socket source should not be used for production applications! " + diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/WriteToMicroBatchDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/WriteToMicroBatchDataSource.scala index a3f58fa966fe8..ef1115e6d9e01 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/WriteToMicroBatchDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/WriteToMicroBatchDataSource.scala @@ -19,8 +19,8 @@ package org.apache.spark.sql.execution.streaming.sources import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.connector.write.streaming.StreamingWrite import org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2 -import org.apache.spark.sql.sources.v2.writer.streaming.StreamingWrite /** * The logical plan for writing data to a micro-batch stream. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memory.scala index de8d00d4ac348..2b674070a70ad 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memory.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memory.scala @@ -32,12 +32,11 @@ import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics} import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils +import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table, TableCapability} +import org.apache.spark.sql.connector.write.{DataWriter, DataWriterFactory, LogicalWriteInfo, PhysicalWriteInfo, SupportsTruncate, WriteBuilder, WriterCommitMessage} +import org.apache.spark.sql.connector.write.streaming.{StreamingDataWriterFactory, StreamingWrite} import org.apache.spark.sql.execution.streaming.Sink -import org.apache.spark.sql.sources.v2.{SupportsWrite, Table, TableCapability} -import org.apache.spark.sql.sources.v2.writer._ -import org.apache.spark.sql.sources.v2.writer.streaming.{StreamingDataWriterFactory, StreamingWrite} import org.apache.spark.sql.types.StructType -import org.apache.spark.sql.util.CaseInsensitiveStringMap /** * A sink that stores the results in memory. This [[Sink]] is primarily intended for use in unit @@ -53,21 +52,16 @@ class MemorySink extends Table with SupportsWrite with Logging { Set(TableCapability.STREAMING_WRITE).asJava } - override def newWriteBuilder(options: CaseInsensitiveStringMap): WriteBuilder = { + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { new WriteBuilder with SupportsTruncate { private var needTruncate: Boolean = false - private var inputSchema: StructType = _ + private val inputSchema: StructType = info.schema() override def truncate(): WriteBuilder = { this.needTruncate = true this } - override def withInputDataSchema(schema: StructType): WriteBuilder = { - this.inputSchema = schema - this - } - override def buildForStreaming(): StreamingWrite = { new MemoryStreamingWrite(MemorySink.this, inputSchema, needTruncate) } @@ -140,7 +134,7 @@ class MemoryStreamingWrite( val sink: MemorySink, schema: StructType, needTruncate: Boolean) extends StreamingWrite { - override def createStreamingWriterFactory: MemoryWriterFactory = { + override def createStreamingWriterFactory(info: PhysicalWriteInfo): MemoryWriterFactory = { MemoryWriterFactory(schema) } @@ -191,6 +185,8 @@ class MemoryDataWriter(partition: Int, schema: StructType) } override def abort(): Unit = {} + + override def close(): Unit = {} } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala index 6ee54b948a7d4..05c651f9951b9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala @@ -43,7 +43,7 @@ import org.apache.spark.util.{SizeEstimator, Utils} /** * An implementation of [[StateStoreProvider]] and [[StateStore]] in which all the data is backed - * by files in a HDFS-compatible file system. All updates to the store has to be done in sets + * by files in an HDFS-compatible file system. All updates to the store has to be done in sets * transactionally, and each set of updates increments the store's version. These versions can * be used to re-execute the updates (by retries in RDD operations) on the correct version of * the store, and regenerate the store version. @@ -79,7 +79,7 @@ private[state] class HDFSBackedStateStoreProvider extends StateStoreProvider wit // java.util.ConcurrentModificationException type MapType = java.util.concurrent.ConcurrentHashMap[UnsafeRow, UnsafeRow] - /** Implementation of [[StateStore]] API which is backed by a HDFS-compatible file system */ + /** Implementation of [[StateStore]] API which is backed by an HDFS-compatible file system */ class HDFSBackedStateStore(val version: Long, mapToUpdate: MapType) extends StateStore { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala index 43f22803e7685..1a0a43c083879 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManager.scala @@ -23,10 +23,12 @@ import org.apache.hadoop.conf.Configuration import org.apache.spark.TaskContext import org.apache.spark.internal.Logging -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, Literal, SpecificInternalRow, UnsafeProjection, UnsafeRow} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, JoinedRow, Literal, SpecificInternalRow, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.execution.streaming.{StatefulOperatorStateInfo, StreamingSymmetricHashJoinExec} import org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinHelper._ -import org.apache.spark.sql.types.{LongType, StructField, StructType} +import org.apache.spark.sql.execution.streaming.state.SymmetricHashJoinStateManager.KeyToValuePair +import org.apache.spark.sql.types.{BooleanType, LongType, StructField, StructType} import org.apache.spark.util.NextIterator /** @@ -42,10 +44,15 @@ import org.apache.spark.util.NextIterator * @param stateInfo Information about how to retrieve the correct version of state * @param storeConf Configuration for the state store. * @param hadoopConf Hadoop configuration for reading state data from storage + * @param partitionId A partition ID of source RDD. + * @param stateFormatVersion The version of format for state. * * Internally, the key -> multiple values is stored in two [[StateStore]]s. * - Store 1 ([[KeyToNumValuesStore]]) maintains mapping between key -> number of values - * - Store 2 ([[KeyWithIndexToValueStore]]) maintains mapping between (key, index) -> value + * - Store 2 ([[KeyWithIndexToValueStore]]) maintains mapping; the mapping depends on the state + * format version: + * - version 1: [(key, index) -> value] + * - version 2: [(key, index) -> (value, matched)] * - Put: update count in KeyToNumValuesStore, * insert new (key, count) -> value in KeyWithIndexToValueStore * - Get: read count from KeyToNumValuesStore, @@ -54,7 +61,7 @@ import org.apache.spark.util.NextIterator * scan all keys in KeyToNumValuesStore to find keys that do match the predicate, * delete from key from KeyToNumValuesStore, delete values in KeyWithIndexToValueStore * - Remove state by condition on values: - * scan all [(key, index) -> value] in KeyWithIndexToValueStore to find values that match + * scan all elements in KeyWithIndexToValueStore to find values that match * the predicate, delete corresponding (key, indexToDelete) from KeyWithIndexToValueStore * by overwriting with the value of (key, maxIndex), and removing [(key, maxIndex), * decrement corresponding num values in KeyToNumValuesStore @@ -65,8 +72,9 @@ class SymmetricHashJoinStateManager( joinKeys: Seq[Expression], stateInfo: Option[StatefulOperatorStateInfo], storeConf: StateStoreConf, - hadoopConf: Configuration) extends Logging { - + hadoopConf: Configuration, + partitionId: Int, + stateFormatVersion: Int) extends Logging { import SymmetricHashJoinStateManager._ /* @@ -82,23 +90,46 @@ class SymmetricHashJoinStateManager( } /** Append a new value to the key */ - def append(key: UnsafeRow, value: UnsafeRow): Unit = { + def append(key: UnsafeRow, value: UnsafeRow, matched: Boolean): Unit = { val numExistingValues = keyToNumValues.get(key) - keyWithIndexToValue.put(key, numExistingValues, value) + keyWithIndexToValue.put(key, numExistingValues, value, matched) keyToNumValues.put(key, numExistingValues + 1) } + /** + * Get all the matched values for given join condition, with marking matched. + * This method is designed to mark joined rows properly without exposing internal index of row. + */ + def getJoinedRows( + key: UnsafeRow, + generateJoinedRow: InternalRow => JoinedRow, + predicate: JoinedRow => Boolean): Iterator[JoinedRow] = { + val numValues = keyToNumValues.get(key) + keyWithIndexToValue.getAll(key, numValues).map { keyIdxToValue => + val joinedRow = generateJoinedRow(keyIdxToValue.value) + if (predicate(joinedRow)) { + if (!keyIdxToValue.matched) { + keyWithIndexToValue.put(key, keyIdxToValue.valueIndex, keyIdxToValue.value, + matched = true) + } + joinedRow + } else { + null + } + }.filter(_ != null) + } + /** * Remove using a predicate on keys. * - * This produces an iterator over the (key, value) pairs satisfying condition(key), where the - * underlying store is updated as a side-effect of producing next. + * This produces an iterator over the (key, value, matched) tuples satisfying condition(key), + * where the underlying store is updated as a side-effect of producing next. * * This implies the iterator must be consumed fully without any other operations on this manager * or the underlying store being interleaved. */ - def removeByKeyCondition(removalCondition: UnsafeRow => Boolean): Iterator[UnsafeRowPair] = { - new NextIterator[UnsafeRowPair] { + def removeByKeyCondition(removalCondition: UnsafeRow => Boolean): Iterator[KeyToValuePair] = { + new NextIterator[KeyToValuePair] { private val allKeyToNumValues = keyToNumValues.iterator @@ -107,15 +138,15 @@ class SymmetricHashJoinStateManager( private def currentKey = currentKeyToNumValue.key - private val reusedPair = new UnsafeRowPair() + private val reusedRet = new KeyToValuePair() - private def getAndRemoveValue() = { + private def getAndRemoveValue(): KeyToValuePair = { val keyWithIndexAndValue = currentValues.next() keyWithIndexToValue.remove(currentKey, keyWithIndexAndValue.valueIndex) - reusedPair.withRows(currentKey, keyWithIndexAndValue.value) + reusedRet.withNew(currentKey, keyWithIndexAndValue.value, keyWithIndexAndValue.matched) } - override def getNext(): UnsafeRowPair = { + override def getNext(): KeyToValuePair = { // If there are more values for the current key, remove and return the next one. if (currentValues != null && currentValues.hasNext) { return getAndRemoveValue() @@ -126,8 +157,7 @@ class SymmetricHashJoinStateManager( while (allKeyToNumValues.hasNext) { currentKeyToNumValue = allKeyToNumValues.next() if (removalCondition(currentKey)) { - currentValues = keyWithIndexToValue.getAll( - currentKey, currentKeyToNumValue.numValue) + currentValues = keyWithIndexToValue.getAll(currentKey, currentKeyToNumValue.numValue) keyToNumValues.remove(currentKey) if (currentValues.hasNext) { @@ -148,18 +178,18 @@ class SymmetricHashJoinStateManager( /** * Remove using a predicate on values. * - * At a high level, this produces an iterator over the (key, value) pairs such that value - * satisfies the predicate, where producing an element removes the value from the state store - * and producing all elements with a given key updates it accordingly. + * At a high level, this produces an iterator over the (key, value, matched) tuples such that + * value satisfies the predicate, where producing an element removes the value from the + * state store and producing all elements with a given key updates it accordingly. * * This implies the iterator must be consumed fully without any other operations on this manager * or the underlying store being interleaved. */ - def removeByValueCondition(removalCondition: UnsafeRow => Boolean): Iterator[UnsafeRowPair] = { - new NextIterator[UnsafeRowPair] { + def removeByValueCondition(removalCondition: UnsafeRow => Boolean): Iterator[KeyToValuePair] = { + new NextIterator[KeyToValuePair] { // Reuse this object to avoid creation+GC overhead. - private val reusedPair = new UnsafeRowPair() + private val reusedRet = new KeyToValuePair() private val allKeyToNumValues = keyToNumValues.iterator @@ -187,7 +217,7 @@ class SymmetricHashJoinStateManager( // Find the next value satisfying the condition, updating `currentKey` and `numValues` if // needed. Returns null when no value can be found. - private def findNextValueForIndex(): UnsafeRow = { + private def findNextValueForIndex(): ValueAndMatchPair = { // Loop across all values for the current key, and then all other keys, until we find a // value satisfying the removal condition. def hasMoreValuesForCurrentKey = currentKey != null && index < numValues @@ -195,9 +225,9 @@ class SymmetricHashJoinStateManager( while (hasMoreValuesForCurrentKey || hasMoreKeys) { if (hasMoreValuesForCurrentKey) { // First search the values for the current key. - val currentValue = keyWithIndexToValue.get(currentKey, index) - if (removalCondition(currentValue)) { - return currentValue + val valuePair = keyWithIndexToValue.get(currentKey, index) + if (removalCondition(valuePair.value)) { + return valuePair } else { index += 1 } @@ -219,7 +249,7 @@ class SymmetricHashJoinStateManager( return null } - override def getNext(): UnsafeRowPair = { + override def getNext(): KeyToValuePair = { val currentValue = findNextValueForIndex() // If there's no value, clean up and finish. There aren't any more available. @@ -233,8 +263,13 @@ class SymmetricHashJoinStateManager( // any hole. So we swap the last element into the hole and decrement numValues to shorten. // clean if (numValues > 1) { - val valueAtMaxIndex = keyWithIndexToValue.get(currentKey, numValues - 1) - keyWithIndexToValue.put(currentKey, index, valueAtMaxIndex) + val valuePairAtMaxIndex = keyWithIndexToValue.get(currentKey, numValues - 1) + if (valuePairAtMaxIndex != null) { + keyWithIndexToValue.put(currentKey, index, valuePairAtMaxIndex.value, + valuePairAtMaxIndex.matched) + } else { + keyWithIndexToValue.put(currentKey, index, null, false) + } keyWithIndexToValue.remove(currentKey, numValues - 1) } else { keyWithIndexToValue.remove(currentKey, 0) @@ -242,7 +277,7 @@ class SymmetricHashJoinStateManager( numValues -= 1 valueRemoved = true - return reusedPair.withRows(currentKey, currentValue) + return reusedRet.withNew(currentKey, currentValue.value, currentValue.matched) } override def close: Unit = {} @@ -294,7 +329,7 @@ class SymmetricHashJoinStateManager( joinKeys.zipWithIndex.map { case (k, i) => StructField(s"field$i", k.dataType, k.nullable) }) private val keyAttributes = keySchema.toAttributes private val keyToNumValues = new KeyToNumValuesStore() - private val keyWithIndexToValue = new KeyWithIndexToValueStore() + private val keyWithIndexToValue = new KeyWithIndexToValueStore(stateFormatVersion) // Clean up any state store resources if necessary at the end of the task Option(TaskContext.get()).foreach { _.addTaskCompletionListener[Unit] { _ => abortIfNeeded() } } @@ -322,7 +357,7 @@ class SymmetricHashJoinStateManager( /** Get the StateStore with the given schema */ protected def getStateStore(keySchema: StructType, valueSchema: StructType): StateStore = { val storeProviderId = StateStoreProviderId( - stateInfo.get, TaskContext.getPartitionId(), getStateStoreName(joinSide, stateStoreType)) + stateInfo.get, partitionId, getStateStoreName(joinSide, stateStoreType)) val store = StateStore.get( storeProviderId, keySchema, valueSchema, None, stateInfo.get.storeVersion, storeConf, hadoopConf) @@ -335,7 +370,7 @@ class SymmetricHashJoinStateManager( * Helper class for representing data returned by [[KeyWithIndexToValueStore]]. * Designed for object reuse. */ - private case class KeyAndNumValues(var key: UnsafeRow = null, var numValue: Long = 0) { + private class KeyAndNumValues(var key: UnsafeRow = null, var numValue: Long = 0) { def withNew(newKey: UnsafeRow, newNumValues: Long): this.type = { this.key = newKey this.numValue = newNumValues @@ -380,18 +415,105 @@ class SymmetricHashJoinStateManager( * Helper class for representing data returned by [[KeyWithIndexToValueStore]]. * Designed for object reuse. */ - private case class KeyWithIndexAndValue( - var key: UnsafeRow = null, var valueIndex: Long = -1, var value: UnsafeRow = null) { - def withNew(newKey: UnsafeRow, newIndex: Long, newValue: UnsafeRow): this.type = { + private class KeyWithIndexAndValue( + var key: UnsafeRow = null, + var valueIndex: Long = -1, + var value: UnsafeRow = null, + var matched: Boolean = false) { + + def withNew( + newKey: UnsafeRow, + newIndex: Long, + newValue: UnsafeRow, + newMatched: Boolean): this.type = { this.key = newKey this.valueIndex = newIndex this.value = newValue + this.matched = newMatched this } + + def withNew( + newKey: UnsafeRow, + newIndex: Long, + newValue: ValueAndMatchPair): this.type = { + this.key = newKey + this.valueIndex = newIndex + if (newValue != null) { + this.value = newValue.value + this.matched = newValue.matched + } else { + this.value = null + this.matched = false + } + this + } + } + + private trait KeyWithIndexToValueRowConverter { + def valueAttributes: Seq[Attribute] + + def convertValue(value: UnsafeRow): ValueAndMatchPair + + def convertToValueRow(value: UnsafeRow, matched: Boolean): UnsafeRow + } + + private object KeyWithIndexToValueRowConverter { + def create(version: Int): KeyWithIndexToValueRowConverter = version match { + case 1 => new KeyWithIndexToValueRowConverterFormatV1() + case 2 => new KeyWithIndexToValueRowConverterFormatV2() + case _ => throw new IllegalArgumentException("Incorrect state format version! " + + s"version $version") + } + } + + private class KeyWithIndexToValueRowConverterFormatV1 extends KeyWithIndexToValueRowConverter { + override val valueAttributes: Seq[Attribute] = inputValueAttributes + + override def convertValue(value: UnsafeRow): ValueAndMatchPair = { + if (value != null) ValueAndMatchPair(value, false) else null + } + + override def convertToValueRow(value: UnsafeRow, matched: Boolean): UnsafeRow = value + } + + private class KeyWithIndexToValueRowConverterFormatV2 extends KeyWithIndexToValueRowConverter { + private val valueWithMatchedExprs = inputValueAttributes :+ Literal(true) + private val indexOrdinalInValueWithMatchedRow = inputValueAttributes.size + + private val valueWithMatchedRowGenerator = UnsafeProjection.create(valueWithMatchedExprs, + inputValueAttributes) + + override val valueAttributes: Seq[Attribute] = inputValueAttributes :+ + AttributeReference("matched", BooleanType)() + + // Projection to generate key row from (value + matched) row + private val valueRowGenerator = UnsafeProjection.create( + inputValueAttributes, valueAttributes) + + override def convertValue(value: UnsafeRow): ValueAndMatchPair = { + if (value != null) { + ValueAndMatchPair(valueRowGenerator(value), + value.getBoolean(indexOrdinalInValueWithMatchedRow)) + } else { + null + } + } + + override def convertToValueRow(value: UnsafeRow, matched: Boolean): UnsafeRow = { + val row = valueWithMatchedRowGenerator(value) + row.setBoolean(indexOrdinalInValueWithMatchedRow, matched) + row + } } - /** A wrapper around a [[StateStore]] that stores [(key, index) -> value]. */ - private class KeyWithIndexToValueStore extends StateStoreHandler(KeyWithIndexToValueType) { + /** + * A wrapper around a [[StateStore]] that stores the mapping; the mapping depends on the + * state format version - please refer implementations of [[KeyWithIndexToValueRowConverter]]. + */ + private class KeyWithIndexToValueStore(stateFormatVersion: Int) + extends StateStoreHandler(KeyWithIndexToValueType) { + private val keyWithIndexExprs = keyAttributes :+ Literal(1L) private val keyWithIndexSchema = keySchema.add("index", LongType) private val indexOrdinalInKeyWithIndexRow = keyAttributes.size @@ -403,10 +525,13 @@ class SymmetricHashJoinStateManager( private val keyRowGenerator = UnsafeProjection.create( keyAttributes, keyAttributes :+ AttributeReference("index", LongType)()) - protected val stateStore = getStateStore(keyWithIndexSchema, inputValueAttributes.toStructType) + private val valueRowConverter = KeyWithIndexToValueRowConverter.create(stateFormatVersion) + + protected val stateStore = getStateStore(keyWithIndexSchema, + valueRowConverter.valueAttributes.toStructType) - def get(key: UnsafeRow, valueIndex: Long): UnsafeRow = { - stateStore.get(keyWithIndexRow(key, valueIndex)) + def get(key: UnsafeRow, valueIndex: Long): ValueAndMatchPair = { + valueRowConverter.convertValue(stateStore.get(keyWithIndexRow(key, valueIndex))) } /** @@ -423,8 +548,8 @@ class SymmetricHashJoinStateManager( null } else { val keyWithIndex = keyWithIndexRow(key, index) - val value = stateStore.get(keyWithIndex) - keyWithIndexAndValue.withNew(key, index, value) + val valuePair = valueRowConverter.convertValue(stateStore.get(keyWithIndex)) + keyWithIndexAndValue.withNew(key, index, valuePair) index += 1 keyWithIndexAndValue } @@ -435,9 +560,10 @@ class SymmetricHashJoinStateManager( } /** Put new value for key at the given index */ - def put(key: UnsafeRow, valueIndex: Long, value: UnsafeRow): Unit = { + def put(key: UnsafeRow, valueIndex: Long, value: UnsafeRow, matched: Boolean): Unit = { val keyWithIndex = keyWithIndexRow(key, valueIndex) - stateStore.put(keyWithIndex, value) + val valueWithMatched = valueRowConverter.convertToValueRow(value, matched) + stateStore.put(keyWithIndex, valueWithMatched) } /** @@ -460,8 +586,9 @@ class SymmetricHashJoinStateManager( def iterator: Iterator[KeyWithIndexAndValue] = { val keyWithIndexAndValue = new KeyWithIndexAndValue() stateStore.getRange(None, None).map { pair => + val valuePair = valueRowConverter.convertValue(pair.value) keyWithIndexAndValue.withNew( - keyRowGenerator(pair.key), pair.key.getLong(indexOrdinalInKeyWithIndexRow), pair.value) + keyRowGenerator(pair.key), pair.key.getLong(indexOrdinalInKeyWithIndexRow), valuePair) keyWithIndexAndValue } } @@ -476,6 +603,8 @@ class SymmetricHashJoinStateManager( } object SymmetricHashJoinStateManager { + val supportedVersions = Seq(1, 2) + val legacyVersion = 1 def allStateStoreNames(joinSides: JoinSide*): Seq[String] = { val allStateStoreTypes: Seq[StateStoreType] = Seq(KeyToNumValuesType, KeyWithIndexToValueType) @@ -497,4 +626,35 @@ object SymmetricHashJoinStateManager { private def getStateStoreName(joinSide: JoinSide, storeType: StateStoreType): String = { s"$joinSide-$storeType" } + + /** Helper class for representing data (value, matched). */ + case class ValueAndMatchPair(value: UnsafeRow, matched: Boolean) + + /** + * Helper class for representing data key to (value, matched). + * Designed for object reuse. + */ + case class KeyToValuePair( + var key: UnsafeRow = null, + var value: UnsafeRow = null, + var matched: Boolean = false) { + def withNew(newKey: UnsafeRow, newValue: UnsafeRow, newMatched: Boolean): this.type = { + this.key = newKey + this.value = newValue + this.matched = newMatched + this + } + + def withNew(newKey: UnsafeRow, newValue: ValueAndMatchPair): this.type = { + this.key = newKey + if (newValue != null) { + this.value = newValue.value + this.matched = newValue.matched + } else { + this.value = null + this.matched = false + } + this + } + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala index d689a6f3c9819..1bec924ba219a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala @@ -26,7 +26,7 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateUnsafeProjection, Predicate} +import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, Partitioning} import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._ @@ -156,17 +156,17 @@ trait WatermarkSupport extends UnaryExecNode { } /** Predicate based on keys that matches data older than the watermark */ - lazy val watermarkPredicateForKeys: Option[Predicate] = watermarkExpression.flatMap { e => + lazy val watermarkPredicateForKeys: Option[BasePredicate] = watermarkExpression.flatMap { e => if (keyExpressions.exists(_.metadata.contains(EventTimeWatermark.delayKey))) { - Some(newPredicate(e, keyExpressions)) + Some(Predicate.create(e, keyExpressions)) } else { None } } /** Predicate based on the child output that matches data older than the watermark. */ - lazy val watermarkPredicateForData: Option[Predicate] = - watermarkExpression.map(newPredicate(_, child.output)) + lazy val watermarkPredicateForData: Option[BasePredicate] = + watermarkExpression.map(Predicate.create(_, child.output)) protected def removeKeysOlderThanWatermark(store: StateStore): Unit = { if (watermarkPredicateForKeys.nonEmpty) { @@ -353,6 +353,7 @@ case class StateStoreSaveExec( finished = true null } else { + numOutputRows += 1 removedValueRow } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingGlobalLimitExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/streamingLimits.scala similarity index 68% rename from sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingGlobalLimitExec.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/streamingLimits.scala index bf4af60c8cf03..b19540253d7eb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingGlobalLimitExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/streamingLimits.scala @@ -20,21 +20,21 @@ import java.util.concurrent.TimeUnit.NANOSECONDS import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.catalyst.expressions.GenericInternalRow -import org.apache.spark.sql.catalyst.expressions.UnsafeProjection -import org.apache.spark.sql.catalyst.expressions.UnsafeRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericInternalRow, SortOrder, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, Distribution, Partitioning} import org.apache.spark.sql.catalyst.streaming.InternalOutputModes -import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.{LimitExec, SparkPlan, UnaryExecNode} import org.apache.spark.sql.execution.streaming.state.StateStoreOps import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{LongType, NullType, StructField, StructType} -import org.apache.spark.util.CompletionIterator +import org.apache.spark.util.{CompletionIterator, NextIterator} /** * A physical operator for executing a streaming limit, which makes sure no more than streamLimit - * rows are returned. This operator is meant for streams in Append mode only. + * rows are returned. This physical operator is only meant for logical limit operations that + * will get a input stream of rows that are effectively appends. For example, + * - limit on any query in append mode + * - limit before the aggregation in a streaming aggregation query complete mode */ case class StreamingGlobalLimitExec( streamLimit: Long, @@ -49,9 +49,6 @@ case class StreamingGlobalLimitExec( override protected def doExecute(): RDD[InternalRow] = { metrics // force lazy init at driver - assert(outputMode.isDefined && outputMode.get == InternalOutputModes.Append, - "StreamingGlobalLimitExec is only valid for streams in Append output mode") - child.execute().mapPartitionsWithStateStore( getStateInfo, keySchema, @@ -100,3 +97,41 @@ case class StreamingGlobalLimitExec( UnsafeProjection.create(valueSchema)(new GenericInternalRow(Array[Any](value))) } } + + +/** + * A physical operator for executing limits locally on each partition. The main difference from + * LocalLimitExec is that this will fully consume `child` plan's iterators to ensure that any + * stateful operation within `child` commits all the state changes (many stateful operations + * commit state changes only after the iterator is consumed). + */ +case class StreamingLocalLimitExec(limit: Int, child: SparkPlan) + extends LimitExec { + + override def doExecute(): RDD[InternalRow] = child.execute().mapPartitions { iter => + + var generatedCount = 0 + + new NextIterator[InternalRow]() { + override protected def getNext(): InternalRow = { + if (generatedCount < limit && iter.hasNext) { + generatedCount += 1 + iter.next() + } else { + finished = true + null + } + } + + override protected def close(): Unit = { + while (iter.hasNext) iter.next() // consume the iterator completely + } + } + } + + override def outputOrdering: Seq[SortOrder] = child.outputOrdering + + override def outputPartitioning: Partitioning = child.outputPartitioning + + override def output: Seq[Attribute] = child.output +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala index 22e3f8e035991..c2270c57eb941 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala @@ -172,13 +172,13 @@ case class InSubqueryExec( } /** - * Plans scalar subqueries from that are present in the given [[SparkPlan]]. + * Plans subqueries that are present in the given [[SparkPlan]]. */ case class PlanSubqueries(sparkSession: SparkSession) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { plan.transformAllExpressions { case subquery: expressions.ScalarSubquery => - val executedPlan = new QueryExecution(sparkSession, subquery.plan).executedPlan + val executedPlan = QueryExecution.prepareExecutedPlan(sparkSession, subquery.plan) ScalarSubquery( SubqueryExec(s"scalar-subquery#${subquery.exprId.id}", executedPlan), subquery.exprId) @@ -192,8 +192,8 @@ case class PlanSubqueries(sparkSession: SparkSession) extends Rule[SparkPlan] { } ) } - val executedPlan = new QueryExecution(sparkSession, query).executedPlan - InSubqueryExec(expr, SubqueryExec(s"subquery${exprId.id}", executedPlan), exprId) + val executedPlan = QueryExecution.prepareExecutedPlan(sparkSession, query) + InSubqueryExec(expr, SubqueryExec(s"subquery#${exprId.id}", executedPlan), exprId) } } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala index ec0577283265d..e1ff90a2c20e8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/AllExecutionsPage.scala @@ -324,7 +324,15 @@ private[ui] class ExecutionPagedTable( - {header} + {if (header == "Duration") { + + {header} + + } else { + {header} + }} } else { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala index 875086cda258d..91360e0e50314 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala @@ -116,7 +116,7 @@ class ExecutionPage(parent: SQLTab) extends WebUIPage("execution") with Logging {metadata}
    {planVisualizationResources(request)} - + } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala index 2c4a7eacdf10b..1454cc05ed4da 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala @@ -16,10 +16,12 @@ */ package org.apache.spark.sql.execution.ui -import java.util.{Date, NoSuchElementException} +import java.util.{Arrays, Date, NoSuchElementException} import java.util.concurrent.ConcurrentHashMap +import java.util.concurrent.atomic.AtomicInteger import scala.collection.JavaConverters._ +import scala.collection.mutable import org.apache.spark.{JobExecutionStatus, SparkConf} import org.apache.spark.internal.Logging @@ -29,6 +31,7 @@ import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.metric._ import org.apache.spark.sql.internal.StaticSQLConf._ import org.apache.spark.status.{ElementTrackingStore, KVUtils, LiveEntity} +import org.apache.spark.util.collection.OpenHashMap class SQLAppStatusListener( conf: SparkConf, @@ -50,7 +53,7 @@ class SQLAppStatusListener( liveExecutions.isEmpty && stageMetrics.isEmpty } - kvstore.addTrigger(classOf[SQLExecutionUIData], conf.get(UI_RETAINED_EXECUTIONS)) { count => + kvstore.addTrigger(classOf[SQLExecutionUIData], conf.get[Int](UI_RETAINED_EXECUTIONS)) { count => cleanupExecutions(count) } @@ -92,7 +95,7 @@ class SQLAppStatusListener( executionData.jobs = sqlStoreData.jobs executionData.stages = sqlStoreData.stages executionData.metricsValues = sqlStoreData.metricValues - executionData.endEvents = sqlStoreData.jobs.size + 1 + executionData.endEvents.set(sqlStoreData.jobs.size + 1) liveExecutions.put(executionId, executionData) Some(executionData) } catch { @@ -100,11 +103,14 @@ class SQLAppStatusListener( } }.getOrElse(getOrCreateExecution(executionId)) - // Record the accumulator IDs for the stages of this job, so that the code that keeps - // track of the metrics knows which accumulators to look at. - val accumIds = exec.metrics.map(_.accumulatorId).toSet - event.stageIds.foreach { id => - stageMetrics.put(id, new LiveStageMetrics(id, 0, accumIds, new ConcurrentHashMap())) + // Record the accumulator IDs and metric types for the stages of this job, so that the code + // that keeps track of the metrics knows which accumulators to look at. + val accumIdsAndType = exec.metrics.map { m => (m.accumulatorId, m.metricType) }.toMap + if (accumIdsAndType.nonEmpty) { + event.stageInfos.foreach { stage => + stageMetrics.put(stage.stageId, new LiveStageMetrics(stage.stageId, 0, + stage.numTasks, accumIdsAndType)) + } } exec.jobs = exec.jobs + (jobId -> JobExecutionStatus.RUNNING) @@ -118,9 +124,12 @@ class SQLAppStatusListener( } // Reset the metrics tracking object for the new attempt. - Option(stageMetrics.get(event.stageInfo.stageId)).foreach { metrics => - metrics.taskMetrics.clear() - metrics.attemptId = event.stageInfo.attemptNumber + Option(stageMetrics.get(event.stageInfo.stageId)).foreach { stage => + if (stage.attemptId != event.stageInfo.attemptNumber) { + stageMetrics.put(event.stageInfo.stageId, + new LiveStageMetrics(event.stageInfo.stageId, event.stageInfo.attemptNumber, + stage.numTasks, stage.accumIdsToMetricType)) + } } } @@ -132,7 +141,7 @@ class SQLAppStatusListener( case _ => JobExecutionStatus.FAILED } exec.jobs = exec.jobs + (event.jobId -> result) - exec.endEvents += 1 + exec.endEvents.incrementAndGet() update(exec) } } @@ -140,7 +149,16 @@ class SQLAppStatusListener( override def onExecutorMetricsUpdate(event: SparkListenerExecutorMetricsUpdate): Unit = { event.accumUpdates.foreach { case (taskId, stageId, attemptId, accumUpdates) => - updateStageMetrics(stageId, attemptId, taskId, accumUpdates, false) + updateStageMetrics(stageId, attemptId, taskId, SQLAppStatusListener.UNKNOWN_INDEX, + accumUpdates, false) + } + } + + override def onTaskStart(event: SparkListenerTaskStart): Unit = { + Option(stageMetrics.get(event.stageId)).foreach { stage => + if (stage.attemptId == event.stageAttemptId) { + stage.registerTask(event.taskInfo.taskId, event.taskInfo.index) + } } } @@ -165,7 +183,7 @@ class SQLAppStatusListener( } else { info.accumulables } - updateStageMetrics(event.stageId, event.stageAttemptId, info.taskId, accums, + updateStageMetrics(event.stageId, event.stageAttemptId, info.taskId, info.index, accums, info.successful) } @@ -181,17 +199,64 @@ class SQLAppStatusListener( private def aggregateMetrics(exec: LiveExecutionData): Map[Long, String] = { val metricTypes = exec.metrics.map { m => (m.accumulatorId, m.metricType) }.toMap - val metrics = exec.stages.toSeq + + val liveStageMetrics = exec.stages.toSeq .flatMap { stageId => Option(stageMetrics.get(stageId)) } - .flatMap(_.taskMetrics.values().asScala) - .flatMap { metrics => metrics.ids.zip(metrics.values) } - - val aggregatedMetrics = (metrics ++ exec.driverAccumUpdates.toSeq) - .filter { case (id, _) => metricTypes.contains(id) } - .groupBy(_._1) - .map { case (id, values) => - id -> SQLMetrics.stringValue(metricTypes(id), values.map(_._2)) + + val taskMetrics = liveStageMetrics.flatMap(_.metricValues()) + + val maxMetrics = liveStageMetrics.flatMap(_.maxMetricValues()) + + val allMetrics = new mutable.HashMap[Long, Array[Long]]() + + val maxMetricsFromAllStages = new mutable.HashMap[Long, Array[Long]]() + + taskMetrics.foreach { case (id, values) => + val prev = allMetrics.getOrElse(id, null) + val updated = if (prev != null) { + prev ++ values + } else { + values + } + allMetrics(id) = updated + } + + // Find the max for each metric id between all stages. + maxMetrics.foreach { case (id, value, taskId, stageId, attemptId) => + val updated = maxMetricsFromAllStages.getOrElse(id, Array(value, stageId, attemptId, taskId)) + if (value > updated(0)) { + updated(0) = value + updated(1) = stageId + updated(2) = attemptId + updated(3) = taskId } + maxMetricsFromAllStages(id) = updated + } + + exec.driverAccumUpdates.foreach { case (id, value) => + if (metricTypes.contains(id)) { + val prev = allMetrics.getOrElse(id, null) + val updated = if (prev != null) { + // If the driver updates same metrics as tasks and has higher value then remove + // that entry from maxMetricsFromAllStage. This would make stringValue function default + // to "driver" that would be displayed on UI. + if (maxMetricsFromAllStages.contains(id) && value > maxMetricsFromAllStages(id)(0)) { + maxMetricsFromAllStages.remove(id) + } + val _copy = Arrays.copyOf(prev, prev.length + 1) + _copy(prev.length) = value + _copy + } else { + Array(value) + } + allMetrics(id) = updated + } + } + + val aggregatedMetrics = allMetrics.map { case (id, values) => + id -> SQLMetrics.stringValue(metricTypes(id), values, maxMetricsFromAllStages.getOrElse(id, + Array.empty[Long])) + }.toMap // Check the execution again for whether the aggregated metrics data has been calculated. // This can happen if the UI is requesting this data, and the onExecutionEnd handler is @@ -208,43 +273,13 @@ class SQLAppStatusListener( stageId: Int, attemptId: Int, taskId: Long, + taskIdx: Int, accumUpdates: Seq[AccumulableInfo], succeeded: Boolean): Unit = { Option(stageMetrics.get(stageId)).foreach { metrics => - if (metrics.attemptId != attemptId || metrics.accumulatorIds.isEmpty) { - return - } - - val oldTaskMetrics = metrics.taskMetrics.get(taskId) - if (oldTaskMetrics != null && oldTaskMetrics.succeeded) { - return - } - - val updates = accumUpdates - .filter { acc => acc.update.isDefined && metrics.accumulatorIds.contains(acc.id) } - .sortBy(_.id) - - if (updates.isEmpty) { - return + if (metrics.attemptId == attemptId) { + metrics.updateTaskMetrics(taskId, taskIdx, succeeded, accumUpdates) } - - val ids = new Array[Long](updates.size) - val values = new Array[Long](updates.size) - updates.zipWithIndex.foreach { case (acc, idx) => - ids(idx) = acc.id - // In a live application, accumulators have Long values, but when reading from event - // logs, they have String values. For now, assume all accumulators are Long and covert - // accordingly. - values(idx) = acc.update.get match { - case s: String => s.toLong - case l: Long => l - case o => throw new IllegalArgumentException(s"Unexpected: $o") - } - } - - // TODO: storing metrics by task ID can cause metrics for the same task index to be - // counted multiple times, for example due to speculation or re-attempts. - metrics.taskMetrics.put(taskId, new LiveTaskMetrics(ids, values, succeeded)) } } @@ -309,15 +344,29 @@ class SQLAppStatusListener( update(exec) } + private def onAdaptiveSQLMetricUpdate(event: SparkListenerSQLAdaptiveSQLMetricUpdates): Unit = { + val SparkListenerSQLAdaptiveSQLMetricUpdates(executionId, sqlPlanMetrics) = event + + val exec = getOrCreateExecution(executionId) + exec.metrics = exec.metrics ++ sqlPlanMetrics + update(exec) + } + private def onExecutionEnd(event: SparkListenerSQLExecutionEnd): Unit = { val SparkListenerSQLExecutionEnd(executionId, time) = event Option(liveExecutions.get(executionId)).foreach { exec => - exec.metricsValues = aggregateMetrics(exec) exec.completionTime = Some(new Date(time)) - exec.endEvents += 1 update(exec) - removeStaleMetricsData(exec) + // Aggregating metrics can be expensive for large queries, so do it asynchronously. The end + // event count is updated after the metrics have been aggregated, to prevent a job end event + // arriving during aggregation from cleaning up the metrics data. + kvstore.doAsync { + exec.metricsValues = aggregateMetrics(exec) + removeStaleMetricsData(exec) + exec.endEvents.incrementAndGet() + update(exec, force = true) + } } } @@ -342,6 +391,7 @@ class SQLAppStatusListener( override def onOtherEvent(event: SparkListenerEvent): Unit = event match { case e: SparkListenerSQLExecutionStart => onExecutionStart(e) case e: SparkListenerSQLAdaptiveExecutionUpdate => onAdaptiveExecutionUpdate(e) + case e: SparkListenerSQLAdaptiveSQLMetricUpdates => onAdaptiveSQLMetricUpdate(e) case e: SparkListenerSQLExecutionEnd => onExecutionEnd(e) case e: SparkListenerDriverAccumUpdates => onDriverAccumUpdates(e) case _ => // Ignore @@ -354,7 +404,7 @@ class SQLAppStatusListener( private def update(exec: LiveExecutionData, force: Boolean = false): Unit = { val now = System.nanoTime() - if (exec.endEvents >= exec.jobs.size + 1) { + if (exec.endEvents.get() >= exec.jobs.size + 1) { exec.write(kvstore, now) removeStaleMetricsData(exec) liveExecutions.remove(exec.executionId) @@ -406,7 +456,7 @@ private class LiveExecutionData(val executionId: Long) extends LiveEntity { // Just in case job end and execution end arrive out of order, keep track of how many // end events arrived so that the listener can stop tracking the execution. - var endEvents = 0 + val endEvents = new AtomicInteger() override protected def doUpdate(): Any = { new SQLExecutionUIData( @@ -426,11 +476,94 @@ private class LiveExecutionData(val executionId: Long) extends LiveEntity { private class LiveStageMetrics( val stageId: Int, - var attemptId: Int, - val accumulatorIds: Set[Long], - val taskMetrics: ConcurrentHashMap[Long, LiveTaskMetrics]) - -private class LiveTaskMetrics( - val ids: Array[Long], - val values: Array[Long], - val succeeded: Boolean) + val attemptId: Int, + val numTasks: Int, + val accumIdsToMetricType: Map[Long, String]) { + + /** + * Mapping of task IDs to their respective index. Note this may contain more elements than the + * stage's number of tasks, if speculative execution is on. + */ + private val taskIndices = new OpenHashMap[Long, Int]() + + /** Bit set tracking which indices have been successfully computed. */ + private val completedIndices = new mutable.BitSet() + + /** + * Task metrics values for the stage. Maps the metric ID to the metric values for each + * index. For each metric ID, there will be the same number of values as the number + * of indices. This relies on `SQLMetrics.stringValue` treating 0 as a neutral value, + * independent of the actual metric type. + */ + private val taskMetrics = new ConcurrentHashMap[Long, Array[Long]]() + + private val metricsIdToMaxTaskValue = new ConcurrentHashMap[Long, Array[Long]]() + + def registerTask(taskId: Long, taskIdx: Int): Unit = { + taskIndices.update(taskId, taskIdx) + } + + def updateTaskMetrics( + taskId: Long, + eventIdx: Int, + finished: Boolean, + accumUpdates: Seq[AccumulableInfo]): Unit = { + val taskIdx = if (eventIdx == SQLAppStatusListener.UNKNOWN_INDEX) { + if (!taskIndices.contains(taskId)) { + // We probably missed the start event for the task, just ignore it. + return + } + taskIndices(taskId) + } else { + // Here we can recover from a missing task start event. Just register the task again. + registerTask(taskId, eventIdx) + eventIdx + } + + if (completedIndices.contains(taskIdx)) { + return + } + + accumUpdates + .filter { acc => acc.update.isDefined && accumIdsToMetricType.contains(acc.id) } + .foreach { acc => + // In a live application, accumulators have Long values, but when reading from event + // logs, they have String values. For now, assume all accumulators are Long and convert + // accordingly. + val value = acc.update.get match { + case s: String => s.toLong + case l: Long => l + case o => throw new IllegalArgumentException(s"Unexpected: $o") + } + + val metricValues = taskMetrics.computeIfAbsent(acc.id, _ => new Array(numTasks)) + metricValues(taskIdx) = value + + if (SQLMetrics.metricNeedsMax(accumIdsToMetricType(acc.id))) { + val maxMetricsTaskId = metricsIdToMaxTaskValue.computeIfAbsent(acc.id, _ => Array(value, + taskId)) + + if (value > maxMetricsTaskId.head) { + maxMetricsTaskId(0) = value + maxMetricsTaskId(1) = taskId + } + } + } + if (finished) { + completedIndices += taskIdx + } + } + + def metricValues(): Seq[(Long, Array[Long])] = taskMetrics.asScala.toSeq + + // Return Seq of metric id, value, taskId, stageId, attemptId for this stage + def maxMetricValues(): Seq[(Long, Long, Long, Int, Int)] = { + metricsIdToMaxTaskValue.asScala.toSeq.map { case (id, maxMetrics) => (id, maxMetrics(0), + maxMetrics(1), stageId, attemptId) + } + } +} + +private object SQLAppStatusListener { + val UNKNOWN_INDEX = -1 +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala index 241001a857c8f..a90f37a80d525 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala @@ -42,6 +42,10 @@ class SQLAppStatusStore( store.view(classOf[SQLExecutionUIData]).asScala.toSeq } + def executionsList(offset: Int, length: Int): Seq[SQLExecutionUIData] = { + store.view(classOf[SQLExecutionUIData]).skip(offset).max(length).asScala.toSeq + } + def execution(executionId: Long): Option[SQLExecutionUIData] = { try { Some(store.read(classOf[SQLExecutionUIData], executionId)) @@ -133,7 +137,7 @@ class SparkPlanGraphNodeWrapper( val cluster: SparkPlanGraphClusterWrapper) { def toSparkPlanGraphNode(): SparkPlanGraphNode = { - assert(node == null ^ cluster == null, "One and only of of nore or cluster must be set.") + assert(node == null ^ cluster == null, "Exactly one of node, cluster values to be set.") if (node != null) node else cluster.toSparkPlanGraphCluster() } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLHistoryServerPlugin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLHistoryServerPlugin.scala index 522d0cf79bffa..5bf1ce5eb8a90 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLHistoryServerPlugin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLHistoryServerPlugin.scala @@ -33,4 +33,7 @@ class SQLHistoryServerPlugin extends AppHistoryServerPlugin { new SQLTab(sqlStatusStore, ui) } } + + override def displayOrder: Int = 0 } + diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala index 81cbc7f54c7eb..6a6a71c46f213 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala @@ -34,6 +34,12 @@ case class SparkListenerSQLAdaptiveExecutionUpdate( sparkPlanInfo: SparkPlanInfo) extends SparkListenerEvent +@DeveloperApi +case class SparkListenerSQLAdaptiveSQLMetricUpdates( + executionId: Long, + sqlPlanMetrics: Seq[SQLPlanMetric]) + extends SparkListenerEvent + @DeveloperApi case class SparkListenerSQLExecutionStart( executionId: Long, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala index f898236c537a8..d31d77840b802 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala @@ -78,7 +78,7 @@ object SparkPlanGraph { subgraph: SparkPlanGraphCluster, exchanges: mutable.HashMap[SparkPlanInfo, SparkPlanGraphNode]): Unit = { planInfo.nodeName match { - case "WholeStageCodegen" => + case name if name.startsWith("WholeStageCodegen") => val metrics = planInfo.metrics.map { metric => SQLPlanMetric(metric.name, metric.accumulatorId, metric.metricType) } @@ -175,9 +175,12 @@ private[ui] class SparkPlanGraphNode( // SparkPlan and metrics. If removing it, it won't display the empty line in UI. builder ++= "\n \n" builder ++= values.mkString("\n") + s""" $id [label="${StringEscapeUtils.escapeJava(builder.toString())}"];""" + } else { + // SPARK-30684: when there is no metrics, add empty lines to increase the height of the node, + // so that there won't be gaps between an edge and a small node. + s""" $id [labelType="html" label="
    $name

    "];""" } - - s""" $id [label="${StringEscapeUtils.escapeJava(builder.toString())}"];""" } } @@ -197,8 +200,8 @@ private[ui] class SparkPlanGraphCluster( val labelStr = if (duration.nonEmpty) { require(duration.length == 1) val id = duration(0).accumulatorId - if (metricsValue.contains(duration(0).accumulatorId)) { - name + "\n\n" + metricsValue(id) + if (metricsValue.contains(id)) { + name + "\n \n" + duration(0).name + ": " + metricsValue(id) } else { name } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala index 89f6edda2ef57..d191f3790ffa8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExec.scala @@ -123,7 +123,7 @@ case class WindowExec( var nextRow: UnsafeRow = null var nextGroup: UnsafeRow = null var nextRowAvailable: Boolean = false - private[this] def fetchNextRow() { + private[this] def fetchNextRow(): Unit = { nextRowAvailable = stream.hasNext if (nextRowAvailable) { nextRow = stream.next().asInstanceOf[UnsafeRow] @@ -144,7 +144,7 @@ case class WindowExec( val windowFunctionResult = new SpecificInternalRow(expressions.map(_.dataType)) val frames = factories.map(_(windowFunctionResult)) val numFrames = frames.length - private[this] def fetchNextPartition() { + private[this] def fetchNextPartition(): Unit = { // Collect all the rows in the current partition. // Before we start to fetch new input rows, make a copy of nextGroup. val currentGroup = nextGroup.copy() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala index dcb86f48bdf32..d5d11c45f8535 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/window/WindowExecBase.scala @@ -73,7 +73,7 @@ abstract class WindowExecBase( RowBoundOrdering(offset) case (RangeFrame, CurrentRow) => - val ordering = newOrdering(orderSpec, child.output) + val ordering = RowOrdering.create(orderSpec, child.output) RangeBoundOrdering(ordering, IdentityProjection, IdentityProjection) case (RangeFrame, offset: Expression) if orderSpec.size == 1 => @@ -82,7 +82,7 @@ abstract class WindowExecBase( val expr = sortExpr.child // Create the projection which returns the current 'value'. - val current = newMutableProjection(expr :: Nil, child.output) + val current = MutableProjection.create(expr :: Nil, child.output) // Flip the sign of the offset when processing the order is descending val boundOffset = sortExpr.direction match { @@ -97,13 +97,13 @@ abstract class WindowExecBase( TimeAdd(expr, boundOffset, Some(timeZone)) case (a, b) if a == b => Add(expr, boundOffset) } - val bound = newMutableProjection(boundExpr :: Nil, child.output) + val bound = MutableProjection.create(boundExpr :: Nil, child.output) // Construct the ordering. This is used to compare the result of current value projection // to the result of bound value projection. This is done manually because we want to use // Code Generation (if it is enabled). val boundSortExprs = sortExpr.copy(BoundReference(0, expr.dataType, expr.nullable)) :: Nil - val ordering = newOrdering(boundSortExprs, Nil) + val ordering = RowOrdering.create(boundSortExprs, Nil) RangeBoundOrdering(ordering, current, bound) case (RangeFrame, _) => @@ -136,7 +136,7 @@ abstract class WindowExecBase( case e @ WindowExpression(function, spec) => val frame = spec.frameSpecification.asInstanceOf[SpecifiedWindowFrame] function match { - case AggregateExpression(f, _, _, _) => collect("AGGREGATE", frame, e, f) + case AggregateExpression(f, _, _, _, _) => collect("AGGREGATE", frame, e, f) case f: AggregateWindowFunction => collect("AGGREGATE", frame, e, f) case f: OffsetWindowFunction => collect("OFFSET", frame, e, f) case f: PythonUDF => collect("AGGREGATE", frame, e, f) @@ -167,7 +167,7 @@ abstract class WindowExecBase( ordinal, child.output, (expressions, schema) => - newMutableProjection(expressions, schema, subexpressionEliminationEnabled)) + MutableProjection.create(expressions, schema)) } // Create the factory @@ -182,7 +182,7 @@ abstract class WindowExecBase( functions.map(_.asInstanceOf[OffsetWindowFunction]), child.output, (expressions, schema) => - newMutableProjection(expressions, schema, subexpressionEliminationEnabled), + MutableProjection.create(expressions, schema), offset) // Entire Partition Frame. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala index 0c956ecbf936e..85b2cd379ba24 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala @@ -17,10 +17,15 @@ package org.apache.spark.sql.expressions -import org.apache.spark.annotation.Stable -import org.apache.spark.sql.Column +import scala.reflect.runtime.universe.TypeTag + +import org.apache.spark.annotation.{Experimental, Stable} +import org.apache.spark.sql.{Column, Encoder} import org.apache.spark.sql.catalyst.ScalaReflection +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUDF} +import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete} +import org.apache.spark.sql.execution.aggregate.ScalaAggregator import org.apache.spark.sql.types.{AnyDataType, DataType} /** @@ -136,3 +141,42 @@ private[sql] case class SparkUserDefinedFunction( } } } + +private[sql] case class UserDefinedAggregator[IN, BUF, OUT]( + aggregator: Aggregator[IN, BUF, OUT], + inputEncoder: Encoder[IN], + name: Option[String] = None, + nullable: Boolean = true, + deterministic: Boolean = true) extends UserDefinedFunction { + + @scala.annotation.varargs + def apply(exprs: Column*): Column = { + Column(AggregateExpression(scalaAggregator(exprs.map(_.expr)), Complete, isDistinct = false)) + } + + // This is also used by udf.register(...) when it detects a UserDefinedAggregator + def scalaAggregator(exprs: Seq[Expression]): ScalaAggregator[IN, BUF, OUT] = { + val iEncoder = inputEncoder.asInstanceOf[ExpressionEncoder[IN]] + ScalaAggregator(exprs, aggregator, iEncoder, nullable, deterministic) + } + + override def withName(name: String): UserDefinedAggregator[IN, BUF, OUT] = { + copy(name = Option(name)) + } + + override def asNonNullable(): UserDefinedAggregator[IN, BUF, OUT] = { + if (!nullable) { + this + } else { + copy(nullable = false) + } + } + + override def asNondeterministic(): UserDefinedAggregator[IN, BUF, OUT] = { + if (!deterministic) { + this + } else { + copy(deterministic = false) + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala index cd1c198ddebf0..d13baaedbaeff 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Window.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.expressions import org.apache.spark.annotation.Stable import org.apache.spark.sql.Column -import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.{WindowSpec => _, _} /** * Utility functions for defining window in DataFrames. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala index 4e8cb3a6ddd66..8407b1419af62 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/udaf.scala @@ -27,8 +27,12 @@ import org.apache.spark.sql.types._ * The base class for implementing user-defined aggregate functions (UDAF). * * @since 1.5.0 + * @deprecated UserDefinedAggregateFunction is deprecated. + * Aggregator[IN, BUF, OUT] should now be registered as a UDF via the functions.udaf(agg) method. */ @Stable +@deprecated("Aggregator[IN, BUF, OUT] should now be registered as a UDF" + + " via the functions.udaf(agg) method.", "3.0.0") abstract class UserDefinedAggregateFunction extends Serializable { /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 6b8127bab1cb4..2d5504ac00ffa 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -31,13 +31,13 @@ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.plans.logical.{BROADCAST, HintInfo, ResolvedHint} +import org.apache.spark.sql.catalyst.util.TimestampFormatter import org.apache.spark.sql.execution.SparkSqlParser -import org.apache.spark.sql.expressions.{SparkUserDefinedFunction, UserDefinedFunction} +import org.apache.spark.sql.expressions.{Aggregator, SparkUserDefinedFunction, UserDefinedAggregator, UserDefinedFunction} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.util.Utils - /** * Commonly used functions available for DataFrame operations. Using functions defined here provides * a little bit more compile-time safety to make sure the function exists. @@ -69,6 +69,7 @@ import org.apache.spark.util.Utils * @groupname window_funcs Window functions * @groupname string_funcs String functions * @groupname collection_funcs Collection functions + * @groupname partition_transforms Partition transform functions * @groupname Ungrouped Support functions for DataFrames * @since 1.3.0 */ @@ -272,7 +273,7 @@ object functions { * Aggregate function: returns a list of objects with duplicates. * * @note The function is non-deterministic because the order of collected results depends - * on order of rows which may be non-deterministic after a shuffle. + * on the order of the rows which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 1.6.0 @@ -283,7 +284,7 @@ object functions { * Aggregate function: returns a list of objects with duplicates. * * @note The function is non-deterministic because the order of collected results depends - * on order of rows which may be non-deterministic after a shuffle. + * on the order of the rows which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 1.6.0 @@ -294,7 +295,7 @@ object functions { * Aggregate function: returns a set of objects with duplicate elements eliminated. * * @note The function is non-deterministic because the order of collected results depends - * on order of rows which may be non-deterministic after a shuffle. + * on the order of the rows which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 1.6.0 @@ -305,7 +306,7 @@ object functions { * Aggregate function: returns a set of objects with duplicate elements eliminated. * * @note The function is non-deterministic because the order of collected results depends - * on order of rows which may be non-deterministic after a shuffle. + * on the order of the rows which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 1.6.0 @@ -423,8 +424,8 @@ object functions { * The function by default returns the first values it sees. It will return the first non-null * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. * - * @note The function is non-deterministic because its results depends on order of rows which - * may be non-deterministic after a shuffle. + * @note The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 2.0.0 @@ -439,8 +440,8 @@ object functions { * The function by default returns the first values it sees. It will return the first non-null * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. * - * @note The function is non-deterministic because its results depends on order of rows which - * may be non-deterministic after a shuffle. + * @note The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 2.0.0 @@ -455,8 +456,8 @@ object functions { * The function by default returns the first values it sees. It will return the first non-null * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. * - * @note The function is non-deterministic because its results depends on order of rows which - * may be non-deterministic after a shuffle. + * @note The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 1.3.0 @@ -469,8 +470,8 @@ object functions { * The function by default returns the first values it sees. It will return the first non-null * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. * - * @note The function is non-deterministic because its results depends on order of rows which - * may be non-deterministic after a shuffle. + * @note The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 1.3.0 @@ -548,8 +549,8 @@ object functions { * The function by default returns the last values it sees. It will return the last non-null * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. * - * @note The function is non-deterministic because its results depends on order of rows which - * may be non-deterministic after a shuffle. + * @note The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 2.0.0 @@ -564,8 +565,8 @@ object functions { * The function by default returns the last values it sees. It will return the last non-null * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. * - * @note The function is non-deterministic because its results depends on order of rows which - * may be non-deterministic after a shuffle. + * @note The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 2.0.0 @@ -580,8 +581,8 @@ object functions { * The function by default returns the last values it sees. It will return the last non-null * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. * - * @note The function is non-deterministic because its results depends on order of rows which - * may be non-deterministic after a shuffle. + * @note The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 1.3.0 @@ -594,8 +595,8 @@ object functions { * The function by default returns the last values it sees. It will return the last non-null * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. * - * @note The function is non-deterministic because its results depends on order of rows which - * may be non-deterministic after a shuffle. + * @note The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. * * @group agg_funcs * @since 1.3.0 @@ -2521,25 +2522,25 @@ object functions { } /** - * Overlay the specified portion of `src` with `replaceString`, - * starting from byte position `pos` of `inputString` and proceeding for `len` bytes. + * Overlay the specified portion of `src` with `replace`, + * starting from byte position `pos` of `src` and proceeding for `len` bytes. * * @group string_funcs * @since 3.0.0 */ - def overlay(src: Column, replaceString: String, pos: Int, len: Int): Column = withExpr { - Overlay(src.expr, lit(replaceString).expr, lit(pos).expr, lit(len).expr) + def overlay(src: Column, replace: Column, pos: Column, len: Column): Column = withExpr { + Overlay(src.expr, replace.expr, pos.expr, len.expr) } /** - * Overlay the specified portion of `src` with `replaceString`, - * starting from byte position `pos` of `inputString`. + * Overlay the specified portion of `src` with `replace`, + * starting from byte position `pos` of `src`. * * @group string_funcs * @since 3.0.0 */ - def overlay(src: Column, replaceString: String, pos: Int): Column = withExpr { - new Overlay(src.expr, lit(replaceString).expr, lit(pos).expr) + def overlay(src: Column, replace: Column, pos: Column): Column = withExpr { + new Overlay(src.expr, replace.expr, pos.expr) } /** @@ -2634,8 +2635,8 @@ object functions { * See [[java.time.format.DateTimeFormatter]] for valid date and time format patterns * * @param dateExpr A date, timestamp or string. If a string, the data must be in a format that - * can be cast to a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` - * @param format A pattern `dd.MM.yyyy` would return a string like `18.03.1993` + * can be cast to a timestamp, such as `uuuu-MM-dd` or `uuuu-MM-dd HH:mm:ss.SSSS` + * @param format A pattern `dd.MM.uuuu` would return a string like `18.03.1993` * @return A string, or null if `dateExpr` was a string that could not be cast to a timestamp * @note Use specialized functions like [[year]] whenever possible as they benefit from a * specialized implementation. @@ -2881,7 +2882,7 @@ object functions { * @since 1.5.0 */ def from_unixtime(ut: Column): Column = withExpr { - FromUnixTime(ut.expr, Literal("uuuu-MM-dd HH:mm:ss")) + FromUnixTime(ut.expr, Literal(TimestampFormatter.defaultPattern)) } /** @@ -2913,7 +2914,7 @@ object functions { * @since 1.5.0 */ def unix_timestamp(): Column = withExpr { - UnixTimestamp(CurrentTimestamp(), Literal("uuuu-MM-dd HH:mm:ss")) + UnixTimestamp(CurrentTimestamp(), Literal(TimestampFormatter.defaultPattern)) } /** @@ -2927,7 +2928,7 @@ object functions { * @since 1.5.0 */ def unix_timestamp(s: Column): Column = withExpr { - UnixTimestamp(s.expr, Literal("uuuu-MM-dd HH:mm:ss")) + UnixTimestamp(s.expr, Literal(TimestampFormatter.defaultPattern)) } /** @@ -3053,7 +3054,6 @@ object functions { * @group datetime_funcs * @since 1.5.0 */ - @deprecated("This function is deprecated and will be removed in future versions.", "3.0.0") def from_utc_timestamp(ts: Column, tz: String): Column = withExpr { FromUTCTimestamp(ts.expr, Literal(tz)) } @@ -3065,7 +3065,6 @@ object functions { * @group datetime_funcs * @since 2.4.0 */ - @deprecated("This function is deprecated and will be removed in future versions.", "3.0.0") def from_utc_timestamp(ts: Column, tz: Column): Column = withExpr { FromUTCTimestamp(ts.expr, tz.expr) } @@ -3084,7 +3083,6 @@ object functions { * @group datetime_funcs * @since 1.5.0 */ - @deprecated("This function is deprecated and will be removed in future versions.", "3.0.0") def to_utc_timestamp(ts: Column, tz: String): Column = withExpr { ToUTCTimestamp(ts.expr, Literal(tz)) } @@ -3096,7 +3094,6 @@ object functions { * @group datetime_funcs * @since 2.4.0 */ - @deprecated("This function is deprecated and will be removed in future versions.", "3.0.0") def to_utc_timestamp(ts: Column, tz: Column): Column = withExpr { ToUTCTimestamp(ts.expr, tz.expr) } @@ -3266,6 +3263,11 @@ object functions { /** * Returns an array containing all the elements in `x` from index `start` (or starting from the * end if `start` is negative) with the specified `length`. + * + * @param x the array column to be sliced + * @param start the starting index + * @param length the length of the slice + * * @group collection_funcs * @since 2.4.0 */ @@ -3334,7 +3336,7 @@ object functions { * @group collection_funcs * @since 2.4.0 */ - def array_sort(e: Column): Column = withExpr { ArraySort(e.expr) } + def array_sort(e: Column): Column = withExpr { new ArraySort(e.expr) } /** * Remove all elements that equal to element from the given array. @@ -3385,6 +3387,265 @@ object functions { ArrayExcept(col1.expr, col2.expr) } + private def createLambda(f: Column => Column) = { + val x = UnresolvedNamedLambdaVariable(Seq("x")) + val function = f(Column(x)).expr + LambdaFunction(function, Seq(x)) + } + + private def createLambda(f: (Column, Column) => Column) = { + val x = UnresolvedNamedLambdaVariable(Seq("x")) + val y = UnresolvedNamedLambdaVariable(Seq("y")) + val function = f(Column(x), Column(y)).expr + LambdaFunction(function, Seq(x, y)) + } + + private def createLambda(f: (Column, Column, Column) => Column) = { + val x = UnresolvedNamedLambdaVariable(Seq("x")) + val y = UnresolvedNamedLambdaVariable(Seq("y")) + val z = UnresolvedNamedLambdaVariable(Seq("z")) + val function = f(Column(x), Column(y), Column(z)).expr + LambdaFunction(function, Seq(x, y, z)) + } + + /** + * Returns an array of elements after applying a transformation to each element + * in the input array. + * {{{ + * df.select(transform(col("i"), x => x + 1)) + * }}} + * + * @param column the input array column + * @param f col => transformed_col, the lambda function to transform the input column + * + * @group collection_funcs + * @since 3.0.0 + */ + def transform(column: Column, f: Column => Column): Column = withExpr { + ArrayTransform(column.expr, createLambda(f)) + } + + /** + * Returns an array of elements after applying a transformation to each element + * in the input array. + * {{{ + * df.select(transform(col("i"), (x, i) => x + i)) + * }}} + * + * @param column the input array column + * @param f (col, index) => transformed_col, the lambda function to filter the input column + * given the index. Indices start at 0. + * + * @group collection_funcs + * @since 3.0.0 + */ + def transform(column: Column, f: (Column, Column) => Column): Column = withExpr { + ArrayTransform(column.expr, createLambda(f)) + } + + /** + * Returns whether a predicate holds for one or more elements in the array. + * {{{ + * df.select(exists(col("i"), _ % 2 === 0)) + * }}} + * + * @param column the input array column + * @param f col => predicate, the Boolean predicate to check the input column + * + * @group collection_funcs + * @since 3.0.0 + */ + def exists(column: Column, f: Column => Column): Column = withExpr { + ArrayExists(column.expr, createLambda(f)) + } + + /** + * Returns whether a predicate holds for every element in the array. + * {{{ + * df.select(forall(col("i"), x => x % 2 === 0)) + * }}} + * + * @param column the input array column + * @param f col => predicate, the Boolean predicate to check the input column + * + * @group collection_funcs + * @since 3.0.0 + */ + def forall(column: Column, f: Column => Column): Column = withExpr { + ArrayForAll(column.expr, createLambda(f)) + } + + /** + * Returns an array of elements for which a predicate holds in a given array. + * {{{ + * df.select(filter(col("s"), x => x % 2 === 0)) + * }}} + * + * @param column the input array column + * @param f col => predicate, the Boolean predicate to filter the input column + * + * @group collection_funcs + * @since 3.0.0 + */ + def filter(column: Column, f: Column => Column): Column = withExpr { + ArrayFilter(column.expr, createLambda(f)) + } + + /** + * Returns an array of elements for which a predicate holds in a given array. + * {{{ + * df.select(filter(col("s"), (x, i) => i % 2 === 0)) + * }}} + * + * @param column the input array column + * @param f (col, index) => predicate, the Boolean predicate to filter the input column + * given the index. Indices start at 0. + * + * @group collection_funcs + * @since 3.0.0 + */ + def filter(column: Column, f: (Column, Column) => Column): Column = withExpr { + ArrayFilter(column.expr, createLambda(f)) + } + + /** + * Applies a binary operator to an initial state and all elements in the array, + * and reduces this to a single state. The final state is converted into the final result + * by applying a finish function. + * {{{ + * df.select(aggregate(col("i"), lit(0), (acc, x) => acc + x, _ * 10)) + * }}} + * + * @param expr the input array column + * @param initialValue the initial value + * @param merge (combined_value, input_value) => combined_value, the merge function to merge + * an input value to the combined_value + * @param finish combined_value => final_value, the lambda function to convert the combined value + * of all inputs to final result + * + * @group collection_funcs + * @since 3.0.0 + */ + def aggregate( + expr: Column, + initialValue: Column, + merge: (Column, Column) => Column, + finish: Column => Column): Column = withExpr { + ArrayAggregate( + expr.expr, + initialValue.expr, + createLambda(merge), + createLambda(finish) + ) + } + + /** + * Applies a binary operator to an initial state and all elements in the array, + * and reduces this to a single state. + * {{{ + * df.select(aggregate(col("i"), lit(0), (acc, x) => acc + x)) + * }}} + * + * @param expr the input array column + * @param initialValue the initial value + * @param merge (combined_value, input_value) => combined_value, the merge function to merge + * an input value to the combined_value + * @group collection_funcs + * @since 3.0.0 + */ + def aggregate(expr: Column, initialValue: Column, merge: (Column, Column) => Column): Column = + aggregate(expr, initialValue, merge, c => c) + + /** + * Merge two given arrays, element-wise, into a single array using a function. + * If one array is shorter, nulls are appended at the end to match the length of the longer + * array, before applying the function. + * {{{ + * df.select(zip_with(df1("val1"), df1("val2"), (x, y) => x + y)) + * }}} + * + * @param left the left input array column + * @param right the right input array column + * @param f (lCol, rCol) => col, the lambda function to merge two input columns into one column + * + * @group collection_funcs + * @since 3.0.0 + */ + def zip_with(left: Column, right: Column, f: (Column, Column) => Column): Column = withExpr { + ZipWith(left.expr, right.expr, createLambda(f)) + } + + /** + * Applies a function to every key-value pair in a map and returns + * a map with the results of those applications as the new keys for the pairs. + * {{{ + * df.select(transform_keys(col("i"), (k, v) => k + v)) + * }}} + * + * @param expr the input map column + * @param f (key, value) => new_key, the lambda function to transform the key of input map column + * + * @group collection_funcs + * @since 3.0.0 + */ + def transform_keys(expr: Column, f: (Column, Column) => Column): Column = withExpr { + TransformKeys(expr.expr, createLambda(f)) + } + + /** + * Applies a function to every key-value pair in a map and returns + * a map with the results of those applications as the new values for the pairs. + * {{{ + * df.select(transform_values(col("i"), (k, v) => k + v)) + * }}} + * + * @param expr the input map column + * @param f (key, value) => new_value, the lambda function to transform the value of input map + * column + * + * @group collection_funcs + * @since 3.0.0 + */ + def transform_values(expr: Column, f: (Column, Column) => Column): Column = withExpr { + TransformValues(expr.expr, createLambda(f)) + } + + /** + * Returns a map whose key-value pairs satisfy a predicate. + * {{{ + * df.select(map_filter(col("m"), (k, v) => k * 10 === v)) + * }}} + * + * @param expr the input map column + * @param f (key, value) => predicate, the Boolean predicate to filter the input map column + * + * @group collection_funcs + * @since 3.0.0 + */ + def map_filter(expr: Column, f: (Column, Column) => Column): Column = withExpr { + MapFilter(expr.expr, createLambda(f)) + } + + /** + * Merge two given maps, key-wise into a single map using a function. + * {{{ + * df.select(map_zip_with(df("m1"), df("m2"), (k, v1, v2) => k === v1 + v2)) + * }}} + * + * @param left the left input map column + * @param right the right input map column + * @param f (key, value1, value2) => new_value, the lambda function to merge the map values + * + * @group collection_funcs + * @since 3.0.0 + */ + def map_zip_with( + left: Column, + right: Column, + f: (Column, Column, Column) => Column): Column = withExpr { + MapZipWith(left.expr, right.expr, createLambda(f)) + } + /** * Creates a new row for each element in the given array or map column. * Uses the default column name `col` for elements in the array and @@ -3942,6 +4203,63 @@ object functions { */ def to_csv(e: Column): Column = to_csv(e, Map.empty[String, String].asJava) + /** + * A transform for timestamps and dates to partition data into years. + * + * @group partition_transforms + * @since 3.0.0 + */ + def years(e: Column): Column = withExpr { Years(e.expr) } + + /** + * A transform for timestamps and dates to partition data into months. + * + * @group partition_transforms + * @since 3.0.0 + */ + def months(e: Column): Column = withExpr { Months(e.expr) } + + /** + * A transform for timestamps and dates to partition data into days. + * + * @group partition_transforms + * @since 3.0.0 + */ + def days(e: Column): Column = withExpr { Days(e.expr) } + + /** + * A transform for timestamps to partition data into hours. + * + * @group partition_transforms + * @since 3.0.0 + */ + def hours(e: Column): Column = withExpr { Hours(e.expr) } + + /** + * A transform for any type that partitions by a hash of the input column. + * + * @group partition_transforms + * @since 3.0.0 + */ + def bucket(numBuckets: Column, e: Column): Column = withExpr { + numBuckets.expr match { + case lit @ Literal(_, IntegerType) => + Bucket(lit, e.expr) + case _ => + throw new AnalysisException(s"Invalid number of buckets: bucket($numBuckets, $e)") + } + } + + /** + * A transform for any type that partitions by a hash of the input column. + * + * @group partition_transforms + * @since 3.0.0 + */ + def bucket(numBuckets: Int, e: Column): Column = withExpr { + Bucket(Literal(numBuckets), e.expr) + } + // scalastyle:off line.size.limit // scalastyle:off parameter.number @@ -3997,6 +4315,67 @@ object functions { // Scala UDF functions ////////////////////////////////////////////////////////////////////////////////////////////// + /** + * Obtains a `UserDefinedFunction` that wraps the given `Aggregator` + * so that it may be used with untyped Data Frames. + * {{{ + * val agg = // Aggregator[IN, BUF, OUT] + * + * // declare a UDF based on agg + * val aggUDF = udaf(agg) + * val aggData = df.agg(aggUDF($"colname")) + * + * // register agg as a named function + * spark.udf.register("myAggName", udaf(agg)) + * }}} + * + * @tparam IN the aggregator input type + * @tparam BUF the aggregating buffer type + * @tparam OUT the finalized output type + * + * @param agg the typed Aggregator + * + * @return a UserDefinedFunction that can be used as an aggregating expression. + * + * @note The input encoder is inferred from the input type IN. + */ + def udaf[IN: TypeTag, BUF, OUT](agg: Aggregator[IN, BUF, OUT]): UserDefinedFunction = { + udaf(agg, ExpressionEncoder[IN]()) + } + + /** + * Obtains a `UserDefinedFunction` that wraps the given `Aggregator` + * so that it may be used with untyped Data Frames. + * {{{ + * Aggregator agg = // custom Aggregator + * Encoder enc = // input encoder + * + * // declare a UDF based on agg + * UserDefinedFunction aggUDF = udaf(agg, enc) + * DataFrame aggData = df.agg(aggUDF($"colname")) + * + * // register agg as a named function + * spark.udf.register("myAggName", udaf(agg, enc)) + * }}} + * + * @tparam IN the aggregator input type + * @tparam BUF the aggregating buffer type + * @tparam OUT the finalized output type + * + * @param agg the typed Aggregator + * @param inputEncoder a specific input encoder to use + * + * @return a UserDefinedFunction that can be used as an aggregating expression + * + * @note This overloading takes an explicit input encoder, to support UDAF + * declarations in Java. + */ + def udaf[IN, BUF, OUT]( + agg: Aggregator[IN, BUF, OUT], + inputEncoder: Encoder[IN]): UserDefinedFunction = { + UserDefinedAggregator(agg, inputEncoder) + } + /** * Defines a Scala closure of 0 arguments as user-defined function (UDF). * The data types are automatically inferred based on the Scala closure's diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala index db4885aa01bad..eb658e2d8850e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/BaseSessionStateBuilder.scala @@ -19,16 +19,17 @@ package org.apache.spark.sql.internal import org.apache.spark.SparkConf import org.apache.spark.annotation.Unstable import org.apache.spark.sql.{ExperimentalMethods, SparkSession, UDFRegistration, _} -import org.apache.spark.sql.catalyst.analysis.{Analyzer, FunctionRegistry} +import org.apache.spark.sql.catalyst.analysis.{Analyzer, FunctionRegistry, ResolveSessionCatalog} import org.apache.spark.sql.catalyst.catalog.SessionCatalog import org.apache.spark.sql.catalyst.optimizer.Optimizer import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.connector.catalog.CatalogManager import org.apache.spark.sql.execution.{ColumnarRule, QueryExecution, SparkOptimizer, SparkPlanner, SparkSqlParser} import org.apache.spark.sql.execution.analysis.DetectAmbiguousSelfJoin import org.apache.spark.sql.execution.datasources._ -import org.apache.spark.sql.execution.datasources.v2.TableCapabilityCheck +import org.apache.spark.sql.execution.datasources.v2.{TableCapabilityCheck, V2SessionCatalog} import org.apache.spark.sql.streaming.StreamingQueryManager import org.apache.spark.sql.util.ExecutionListenerManager @@ -151,6 +152,10 @@ abstract class BaseSessionStateBuilder( catalog } + protected lazy val v2SessionCatalog = new V2SessionCatalog(catalog, conf) + + protected lazy val catalogManager = new CatalogManager(conf, v2SessionCatalog, catalog) + /** * Interface exposed to the user for registering user-defined functions. * @@ -164,12 +169,12 @@ abstract class BaseSessionStateBuilder( * * Note: this depends on the `conf` and `catalog` fields. */ - protected def analyzer: Analyzer = new Analyzer(catalog, conf) { + protected def analyzer: Analyzer = new Analyzer(catalogManager, conf) { override val extendedResolutionRules: Seq[Rule[LogicalPlan]] = new FindDataSourceTable(session) +: new ResolveSQLOnFile(session) +: new FallBackFileSourceV2(session) +: - DataSourceResolution(conf, this.catalogManager) +: + new ResolveSessionCatalog(catalogManager, conf, catalog.isView) +: customResolutionRules override val postHocResolutionRules: Seq[Rule[LogicalPlan]] = @@ -223,7 +228,7 @@ abstract class BaseSessionStateBuilder( * Note: this depends on `catalog` and `experimentalMethods` fields. */ protected def optimizer: Optimizer = { - new SparkOptimizer(catalog, experimentalMethods) { + new SparkOptimizer(catalogManager, catalog, experimentalMethods) { override def extendedOperatorOptimizationRules: Seq[Rule[LogicalPlan]] = super.extendedOperatorOptimizationRules ++ customOperatorOptimizationRules } @@ -245,7 +250,7 @@ abstract class BaseSessionStateBuilder( * Note: this depends on the `conf` and `experimentalMethods` fields. */ protected def planner: SparkPlanner = { - new SparkPlanner(session.sparkContext, conf, experimentalMethods) { + new SparkPlanner(session, conf, experimentalMethods) { override def extraPlanningStrategies: Seq[Strategy] = super.extraPlanningStrategies ++ customPlanningStrategies } @@ -311,7 +316,7 @@ abstract class BaseSessionStateBuilder( () => analyzer, () => optimizer, planner, - streamingQueryManager, + () => streamingQueryManager, listenerManager, () => resourceLoader, createQueryExecution, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala index 3740b56cb9cbb..d3ef03e9b3b74 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala @@ -520,7 +520,7 @@ private[sql] object CatalogImpl { val encoded = data.map(d => enc.toRow(d).copy()) val plan = new LocalRelation(enc.schema.toAttributes, encoded) val queryExecution = sparkSession.sessionState.executePlan(plan) - new Dataset[T](sparkSession, queryExecution, enc) + new Dataset[T](queryExecution, enc) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala index 4921e3ca903c4..64b7e7fe7923a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala @@ -65,6 +65,14 @@ object HiveSerDe { outputFormat = Option("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat"), serde = Option("org.apache.hadoop.hive.serde2.avro.AvroSerDe"))) + // `HiveSerDe` in `serdeMap` should be dintinct. + val serdeInverseMap: Map[HiveSerDe, String] = serdeMap.flatMap { + case ("sequencefile", _) => None + case ("rcfile", _) => None + case ("textfile", serde) => Some((serde, "text")) + case pair => Some(pair.swap) + } + /** * Get the Hive SerDe information from the data source abbreviation string or classname. * @@ -88,6 +96,14 @@ object HiveSerDe { serdeMap.get(key) } + /** + * Get the Spark data source name from the Hive SerDe information. + * + * @param serde Hive SerDe information. + * @return Spark data source name associated with the specified Hive Serde. + */ + def serdeToSource(serde: HiveSerDe): Option[String] = serdeInverseMap.get(serde) + def getDefaultStorage(conf: SQLConf): CatalogStorageFormat = { // To respect hive-site.xml, it peeks Hadoop configuration from existing Spark session, // as an easy workaround. See SPARK-27555. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala index a83a0f51ecf11..abd1250628539 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala @@ -24,12 +24,12 @@ import org.apache.hadoop.fs.Path import org.apache.spark.annotation.Unstable import org.apache.spark.sql._ -import org.apache.spark.sql.catalog.v2.CatalogManager import org.apache.spark.sql.catalyst.analysis.{Analyzer, FunctionRegistry} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.optimizer.Optimizer import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.connector.catalog.CatalogManager import org.apache.spark.sql.execution._ import org.apache.spark.sql.streaming.StreamingQueryManager import org.apache.spark.sql.util.{ExecutionListenerManager, QueryExecutionListener} @@ -49,7 +49,8 @@ import org.apache.spark.sql.util.{ExecutionListenerManager, QueryExecutionListen * unresolved attributes and relations. * @param optimizerBuilder a function to create the logical query plan optimizer. * @param planner Planner that converts optimized logical plans to physical plans. - * @param streamingQueryManager Interface to start and stop streaming queries. + * @param streamingQueryManagerBuilder A function to create a streaming query manager to + * start and stop streaming queries. * @param listenerManager Interface to register custom [[QueryExecutionListener]]s. * @param resourceLoaderBuilder a function to create a session shared resource loader to load JARs, * files, etc. @@ -67,7 +68,7 @@ private[sql] class SessionState( analyzerBuilder: () => Analyzer, optimizerBuilder: () => Optimizer, val planner: SparkPlanner, - val streamingQueryManager: StreamingQueryManager, + val streamingQueryManagerBuilder: () => StreamingQueryManager, val listenerManager: ExecutionListenerManager, resourceLoaderBuilder: () => SessionResourceLoader, createQueryExecution: LogicalPlan => QueryExecution, @@ -83,6 +84,10 @@ private[sql] class SessionState( lazy val resourceLoader: SessionResourceLoader = resourceLoaderBuilder() + // The streamingQueryManager is lazy to avoid creating a StreamingQueryManager for each session + // when connecting to ThriftServer. + lazy val streamingQueryManager: StreamingQueryManager = streamingQueryManagerBuilder() + def catalogManager: CatalogManager = analyzer.catalogManager def newHadoopConf(): Configuration = SessionState.newHadoopConf( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala index f1a648176c3b3..5347264d7c50a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala @@ -18,7 +18,9 @@ package org.apache.spark.sql.internal import java.net.URL -import java.util.Locale +import java.util.UUID +import java.util.concurrent.ConcurrentHashMap +import javax.annotation.concurrent.GuardedBy import scala.reflect.ClassTag import scala.util.control.NonFatal @@ -31,8 +33,11 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.execution.CacheManager +import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.execution.ui.{SQLAppStatusListener, SQLAppStatusStore, SQLTab} import org.apache.spark.sql.internal.StaticSQLConf._ +import org.apache.spark.sql.streaming.StreamingQueryListener +import org.apache.spark.sql.streaming.ui.{StreamingQueryStatusListener, StreamingQueryTab} import org.apache.spark.status.ElementTrackingStore import org.apache.spark.util.Utils @@ -48,6 +53,8 @@ private[sql] class SharedState( initialConfigs: scala.collection.Map[String, String]) extends Logging { + SharedState.setFsUrlStreamHandlerFactory(sparkContext.conf) + // Load hive-site.xml into hadoopConf and determine the warehouse path we want to use, based on // the config from both hive and Spark SQL. Finally set the warehouse config value to sparkConf. val warehousePath: String = { @@ -110,6 +117,16 @@ private[sql] class SharedState( */ val cacheManager: CacheManager = new CacheManager + /** A global lock for all streaming query lifecycle tracking and management. */ + private[sql] val activeQueriesLock = new Object + + /** + * A map of active streaming queries to the session specific StreamingQueryManager that manages + * the lifecycle of that stream. + */ + @GuardedBy("activeQueriesLock") + private[sql] val activeStreamingQueries = new ConcurrentHashMap[UUID, StreamExecution]() + /** * A status store to query SQL status/metrics of this Spark application, based on SQL-specific * [[org.apache.spark.scheduler.SparkListenerEvent]]s. @@ -123,6 +140,22 @@ private[sql] class SharedState( statusStore } + /** + * A [[StreamingQueryListener]] for structured streaming ui, it contains all streaming query ui + * data to show. + */ + lazy val streamingQueryStatusListener: Option[StreamingQueryStatusListener] = { + sparkContext.ui.flatMap { ui => + if (conf.get(STREAMING_UI_ENABLED)) { + val statusListener = new StreamingQueryStatusListener(conf) + new StreamingQueryTab(statusListener, ui) + Some(statusListener) + } else { + None + } + } + } + /** * A catalog that interacts with external systems. */ @@ -177,11 +210,23 @@ private[sql] class SharedState( } object SharedState extends Logging { - try { - URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory()) - } catch { - case e: Error => - logWarning("URL.setURLStreamHandlerFactory failed to set FsUrlStreamHandlerFactory") + @volatile private var fsUrlStreamHandlerFactoryInitialized = false + + private def setFsUrlStreamHandlerFactory(conf: SparkConf): Unit = { + if (!fsUrlStreamHandlerFactoryInitialized && + conf.get(DEFAULT_URL_STREAM_HANDLER_FACTORY_ENABLED)) { + synchronized { + if (!fsUrlStreamHandlerFactoryInitialized) { + try { + URL.setURLStreamHandlerFactory(new FsUrlStreamHandlerFactory()) + fsUrlStreamHandlerFactoryInitialized = true + } catch { + case NonFatal(_) => + logWarning("URL.setURLStreamHandlerFactory failed to set FsUrlStreamHandlerFactory") + } + } + } + } } private val HIVE_EXTERNAL_CATALOG_CLASS_NAME = "org.apache.spark.sql.hive.HiveExternalCatalog" diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DB2Dialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DB2Dialect.scala index d160ad82888a2..ab574df4557a5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DB2Dialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DB2Dialect.scala @@ -18,12 +18,14 @@ package org.apache.spark.sql.jdbc import java.sql.Types +import java.util.Locale import org.apache.spark.sql.types._ private object DB2Dialect extends JdbcDialect { - override def canHandle(url: String): Boolean = url.startsWith("jdbc:db2") + override def canHandle(url: String): Boolean = + url.toLowerCase(Locale.ROOT).startsWith("jdbc:db2") override def getCatalystType( sqlType: Int, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DerbyDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DerbyDialect.scala index d13c29ed46bd5..d528d5a9fef5a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DerbyDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/DerbyDialect.scala @@ -18,13 +18,15 @@ package org.apache.spark.sql.jdbc import java.sql.Types +import java.util.Locale import org.apache.spark.sql.types._ private object DerbyDialect extends JdbcDialect { - override def canHandle(url: String): Boolean = url.startsWith("jdbc:derby") + override def canHandle(url: String): Boolean = + url.toLowerCase(Locale.ROOT).startsWith("jdbc:derby") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala index 805f73dee141b..72284b5996201 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala @@ -17,12 +17,16 @@ package org.apache.spark.sql.jdbc +import java.util.Locale + +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ private object MsSqlServerDialect extends JdbcDialect { - override def canHandle(url: String): Boolean = url.startsWith("jdbc:sqlserver") + override def canHandle(url: String): Boolean = + url.toLowerCase(Locale.ROOT).startsWith("jdbc:sqlserver") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { @@ -30,10 +34,14 @@ private object MsSqlServerDialect extends JdbcDialect { // String is recommend by Microsoft SQL Server for datetimeoffset types in non-MS clients Option(StringType) } else { - sqlType match { - case java.sql.Types.SMALLINT => Some(ShortType) - case java.sql.Types.REAL => Some(FloatType) - case _ => None + if (SQLConf.get.legacyMsSqlServerNumericMappingEnabled) { + None + } else { + sqlType match { + case java.sql.Types.SMALLINT => Some(ShortType) + case java.sql.Types.REAL => Some(FloatType) + case _ => None + } } } } @@ -43,7 +51,8 @@ private object MsSqlServerDialect extends JdbcDialect { case StringType => Some(JdbcType("NVARCHAR(MAX)", java.sql.Types.NVARCHAR)) case BooleanType => Some(JdbcType("BIT", java.sql.Types.BIT)) case BinaryType => Some(JdbcType("VARBINARY(MAX)", java.sql.Types.VARBINARY)) - case ShortType => Some(JdbcType("SMALLINT", java.sql.Types.SMALLINT)) + case ShortType if !SQLConf.get.legacyMsSqlServerNumericMappingEnabled => + Some(JdbcType("SMALLINT", java.sql.Types.SMALLINT)) case _ => None } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala index b2cff7877d8b5..24b31b14d9427 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala @@ -18,12 +18,14 @@ package org.apache.spark.sql.jdbc import java.sql.Types +import java.util.Locale import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder} private case object MySQLDialect extends JdbcDialect { - override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql") + override def canHandle(url : String): Boolean = + url.toLowerCase(Locale.ROOT).startsWith("jdbc:mysql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala index f4a6d0a4d2e44..4c0623729e00d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.jdbc import java.sql.{Date, Timestamp, Types} -import java.util.TimeZone +import java.util.{Locale, TimeZone} import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.internal.SQLConf @@ -30,7 +30,8 @@ private case object OracleDialect extends JdbcDialect { private[jdbc] val BINARY_DOUBLE = 101 private[jdbc] val TIMESTAMPTZ = -101 - override def canHandle(url: String): Boolean = url.startsWith("jdbc:oracle") + override def canHandle(url: String): Boolean = + url.toLowerCase(Locale.ROOT).startsWith("jdbc:oracle") private def supportTimeZoneTypes: Boolean = { val timeZone = DateTimeUtils.getTimeZone(SQLConf.get.sessionLocalTimeZone) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala index 2645e4c9d528b..c8d8a3392128e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.jdbc import java.sql.{Connection, Types} +import java.util.Locale import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils} import org.apache.spark.sql.types._ @@ -25,7 +26,8 @@ import org.apache.spark.sql.types._ private object PostgresDialect extends JdbcDialect { - override def canHandle(url: String): Boolean = url.startsWith("jdbc:postgresql") + override def canHandle(url: String): Boolean = + url.toLowerCase(Locale.ROOT).startsWith("jdbc:postgresql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/TeradataDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/TeradataDialect.scala index 6c17bd7ed9ec4..552d7a484f3fd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/TeradataDialect.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/TeradataDialect.scala @@ -17,14 +17,15 @@ package org.apache.spark.sql.jdbc -import java.sql.Types +import java.util.Locale import org.apache.spark.sql.types._ private case object TeradataDialect extends JdbcDialect { - override def canHandle(url: String): Boolean = { url.startsWith("jdbc:teradata") } + override def canHandle(url: String): Boolean = + url.toLowerCase(Locale.ROOT).startsWith("jdbc:teradata") override def getJDBCType(dt: DataType): Option[JdbcType] = dt match { case StringType => Some(JdbcType("VARCHAR(255)", java.sql.Types.VARCHAR)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala index 23a84cbd0dc02..0eb4776988d9f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala @@ -24,13 +24,13 @@ import scala.collection.JavaConverters._ import org.apache.spark.annotation.Evolving import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, SparkSession} +import org.apache.spark.sql.connector.catalog.{SupportsRead, TableProvider} +import org.apache.spark.sql.connector.catalog.TableCapability._ import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.datasources.DataSource -import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils +import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Utils, FileDataSourceV2} import org.apache.spark.sql.execution.streaming.{StreamingRelation, StreamingRelationV2} import org.apache.spark.sql.sources.StreamSourceProvider -import org.apache.spark.sql.sources.v2._ -import org.apache.spark.sql.sources.v2.TableCapability._ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -83,9 +83,6 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo *
      *
    • `timeZone` (default session local timezone): sets the string that indicates a timezone * to be used to parse timestamps in the JSON/CSV datasources or partition values.
    • - *
    • `pathGlobFilter`: an optional glob pattern to only include files with paths matching - * the pattern. The syntax follows org.apache.hadoop.fs.GlobFilter. - * It does not change the behavior of partition discovery.
    • *
    * * @since 2.0.0 @@ -123,9 +120,6 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo *
      *
    • `timeZone` (default session local timezone): sets the string that indicates a timezone * to be used to parse timestamps in the JSON/CSV data sources or partition values.
    • - *
    • `pathGlobFilter`: an optional glob pattern to only include files with paths matching - * the pattern. The syntax follows org.apache.hadoop.fs.GlobFilter. - * It does not change the behavior of partition discovery.
    • *
    * * @since 2.0.0 @@ -142,9 +136,6 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo *
      *
    • `timeZone` (default session local timezone): sets the string that indicates a timezone * to be used to parse timestamps in the JSON/CSV data sources or partition values.
    • - *
    • `pathGlobFilter`: an optional glob pattern to only include files with paths matching - * the pattern. The syntax follows org.apache.hadoop.fs.GlobFilter. - * It does not change the behavior of partition discovery.
    • *
    * * @since 2.0.0 @@ -182,15 +173,13 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo case _ => None } ds match { - case provider: TableProvider => + // file source v2 does not support streaming yet. + case provider: TableProvider if !provider.isInstanceOf[FileDataSourceV2] => val sessionOptions = DataSourceV2Utils.extractSessionConfigs( source = provider, conf = sparkSession.sessionState.conf) val options = sessionOptions ++ extraOptions val dsOptions = new CaseInsensitiveStringMap(options.asJava) - val table = userSpecifiedSchema match { - case Some(schema) => provider.getTable(dsOptions, schema) - case _ => provider.getTable(dsOptions) - } + val table = DataSourceV2Utils.getTableFromProvider(provider, dsOptions, userSpecifiedSchema) import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._ table match { case _: SupportsRead if table.supportsAny(MICRO_BATCH_READ, CONTINUOUS_READ) => @@ -277,6 +266,11 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo * empty array/struct during schema inference. *
  • `locale` (default is `en-US`): sets a locale as language tag in IETF BCP 47 format. * For instance, this is used while parsing dates and timestamps.
  • + *
  • `pathGlobFilter`: an optional glob pattern to only include files with paths matching + * the pattern. The syntax follows org.apache.hadoop.fs.GlobFilter. + * It does not change the behavior of partition discovery.
  • + *
  • `recursiveFileLookup`: recursively scan a directory for files. Using this option + * disables partition discovery
  • * * * @since 2.0.0 @@ -357,6 +351,11 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo * For instance, this is used while parsing dates and timestamps. *
  • `lineSep` (default covers all `\r`, `\r\n` and `\n`): defines the line separator * that should be used for parsing. Maximum length is 1 character.
  • + *
  • `pathGlobFilter`: an optional glob pattern to only include files with paths matching + * the pattern. The syntax follows org.apache.hadoop.fs.GlobFilter. + * It does not change the behavior of partition discovery.
  • + *
  • `recursiveFileLookup`: recursively scan a directory for files. Using this option + * disables partition discovery
  • * * * @since 2.0.0 @@ -370,6 +369,14 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo *
      *
    • `maxFilesPerTrigger` (default: no max limit): sets the maximum number of new files to be * considered in every trigger.
    • + *
    • `mergeSchema` (default is the value specified in `spark.sql.orc.mergeSchema`): sets whether + * we should merge schemas collected from all ORC part-files. This will override + * `spark.sql.orc.mergeSchema`.
    • + *
    • `pathGlobFilter`: an optional glob pattern to only include files with paths matching + * the pattern. The syntax follows org.apache.hadoop.fs.GlobFilter. + * It does not change the behavior of partition discovery.
    • + *
    • `recursiveFileLookup`: recursively scan a directory for files. Using this option + * disables partition discovery
    • *
    * * @since 2.3.0 @@ -389,6 +396,11 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo * whether we should merge schemas collected from all * Parquet part-files. This will override * `spark.sql.parquet.mergeSchema`. + *
  • `pathGlobFilter`: an optional glob pattern to only include files with paths matching + * the pattern. The syntax follows org.apache.hadoop.fs.GlobFilter. + * It does not change the behavior of partition discovery.
  • + *
  • `recursiveFileLookup`: recursively scan a directory for files. Using this option + * disables partition discovery
  • * * * @since 2.0.0 @@ -419,6 +431,11 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo * *
  • `lineSep` (default covers all `\r`, `\r\n` and `\n`): defines the line separator * that should be used for parsing.
  • + *
  • `pathGlobFilter`: an optional glob pattern to only include files with paths matching + * the pattern. The syntax follows org.apache.hadoop.fs.GlobFilter. + * It does not change the behavior of partition discovery.
  • + *
  • `recursiveFileLookup`: recursively scan a directory for files. Using this option + * disables partition discovery
  • * * * @since 2.0.0 @@ -442,15 +459,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo * spark.readStream().textFile("/path/to/spark/README.md") * }}} * - * You can set the following text-specific options to deal with text files: - *
      - *
    • `maxFilesPerTrigger` (default: no max limit): sets the maximum number of new files to be - * considered in every trigger.
    • - *
    • `wholetext` (default `false`): If true, read a file as a single row and not split by "\n". - *
    • - *
    • `lineSep` (default covers all `\r`, `\r\n` and `\n`): defines the line separator - * that should be used for parsing.
    • - *
    + * You can set the text-specific options as specified in `DataStreamReader.text`. * * @param path input path * @since 2.1.0 diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala index 36104d7a70443..1c21a30dd5bd6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.streaming import java.util.Locale +import java.util.concurrent.TimeoutException import scala.collection.JavaConverters._ @@ -25,13 +26,13 @@ import org.apache.spark.annotation.Evolving import org.apache.spark.api.java.function.VoidFunction2 import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.streaming.InternalOutputModes +import org.apache.spark.sql.connector.catalog.{SupportsWrite, TableProvider} +import org.apache.spark.sql.connector.catalog.TableCapability._ import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.datasources.DataSource -import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils +import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Utils, FileDataSourceV2} import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.sources._ -import org.apache.spark.sql.sources.v2.{SupportsWrite, TableProvider} -import org.apache.spark.sql.sources.v2.TableCapability._ import org.apache.spark.sql.util.CaseInsensitiveStringMap /** @@ -238,10 +239,18 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) { /** * Starts the execution of the streaming query, which will continually output results to the given * path as new data arrives. The returned [[StreamingQuery]] object can be used to interact with - * the stream. + * the stream. Throws a `TimeoutException` if the following conditions are met: + * - Another run of the same streaming query, that is a streaming query + * sharing the same checkpoint location, is already active on the same + * Spark Driver + * - The SQL configuration `spark.sql.streaming.stopActiveRunOnRestart` + * is enabled + * - The active run cannot be stopped within the timeout controlled by + * the SQL configuration `spark.sql.streaming.stopTimeout` * * @since 2.0.0 */ + @throws[TimeoutException] def start(): StreamingQuery = { if (source.toLowerCase(Locale.ROOT) == DDLUtils.HIVE_PROVIDER) { throw new AnalysisException("Hive data source can only be used with tables, you can not " + @@ -299,7 +308,9 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) { } else { val cls = DataSource.lookupDataSource(source, df.sparkSession.sessionState.conf) val disabledSources = df.sparkSession.sqlContext.conf.disabledV2StreamingWriters.split(",") - val useV1Source = disabledSources.contains(cls.getCanonicalName) + val useV1Source = disabledSources.contains(cls.getCanonicalName) || + // file source v2 does not support streaming yet. + classOf[FileDataSourceV2].isAssignableFrom(cls) val sink = if (classOf[TableProvider].isAssignableFrom(cls) && !useV1Source) { val provider = cls.getConstructor().newInstance().asInstanceOf[TableProvider] @@ -307,8 +318,10 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) { source = provider, conf = df.sparkSession.sessionState.conf) val options = sessionOptions ++ extraOptions val dsOptions = new CaseInsensitiveStringMap(options.asJava) + val table = DataSourceV2Utils.getTableFromProvider( + provider, dsOptions, userSpecifiedSchema = None) import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._ - provider.getTable(dsOptions) match { + table match { case table: SupportsWrite if table.supports(STREAMING_WRITE) => table case _ => createV1Sink() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/GroupState.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/GroupState.scala index ab68eba81b843..af08a53e465b3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/GroupState.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/GroupState.scala @@ -93,7 +93,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalGroupState * any trigger and timeout function call will not occur until there is data. * - Since the processing time timeout is based on the clock time, it is affected by the * variations in the system clock (i.e. time zone changes, clock skew, etc.). - * - With `EventTimeTimeout`, the user also has to specify the the the event time watermark in + * - With `EventTimeTimeout`, the user also has to specify the event time watermark in * the query using `Dataset.withWatermark()`. With this setting, data that is older than the * watermark are filtered out. The timeout can be set for a group by setting a timeout timestamp * using`GroupState.setTimeoutTimestamp()`, and the timeout would occur when the watermark diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala index 47ddc88e964e8..85d980e5d6733 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQuery.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.streaming import java.util.UUID +import java.util.concurrent.TimeoutException import org.apache.spark.annotation.Evolving import org.apache.spark.sql.SparkSession @@ -142,10 +143,17 @@ trait StreamingQuery { def processAllAvailable(): Unit /** - * Stops the execution of this query if it is running. This method blocks until the threads - * performing execution has stopped. + * Stops the execution of this query if it is running. This waits until the termination of the + * query execution threads or until a timeout is hit. + * + * By default stop will block indefinitely. You can configure a timeout by the configuration + * `spark.sql.streaming.stopTimeout`. A timeout of 0 (or negative) milliseconds will block + * indefinitely. If a `TimeoutException` is thrown, users can retry stopping the stream. If the + * issue persists, it is advisable to kill the Spark application. + * * @since 2.0.0 */ + @throws[TimeoutException] def stop(): Unit /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala index 916d6a0365965..dd842cd1a3e99 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryListener.scala @@ -79,16 +79,18 @@ object StreamingQueryListener { /** * Event representing the start of a query - * @param id An unique query id that persists across restarts. See `StreamingQuery.id()`. + * @param id A unique query id that persists across restarts. See `StreamingQuery.id()`. * @param runId A query id that is unique for every start/restart. See `StreamingQuery.runId()`. * @param name User-specified name of the query, null if not specified. + * @param submissionTime The timestamp to start a query. * @since 2.1.0 */ @Evolving class QueryStartedEvent private[sql]( val id: UUID, val runId: UUID, - val name: String) extends Event + val name: String, + val submissionTime: Long) extends Event /** * Event representing any progress updates in a query. @@ -101,7 +103,7 @@ object StreamingQueryListener { /** * Event representing that termination of a query. * - * @param id An unique query id that persists across restarts. See `StreamingQuery.id()`. + * @param id A unique query id that persists across restarts. See `StreamingQuery.id()`. * @param runId A query id that is unique for every start/restart. See `StreamingQuery.runId()`. * @param exception The exception message of the query if the query was terminated * with an exception. Otherwise, it will be `None`. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala index 976595616bd28..4d0d8ffd959c6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala @@ -17,8 +17,8 @@ package org.apache.spark.sql.streaming -import java.util.UUID -import java.util.concurrent.TimeUnit +import java.util.{ConcurrentModificationException, UUID} +import java.util.concurrent.{TimeoutException, TimeUnit} import javax.annotation.concurrent.GuardedBy import scala.collection.JavaConverters._ @@ -29,14 +29,15 @@ import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.annotation.Evolving import org.apache.spark.internal.Logging +import org.apache.spark.internal.config.UI.UI_ENABLED import org.apache.spark.sql.{AnalysisException, DataFrame, SparkSession} import org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker +import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table} import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution import org.apache.spark.sql.execution.streaming.state.StateStoreCoordinatorRef import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.StaticSQLConf.STREAMING_QUERY_LISTENERS -import org.apache.spark.sql.sources.v2.{SupportsWrite, Table} import org.apache.spark.util.{Clock, SystemClock, Utils} /** @@ -51,9 +52,10 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo StateStoreCoordinatorRef.forDriver(sparkSession.sparkContext.env) private val listenerBus = new StreamingQueryListenerBus(sparkSession.sparkContext.listenerBus) - @GuardedBy("activeQueriesLock") + @GuardedBy("activeQueriesSharedLock") private val activeQueries = new mutable.HashMap[UUID, StreamingQuery] - private val activeQueriesLock = new Object + // A global lock to keep track of active streaming queries across Spark sessions + private val activeQueriesSharedLock = sparkSession.sharedState.activeQueriesLock private val awaitTerminationLock = new Object @GuardedBy("awaitTerminationLock") @@ -67,6 +69,9 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo logInfo(s"Registered listener ${listener.getClass.getName}") }) } + sparkSession.sharedState.streamingQueryStatusListener.foreach { listener => + addListener(listener) + } } catch { case e: Exception => throw new SparkException("Exception when registering StreamingQueryListener", e) @@ -77,7 +82,7 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo * * @since 2.0.0 */ - def active: Array[StreamingQuery] = activeQueriesLock.synchronized { + def active: Array[StreamingQuery] = activeQueriesSharedLock.synchronized { activeQueries.values.toArray } @@ -86,7 +91,7 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo * * @since 2.1.0 */ - def get(id: UUID): StreamingQuery = activeQueriesLock.synchronized { + def get(id: UUID): StreamingQuery = activeQueriesSharedLock.synchronized { activeQueries.get(id).orNull } @@ -320,6 +325,7 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo * @param trigger [[Trigger]] for the query. * @param triggerClock [[Clock]] to use for the triggering. */ + @throws[TimeoutException] private[sql] def startQuery( userSpecifiedName: Option[String], userSpecifiedCheckpointLocation: Option[String], @@ -343,25 +349,61 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo trigger, triggerClock) - activeQueriesLock.synchronized { + // The following code block checks if a stream with the same name or id is running. Then it + // returns an Option of an already active stream to stop outside of the lock + // to avoid a deadlock. + val activeRunOpt = activeQueriesSharedLock.synchronized { // Make sure no other query with same name is active userSpecifiedName.foreach { name => if (activeQueries.values.exists(_.name == name)) { - throw new IllegalArgumentException( - s"Cannot start query with name $name as a query with that name is already active") + throw new IllegalArgumentException(s"Cannot start query with name $name as a query " + + s"with that name is already active in this SparkSession") } } - // Make sure no other query with same id is active - if (activeQueries.values.exists(_.id == query.id)) { - throw new IllegalStateException( - s"Cannot start query with id ${query.id} as another query with same id is " + - s"already active. Perhaps you are attempting to restart a query from checkpoint " + - s"that is already active.") + // Make sure no other query with same id is active across all sessions + val activeOption = Option(sparkSession.sharedState.activeStreamingQueries.get(query.id)) + .orElse(activeQueries.get(query.id)) // shouldn't be needed but paranoia ... + + val shouldStopActiveRun = + sparkSession.sessionState.conf.getConf(SQLConf.STREAMING_STOP_ACTIVE_RUN_ON_RESTART) + if (activeOption.isDefined) { + if (shouldStopActiveRun) { + val oldQuery = activeOption.get + logWarning(s"Stopping existing streaming query [id=${query.id}, " + + s"runId=${oldQuery.runId}], as a new run is being started.") + Some(oldQuery) + } else { + throw new IllegalStateException( + s"Cannot start query with id ${query.id} as another query with same id is " + + s"already active. Perhaps you are attempting to restart a query from checkpoint " + + s"that is already active. You may stop the old query by setting the SQL " + + "configuration: " + + s"""spark.conf.set("${SQLConf.STREAMING_STOP_ACTIVE_RUN_ON_RESTART.key}", true) """ + + "and retry.") + } + } else { + // nothing to stop so, no-op + None } + } + // stop() will clear the queryId from activeStreamingQueries as well as activeQueries + activeRunOpt.foreach(_.stop()) + + activeQueriesSharedLock.synchronized { + // We still can have a race condition when two concurrent instances try to start the same + // stream, while a third one was already active and stopped above. In this case, we throw a + // ConcurrentModificationException. + val oldActiveQuery = sparkSession.sharedState.activeStreamingQueries.put( + query.id, query.streamingQuery) // we need to put the StreamExecution, not the wrapper + if (oldActiveQuery != null) { + throw new ConcurrentModificationException( + "Another instance of this query was just started by a concurrent session.") + } activeQueries.put(query.id, query) } + try { // When starting a query, it will call `StreamingQueryListener.onQueryStarted` synchronously. // As it's provided by the user and can run arbitrary codes, we must not hold any lock here. @@ -370,9 +412,7 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo query.streamingQuery.start() } catch { case e: Throwable => - activeQueriesLock.synchronized { - activeQueries -= query.id - } + unregisterTerminatedStream(query) throw e } query @@ -380,9 +420,7 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo /** Notify (by the StreamingQuery) that the query has been terminated */ private[sql] def notifyQueryTermination(terminatedQuery: StreamingQuery): Unit = { - activeQueriesLock.synchronized { - activeQueries -= terminatedQuery.id - } + unregisterTerminatedStream(terminatedQuery) awaitTerminationLock.synchronized { if (lastTerminatedQuery == null || terminatedQuery.exception.nonEmpty) { lastTerminatedQuery = terminatedQuery @@ -391,4 +429,13 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo } stateStoreCoordinator.deactivateInstances(terminatedQuery.runId) } + + private def unregisterTerminatedStream(terminatedQuery: StreamingQuery): Unit = { + activeQueriesSharedLock.synchronized { + // remove from shared state only if the streaming execution also matches + sparkSession.sharedState.activeStreamingQueries.remove( + terminatedQuery.id, terminatedQuery) + activeQueries -= terminatedQuery.id + } + } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala index 0b3945cbd1323..13b506b60a126 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala @@ -24,12 +24,15 @@ import java.util.UUID import scala.collection.JavaConverters._ import scala.util.control.NonFatal +import com.fasterxml.jackson.databind.annotation.JsonDeserialize import org.json4s._ import org.json4s.JsonAST.JValue import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.annotation.Evolving +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.streaming.SinkProgress.DEFAULT_NUM_OUTPUT_ROWS /** @@ -74,7 +77,7 @@ class StateOperatorProgress private[sql]( * a trigger. Each event relates to processing done for a single trigger of the streaming * query. Events are emitted even when no new data is available to be processed. * - * @param id An unique query id that persists across restarts. See `StreamingQuery.id()`. + * @param id A unique query id that persists across restarts. See `StreamingQuery.id()`. * @param runId A query id that is unique for every start/restart. See `StreamingQuery.runId()`. * @param name User-specified name of the query, null if not specified. * @param timestamp Beginning time of the trigger in ISO8601 format, i.e. UTC timestamps. @@ -82,6 +85,7 @@ class StateOperatorProgress private[sql]( * case of retries after a failure a given batchId my be executed more than once. * Similarly, when there is no data to be processed, the batchId will not be * incremented. + * @param batchDuration The process duration of each batch. * @param durationMs The amount of time taken to perform various operations in milliseconds. * @param eventTime Statistics of event time seen in this batch. It may contain the following keys: * {{{ @@ -102,11 +106,14 @@ class StreamingQueryProgress private[sql]( val name: String, val timestamp: String, val batchId: Long, + val batchDuration: Long, val durationMs: ju.Map[String, JLong], val eventTime: ju.Map[String, String], val stateOperators: Array[StateOperatorProgress], val sources: Array[SourceProgress], - val sink: SinkProgress) extends Serializable { + val sink: SinkProgress, + @JsonDeserialize(contentAs = classOf[GenericRowWithSchema]) + val observedMetrics: ju.Map[String, Row]) extends Serializable { /** The aggregate (across all sources) number of records processed in a trigger. */ def numInputRows: Long = sources.map(_.numInputRows).sum @@ -149,7 +156,8 @@ class StreamingQueryProgress private[sql]( ("eventTime" -> safeMapToJValue[String](eventTime, s => JString(s))) ~ ("stateOperators" -> JArray(stateOperators.map(_.jsonValue).toList)) ~ ("sources" -> JArray(sources.map(_.jsonValue).toList)) ~ - ("sink" -> sink.jsonValue) + ("sink" -> sink.jsonValue) ~ + ("observedMetrics" -> safeMapToJValue[Row](observedMetrics, row => row.jsonValue)) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala new file mode 100644 index 0000000000000..650f64fe1688c --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPage.scala @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.streaming.ui + +import java.text.SimpleDateFormat +import javax.servlet.http.HttpServletRequest + +import scala.xml.Node + +import org.apache.commons.lang3.StringEscapeUtils + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.catalyst.util.DateTimeUtils.getTimeZone +import org.apache.spark.sql.streaming.ui.UIUtils._ +import org.apache.spark.ui.{UIUtils => SparkUIUtils, WebUIPage} + +private[ui] class StreamingQueryPage(parent: StreamingQueryTab) + extends WebUIPage("") with Logging { + val df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") + df.setTimeZone(getTimeZone("UTC")) + + override def render(request: HttpServletRequest): Seq[Node] = { + val content = generateStreamingQueryTable(request) + SparkUIUtils.headerSparkPage(request, "Streaming Query", content, parent) + } + + def generateDataRow(request: HttpServletRequest, queryActive: Boolean) + (query: StreamingQueryUIData): Seq[Node] = { + + def details(detail: Any): Seq[Node] = { + if (queryActive) { + return Seq.empty[Node] + } + val detailString = detail.asInstanceOf[String] + val isMultiline = detailString.indexOf('\n') >= 0 + val summary = StringEscapeUtils.escapeHtml4( + if (isMultiline) detailString.substring(0, detailString.indexOf('\n')) else detailString + ) + val details = SparkUIUtils.detailsUINode(isMultiline, detailString) + {summary}{details} + } + + val statisticsLink = "%s/%s/statistics?id=%s" + .format(SparkUIUtils.prependBaseUri(request, parent.basePath), parent.prefix, query.runId) + + val name = UIUtils.getQueryName(query) + val status = UIUtils.getQueryStatus(query) + val duration = if (queryActive) { + SparkUIUtils.formatDurationVerbose(System.currentTimeMillis() - query.submissionTime) + } else { + withNoProgress(query, { + val endTimeMs = query.lastProgress.timestamp + SparkUIUtils.formatDurationVerbose(df.parse(endTimeMs).getTime - query.submissionTime) + }, "-") + } + + + {name} + {status} + {query.id} + {query.runId} + {SparkUIUtils.formatDate(query.submissionTime)} + {duration} + {withNoProgress(query, { + (query.recentProgress.map(p => withNumberInvalid(p.inputRowsPerSecond)).sum / + query.recentProgress.length).formatted("%.2f") }, "NaN")} + + {withNoProgress(query, { + (query.recentProgress.map(p => withNumberInvalid(p.processedRowsPerSecond)).sum / + query.recentProgress.length).formatted("%.2f") }, "NaN")} + + {withNoProgress(query, { query.lastProgress.batchId }, "NaN")} + {details(query.exception.getOrElse("-"))} + + } + + private def generateStreamingQueryTable(request: HttpServletRequest): Seq[Node] = { + val (activeQueries, inactiveQueries) = parent.statusListener.allQueryStatus + .partition(_.isActive) + val activeQueryTables = if (activeQueries.nonEmpty) { + val headerRow = Seq( + "Name", "Status", "Id", "Run ID", "Submitted Time", "Duration", "Avg Input /sec", + "Avg Process /sec", "Lastest Batch") + + Some(SparkUIUtils.listingTable(headerRow, generateDataRow(request, queryActive = true), + activeQueries, true, None, Seq(null), false)) + } else { + None + } + + val inactiveQueryTables = if (inactiveQueries.nonEmpty) { + val headerRow = Seq( + "Name", "Status", "Id", "Run ID", "Submitted Time", "Duration", "Avg Input /sec", + "Avg Process /sec", "Lastest Batch", "Error") + + Some(SparkUIUtils.listingTable(headerRow, generateDataRow(request, queryActive = false), + inactiveQueries, true, None, Seq(null), false)) + } else { + None + } + + // scalastyle:off + val content = + +
    + + Active Streaming Queries ({activeQueries.length}) +
    +
    ++ +
    +
      + {activeQueryTables.getOrElse(Seq.empty[Node])} +
    +
    ++ + +
    + + Completed Streaming Queries ({inactiveQueries.length}) +
    +
    ++ +
    +
      + {inactiveQueryTables.getOrElse(Seq.empty[Node])} +
    +
    + // scalastyle:on + + content + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala new file mode 100644 index 0000000000000..56672ce328bff --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.streaming.ui + +import java.{util => ju} +import java.lang.{Long => JLong} +import java.text.SimpleDateFormat +import java.util.UUID +import javax.servlet.http.HttpServletRequest + +import scala.xml.{Node, Unparsed} + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.catalyst.util.DateTimeUtils.getTimeZone +import org.apache.spark.sql.streaming.ui.UIUtils._ +import org.apache.spark.ui.{GraphUIData, JsCollector, UIUtils => SparkUIUtils, WebUIPage} + +private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab) + extends WebUIPage("statistics") with Logging { + val df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") + df.setTimeZone(getTimeZone("UTC")) + + def generateLoadResources(request: HttpServletRequest): Seq[Node] = { + // scalastyle:off + + + + + // scalastyle:on + } + + override def render(request: HttpServletRequest): Seq[Node] = { + val parameterId = request.getParameter("id") + require(parameterId != null && parameterId.nonEmpty, "Missing id parameter") + + val query = parent.statusListener.allQueryStatus.find { case q => + q.runId.equals(UUID.fromString(parameterId)) + }.getOrElse(throw new IllegalArgumentException(s"Failed to find streaming query $parameterId")) + + val resources = generateLoadResources(request) + val basicInfo = generateBasicInfo(query) + val content = + resources ++ + basicInfo ++ + generateStatTable(query) + SparkUIUtils.headerSparkPage(request, "Streaming Query Statistics", content, parent) + } + + def generateTimeMap(times: Seq[Long]): Seq[Node] = { + val js = "var timeFormat = {};\n" + times.map { time => + val formattedTime = SparkUIUtils.formatBatchTime(time, 1, showYYYYMMSS = false) + s"timeFormat[$time] = '$formattedTime';" + }.mkString("\n") + + + } + + def generateVar(values: Array[(Long, ju.Map[String, JLong])]): Seq[Node] = { + val durationDataPadding = SparkUIUtils.durationDataPadding(values) + val js = "var timeToValues = {};\n" + durationDataPadding.map { case (x, y) => + val s = y.toSeq.sortBy(_._1).map(e => s""""${e._2}"""").mkString("[", ",", "]") + s"""timeToValues["${SparkUIUtils.formatBatchTime(x, 1, showYYYYMMSS = false)}"] = $s;""" + }.mkString("\n") + + + } + + def generateBasicInfo(query: StreamingQueryUIData): Seq[Node] = { + val duration = if (query.isActive) { + SparkUIUtils.formatDurationVerbose(System.currentTimeMillis() - query.submissionTime) + } else { + withNoProgress(query, { + val end = query.lastProgress.timestamp + val start = query.recentProgress.head.timestamp + SparkUIUtils.formatDurationVerbose( + df.parse(end).getTime - df.parse(start).getTime) + }, "-") + } + + val name = UIUtils.getQueryName(query) + val numBatches = withNoProgress(query, { query.lastProgress.batchId + 1L }, 0) +
    Running batches for + + {duration} + + since + + {SparkUIUtils.formatDate(query.submissionTime)} + + ({numBatches} completed batches) +
    +
    +
    Name: {name}
    +
    Id: {query.id}
    +
    RunId: {query.runId}
    +
    + } + + def generateStatTable(query: StreamingQueryUIData): Seq[Node] = { + val batchTimes = withNoProgress(query, + query.recentProgress.map(p => df.parse(p.timestamp).getTime), Array.empty[Long]) + val minBatchTime = + withNoProgress(query, df.parse(query.recentProgress.head.timestamp).getTime, 0L) + val maxBatchTime = + withNoProgress(query, df.parse(query.lastProgress.timestamp).getTime, 0L) + val maxRecordRate = + withNoProgress(query, query.recentProgress.map(_.inputRowsPerSecond).max, 0L) + val minRecordRate = 0L + val maxProcessRate = + withNoProgress(query, query.recentProgress.map(_.processedRowsPerSecond).max, 0L) + + val minProcessRate = 0L + val maxRows = withNoProgress(query, query.recentProgress.map(_.numInputRows).max, 0L) + val minRows = 0L + val maxBatchDuration = withNoProgress(query, query.recentProgress.map(_.batchDuration).max, 0L) + val minBatchDuration = 0L + + val inputRateData = withNoProgress(query, + query.recentProgress.map(p => (df.parse(p.timestamp).getTime, + withNumberInvalid { p.inputRowsPerSecond })), Array.empty[(Long, Double)]) + val processRateData = withNoProgress(query, + query.recentProgress.map(p => (df.parse(p.timestamp).getTime, + withNumberInvalid { p.processedRowsPerSecond })), Array.empty[(Long, Double)]) + val inputRowsData = withNoProgress(query, + query.recentProgress.map(p => (df.parse(p.timestamp).getTime, + withNumberInvalid { p.numInputRows })), Array.empty[(Long, Double)]) + val batchDurations = withNoProgress(query, + query.recentProgress.map(p => (df.parse(p.timestamp).getTime, + withNumberInvalid { p.batchDuration })), Array.empty[(Long, Double)]) + val operationDurationData = withNoProgress(query, query.recentProgress.map { p => + val durationMs = p.durationMs + // remove "triggerExecution" as it count the other operation duration. + durationMs.remove("triggerExecution") + (df.parse(p.timestamp).getTime, durationMs)}, Array.empty[(Long, ju.Map[String, JLong])]) + + val jsCollector = new JsCollector + val graphUIDataForInputRate = + new GraphUIData( + "input-rate-timeline", + "input-rate-histogram", + inputRateData, + minBatchTime, + maxBatchTime, + minRecordRate, + maxRecordRate, + "records/sec") + graphUIDataForInputRate.generateDataJs(jsCollector) + + val graphUIDataForProcessRate = + new GraphUIData( + "process-rate-timeline", + "process-rate-histogram", + processRateData, + minBatchTime, + maxBatchTime, + minProcessRate, + maxProcessRate, + "records/sec") + graphUIDataForProcessRate.generateDataJs(jsCollector) + + val graphUIDataForInputRows = + new GraphUIData( + "input-rows-timeline", + "input-rows-histogram", + inputRowsData, + minBatchTime, + maxBatchTime, + minRows, + maxRows, + "records") + graphUIDataForInputRows.generateDataJs(jsCollector) + + val graphUIDataForBatchDuration = + new GraphUIData( + "batch-duration-timeline", + "batch-duration-histogram", + batchDurations, + minBatchTime, + maxBatchTime, + minBatchDuration, + maxBatchDuration, + "ms") + graphUIDataForBatchDuration.generateDataJs(jsCollector) + + val graphUIDataForDuration = + new GraphUIData( + "duration-area-stack", + "", + Seq.empty[(Long, Double)], + 0L, + 0L, + 0L, + 0L, + "ms") + + val table = + // scalastyle:off + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    TimelinesHistograms
    +
    +
    Input Rate {SparkUIUtils.tooltip("The aggregate (across all sources) rate of data arriving.", "right")}
    +
    +
    {graphUIDataForInputRate.generateTimelineHtml(jsCollector)}{graphUIDataForInputRate.generateHistogramHtml(jsCollector)}
    +
    +
    Process Rate {SparkUIUtils.tooltip("The aggregate (across all sources) rate at which Spark is processing data.", "right")}
    +
    +
    {graphUIDataForProcessRate.generateTimelineHtml(jsCollector)}{graphUIDataForProcessRate.generateHistogramHtml(jsCollector)}
    +
    +
    Input Rows {SparkUIUtils.tooltip("The aggregate (across all sources) number of records processed in a trigger.", "right")}
    +
    +
    {graphUIDataForInputRows.generateTimelineHtml(jsCollector)}{graphUIDataForInputRows.generateHistogramHtml(jsCollector)}
    +
    +
    Batch Duration {SparkUIUtils.tooltip("The process duration of each batch.", "right")}
    +
    +
    {graphUIDataForBatchDuration.generateTimelineHtml(jsCollector)}{graphUIDataForBatchDuration.generateHistogramHtml(jsCollector)}
    +
    +
    Operation Duration {SparkUIUtils.tooltip("The amount of time taken to perform various operations in milliseconds.", "right")}
    +
    +
    {graphUIDataForDuration.generateAreaStackHtmlWithData(jsCollector, operationDurationData)}
    + // scalastyle:on + + generateVar(operationDurationData) ++ generateTimeMap(batchTimes) ++ table ++ jsCollector.toHtml + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListener.scala new file mode 100644 index 0000000000000..91815110e0d39 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListener.scala @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.streaming.ui + +import java.text.SimpleDateFormat +import java.util.UUID +import java.util.concurrent.ConcurrentHashMap + +import scala.collection.JavaConverters._ +import scala.collection.mutable + +import org.apache.spark.SparkConf +import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.internal.StaticSQLConf +import org.apache.spark.sql.streaming.{StreamingQueryListener, StreamingQueryProgress} + +/** + * A customized StreamingQueryListener used in structured streaming UI, which contains all + * UI data for both active and inactive query. + * TODO: Add support for history server. + */ +private[sql] class StreamingQueryStatusListener(conf: SparkConf) extends StreamingQueryListener { + + private val timestampFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") // ISO8601 + timestampFormat.setTimeZone(DateTimeUtils.getTimeZone("UTC")) + + /** + * We use runId as the key here instead of id in active query status map, + * because the runId is unique for every started query, even it its a restart. + */ + private[ui] val activeQueryStatus = new ConcurrentHashMap[UUID, StreamingQueryUIData]() + private[ui] val inactiveQueryStatus = new mutable.Queue[StreamingQueryUIData]() + + private val streamingProgressRetention = + conf.get(StaticSQLConf.STREAMING_UI_RETAINED_PROGRESS_UPDATES) + private val inactiveQueryStatusRetention = conf.get(StaticSQLConf.STREAMING_UI_RETAINED_QUERIES) + + override def onQueryStarted(event: StreamingQueryListener.QueryStartedEvent): Unit = { + activeQueryStatus.putIfAbsent(event.runId, + new StreamingQueryUIData(event.name, event.id, event.runId, event.submissionTime)) + } + + override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = { + val batchTimestamp = timestampFormat.parse(event.progress.timestamp).getTime + val queryStatus = activeQueryStatus.getOrDefault( + event.progress.runId, + new StreamingQueryUIData(event.progress.name, event.progress.id, event.progress.runId, + batchTimestamp)) + queryStatus.updateProcess(event.progress, streamingProgressRetention) + } + + override def onQueryTerminated( + event: StreamingQueryListener.QueryTerminatedEvent): Unit = synchronized { + val queryStatus = activeQueryStatus.remove(event.runId) + if (queryStatus != null) { + queryStatus.queryTerminated(event) + inactiveQueryStatus += queryStatus + while (inactiveQueryStatus.length >= inactiveQueryStatusRetention) { + inactiveQueryStatus.dequeue() + } + } + } + + def allQueryStatus: Seq[StreamingQueryUIData] = synchronized { + activeQueryStatus.values().asScala.toSeq ++ inactiveQueryStatus + } +} + +/** + * This class contains all message related to UI display, each instance corresponds to a single + * [[org.apache.spark.sql.streaming.StreamingQuery]]. + */ +private[ui] class StreamingQueryUIData( + val name: String, + val id: UUID, + val runId: UUID, + val submissionTime: Long) { + + /** Holds the most recent query progress updates. */ + private val progressBuffer = new mutable.Queue[StreamingQueryProgress]() + + private var _isActive = true + private var _exception: Option[String] = None + + def isActive: Boolean = synchronized { _isActive } + + def exception: Option[String] = synchronized { _exception } + + def queryTerminated(event: StreamingQueryListener.QueryTerminatedEvent): Unit = synchronized { + _isActive = false + _exception = event.exception + } + + def updateProcess( + newProgress: StreamingQueryProgress, retentionNum: Int): Unit = progressBuffer.synchronized { + progressBuffer += newProgress + while (progressBuffer.length >= retentionNum) { + progressBuffer.dequeue() + } + } + + def recentProgress: Array[StreamingQueryProgress] = progressBuffer.synchronized { + progressBuffer.toArray + } + + def lastProgress: StreamingQueryProgress = progressBuffer.synchronized { + progressBuffer.lastOption.orNull + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryTab.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryTab.scala new file mode 100644 index 0000000000000..bb097ffc06912 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryTab.scala @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.streaming.ui + +import org.apache.spark.internal.Logging +import org.apache.spark.ui.{SparkUI, SparkUITab} + +private[sql] class StreamingQueryTab( + val statusListener: StreamingQueryStatusListener, + sparkUI: SparkUI) extends SparkUITab(sparkUI, "StreamingQuery") with Logging { + + override val name = "Structured Streaming" + + val parent = sparkUI + + attachPage(new StreamingQueryPage(this)) + attachPage(new StreamingQueryStatisticsPage(this)) + parent.attachTab(this) + + parent.addStaticHandler(StreamingQueryTab.STATIC_RESOURCE_DIR, "/static/sql") +} + +private[sql] object StreamingQueryTab { + private val STATIC_RESOURCE_DIR = "org/apache/spark/sql/execution/ui/static" +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/UIUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/UIUtils.scala new file mode 100644 index 0000000000000..57b9dec81f28a --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/UIUtils.scala @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.streaming.ui + +private[ui] object UIUtils { + + /** + * Check whether `number` is valid, if not return 0.0d + */ + def withNumberInvalid(number: => Double): Double = { + if (number.isNaN || number.isInfinite) { + 0.0d + } else { + number + } + } + + /** + * Execute a block of code when there is already one completed batch in streaming query, + * otherwise return `default` value. + */ + def withNoProgress[T](query: StreamingQueryUIData, body: => T, default: T): T = { + if (query.lastProgress != null) { + body + } else { + default + } + } + + def getQueryName(query: StreamingQueryUIData): String = { + if (query.name == null || query.name.isEmpty) { + "" + } else { + query.name + } + } + + def getQueryStatus(query: StreamingQueryUIData): String = { + if (query.isActive) { + "RUNNING" + } else { + query.exception.map(_ => "FAILED").getOrElse("FINISHED") + } + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala index f1fe472afdc2a..01f81825f6bfd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala @@ -108,6 +108,11 @@ class ExecutionListenerManager private[sql](session: SparkSession, loadExtension listenerBus.removeAllListeners() } + /** Only exposed for testing. */ + private[sql] def listListeners(): Array[QueryExecutionListener] = { + listenerBus.listeners.asScala.toArray + } + /** * Get an identical copy of this listener manager. */ diff --git a/sql/core/src/main/scala/org/apache/spark/status/api/v1/sql/ApiSqlRootResource.scala b/sql/core/src/main/scala/org/apache/spark/status/api/v1/sql/ApiSqlRootResource.scala new file mode 100644 index 0000000000000..5fc7123c9097b --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/status/api/v1/sql/ApiSqlRootResource.scala @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.status.api.v1.sql + +import javax.ws.rs.Path + +import org.apache.spark.status.api.v1.ApiRequestContext + +@Path("/v1") +private[v1] class ApiSqlRootResource extends ApiRequestContext { + + @Path("applications/{appId}/sql") + def sqlList(): Class[SqlResource] = classOf[SqlResource] +} diff --git a/sql/core/src/main/scala/org/apache/spark/status/api/v1/sql/SqlResource.scala b/sql/core/src/main/scala/org/apache/spark/status/api/v1/sql/SqlResource.scala new file mode 100644 index 0000000000000..346e07f2bef15 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/status/api/v1/sql/SqlResource.scala @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.status.api.v1.sql + +import java.util.Date +import javax.ws.rs._ +import javax.ws.rs.core.MediaType + +import org.apache.spark.JobExecutionStatus +import org.apache.spark.sql.execution.ui.{SQLAppStatusStore, SQLExecutionUIData, SQLPlanMetric} +import org.apache.spark.status.api.v1.{BaseAppResource, NotFoundException} + +@Produces(Array(MediaType.APPLICATION_JSON)) +private[v1] class SqlResource extends BaseAppResource { + + @GET + def sqlList( + @DefaultValue("false") @QueryParam("details") details: Boolean, + @DefaultValue("0") @QueryParam("offset") offset: Int, + @DefaultValue("20") @QueryParam("length") length: Int): Seq[ExecutionData] = { + withUI { ui => + val sqlStore = new SQLAppStatusStore(ui.store.store) + sqlStore.executionsList(offset, length).map(prepareExecutionData(_, details)) + } + } + + @GET + @Path("{executionId:\\d+}") + def sql( + @PathParam("executionId") execId: Long, + @DefaultValue("false") @QueryParam("details") details: Boolean): ExecutionData = { + withUI { ui => + val sqlStore = new SQLAppStatusStore(ui.store.store) + sqlStore + .execution(execId) + .map(prepareExecutionData(_, details)) + .getOrElse(throw new NotFoundException("unknown id: " + execId)) + } + } + + private def printableMetrics( + metrics: Seq[SQLPlanMetric], + metricValues: Map[Long, String]): Seq[Metrics] = { + metrics.map(metric => + Metrics(metric.name, metricValues.get(metric.accumulatorId).getOrElse(""))) + } + + private def prepareExecutionData(exec: SQLExecutionUIData, details: Boolean): ExecutionData = { + var running = Seq[Int]() + var completed = Seq[Int]() + var failed = Seq[Int]() + + exec.jobs.foreach { + case (id, JobExecutionStatus.RUNNING) => + running = running :+ id + case (id, JobExecutionStatus.SUCCEEDED) => + completed = completed :+ id + case (id, JobExecutionStatus.FAILED) => + failed = failed :+ id + case _ => + } + + val status = if (exec.jobs.size == completed.size) { + "COMPLETED" + } else if (failed.nonEmpty) { + "FAILED" + } else { + "RUNNING" + } + + val duration = exec.completionTime.getOrElse(new Date()).getTime - exec.submissionTime + val planDetails = if (details) exec.physicalPlanDescription else "" + val metrics = if (details) printableMetrics(exec.metrics, exec.metricValues) else Seq.empty + new ExecutionData( + exec.executionId, + status, + exec.description, + planDetails, + metrics, + new Date(exec.submissionTime), + duration, + running, + completed, + failed) + } +} diff --git a/sql/core/src/main/scala/org/apache/spark/status/api/v1/sql/api.scala b/sql/core/src/main/scala/org/apache/spark/status/api/v1/sql/api.scala new file mode 100644 index 0000000000000..7ace66ffb06e1 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/status/api/v1/sql/api.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.status.api.v1.sql + +import java.util.Date + +class ExecutionData private[spark] ( + val id: Long, + val status: String, + val description: String, + val planDescription: String, + val metrics: Seq[Metrics], + val submissionTime: Date, + val duration: Long, + val runningJobIds: Seq[Int], + val successJobIds: Seq[Int], + val failedJobIds: Seq[Int]) + +case class Metrics private[spark] (metricName: String, metricValue: String) diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/Java8DatasetAggregatorSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/Java8DatasetAggregatorSuite.java index 6ffccee52c0fe..dd3755d3f904e 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/Java8DatasetAggregatorSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/Java8DatasetAggregatorSuite.java @@ -25,43 +25,50 @@ import org.apache.spark.sql.Dataset; import org.apache.spark.sql.KeyValueGroupedDataset; -import org.apache.spark.sql.expressions.javalang.typed; /** * Suite that replicates tests in JavaDatasetAggregatorSuite using lambda syntax. */ public class Java8DatasetAggregatorSuite extends JavaDatasetAggregatorSuiteBase { + @SuppressWarnings("deprecation") @Test public void testTypedAggregationAverage() { KeyValueGroupedDataset> grouped = generateGroupedDataset(); - Dataset> agged = grouped.agg(typed.avg(v -> (double)(v._2() * 2))); + Dataset> agged = grouped.agg( + org.apache.spark.sql.expressions.javalang.typed.avg(v -> (double)(v._2() * 2))); Assert.assertEquals( Arrays.asList(new Tuple2<>("a", 3.0), new Tuple2<>("b", 6.0)), agged.collectAsList()); } + @SuppressWarnings("deprecation") @Test public void testTypedAggregationCount() { KeyValueGroupedDataset> grouped = generateGroupedDataset(); - Dataset> agged = grouped.agg(typed.count(v -> v)); + Dataset> agged = grouped.agg( + org.apache.spark.sql.expressions.javalang.typed.count(v -> v)); Assert.assertEquals( Arrays.asList(new Tuple2<>("a", 2L), new Tuple2<>("b", 1L)), agged.collectAsList()); } + @SuppressWarnings("deprecation") @Test public void testTypedAggregationSumDouble() { KeyValueGroupedDataset> grouped = generateGroupedDataset(); - Dataset> agged = grouped.agg(typed.sum(v -> (double)v._2())); + Dataset> agged = grouped.agg( + org.apache.spark.sql.expressions.javalang.typed.sum(v -> (double)v._2())); Assert.assertEquals( Arrays.asList(new Tuple2<>("a", 3.0), new Tuple2<>("b", 3.0)), agged.collectAsList()); } + @SuppressWarnings("deprecation") @Test public void testTypedAggregationSumLong() { KeyValueGroupedDataset> grouped = generateGroupedDataset(); - Dataset> agged = grouped.agg(typed.sumLong(v -> (long)v._2())); + Dataset> agged = grouped.agg( + org.apache.spark.sql.expressions.javalang.typed.sumLong(v -> (long)v._2())); Assert.assertEquals( Arrays.asList(new Tuple2<>("a", 3L), new Tuple2<>("b", 3L)), agged.collectAsList()); diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaBeanDeserializationSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaBeanDeserializationSuite.java index 7bf0789b43d63..5603cb988b8e7 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaBeanDeserializationSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaBeanDeserializationSuite.java @@ -22,6 +22,10 @@ import java.time.LocalDate; import java.util.*; +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; +import org.junit.*; + import org.apache.spark.sql.*; import org.apache.spark.sql.catalyst.expressions.GenericRow; import org.apache.spark.sql.catalyst.util.DateTimeUtils; @@ -29,7 +33,6 @@ import org.apache.spark.sql.internal.SQLConf; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructType; -import org.junit.*; import org.apache.spark.sql.test.TestSparkSession; @@ -78,7 +81,7 @@ public void testBeanWithArrayFieldDeserialization() { .as(encoder); List records = dataset.collectAsList(); - Assert.assertEquals(records, ARRAY_RECORDS); + Assert.assertEquals(ARRAY_RECORDS, records); } private static final List MAP_RECORDS = new ArrayList<>(); @@ -121,7 +124,7 @@ public void testBeanWithMapFieldsDeserialization() { List records = dataset.collectAsList(); - Assert.assertEquals(records, MAP_RECORDS); + Assert.assertEquals(MAP_RECORDS, records); } @Test @@ -486,17 +489,17 @@ public int hashCode() { @Override public String toString() { - return com.google.common.base.Objects.toStringHelper(this) - .add("shortField", shortField) - .add("intField", intField) - .add("longField", longField) - .add("floatField", floatField) - .add("doubleField", doubleField) - .add("stringField", stringField) - .add("booleanField", booleanField) - .add("timestampField", timestampField) - .add("nullIntField", nullIntField) - .toString(); + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("shortField", shortField) + .append("intField", intField) + .append("longField", longField) + .append("floatField", floatField) + .append("doubleField", doubleField) + .append("stringField", stringField) + .append("booleanField", booleanField) + .append("timestampField", timestampField) + .append("nullIntField", nullIntField) + .toString(); } } @@ -584,11 +587,12 @@ public int hashCode() { @Override public String toString() { - return com.google.common.base.Objects.toStringHelper(this) - .add("localDateField", localDateField) - .add("instantField", instantField) - .toString(); + return new ToStringBuilder(this, ToStringStyle.SHORT_PREFIX_STYLE) + .append("localDateField", localDateField) + .append("instantField", instantField) + .toString(); } + } private static Row createLocalDateInstantRow(Long index) { diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java index a05afa4f6ba30..f4bffd9d79828 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java @@ -467,10 +467,11 @@ public void testBeanWithoutGetter() { BeanWithoutGetter bean = new BeanWithoutGetter(); List data = Arrays.asList(bean); Dataset df = spark.createDataFrame(data, BeanWithoutGetter.class); - Assert.assertEquals(df.schema().length(), 0); - Assert.assertEquals(df.collectAsList().size(), 1); + Assert.assertEquals(0, df.schema().length()); + Assert.assertEquals(1, df.collectAsList().size()); } + @SuppressWarnings("deprecation") @Test public void testJsonRDDToDataFrame() { // This is a test for the deprecated API in SPARK-15615. diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameWriterV2Suite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameWriterV2Suite.java new file mode 100644 index 0000000000000..e418958bef94d --- /dev/null +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameWriterV2Suite.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package test.org.apache.spark.sql; + +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.catalyst.analysis.CannotReplaceMissingTableException; +import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; +import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException; +import org.apache.spark.sql.connector.InMemoryTableCatalog; +import org.apache.spark.sql.test.TestSparkSession; +import org.apache.spark.sql.types.StructType; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import static org.apache.spark.sql.functions.*; + +public class JavaDataFrameWriterV2Suite { + private static StructType schema = new StructType().add("s", "string"); + private SparkSession spark = null; + + public Dataset df() { + return spark.read().schema(schema).text(); + } + + @Before + public void createTestTable() { + this.spark = new TestSparkSession(); + spark.conf().set("spark.sql.catalog.testcat", InMemoryTableCatalog.class.getName()); + spark.sql("CREATE TABLE testcat.t (s string) USING foo"); + } + + @After + public void dropTestTable() { + spark.sql("DROP TABLE testcat.t"); + spark.stop(); + } + + @Test + public void testAppendAPI() throws NoSuchTableException { + df().writeTo("testcat.t").append(); + df().writeTo("testcat.t").option("property", "value").append(); + } + + @Test + public void testOverwritePartitionsAPI() throws NoSuchTableException { + df().writeTo("testcat.t").overwritePartitions(); + df().writeTo("testcat.t").option("property", "value").overwritePartitions(); + } + + @Test + public void testOverwriteAPI() throws NoSuchTableException { + df().writeTo("testcat.t").overwrite(lit(true)); + df().writeTo("testcat.t").option("property", "value").overwrite(lit(true)); + } + + @Test + public void testCreateAPI() throws TableAlreadyExistsException { + df().writeTo("testcat.t2").create(); + spark.sql("DROP TABLE testcat.t2"); + + df().writeTo("testcat.t2").option("property", "value").create(); + spark.sql("DROP TABLE testcat.t2"); + + df().writeTo("testcat.t2").tableProperty("property", "value").create(); + spark.sql("DROP TABLE testcat.t2"); + + df().writeTo("testcat.t2").using("v2format").create(); + spark.sql("DROP TABLE testcat.t2"); + + df().writeTo("testcat.t2").partitionedBy(col("s")).create(); + spark.sql("DROP TABLE testcat.t2"); + } + + @Test + public void testReplaceAPI() throws CannotReplaceMissingTableException { + df().writeTo("testcat.t").replace(); + df().writeTo("testcat.t").option("property", "value").replace(); + df().writeTo("testcat.t").tableProperty("property", "value").replace(); + df().writeTo("testcat.t").using("v2format").replace(); + df().writeTo("testcat.t").partitionedBy(col("s")).replace(); + } + + @Test + public void testCreateOrReplaceAPI() { + df().writeTo("testcat.t").createOrReplace(); + df().writeTo("testcat.t").option("property", "value").createOrReplace(); + df().writeTo("testcat.t").tableProperty("property", "value").createOrReplace(); + df().writeTo("testcat.t").using("v2format").createOrReplace(); + df().writeTo("testcat.t").partitionedBy(col("s")).createOrReplace(); + } +} diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetAggregatorSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetAggregatorSuite.java index 539976d5af469..8a90624f2070b 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetAggregatorSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetAggregatorSuite.java @@ -29,7 +29,6 @@ import org.apache.spark.sql.Encoders; import org.apache.spark.sql.KeyValueGroupedDataset; import org.apache.spark.sql.expressions.Aggregator; -import org.apache.spark.sql.expressions.javalang.typed; /** * Suite for testing the aggregate functionality of Datasets in Java. @@ -85,37 +84,45 @@ public Encoder outputEncoder() { } } + @SuppressWarnings("deprecation") @Test public void testTypedAggregationAverage() { KeyValueGroupedDataset> grouped = generateGroupedDataset(); - Dataset> agged = grouped.agg(typed.avg(value -> value._2() * 2.0)); + Dataset> agged = grouped.agg( + org.apache.spark.sql.expressions.javalang.typed.avg(value -> value._2() * 2.0)); Assert.assertEquals( Arrays.asList(new Tuple2<>("a", 3.0), new Tuple2<>("b", 6.0)), agged.collectAsList()); } + @SuppressWarnings("deprecation") @Test public void testTypedAggregationCount() { KeyValueGroupedDataset> grouped = generateGroupedDataset(); - Dataset> agged = grouped.agg(typed.count(value -> value)); + Dataset> agged = grouped.agg( + org.apache.spark.sql.expressions.javalang.typed.count(value -> value)); Assert.assertEquals( Arrays.asList(new Tuple2<>("a", 2L), new Tuple2<>("b", 1L)), agged.collectAsList()); } + @SuppressWarnings("deprecation") @Test public void testTypedAggregationSumDouble() { KeyValueGroupedDataset> grouped = generateGroupedDataset(); - Dataset> agged = grouped.agg(typed.sum(value -> (double) value._2())); + Dataset> agged = grouped.agg( + org.apache.spark.sql.expressions.javalang.typed.sum(value -> (double) value._2())); Assert.assertEquals( Arrays.asList(new Tuple2<>("a", 3.0), new Tuple2<>("b", 3.0)), agged.collectAsList()); } + @SuppressWarnings("deprecation") @Test public void testTypedAggregationSumLong() { KeyValueGroupedDataset> grouped = generateGroupedDataset(); - Dataset> agged = grouped.agg(typed.sumLong(value -> (long) value._2())); + Dataset> agged = grouped.agg( + org.apache.spark.sql.expressions.javalang.typed.sumLong(value -> (long) value._2())); Assert.assertEquals( Arrays.asList(new Tuple2<>("a", 3L), new Tuple2<>("b", 3L)), agged.collectAsList()); diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java index 1e5f55e494b70..d8462ae064dcf 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java @@ -853,7 +853,7 @@ public void testRuntimeNullabilityCheck() { NestedSmallBean nestedSmallBean = new NestedSmallBean(); nestedSmallBean.setF(smallBean); - Assert.assertEquals(ds.collectAsList(), Collections.singletonList(nestedSmallBean)); + Assert.assertEquals(Collections.singletonList(nestedSmallBean), ds.collectAsList()); } // Shouldn't throw runtime exception when parent object (`ClassData`) is null @@ -864,7 +864,7 @@ public void testRuntimeNullabilityCheck() { Dataset ds = df.as(Encoders.bean(NestedSmallBean.class)); NestedSmallBean nestedSmallBean = new NestedSmallBean(); - Assert.assertEquals(ds.collectAsList(), Collections.singletonList(nestedSmallBean)); + Assert.assertEquals(Collections.singletonList(nestedSmallBean), ds.collectAsList()); } nullabilityCheck.expect(RuntimeException.class); @@ -1384,7 +1384,7 @@ public void testBeanWithEnum() { new BeanWithEnum(MyEnum.B, "flower boulevard")); Encoder encoder = Encoders.bean(BeanWithEnum.class); Dataset ds = spark.createDataset(data, encoder); - Assert.assertEquals(ds.collectAsList(), data); + Assert.assertEquals(data, ds.collectAsList()); } public static class EmptyBean implements Serializable {} @@ -1394,8 +1394,8 @@ public void testEmptyBean() { EmptyBean bean = new EmptyBean(); List data = Arrays.asList(bean); Dataset df = spark.createDataset(data, Encoders.bean(EmptyBean.class)); - Assert.assertEquals(df.schema().length(), 0); - Assert.assertEquals(df.collectAsList().size(), 1); + Assert.assertEquals(0, df.schema().length()); + Assert.assertEquals(1, df.collectAsList().size()); } public class CircularReference1Bean implements Serializable { diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaHigherOrderFunctionsSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaHigherOrderFunctionsSuite.java new file mode 100644 index 0000000000000..de0acc295b5ea --- /dev/null +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaHigherOrderFunctionsSuite.java @@ -0,0 +1,272 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package test.org.apache.spark.sql; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import static java.util.stream.Collectors.toList; + +import static scala.collection.JavaConverters.mapAsScalaMap; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import static org.apache.spark.sql.functions.*; +import org.apache.spark.sql.test.TestSparkSession; +import org.apache.spark.sql.types.*; +import static org.apache.spark.sql.types.DataTypes.*; + +public class JavaHigherOrderFunctionsSuite { + private transient TestSparkSession spark; + private Dataset arrDf; + private Dataset mapDf; + + private void checkAnswer(Dataset actualDS, List expected) throws Exception { + List actual = actualDS.collectAsList(); + Assert.assertEquals(expected.size(), actual.size()); + for (int i = 0; i < expected.size(); i++) { + Row expectedRow = expected.get(i); + Row actualRow = actual.get(i); + Assert.assertEquals(expectedRow.size(), actualRow.size()); + for (int j = 0; j < expectedRow.size(); j++) { + Object expectedValue = expectedRow.get(j); + Object actualValue = actualRow.get(j); + if (expectedValue != null && expectedValue.getClass().isArray()) { + actualValue = actualValue.getClass().getMethod("array").invoke(actualValue); + Assert.assertArrayEquals((Object[]) expectedValue, (Object[]) actualValue); + } else { + Assert.assertEquals(expectedValue, actualValue); + } + } + } + } + + @SafeVarargs + private static List toRows(T... objs) { + return Arrays.stream(objs) + .map(RowFactory::create) + .collect(toList()); + } + + @SafeVarargs + private static T[] makeArray(T... ts) { + return ts; + } + + private void setUpArrDf() { + List data = toRows( + makeArray(1, 9, 8, 7), + makeArray(5, 8, 9, 7, 2), + JavaHigherOrderFunctionsSuite.makeArray(), + null + ); + StructType schema = new StructType() + .add("x", new ArrayType(IntegerType, true), true); + arrDf = spark.createDataFrame(data, schema); + } + + private void setUpMapDf() { + List data = toRows( + new HashMap() {{ + put(1, 1); + put(2, 2); + }}, + null + ); + StructType schema = new StructType() + .add("x", new MapType(IntegerType, IntegerType, true)); + mapDf = spark.createDataFrame(data, schema); + } + + @Before + public void setUp() { + spark = new TestSparkSession(); + setUpArrDf(); + setUpMapDf(); + } + + @After + public void tearDown() { + spark.stop(); + spark = null; + } + + @Test + public void testTransform() throws Exception { + checkAnswer( + arrDf.select(transform(col("x"), x -> x.plus(1))), + toRows( + makeArray(2, 10, 9, 8), + makeArray(6, 9, 10, 8, 3), + JavaHigherOrderFunctionsSuite.makeArray(), + null + ) + ); + checkAnswer( + arrDf.select(transform(col("x"), (x, i) -> x.plus(i))), + toRows( + makeArray(1, 10, 10, 10), + makeArray(5, 9, 11, 10, 6), + JavaHigherOrderFunctionsSuite.makeArray(), + null + ) + ); + } + + @Test + public void testFilter() throws Exception { + checkAnswer( + arrDf.select(filter(col("x"), x -> x.plus(1).equalTo(10))), + toRows( + makeArray(9), + makeArray(9), + JavaHigherOrderFunctionsSuite.makeArray(), + null + ) + ); + checkAnswer( + arrDf.select(filter(col("x"), (x, i) -> x.plus(i).equalTo(10))), + toRows( + makeArray(9, 8, 7), + makeArray(7), + JavaHigherOrderFunctionsSuite.makeArray(), + null + ) + ); + } + + @Test + public void testExists() throws Exception { + checkAnswer( + arrDf.select(exists(col("x"), x -> x.plus(1).equalTo(10))), + toRows( + true, + true, + false, + null + ) + ); + } + + @Test + public void testForall() throws Exception { + checkAnswer( + arrDf.select(forall(col("x"), x -> x.plus(1).equalTo(10))), + toRows( + false, + false, + true, + null + ) + ); + } + + @Test + public void testAggregate() throws Exception { + checkAnswer( + arrDf.select(aggregate(col("x"), lit(0), (acc, x) -> acc.plus(x))), + toRows( + 25, + 31, + 0, + null + ) + ); + checkAnswer( + arrDf.select(aggregate(col("x"), lit(0), (acc, x) -> acc.plus(x), x -> x)), + toRows( + 25, + 31, + 0, + null + ) + ); + } + + @Test + public void testZipWith() throws Exception { + checkAnswer( + arrDf.select(zip_with(col("x"), col("x"), (a, b) -> lit(42))), + toRows( + makeArray(42, 42, 42, 42), + makeArray(42, 42, 42, 42, 42), + JavaHigherOrderFunctionsSuite.makeArray(), + null + ) + ); + } + + @Test + public void testTransformKeys() throws Exception { + checkAnswer( + mapDf.select(transform_keys(col("x"), (k, v) -> k.plus(v))), + toRows( + mapAsScalaMap(new HashMap() {{ + put(2, 1); + put(4, 2); + }}), + null + ) + ); + } + + @Test + public void testTransformValues() throws Exception { + checkAnswer( + mapDf.select(transform_values(col("x"), (k, v) -> k.plus(v))), + toRows( + mapAsScalaMap(new HashMap() {{ + put(1, 2); + put(2, 4); + }}), + null + ) + ); + } + + @Test + public void testMapFilter() throws Exception { + checkAnswer( + mapDf.select(map_filter(col("x"), (k, v) -> lit(false))), + toRows( + mapAsScalaMap(new HashMap()), + null + ) + ); + } + + @Test + public void testMapZipWith() throws Exception { + checkAnswer( + mapDf.select(map_zip_with(col("x"), col("x"), (k, v1, v2) -> lit(false))), + toRows( + mapAsScalaMap(new HashMap() {{ + put(1, false); + put(2, false); + }}), + null + ) + ); + } +} diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaSaveLoadSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaSaveLoadSuite.java index 127d272579a62..e2a69d55337bc 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaSaveLoadSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaSaveLoadSuite.java @@ -25,7 +25,6 @@ import java.util.Map; import org.junit.After; -import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -43,10 +42,7 @@ public class JavaSaveLoadSuite { Dataset df; private static void checkAnswer(Dataset actual, List expected) { - String errorMessage = QueryTest$.MODULE$.checkAnswer(actual, expected); - if (errorMessage != null) { - Assert.fail(errorMessage); - } + QueryTest$.MODULE$.checkAnswer(actual, expected); } @Before diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaAdvancedDataSourceV2.java similarity index 93% rename from sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java rename to sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaAdvancedDataSourceV2.java index 255a9f887878b..1a55d198361ee 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaAdvancedDataSourceV2.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaAdvancedDataSourceV2.java @@ -15,22 +15,22 @@ * limitations under the License. */ -package test.org.apache.spark.sql.sources.v2; +package test.org.apache.spark.sql.connector; import java.io.IOException; import java.util.*; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; +import org.apache.spark.sql.connector.TestingV2Source; +import org.apache.spark.sql.connector.catalog.Table; +import org.apache.spark.sql.connector.read.*; import org.apache.spark.sql.sources.Filter; import org.apache.spark.sql.sources.GreaterThan; -import org.apache.spark.sql.sources.v2.Table; -import org.apache.spark.sql.sources.v2.TableProvider; -import org.apache.spark.sql.sources.v2.reader.*; import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.util.CaseInsensitiveStringMap; -public class JavaAdvancedDataSourceV2 implements TableProvider { +public class JavaAdvancedDataSourceV2 implements TestingV2Source { @Override public Table getTable(CaseInsensitiveStringMap options) { @@ -45,7 +45,7 @@ public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) { static class AdvancedScanBuilder implements ScanBuilder, Scan, SupportsPushDownFilters, SupportsPushDownRequiredColumns { - private StructType requiredSchema = new StructType().add("i", "int").add("j", "int"); + private StructType requiredSchema = TestingV2Source.schema(); private Filter[] filters = new Filter[0]; @Override diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaColumnarDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaColumnarDataSourceV2.java similarity index 88% rename from sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaColumnarDataSourceV2.java rename to sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaColumnarDataSourceV2.java index 699859cfaebe1..2f10c84c999f9 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaColumnarDataSourceV2.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaColumnarDataSourceV2.java @@ -15,22 +15,25 @@ * limitations under the License. */ -package test.org.apache.spark.sql.sources.v2; +package test.org.apache.spark.sql.connector; import java.io.IOException; import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.connector.TestingV2Source; +import org.apache.spark.sql.connector.catalog.Table; +import org.apache.spark.sql.connector.read.InputPartition; +import org.apache.spark.sql.connector.read.PartitionReader; +import org.apache.spark.sql.connector.read.PartitionReaderFactory; +import org.apache.spark.sql.connector.read.ScanBuilder; import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector; -import org.apache.spark.sql.sources.v2.Table; -import org.apache.spark.sql.sources.v2.TableProvider; -import org.apache.spark.sql.sources.v2.reader.*; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.util.CaseInsensitiveStringMap; import org.apache.spark.sql.vectorized.ColumnVector; import org.apache.spark.sql.vectorized.ColumnarBatch; -public class JavaColumnarDataSourceV2 implements TableProvider { +public class JavaColumnarDataSourceV2 implements TestingV2Source { class MyScanBuilder extends JavaSimpleScanBuilder { diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaPartitionAwareDataSource.java b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaPartitionAwareDataSource.java similarity index 84% rename from sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaPartitionAwareDataSource.java rename to sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaPartitionAwareDataSource.java index 391af5a306a16..9c1db7a379602 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaPartitionAwareDataSource.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaPartitionAwareDataSource.java @@ -15,24 +15,24 @@ * limitations under the License. */ -package test.org.apache.spark.sql.sources.v2; +package test.org.apache.spark.sql.connector; import java.io.IOException; import java.util.Arrays; -import org.apache.spark.sql.catalog.v2.expressions.Expressions; -import org.apache.spark.sql.catalog.v2.expressions.Transform; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; -import org.apache.spark.sql.sources.v2.Table; -import org.apache.spark.sql.sources.v2.TableProvider; -import org.apache.spark.sql.sources.v2.reader.*; -import org.apache.spark.sql.sources.v2.reader.partitioning.ClusteredDistribution; -import org.apache.spark.sql.sources.v2.reader.partitioning.Distribution; -import org.apache.spark.sql.sources.v2.reader.partitioning.Partitioning; +import org.apache.spark.sql.connector.TestingV2Source; +import org.apache.spark.sql.connector.expressions.Expressions; +import org.apache.spark.sql.connector.expressions.Transform; +import org.apache.spark.sql.connector.catalog.Table; +import org.apache.spark.sql.connector.read.*; +import org.apache.spark.sql.connector.read.partitioning.ClusteredDistribution; +import org.apache.spark.sql.connector.read.partitioning.Distribution; +import org.apache.spark.sql.connector.read.partitioning.Partitioning; import org.apache.spark.sql.util.CaseInsensitiveStringMap; -public class JavaPartitionAwareDataSource implements TableProvider { +public class JavaPartitionAwareDataSource implements TestingV2Source { class MyScanBuilder extends JavaSimpleScanBuilder implements SupportsReportPartitioning { diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaRangeInputPartition.java b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaRangeInputPartition.java similarity index 90% rename from sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaRangeInputPartition.java rename to sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaRangeInputPartition.java index 438f489a3eea7..d612441201e64 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaRangeInputPartition.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaRangeInputPartition.java @@ -15,9 +15,9 @@ * limitations under the License. */ -package test.org.apache.spark.sql.sources.v2; +package test.org.apache.spark.sql.connector; -import org.apache.spark.sql.sources.v2.reader.InputPartition; +import org.apache.spark.sql.connector.read.InputPartition; class JavaRangeInputPartition implements InputPartition { int start; diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaReportStatisticsDataSource.java b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaReportStatisticsDataSource.java similarity index 79% rename from sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaReportStatisticsDataSource.java rename to sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaReportStatisticsDataSource.java index f3755e18b58d5..9a787c3d2d92c 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaReportStatisticsDataSource.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaReportStatisticsDataSource.java @@ -15,19 +15,19 @@ * limitations under the License. */ -package test.org.apache.spark.sql.sources.v2; +package test.org.apache.spark.sql.connector; import java.util.OptionalLong; -import org.apache.spark.sql.sources.v2.Table; -import org.apache.spark.sql.sources.v2.TableProvider; -import org.apache.spark.sql.sources.v2.reader.InputPartition; -import org.apache.spark.sql.sources.v2.reader.ScanBuilder; -import org.apache.spark.sql.sources.v2.reader.Statistics; -import org.apache.spark.sql.sources.v2.reader.SupportsReportStatistics; +import org.apache.spark.sql.connector.TestingV2Source; +import org.apache.spark.sql.connector.catalog.Table; +import org.apache.spark.sql.connector.read.InputPartition; +import org.apache.spark.sql.connector.read.ScanBuilder; +import org.apache.spark.sql.connector.read.Statistics; +import org.apache.spark.sql.connector.read.SupportsReportStatistics; import org.apache.spark.sql.util.CaseInsensitiveStringMap; -public class JavaReportStatisticsDataSource implements TableProvider { +public class JavaReportStatisticsDataSource implements TestingV2Source { class MyScanBuilder extends JavaSimpleScanBuilder implements SupportsReportStatistics { @Override public Statistics estimateStatistics() { diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSchemaRequiredDataSource.java b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaSchemaRequiredDataSource.java similarity index 73% rename from sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSchemaRequiredDataSource.java rename to sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaSchemaRequiredDataSource.java index 3800a94f88898..5f73567ade025 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSchemaRequiredDataSource.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaSchemaRequiredDataSource.java @@ -15,11 +15,15 @@ * limitations under the License. */ -package test.org.apache.spark.sql.sources.v2; +package test.org.apache.spark.sql.connector; -import org.apache.spark.sql.sources.v2.Table; -import org.apache.spark.sql.sources.v2.TableProvider; -import org.apache.spark.sql.sources.v2.reader.*; +import java.util.Map; + +import org.apache.spark.sql.connector.catalog.Table; +import org.apache.spark.sql.connector.catalog.TableProvider; +import org.apache.spark.sql.connector.expressions.Transform; +import org.apache.spark.sql.connector.read.InputPartition; +import org.apache.spark.sql.connector.read.ScanBuilder; import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.util.CaseInsensitiveStringMap; @@ -45,7 +49,18 @@ public InputPartition[] planInputPartitions() { } @Override - public Table getTable(CaseInsensitiveStringMap options, StructType schema) { + public boolean supportsExternalMetadata() { + return true; + } + + @Override + public StructType inferSchema(CaseInsensitiveStringMap options) { + throw new IllegalArgumentException("requires a user-supplied schema"); + } + + @Override + public Table getTable( + StructType schema, Transform[] partitioning, Map properties) { return new JavaSimpleBatchTable() { @Override @@ -59,9 +74,4 @@ public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) { } }; } - - @Override - public Table getTable(CaseInsensitiveStringMap options) { - throw new IllegalArgumentException("requires a user-supplied schema"); - } } diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleBatchTable.java b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaSimpleBatchTable.java similarity index 81% rename from sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleBatchTable.java rename to sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaSimpleBatchTable.java index 64663d5db4bed..71cf97b56fe54 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleBatchTable.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaSimpleBatchTable.java @@ -15,15 +15,16 @@ * limitations under the License. */ -package test.org.apache.spark.sql.sources.v2; +package test.org.apache.spark.sql.connector; import java.util.Arrays; import java.util.HashSet; import java.util.Set; -import org.apache.spark.sql.sources.v2.SupportsRead; -import org.apache.spark.sql.sources.v2.Table; -import org.apache.spark.sql.sources.v2.TableCapability; +import org.apache.spark.sql.connector.TestingV2Source; +import org.apache.spark.sql.connector.catalog.SupportsRead; +import org.apache.spark.sql.connector.catalog.Table; +import org.apache.spark.sql.connector.catalog.TableCapability; import org.apache.spark.sql.types.StructType; abstract class JavaSimpleBatchTable implements Table, SupportsRead { @@ -34,7 +35,7 @@ abstract class JavaSimpleBatchTable implements Table, SupportsRead { @Override public StructType schema() { - return new StructType().add("i", "int").add("j", "int"); + return TestingV2Source.schema(); } @Override diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleDataSourceV2.java b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaSimpleDataSourceV2.java similarity index 81% rename from sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleDataSourceV2.java rename to sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaSimpleDataSourceV2.java index 7474f36c97f75..8852249d8a01f 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleDataSourceV2.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaSimpleDataSourceV2.java @@ -15,14 +15,15 @@ * limitations under the License. */ -package test.org.apache.spark.sql.sources.v2; +package test.org.apache.spark.sql.connector; -import org.apache.spark.sql.sources.v2.Table; -import org.apache.spark.sql.sources.v2.TableProvider; -import org.apache.spark.sql.sources.v2.reader.*; +import org.apache.spark.sql.connector.TestingV2Source; +import org.apache.spark.sql.connector.catalog.Table; +import org.apache.spark.sql.connector.read.InputPartition; +import org.apache.spark.sql.connector.read.ScanBuilder; import org.apache.spark.sql.util.CaseInsensitiveStringMap; -public class JavaSimpleDataSourceV2 implements TableProvider { +public class JavaSimpleDataSourceV2 implements TestingV2Source { class MyScanBuilder extends JavaSimpleScanBuilder { diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleReaderFactory.java b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaSimpleReaderFactory.java similarity index 86% rename from sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleReaderFactory.java rename to sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaSimpleReaderFactory.java index 740279033c416..0c702031a939b 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleReaderFactory.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaSimpleReaderFactory.java @@ -15,13 +15,13 @@ * limitations under the License. */ -package test.org.apache.spark.sql.sources.v2; +package test.org.apache.spark.sql.connector; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; -import org.apache.spark.sql.sources.v2.reader.InputPartition; -import org.apache.spark.sql.sources.v2.reader.PartitionReader; -import org.apache.spark.sql.sources.v2.reader.PartitionReaderFactory; +import org.apache.spark.sql.connector.read.InputPartition; +import org.apache.spark.sql.connector.read.PartitionReader; +import org.apache.spark.sql.connector.read.PartitionReaderFactory; class JavaSimpleReaderFactory implements PartitionReaderFactory { diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleScanBuilder.java b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaSimpleScanBuilder.java similarity index 77% rename from sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleScanBuilder.java rename to sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaSimpleScanBuilder.java index 217e66950d146..bdd9dd3ea0ce0 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/sources/v2/JavaSimpleScanBuilder.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/connector/JavaSimpleScanBuilder.java @@ -15,12 +15,13 @@ * limitations under the License. */ -package test.org.apache.spark.sql.sources.v2; +package test.org.apache.spark.sql.connector; -import org.apache.spark.sql.sources.v2.reader.Batch; -import org.apache.spark.sql.sources.v2.reader.PartitionReaderFactory; -import org.apache.spark.sql.sources.v2.reader.Scan; -import org.apache.spark.sql.sources.v2.reader.ScanBuilder; +import org.apache.spark.sql.connector.TestingV2Source; +import org.apache.spark.sql.connector.read.Batch; +import org.apache.spark.sql.connector.read.PartitionReaderFactory; +import org.apache.spark.sql.connector.read.Scan; +import org.apache.spark.sql.connector.read.ScanBuilder; import org.apache.spark.sql.types.StructType; abstract class JavaSimpleScanBuilder implements ScanBuilder, Scan, Batch { @@ -37,7 +38,7 @@ public Batch toBatch() { @Override public StructType readSchema() { - return new StructType().add("i", "int").add("j", "int"); + return TestingV2Source.schema(); } @Override diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/execution/sort/RecordBinaryComparatorSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/execution/sort/RecordBinaryComparatorSuite.java index 92dabc79d2bff..564e76737ecde 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/execution/sort/RecordBinaryComparatorSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/execution/sort/RecordBinaryComparatorSuite.java @@ -33,6 +33,7 @@ import org.apache.spark.util.collection.unsafe.sort.*; import org.junit.After; +import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -81,14 +82,14 @@ private void insertRow(UnsafeRow row) { int recordLength = row.getSizeInBytes(); Object baseObject = dataPage.getBaseObject(); - assert(pageCursor + recordLength <= dataPage.getBaseOffset() + dataPage.size()); + Assert.assertTrue(pageCursor + recordLength <= dataPage.getBaseOffset() + dataPage.size()); long recordAddress = memoryManager.encodePageNumberAndOffset(dataPage, pageCursor); UnsafeAlignedOffset.putSize(baseObject, pageCursor, recordLength); pageCursor += uaoSize; Platform.copyMemory(recordBase, recordOffset, baseObject, pageCursor, recordLength); pageCursor += recordLength; - assert(pos < 2); + Assert.assertTrue(pos < 2); array.set(pos, recordAddress); pos++; } @@ -141,8 +142,8 @@ public void testBinaryComparatorForSingleColumnRow() throws Exception { insertRow(row1); insertRow(row2); - assert(compare(0, 0) == 0); - assert(compare(0, 1) < 0); + Assert.assertEquals(0, compare(0, 0)); + Assert.assertTrue(compare(0, 1) < 0); } @Test @@ -166,8 +167,8 @@ public void testBinaryComparatorForMultipleColumnRow() throws Exception { insertRow(row1); insertRow(row2); - assert(compare(0, 0) == 0); - assert(compare(0, 1) < 0); + Assert.assertEquals(0, compare(0, 0)); + Assert.assertTrue(compare(0, 1) < 0); } @Test @@ -193,8 +194,8 @@ public void testBinaryComparatorForArrayColumn() throws Exception { insertRow(row1); insertRow(row2); - assert(compare(0, 0) == 0); - assert(compare(0, 1) > 0); + Assert.assertEquals(0, compare(0, 0)); + Assert.assertTrue(compare(0, 1) > 0); } @Test @@ -226,8 +227,8 @@ public void testBinaryComparatorForMixedColumns() throws Exception { insertRow(row1); insertRow(row2); - assert(compare(0, 0) == 0); - assert(compare(0, 1) > 0); + Assert.assertEquals(0, compare(0, 0)); + Assert.assertTrue(compare(0, 1) > 0); } @Test @@ -252,8 +253,8 @@ public void testBinaryComparatorForNullColumns() throws Exception { insertRow(row1); insertRow(row2); - assert(compare(0, 0) == 0); - assert(compare(0, 1) > 0); + Assert.assertEquals(0, compare(0, 0)); + Assert.assertTrue(compare(0, 1) > 0); } @Test @@ -273,7 +274,7 @@ public void testBinaryComparatorWhenSubtractionIsDivisibleByMaxIntValue() throws insertRow(row1); insertRow(row2); - assert(compare(0, 1) < 0); + Assert.assertTrue(compare(0, 1) > 0); } @Test @@ -293,7 +294,7 @@ public void testBinaryComparatorWhenSubtractionCanOverflowLongValue() throws Exc insertRow(row1); insertRow(row2); - assert(compare(0, 1) < 0); + Assert.assertTrue(compare(0, 1) < 0); } @Test @@ -319,6 +320,50 @@ public void testBinaryComparatorWhenOnlyTheLastColumnDiffers() throws Exception insertRow(row1); insertRow(row2); - assert(compare(0, 1) < 0); + Assert.assertTrue(compare(0, 1) < 0); + } + + @Test + public void testCompareLongsAsLittleEndian() { + long arrayOffset = Platform.LONG_ARRAY_OFFSET + 4; + + long[] arr1 = new long[2]; + Platform.putLong(arr1, arrayOffset, 0x0100000000000000L); + long[] arr2 = new long[2]; + Platform.putLong(arr2, arrayOffset + 4, 0x0000000000000001L); + // leftBaseOffset is not aligned while rightBaseOffset is aligned, + // it will start by comparing long + int result1 = binaryComparator.compare(arr1, arrayOffset, 8, arr2, arrayOffset + 4, 8); + + long[] arr3 = new long[2]; + Platform.putLong(arr3, arrayOffset, 0x0100000000000000L); + long[] arr4 = new long[2]; + Platform.putLong(arr4, arrayOffset, 0x0000000000000001L); + // both left and right offset is not aligned, it will start with byte-by-byte comparison + int result2 = binaryComparator.compare(arr3, arrayOffset, 8, arr4, arrayOffset, 8); + + Assert.assertEquals(result1, result2); + } + + @Test + public void testCompareLongsAsUnsigned() { + long arrayOffset = Platform.LONG_ARRAY_OFFSET + 4; + + long[] arr1 = new long[2]; + Platform.putLong(arr1, arrayOffset + 4, 0xa000000000000000L); + long[] arr2 = new long[2]; + Platform.putLong(arr2, arrayOffset + 4, 0x0000000000000000L); + // both leftBaseOffset and rightBaseOffset are aligned, so it will start by comparing long + int result1 = binaryComparator.compare(arr1, arrayOffset + 4, 8, arr2, arrayOffset + 4, 8); + + long[] arr3 = new long[2]; + Platform.putLong(arr3, arrayOffset, 0xa000000000000000L); + long[] arr4 = new long[2]; + Platform.putLong(arr4, arrayOffset, 0x0000000000000000L); + // both leftBaseOffset and rightBaseOffset are not aligned, + // so it will start with byte-by-byte comparison + int result2 = binaryComparator.compare(arr3, arrayOffset, 8, arr4, arrayOffset, 8); + + Assert.assertEquals(result1, result2); } } diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/streaming/JavaDataStreamReaderWriterSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/streaming/JavaDataStreamReaderWriterSuite.java index 48cdb2642d830..5903623847f52 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/streaming/JavaDataStreamReaderWriterSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/streaming/JavaDataStreamReaderWriterSuite.java @@ -18,6 +18,7 @@ package test.org.apache.spark.sql.streaming; import java.io.File; +import java.util.concurrent.TimeoutException; import org.junit.After; import org.junit.Before; @@ -52,7 +53,7 @@ public void tearDown() { } @Test - public void testForeachBatchAPI() { + public void testForeachBatchAPI() throws TimeoutException { StreamingQuery query = spark .readStream() .textFile(input) @@ -66,7 +67,7 @@ public void call(Dataset v1, Long v2) throws Exception {} } @Test - public void testForeachAPI() { + public void testForeachAPI() throws TimeoutException { StreamingQuery query = spark .readStream() .textFile(input) diff --git a/sql/core/src/test/resources/sql-tests/inputs/ansi/decimalArithmeticOperations.sql b/sql/core/src/test/resources/sql-tests/inputs/ansi/decimalArithmeticOperations.sql new file mode 100644 index 0000000000000..d190f38345d6b --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/ansi/decimalArithmeticOperations.sql @@ -0,0 +1,32 @@ +-- SPARK-23179: SQL ANSI 2011 states that in case of overflow during arithmetic operations, +-- an exception should be thrown instead of returning NULL. +-- This is what most of the SQL DBs do (eg. SQLServer, DB2). + +-- tests for decimals handling in operations +create table decimals_test(id int, a decimal(38,18), b decimal(38,18)) using parquet; + +insert into decimals_test values(1, 100.0, 999.0), (2, 12345.123, 12345.123), + (3, 0.1234567891011, 1234.1), (4, 123456789123456789.0, 1.123456789123456789); + +-- test operations between decimals and constants +select id, a*10, b/10 from decimals_test order by id; + +-- test operations on constants +select 10.3 * 3.0; +select 10.3000 * 3.0; +select 10.30000 * 30.0; +select 10.300000000000000000 * 3.000000000000000000; +select 10.300000000000000000 * 3.0000000000000000000; + +-- arithmetic operations causing an overflow throw exception +select (5e36BD + 0.1) + 5e36BD; +select (-4e36BD - 0.1) - 7e36BD; +select 12345678901234567890.0 * 12345678901234567890.0; +select 1e35BD / 0.1; + +-- arithmetic operations causing a precision loss throw exception +select 123456789123456789.1234567890 * 1.123456789123456789; +select 123456789123456789.1234567890 * 1.123456789123456789; +select 12345678912345.123456789123 / 0.000000012345678; + +drop table decimals_test; diff --git a/sql/core/src/test/resources/sql-tests/inputs/ansi/higher-order-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/ansi/higher-order-functions.sql new file mode 100644 index 0000000000000..1e2424fe47cad --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/ansi/higher-order-functions.sql @@ -0,0 +1 @@ +--IMPORT higher-order-functions.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/ansi/interval.sql b/sql/core/src/test/resources/sql-tests/inputs/ansi/interval.sql index f2f4b02c8634b..215ce9658e1ad 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/ansi/interval.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/ansi/interval.sql @@ -1,188 +1 @@ --- Turns on ANSI mode -SET spark.sql.parser.ansi.enabled=true; - -select - '1' second, - 2 seconds, - '1' minute, - 2 minutes, - '1' hour, - 2 hours, - '1' day, - 2 days, - '1' month, - 2 months, - '1' year, - 2 years; - -select - interval '10-11' year to month, - interval '10' year, - interval '11' month; - -select - '10-11' year to month, - '10' year, - '11' month; - -select - interval '10 9:8:7.987654321' day to second, - interval '10' day, - interval '11' hour, - interval '12' minute, - interval '13' second, - interval '13.123456789' second; - -select - '10 9:8:7.987654321' day to second, - '10' day, - '11' hour, - '12' minute, - '13' second, - '13.123456789' second; - -select map(1, interval 1 day, 2, interval 3 week); - -select map(1, 1 day, 2, 3 week); - --- Interval year-month arithmetic - -create temporary view interval_arithmetic as - select CAST(dateval AS date), CAST(tsval AS timestamp) from values - ('2012-01-01', '2012-01-01') - as interval_arithmetic(dateval, tsval); - -select - dateval, - dateval - interval '2-2' year to month, - dateval - interval '-2-2' year to month, - dateval + interval '2-2' year to month, - dateval + interval '-2-2' year to month, - - interval '2-2' year to month + dateval, - interval '2-2' year to month + dateval -from interval_arithmetic; - -select - dateval, - dateval - '2-2' year to month, - dateval - '-2-2' year to month, - dateval + '2-2' year to month, - dateval + '-2-2' year to month, - - '2-2' year to month + dateval, - '2-2' year to month + dateval -from interval_arithmetic; - -select - tsval, - tsval - interval '2-2' year to month, - tsval - interval '-2-2' year to month, - tsval + interval '2-2' year to month, - tsval + interval '-2-2' year to month, - - interval '2-2' year to month + tsval, - interval '2-2' year to month + tsval -from interval_arithmetic; - -select - tsval, - tsval - '2-2' year to month, - tsval - '-2-2' year to month, - tsval + '2-2' year to month, - tsval + '-2-2' year to month, - - '2-2' year to month + tsval, - '2-2' year to month + tsval -from interval_arithmetic; - -select - interval '2-2' year to month + interval '3-3' year to month, - interval '2-2' year to month - interval '3-3' year to month -from interval_arithmetic; - -select - '2-2' year to month + '3-3' year to month, - '2-2' year to month - '3-3' year to month -from interval_arithmetic; - --- Interval day-time arithmetic - -select - dateval, - dateval - interval '99 11:22:33.123456789' day to second, - dateval - interval '-99 11:22:33.123456789' day to second, - dateval + interval '99 11:22:33.123456789' day to second, - dateval + interval '-99 11:22:33.123456789' day to second, - -interval '99 11:22:33.123456789' day to second + dateval, - interval '99 11:22:33.123456789' day to second + dateval -from interval_arithmetic; - -select - dateval, - dateval - '99 11:22:33.123456789' day to second, - dateval - '-99 11:22:33.123456789' day to second, - dateval + '99 11:22:33.123456789' day to second, - dateval + '-99 11:22:33.123456789' day to second, - - '99 11:22:33.123456789' day to second + dateval, - '99 11:22:33.123456789' day to second + dateval -from interval_arithmetic; - -select - tsval, - tsval - interval '99 11:22:33.123456789' day to second, - tsval - interval '-99 11:22:33.123456789' day to second, - tsval + interval '99 11:22:33.123456789' day to second, - tsval + interval '-99 11:22:33.123456789' day to second, - -interval '99 11:22:33.123456789' day to second + tsval, - interval '99 11:22:33.123456789' day to second + tsval -from interval_arithmetic; - -select - tsval, - tsval - '99 11:22:33.123456789' day to second, - tsval - '-99 11:22:33.123456789' day to second, - tsval + '99 11:22:33.123456789' day to second, - tsval + '-99 11:22:33.123456789' day to second, - - '99 11:22:33.123456789' day to second + tsval, - '99 11:22:33.123456789' day to second + tsval -from interval_arithmetic; - -select - interval '99 11:22:33.123456789' day to second + interval '10 9:8:7.123456789' day to second, - interval '99 11:22:33.123456789' day to second - interval '10 9:8:7.123456789' day to second -from interval_arithmetic; - -select - '99 11:22:33.123456789' day to second + '10 9:8:7.123456789' day to second, - '99 11:22:33.123456789' day to second - '10 9:8:7.123456789' day to second -from interval_arithmetic; - --- More tests for interval syntax alternatives - -select 30 day; - -select 30 day day; - -select 30 day day day; - -select date '2012-01-01' - 30 day; - -select date '2012-01-01' - 30 day day; - -select date '2012-01-01' - 30 day day day; - -select date '2012-01-01' + '-30' day; - -select date '2012-01-01' + interval '-30' day; - --- Unsupported syntax for intervals - -select date '2012-01-01' + interval (-30) day; - -select date '2012-01-01' + (-30) day; - -create temporary view t as select * from values (1), (2) as t(a); - -select date '2012-01-01' + interval (a + 1) day from t; - -select date '2012-01-01' + (a + 1) day from t; - --- Turns off ANSI mode -SET spark.sql.parser.ansi.enabled=false; +--IMPORT interval.sql \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/inputs/ansi/literals.sql b/sql/core/src/test/resources/sql-tests/inputs/ansi/literals.sql new file mode 100644 index 0000000000000..698e8fa886307 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/ansi/literals.sql @@ -0,0 +1,2 @@ +--- malformed interval literal with ansi mode +--IMPORT literals.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/bitwise.sql b/sql/core/src/test/resources/sql-tests/inputs/bitwise.sql new file mode 100644 index 0000000000000..5e665e4c0c384 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/bitwise.sql @@ -0,0 +1,70 @@ +-- test cases for bitwise functions + +-- null +select bit_count(null); + +-- boolean +select bit_count(true); +select bit_count(false); + +-- byte/tinyint +select bit_count(cast(1 as tinyint)); +select bit_count(cast(2 as tinyint)); +select bit_count(cast(3 as tinyint)); + +-- short/smallint +select bit_count(1S); +select bit_count(2S); +select bit_count(3S); + +-- int +select bit_count(1); +select bit_count(2); +select bit_count(3); + +-- long/bigint +select bit_count(1L); +select bit_count(2L); +select bit_count(3L); + +-- negative num +select bit_count(-1L); + +-- edge value +select bit_count(9223372036854775807L); +select bit_count(-9223372036854775808L); + +-- other illegal arguments +select bit_count("bit count"); +select bit_count('a'); + +-- test for bit_xor +-- +CREATE OR REPLACE TEMPORARY VIEW bitwise_test AS SELECT * FROM VALUES + (1, 1, 1, 1L), + (2, 3, 4, null), + (7, 7, 7, 3L) AS bitwise_test(b1, b2, b3, b4); + +-- empty case +SELECT BIT_XOR(b3) AS n1 FROM bitwise_test where 1 = 0; + +-- null case +SELECT BIT_XOR(b4) AS n1 FROM bitwise_test where b4 is null; + +-- the suffix numbers show the expected answer +SELECT + BIT_XOR(cast(b1 as tinyint)) AS a4, + BIT_XOR(cast(b2 as smallint)) AS b5, + BIT_XOR(b3) AS c2, + BIT_XOR(b4) AS d2, + BIT_XOR(distinct b4) AS e2 +FROM bitwise_test; + +-- group by +SELECT bit_xor(b3) FROM bitwise_test GROUP BY b1 & 1; + +--having +SELECT b1, bit_xor(b2) FROM bitwise_test GROUP BY b1 HAVING bit_and(b2) < 7; + +-- window +SELECT b1, b2, bit_xor(b2) OVER (PARTITION BY b1 ORDER BY b2) FROM bitwise_test; diff --git a/sql/core/src/test/resources/sql-tests/inputs/cast.sql b/sql/core/src/test/resources/sql-tests/inputs/cast.sql index 8a035f594be54..972ebdd01f61e 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/cast.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/cast.sql @@ -60,3 +60,18 @@ DESC FUNCTION EXTENDED boolean; -- cast string to interval and interval to string SELECT CAST('interval 3 month 1 hour' AS interval); SELECT CAST(interval 3 month 1 hour AS string); + +-- trim string before cast to numeric +select cast(' 1' as tinyint); +select cast(' 1\t' as tinyint); +select cast(' 1' as smallint); +select cast(' 1' as INT); +select cast(' 1' as bigint); +select cast(' 1' as float); +select cast(' 1 ' as DOUBLE); +select cast('1.0 ' as DEC); + +-- trim string before cast to boolean +select cast('\t\t true \n\r ' as boolean); +select cast('\t\n false \t\r' as boolean); +select cast('\t\n xyz \t\r' as boolean); diff --git a/sql/core/src/test/resources/sql-tests/inputs/change-column.sql b/sql/core/src/test/resources/sql-tests/inputs/change-column.sql index 6f5ac221ce79c..2b57891cfcbc5 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/change-column.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/change-column.sql @@ -2,56 +2,48 @@ CREATE TABLE test_change(a INT, b STRING, c INT) using parquet; DESC test_change; --- Change column name (not supported yet) -ALTER TABLE test_change CHANGE a a1 INT; +-- ALTER TABLE CHANGE COLUMN must change either type or comment +ALTER TABLE test_change CHANGE a; +DESC test_change; + +-- Change column name (not supported on v1 table) +ALTER TABLE test_change RENAME COLUMN a TO a1; DESC test_change; -- Change column dataType (not supported yet) -ALTER TABLE test_change CHANGE a a STRING; +ALTER TABLE test_change CHANGE a TYPE STRING; DESC test_change; -- Change column position (not supported yet) -ALTER TABLE test_change CHANGE a a INT AFTER b; -ALTER TABLE test_change CHANGE b b STRING FIRST; +ALTER TABLE test_change CHANGE a AFTER b; +ALTER TABLE test_change CHANGE b FIRST; DESC test_change; -- Change column comment -ALTER TABLE test_change CHANGE a a INT COMMENT 'this is column a'; -ALTER TABLE test_change CHANGE b b STRING COMMENT '#*02?`'; -ALTER TABLE test_change CHANGE c c INT COMMENT ''; +ALTER TABLE test_change CHANGE a COMMENT 'this is column a'; +ALTER TABLE test_change CHANGE b COMMENT '#*02?`'; +ALTER TABLE test_change CHANGE c COMMENT ''; DESC test_change; -- Don't change anything. -ALTER TABLE test_change CHANGE a a INT COMMENT 'this is column a'; +ALTER TABLE test_change CHANGE a TYPE INT; +ALTER TABLE test_change CHANGE a COMMENT 'this is column a'; DESC test_change; -- Change a invalid column -ALTER TABLE test_change CHANGE invalid_col invalid_col INT; +ALTER TABLE test_change CHANGE invalid_col TYPE INT; DESC test_change; --- Change column name/dataType/position/comment together (not supported yet) -ALTER TABLE test_change CHANGE a a1 STRING COMMENT 'this is column a1' AFTER b; -DESC test_change; - --- Check the behavior with different values of CASE_SENSITIVE -SET spark.sql.caseSensitive=false; -ALTER TABLE test_change CHANGE a A INT COMMENT 'this is column A'; -SET spark.sql.caseSensitive=true; -ALTER TABLE test_change CHANGE a A INT COMMENT 'this is column A1'; +-- Check case insensitivity. +ALTER TABLE test_change CHANGE A COMMENT 'case insensitivity'; DESC test_change; -- Change column can't apply to a temporary/global_temporary view CREATE TEMPORARY VIEW temp_view(a, b) AS SELECT 1, "one"; -ALTER TABLE temp_view CHANGE a a INT COMMENT 'this is column a'; +ALTER TABLE temp_view CHANGE a TYPE INT; CREATE GLOBAL TEMPORARY VIEW global_temp_view(a, b) AS SELECT 1, "one"; -ALTER TABLE global_temp.global_temp_view CHANGE a a INT COMMENT 'this is column a'; - --- Change column in partition spec (not supported yet) -CREATE TABLE partition_table(a INT, b STRING, c INT, d STRING) USING parquet PARTITIONED BY (c, d); -ALTER TABLE partition_table PARTITION (c = 1) CHANGE COLUMN a new_a INT; -ALTER TABLE partition_table CHANGE COLUMN c c INT COMMENT 'this is column C'; +ALTER TABLE global_temp.global_temp_view CHANGE a TYPE INT; -- DROP TEST TABLE DROP TABLE test_change; -DROP TABLE partition_table; DROP VIEW global_temp.global_temp_view; diff --git a/sql/core/src/test/resources/sql-tests/inputs/comparator.sql b/sql/core/src/test/resources/sql-tests/inputs/comparator.sql index 3e2447723e576..70af4f75ac431 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/comparator.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/comparator.sql @@ -1,3 +1,13 @@ -- binary type select x'00' < x'0f'; select x'00' < x'ff'; + +-- trim string to numeric +select '1 ' = 1Y; +select '\t1 ' = 1Y; +select '1 ' = 1S; +select '1 ' = 1; +select ' 1' = 1L; +select ' 1' = cast(1.0 as float); +select ' 1.0 ' = 1.0D; +select ' 1.0 ' = 1.0BD; \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql b/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql new file mode 100644 index 0000000000000..b711bf338ab08 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql @@ -0,0 +1,2 @@ +--SET spark.sql.legacy.ctePrecedence.enabled = false +--IMPORT cte.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/date_part.sql b/sql/core/src/test/resources/sql-tests/inputs/date_part.sql new file mode 100644 index 0000000000000..a63cdafb745a0 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/date_part.sql @@ -0,0 +1,145 @@ +CREATE TEMPORARY VIEW t AS select '2011-05-06 07:08:09.1234567' as c; + +select date_part('millennium', c) from t; +select date_part('millennia', c) from t; +select date_part('mil', c) from t; +select date_part('mils', c) from t; + +select date_part('century', c) from t; +select date_part('centuries', c) from t; +select date_part('c', c) from t; +select date_part('cent', c) from t; + +select date_part('decade', c) from t; +select date_part('decades', c) from t; +select date_part('dec', c) from t; +select date_part('decs', c) from t; + +select date_part('year', c) from t; +select date_part('y', c) from t; +select date_part('years', c) from t; +select date_part('yr', c) from t; +select date_part('yrs', c) from t; + +select date_part('quarter', c) from t; +select date_part('qtr', c) from t; + +select date_part('month', c) from t; +select date_part('mon', c) from t; +select date_part('mons', c) from t; +select date_part('months', c) from t; + +select date_part('week', c) from t; +select date_part('w', c) from t; +select date_part('weeks', c) from t; + +select date_part('day', c) from t; +select date_part('d', c) from t; +select date_part('days', c) from t; + +select date_part('dayofweek', c) from t; + +select date_part('dow', c) from t; + +select date_part('isodow', c) from t; + +select date_part('doy', c) from t; + +select date_part('hour', c) from t; +select date_part('h', c) from t; +select date_part('hours', c) from t; +select date_part('hr', c) from t; +select date_part('hrs', c) from t; + +select date_part('minute', c) from t; +select date_part('m', c) from t; +select date_part('min', c) from t; +select date_part('mins', c) from t; +select date_part('minutes', c) from t; + +select date_part('second', c) from t; +select date_part('s', c) from t; +select date_part('sec', c) from t; +select date_part('seconds', c) from t; +select date_part('secs', c) from t; + +select date_part('not_supported', c) from t; + +select date_part(c, c) from t; + +select date_part(null, c) from t; + +CREATE TEMPORARY VIEW t2 AS select interval 1010 year 9 month 8 day 7 hour 6 minute 5 second 4 millisecond 3 microsecond as c; + +select date_part('millennium', c) from t2; +select date_part('millennia', c) from t2; +select date_part('mil', c) from t2; +select date_part('mils', c) from t2; + +select date_part('century', c) from t2; +select date_part('centuries', c) from t2; +select date_part('c', c) from t2; +select date_part('cent', c) from t2; + +select date_part('decade', c) from t2; +select date_part('decades', c) from t2; +select date_part('dec', c) from t2; +select date_part('decs', c) from t2; + +select date_part('year', c) from t2; +select date_part('y', c) from t2; +select date_part('years', c) from t2; +select date_part('yr', c) from t2; +select date_part('yrs', c) from t2; + +select date_part('quarter', c) from t2; +select date_part('qtr', c) from t2; + +select date_part('month', c) from t2; +select date_part('mon', c) from t2; +select date_part('mons', c) from t2; +select date_part('months', c) from t2; + +select date_part('day', c) from t2; +select date_part('d', c) from t2; +select date_part('days', c) from t2; + +select date_part('hour', c) from t2; +select date_part('h', c) from t2; +select date_part('hours', c) from t2; +select date_part('hr', c) from t2; +select date_part('hrs', c) from t2; + +select date_part('minute', c) from t2; +select date_part('m', c) from t2; +select date_part('min', c) from t2; +select date_part('mins', c) from t2; +select date_part('minutes', c) from t2; + +select date_part('second', c) from t2; +select date_part('s', c) from t2; +select date_part('sec', c) from t2; +select date_part('seconds', c) from t2; +select date_part('secs', c) from t2; + +select date_part('milliseconds', c) from t2; +select date_part('msec', c) from t2; +select date_part('msecs', c) from t2; +select date_part('millisecon', c) from t2; +select date_part('mseconds', c) from t2; +select date_part('ms', c) from t2; + +select date_part('microseconds', c) from t2; +select date_part('usec', c) from t2; +select date_part('usecs', c) from t2; +select date_part('useconds', c) from t2; +select date_part('microsecon', c) from t2; +select date_part('us', c) from t2; + +select date_part('epoch', c) from t2; + +select date_part('not_supported', c) from t2; + +select date_part(c, c) from t2; + +select date_part(null, c) from t2; diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql index 2f7ffb73e86b8..b14778b91510e 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql @@ -30,7 +30,48 @@ select weekday('2007-02-03'), weekday('2009-07-30'), weekday('2017-05-27'), week select year('1500-01-01'), month('1500-01-01'), dayOfYear('1500-01-01'); -select date '2001-09-28' + 7; -select 7 + date '2001-09-28'; + +select date '2019-01-01\t'; +select timestamp '2019-01-01\t'; + +-- time add/sub +select timestamp'2011-11-11 11:11:11' + interval '2' day; +select timestamp'2011-11-11 11:11:11' - interval '2' day; +select date'2011-11-11 11:11:11' + interval '2' second; +select date'2011-11-11 11:11:11' - interval '2' second; +select '2011-11-11' - interval '2' day; +select '2011-11-11 11:11:11' - interval '2' second; +select '1' - interval '2' second; +select 1 - interval '2' second; + +-- subtract timestamps +select date'2020-01-01' - timestamp'2019-10-06 10:11:12.345678'; +select timestamp'2019-10-06 10:11:12.345678' - date'2020-01-01'; +select timestamp'2019-10-06 10:11:12.345678' - null; +select null - timestamp'2019-10-06 10:11:12.345678'; + +-- date add/sub +select date_add('2011-11-11', 1Y); +select date_add('2011-11-11', 1S); +select date_add('2011-11-11', 1); +select date_add('2011-11-11', 1L); +select date_add('2011-11-11', 1.0); +select date_add('2011-11-11', 1E1); +select date_add('2011-11-11', '1'); +select date_add(date'2011-11-11', 1); +select date_add(timestamp'2011-11-11', 1); +select date_sub(date'2011-11-11', 1); +select date_sub(timestamp'2011-11-11', 1); +select date_sub(null, 1); +select date_sub(date'2011-11-11', null); +select date'2011-11-11' + 1E1; +select null + date '2001-09-28'; +select date '2001-09-28' + 7Y; +select 7S + date '2001-09-28'; select date '2001-10-01' - 7; +select date '2001-09-28' + null; +select date '2001-09-28' - null; + +-- subtract dates +select null - date '2019-10-06'; select date '2001-10-01' - date '2001-09-28'; diff --git a/sql/core/src/test/resources/sql-tests/inputs/decimalArithmeticOperations.sql b/sql/core/src/test/resources/sql-tests/inputs/decimalArithmeticOperations.sql index 35f2be46cd130..a3bc282cd6ae8 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/decimalArithmeticOperations.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/decimalArithmeticOperations.sql @@ -43,11 +43,11 @@ select 10.300000000000000000 * 3.0000000000000000000; select 2.35E10 * 1.0; -- arithmetic operations causing an overflow return NULL -select (5e36 + 0.1) + 5e36; -select (-4e36 - 0.1) - 7e36; +select (5e36BD + 0.1) + 5e36BD; +select (-4e36BD - 0.1) - 7e36BD; select 12345678901234567890.0 * 12345678901234567890.0; -select 1e35 / 0.1; -select 1.2345678901234567890E30 * 1.2345678901234567890E25; +select 1e35BD / 0.1; +select 1.2345678901234567890E30BD * 1.2345678901234567890E25BD; -- arithmetic operations causing a precision loss are truncated select 12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345; @@ -72,39 +72,15 @@ select 10.300000000000000000 * 3.0000000000000000000; select 2.35E10 * 1.0; -- arithmetic operations causing an overflow return NULL -select (5e36 + 0.1) + 5e36; -select (-4e36 - 0.1) - 7e36; +select (5e36BD + 0.1) + 5e36BD; +select (-4e36BD - 0.1) - 7e36BD; select 12345678901234567890.0 * 12345678901234567890.0; -select 1e35 / 0.1; -select 1.2345678901234567890E30 * 1.2345678901234567890E25; +select 1e35BD / 0.1; +select 1.2345678901234567890E30BD * 1.2345678901234567890E25BD; -- arithmetic operations causing a precision loss return NULL select 12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345; select 123456789123456789.1234567890 * 1.123456789123456789; select 12345678912345.123456789123 / 0.000000012345678; --- throw an exception instead of returning NULL, according to SQL ANSI 2011 -set spark.sql.decimalOperations.nullOnOverflow=false; - --- test operations between decimals and constants -select id, a*10, b/10 from decimals_test order by id; - --- test operations on constants -select 10.3 * 3.0; -select 10.3000 * 3.0; -select 10.30000 * 30.0; -select 10.300000000000000000 * 3.000000000000000000; -select 10.300000000000000000 * 3.0000000000000000000; - --- arithmetic operations causing an overflow throw exception -select (5e36 + 0.1) + 5e36; -select (-4e36 - 0.1) - 7e36; -select 12345678901234567890.0 * 12345678901234567890.0; -select 1e35 / 0.1; - --- arithmetic operations causing a precision loss throw exception -select 123456789123456789.1234567890 * 1.123456789123456789; -select 123456789123456789.1234567890 * 1.123456789123456789; -select 12345678912345.123456789123 / 0.000000012345678; - drop table decimals_test; diff --git a/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql b/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql index 2d180d118da7a..821cb473751eb 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/describe-table-column.sql @@ -49,3 +49,16 @@ DROP VIEW desc_col_temp_view; DROP TABLE desc_col_table; DROP TABLE desc_complex_col_table; + +--Test case insensitive + +CREATE TABLE customer(CName STRING); + +INSERT INTO customer VALUES('Maria'); + +ANALYZE TABLE customer COMPUTE STATISTICS FOR COLUMNS cname; + +DESC EXTENDED customer cname; + +DROP TABLE customer; + diff --git a/sql/core/src/test/resources/sql-tests/inputs/explain.sql b/sql/core/src/test/resources/sql-tests/inputs/explain.sql index 773c123992f71..497b61c6134a2 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/explain.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/explain.sql @@ -1,7 +1,11 @@ +--SET spark.sql.codegen.wholeStage = true +--SET spark.sql.adaptive.enabled = false + -- Test tables CREATE table explain_temp1 (key int, val int) USING PARQUET; CREATE table explain_temp2 (key int, val int) USING PARQUET; CREATE table explain_temp3 (key int, val int) USING PARQUET; +CREATE table explain_temp4 (key int, val string) USING PARQUET; SET spark.sql.codegen.wholeStage = true; @@ -58,7 +62,7 @@ EXPLAIN FORMATTED FROM explain_temp2 WHERE val > 0) OR - key = (SELECT max(key) + key = (SELECT avg(key) FROM explain_temp3 WHERE val > 0); @@ -90,6 +94,25 @@ EXPLAIN FORMATTED CREATE VIEW explain_view AS SELECT key, val FROM explain_temp1; +-- HashAggregate +EXPLAIN FORMATTED + SELECT + COUNT(val) + SUM(key) as TOTAL, + COUNT(key) FILTER (WHERE val > 1) + FROM explain_temp1; + +-- ObjectHashAggregate +EXPLAIN FORMATTED + SELECT key, sort_array(collect_set(val))[0] + FROM explain_temp4 + GROUP BY key; + +-- SortAggregate +EXPLAIN FORMATTED + SELECT key, MIN(val) + FROM explain_temp4 + GROUP BY key; + -- cleanup DROP TABLE explain_temp1; DROP TABLE explain_temp2; diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql new file mode 100644 index 0000000000000..beb5b9e5fe516 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/group-by-filter.sql @@ -0,0 +1,132 @@ +-- Test filter clause for aggregate expression. + +-- Test data. +CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES +(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null) +AS testData(a, b); + +CREATE OR REPLACE TEMPORARY VIEW EMP AS SELECT * FROM VALUES + (100, "emp 1", date "2005-01-01", 100.00D, 10), + (100, "emp 1", date "2005-01-01", 100.00D, 10), + (200, "emp 2", date "2003-01-01", 200.00D, 10), + (300, "emp 3", date "2002-01-01", 300.00D, 20), + (400, "emp 4", date "2005-01-01", 400.00D, 30), + (500, "emp 5", date "2001-01-01", 400.00D, NULL), + (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100), + (700, "emp 7", date "2010-01-01", 400.00D, 100), + (800, "emp 8", date "2016-01-01", 150.00D, 70) +AS EMP(id, emp_name, hiredate, salary, dept_id); + +CREATE OR REPLACE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES + (10, "dept 1", "CA"), + (20, "dept 2", "NY"), + (30, "dept 3", "TX"), + (40, "dept 4 - unassigned", "OR"), + (50, "dept 5 - unassigned", "NJ"), + (70, "dept 7", "FL") +AS DEPT(dept_id, dept_name, state); + +-- Aggregate with filter and empty GroupBy expressions. +SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData; +SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM testData; +SELECT COUNT(id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp; +SELECT COUNT(id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM emp; +SELECT COUNT(id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00")) FROM emp; +SELECT COUNT(id) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd") = "2001-01-01") FROM emp; +-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT +-- SELECT COUNT(DISTINCT id) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd HH:mm:ss") = "2001-01-01 00:00:00") FROM emp; + +-- Aggregate with filter and non-empty GroupBy expressions. +SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a; +SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b; +SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM testData GROUP BY a; +SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > date "2003-01-01") FROM emp GROUP BY dept_id; +SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_date("2003-01-01")) FROM emp GROUP BY dept_id; +SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_timestamp("2003-01-01 00:00:00")) FROM emp GROUP BY dept_id; +SELECT dept_id, SUM(salary) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd") > "2003-01-01") FROM emp GROUP BY dept_id; +-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT +-- SELECT dept_id, SUM(DISTINCT salary) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd HH:mm:ss") > "2001-01-01 00:00:00") FROM emp GROUP BY dept_id; + +-- Aggregate with filter and grouped by literals. +SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1; +SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= date "2003-01-01") FROM emp GROUP BY 1; +SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= to_date("2003-01-01")) FROM emp GROUP BY 1; +SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= to_timestamp("2003-01-01")) FROM emp GROUP BY 1; + +-- Aggregate with filter, more than one aggregate function goes with distinct. +select dept_id, count(distinct emp_name), count(distinct hiredate), sum(salary), sum(salary) filter (where id > 200) from emp group by dept_id; +select dept_id, count(distinct emp_name), count(distinct hiredate), sum(salary), sum(salary) filter (where id + dept_id > 500) from emp group by dept_id; +select dept_id, count(distinct emp_name), count(distinct hiredate), sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 200) from emp group by dept_id; +select dept_id, count(distinct emp_name), count(distinct hiredate), sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id + dept_id > 500) from emp group by dept_id; +-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT +-- select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct hiredate), sum(salary) from emp group by dept_id; +-- select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct hiredate) filter (where hiredate > date "2003-01-01"), sum(salary) from emp group by dept_id; +-- select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct hiredate) filter (where hiredate > date "2003-01-01"), sum(salary) filter (where salary < 400.00D) from emp group by dept_id; +-- select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct hiredate) filter (where hiredate > date "2003-01-01"), sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 200) from emp group by dept_id; +-- select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct emp_name), sum(salary) from emp group by dept_id; +-- select dept_id, count(distinct emp_name) filter (where id > 200), count(distinct emp_name) filter (where hiredate > date "2003-01-01"), sum(salary) from emp group by dept_id; + +-- Aggregate with filter and grouped by literals (hash aggregate), here the input table is filtered using WHERE. +SELECT 'foo', APPROX_COUNT_DISTINCT(a) FILTER (WHERE b >= 0) FROM testData WHERE a = 0 GROUP BY 1; + +-- Aggregate with filter and grouped by literals (sort aggregate), here the input table is filtered using WHERE. +SELECT 'foo', MAX(STRUCT(a)) FILTER (WHERE b >= 1) FROM testData WHERE a = 0 GROUP BY 1; + +-- Aggregate with filter and complex GroupBy expressions. +SELECT a + b, COUNT(b) FILTER (WHERE b >= 2) FROM testData GROUP BY a + b; +SELECT a + 2, COUNT(b) FILTER (WHERE b IN (1, 2)) FROM testData GROUP BY a + 1; +SELECT a + 1 + 1, COUNT(b) FILTER (WHERE b > 0) FROM testData GROUP BY a + 1; + +-- Aggregate with filter, foldable input and multiple distinct groups. +-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT +-- SELECT COUNT(DISTINCT b) FILTER (WHERE b > 0), COUNT(DISTINCT b, c) FILTER (WHERE b > 0 AND c > 2) +-- FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a; + +-- Check analysis exceptions +SELECT a AS k, COUNT(b) FILTER (WHERE b > 0) FROM testData GROUP BY k; + +-- Aggregate with filter contains exists subquery +SELECT emp.dept_id, + avg(salary), + avg(salary) FILTER (WHERE id > (SELECT 200)) +FROM emp +GROUP BY dept_id; + +SELECT emp.dept_id, + avg(salary), + avg(salary) FILTER (WHERE emp.dept_id = (SELECT dept_id FROM dept LIMIT 1)) +FROM emp +GROUP BY dept_id; + +-- [SPARK-30220] Support Filter expression uses IN/EXISTS predicate sub-queries +SELECT emp.dept_id, + avg(salary), + avg(salary) FILTER (WHERE EXISTS (SELECT state + FROM dept + WHERE dept.dept_id = emp.dept_id)) +FROM emp +GROUP BY dept_id; + +SELECT emp.dept_id, + Sum(salary), + Sum(salary) FILTER (WHERE NOT EXISTS (SELECT state + FROM dept + WHERE dept.dept_id = emp.dept_id)) +FROM emp +GROUP BY dept_id; + +SELECT emp.dept_id, + avg(salary), + avg(salary) FILTER (WHERE emp.dept_id IN (SELECT DISTINCT dept_id + FROM dept)) +FROM emp +GROUP BY dept_id; +SELECT emp.dept_id, + Sum(salary), + Sum(salary) FILTER (WHERE emp.dept_id NOT IN (SELECT DISTINCT dept_id + FROM dept)) +FROM emp +GROUP BY dept_id; + +-- Aggregate with filter is subquery +SELECT t1.b FROM (SELECT COUNT(b) FILTER (WHERE a >= 2) AS b FROM testData) t1; diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql index 66bc90914e0d4..fedf03d774e42 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql @@ -1,3 +1,8 @@ +-- Test aggregate operator with codegen on and off. +--CONFIG_DIM1 spark.sql.codegen.wholeStage=true +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + -- Test data. CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES (1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null) @@ -90,16 +95,16 @@ CREATE OR REPLACE TEMPORARY VIEW test_agg AS SELECT * FROM VALUES (5, null), (5, true), (5, false) AS test_agg(k, v); -- empty table -SELECT every(v), some(v), any(v) FROM test_agg WHERE 1 = 0; +SELECT every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg WHERE 1 = 0; -- all null values -SELECT every(v), some(v), any(v) FROM test_agg WHERE k = 4; +SELECT every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg WHERE k = 4; -- aggregates are null Filtering -SELECT every(v), some(v), any(v) FROM test_agg WHERE k = 5; +SELECT every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg WHERE k = 5; -- group by -SELECT k, every(v), some(v), any(v) FROM test_agg GROUP BY k; +SELECT k, every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg GROUP BY k; -- having SELECT k, every(v) FROM test_agg GROUP BY k HAVING every(v) = false; @@ -137,10 +142,18 @@ SELECT any(1L); -- input type checking String SELECT every("true"); --- every/some/any aggregates are supported as windows expression. +-- input type checking Decimal +SELECT bool_and(1.0); + +-- input type checking double +SELECT bool_or(1.0D); + +-- every/some/any aggregates/bool_and/bool_or are supported as windows expression. SELECT k, v, every(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; SELECT k, v, some(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; SELECT k, v, any(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; +SELECT k, v, bool_and(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; +SELECT k, v, bool_or(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg; -- Having referencing aggregate expressions is ok. SELECT count(*) FROM test_agg HAVING count(*) > 1L; @@ -153,4 +166,3 @@ SELECT * FROM (SELECT COUNT(*) AS cnt FROM test_agg) WHERE cnt > 1L; SELECT count(*) FROM test_agg WHERE count(*) > 1L; SELECT count(*) FROM test_agg WHERE count(*) + 1L > 1L; SELECT count(*) FROM test_agg WHERE k = 1 or k = 2 or count(*) + 1L > 1L or max(k) > 1; - diff --git a/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql b/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql index 6bbde9f38d657..d30914fdd92df 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/grouping_set.sql @@ -51,3 +51,9 @@ SELECT a, b, c, count(d) FROM grouping GROUP BY WITH CUBE; SELECT c1 FROM (values (1,2), (3,2)) t(c1, c2) GROUP BY GROUPING SETS (()); +-- duplicate entries in grouping sets +SELECT k1, k2, avg(v) FROM (VALUES (1,1,1),(2,2,2)) AS t(k1,k2,v) GROUP BY GROUPING SETS ((k1),(k1,k2),(k2,k1)); + +SELECT grouping__id, k1, k2, avg(v) FROM (VALUES (1,1,1),(2,2,2)) AS t(k1,k2,v) GROUP BY GROUPING SETS ((k1),(k1,k2),(k2,k1)); + +SELECT grouping(k1), k1, k2, avg(v) FROM (VALUES (1,1,1),(2,2,2)) AS t(k1,k2,v) GROUP BY GROUPING SETS ((k1),(k1,k2),(k2,k1)); diff --git a/sql/core/src/test/resources/sql-tests/inputs/higher-order-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/higher-order-functions.sql index 02ad5e3538689..cfa06aea82b04 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/higher-order-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/higher-order-functions.sql @@ -1,3 +1,8 @@ +-- Test higher order functions with codegen on and off. +--CONFIG_DIM1 spark.sql.codegen.wholeStage=true +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + create or replace temporary view nested as values (1, array(32, 97), array(array(12, 99), array(123, 42), array(1))), (2, array(77, -76), array(array(6, 96, 65), array(-1, -2))), @@ -83,3 +88,7 @@ select transform_values(ys, (k, v) -> v + 1) as v from nested; -- Transform values in a map using values select transform_values(ys, (k, v) -> k + v) as v from nested; + +-- use non reversed keywords: all is non reversed only if !ansi +select transform(ys, all -> all * all) as v from values (array(32, 97)) as t(ys); +select transform(ys, (all, i) -> all + i) as v from values (array(32, 97)) as t(ys); diff --git a/sql/core/src/test/resources/sql-tests/inputs/inner-join.sql b/sql/core/src/test/resources/sql-tests/inputs/inner-join.sql index 38739cb950582..5623161839331 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/inner-join.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/inner-join.sql @@ -1,3 +1,15 @@ +-- There are 2 dimensions we want to test +-- 1. run with broadcast hash join, sort merge join or shuffle hash join. +-- 2. run with whole-stage-codegen, operator codegen or no codegen. + +--CONFIG_DIM1 spark.sql.autoBroadcastJoinThreshold=10485760 +--CONFIG_DIM1 spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=true +--CONFIG_DIM1 spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=false + +--CONFIG_DIM2 spark.sql.codegen.wholeStage=true +--CONFIG_DIM2 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM2 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (1) AS GROUPING(a); CREATE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (1) AS GROUPING(a); CREATE TEMPORARY VIEW t3 AS SELECT * FROM VALUES (1), (1) AS GROUPING(a); diff --git a/sql/core/src/test/resources/sql-tests/inputs/interval.sql b/sql/core/src/test/resources/sql-tests/inputs/interval.sql new file mode 100644 index 0000000000000..a4e621e9639d4 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/interval.sql @@ -0,0 +1,234 @@ +-- test for intervals + +-- multiply and divide an interval by a number +select 3 * (timestamp'2019-10-15 10:11:12.001002' - date'2019-10-15'); +select interval 4 month 2 weeks 3 microseconds * 1.5; +select (timestamp'2019-10-15' - timestamp'2019-10-14') / 1.5; + +-- interval operation with null and zero case +select interval '2 seconds' / 0; +select interval '2 seconds' / null; +select interval '2 seconds' * null; +select null * interval '2 seconds'; + +-- interval with a positive/negative sign +select -interval '-1 month 1 day -1 second'; +select -interval -1 month 1 day -1 second; +select +interval '-1 month 1 day -1 second'; +select +interval -1 month 1 day -1 second; + +-- make intervals +select make_interval(1); +select make_interval(1, 2); +select make_interval(1, 2, 3); +select make_interval(1, 2, 3, 4); +select make_interval(1, 2, 3, 4, 5); +select make_interval(1, 2, 3, 4, 5, 6); +select make_interval(1, 2, 3, 4, 5, 6, 7.008009); + +-- cast string to intervals +select cast('1 second' as interval); +select cast('+1 second' as interval); +select cast('-1 second' as interval); +select cast('+ 1 second' as interval); +select cast('- 1 second' as interval); +select cast('- -1 second' as interval); +select cast('- +1 second' as interval); + +-- interval literal +select interval 13.123456789 seconds, interval -13.123456789 second; +select interval 1 year 2 month 3 week 4 day 5 hour 6 minute 7 seconds 8 millisecond 9 microsecond; +select interval '30' year '25' month '-100' day '40' hour '80' minute '299.889987299' second; +select interval '0 0:0:0.1' day to second; +select interval '10-9' year to month; +select interval '20 15' day to hour; +select interval '20 15:40' day to minute; +select interval '20 15:40:32.99899999' day to second; +select interval '15:40' hour to minute; +select interval '15:40:32.99899999' hour to second; +select interval '40:32.99899999' minute to second; +select interval '40:32' minute to second; +select interval 30 day day; + +-- invalid day-time string intervals +select interval '20 15:40:32.99899999' day to hour; +select interval '20 15:40:32.99899999' day to minute; +select interval '15:40:32.99899999' hour to minute; +select interval '15:40.99899999' hour to second; +select interval '15:40' hour to second; +select interval '20 40:32.99899999' minute to second; + +-- ns is not supported +select interval 10 nanoseconds; + +-- map + interval test +select map(1, interval 1 day, 2, interval 3 week); + +-- typed interval expression +select interval 'interval 3 year 1 hour'; +select interval '3 year 1 hour'; + +-- malformed interval literal +select interval; +select interval 1 fake_unit; +select interval 1 year to month; +select interval '1' year to second; +select interval '10-9' year to month '2-1' year to month; +select interval '10-9' year to month '12:11:10' hour to second; +select interval '1 15:11' day to minute '12:11:10' hour to second; +select interval 1 year '2-1' year to month; +select interval 1 year '12:11:10' hour to second; +select interval '10-9' year to month '1' year; +select interval '12:11:10' hour to second '1' year; +select interval (-30) day; +select interval (a + 1) day; +select interval 30 day day day; + +-- sum interval values +-- null +select sum(cast(null as interval)); + +-- empty set +select sum(cast(v as interval)) from VALUES ('1 seconds') t(v) where 1=0; + +-- basic interval sum +select sum(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v); +select sum(cast(v as interval)) from VALUES ('-1 seconds'), ('2 seconds'), (null) t(v); +select sum(cast(v as interval)) from VALUES ('-1 seconds'), ('-2 seconds'), (null) t(v); +select sum(cast(v as interval)) from VALUES ('-1 weeks'), ('2 seconds'), (null) t(v); + +-- group by +select + i, + sum(cast(v as interval)) +from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) +group by i; + +-- having +select + sum(cast(v as interval)) as sv +from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) +having sv is not null; + +-- window +SELECT + i, + sum(cast(v as interval)) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) +FROM VALUES(1, '1 seconds'), (1, '2 seconds'), (2, NULL), (2, NULL) t(i,v); + +-- average with interval type +-- null +select avg(cast(v as interval)) from VALUES (null) t(v); + +-- empty set +select avg(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v) where 1=0; + +-- basic interval avg +select avg(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v); +select avg(cast(v as interval)) from VALUES ('-1 seconds'), ('2 seconds'), (null) t(v); +select avg(cast(v as interval)) from VALUES ('-1 seconds'), ('-2 seconds'), (null) t(v); +select avg(cast(v as interval)) from VALUES ('-1 weeks'), ('2 seconds'), (null) t(v); + +-- group by +select + i, + avg(cast(v as interval)) +from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) +group by i; + +-- having +select + avg(cast(v as interval)) as sv +from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) +having sv is not null; + +-- window +SELECT + i, + avg(cast(v as interval)) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) +FROM VALUES (1,'1 seconds'), (1,'2 seconds'), (2,NULL), (2,NULL) t(i,v); + +-- Interval year-month arithmetic + +create temporary view interval_arithmetic as + select CAST(dateval AS date), CAST(tsval AS timestamp) from values + ('2012-01-01', '2012-01-01') + as interval_arithmetic(dateval, tsval); + +select + dateval, + dateval - interval '2-2' year to month, + dateval - interval '-2-2' year to month, + dateval + interval '2-2' year to month, + dateval + interval '-2-2' year to month, + - interval '2-2' year to month + dateval, + interval '2-2' year to month + dateval +from interval_arithmetic; + +select + tsval, + tsval - interval '2-2' year to month, + tsval - interval '-2-2' year to month, + tsval + interval '2-2' year to month, + tsval + interval '-2-2' year to month, + - interval '2-2' year to month + tsval, + interval '2-2' year to month + tsval +from interval_arithmetic; + +select + interval '2-2' year to month + interval '3-3' year to month, + interval '2-2' year to month - interval '3-3' year to month +from interval_arithmetic; + +-- Interval day-time arithmetic + +select + dateval, + dateval - interval '99 11:22:33.123456789' day to second, + dateval - interval '-99 11:22:33.123456789' day to second, + dateval + interval '99 11:22:33.123456789' day to second, + dateval + interval '-99 11:22:33.123456789' day to second, + -interval '99 11:22:33.123456789' day to second + dateval, + interval '99 11:22:33.123456789' day to second + dateval +from interval_arithmetic; + +select + tsval, + tsval - interval '99 11:22:33.123456789' day to second, + tsval - interval '-99 11:22:33.123456789' day to second, + tsval + interval '99 11:22:33.123456789' day to second, + tsval + interval '-99 11:22:33.123456789' day to second, + -interval '99 11:22:33.123456789' day to second + tsval, + interval '99 11:22:33.123456789' day to second + tsval +from interval_arithmetic; + +select + interval '99 11:22:33.123456789' day to second + interval '10 9:8:7.123456789' day to second, + interval '99 11:22:33.123456789' day to second - interval '10 9:8:7.123456789' day to second +from interval_arithmetic; + +-- control characters as white spaces +select interval '\t interval 1 day'; +select interval 'interval \t 1\tday'; +select interval 'interval\t1\tday'; +select interval '1\t' day; +select interval '1 ' day; + +-- interval overflow if (ansi) exception else NULL +select -(a) from values (interval '-2147483648 months', interval '2147483647 months') t(a, b); +select a - b from values (interval '-2147483648 months', interval '2147483647 months') t(a, b); +select b + interval '1 month' from values (interval '-2147483648 months', interval '2147483647 months') t(a, b); +select a * 1.1 from values (interval '-2147483648 months', interval '2147483647 months') t(a, b); +select a / 0.5 from values (interval '-2147483648 months', interval '2147483647 months') t(a, b); + +-- interval support for csv and json functions +SELECT + from_csv('1, 1 day', 'a INT, b interval'), + to_csv(from_csv('1, 1 day', 'a INT, b interval')), + to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)), + from_csv(to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)), 'a interval, b interval'); +SELECT + from_json('{"a":"1 days"}', 'a interval'), + to_json(from_json('{"a":"1 days"}', 'a interval')), + to_json(map('a', interval 25 month 100 day 130 minute)), + from_json(to_json(map('a', interval 25 month 100 day 130 minute)), 'a interval'); diff --git a/sql/core/src/test/resources/sql-tests/inputs/join-empty-relation.sql b/sql/core/src/test/resources/sql-tests/inputs/join-empty-relation.sql index 2e6a5f362a8fa..8afa3270f4de4 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/join-empty-relation.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/join-empty-relation.sql @@ -1,8 +1,3 @@ --- List of configuration the test suite is run against: ---SET spark.sql.autoBroadcastJoinThreshold=10485760 ---SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=true ---SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=false - CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (1) AS GROUPING(a); CREATE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (1) AS GROUPING(a); diff --git a/sql/core/src/test/resources/sql-tests/inputs/literals.sql b/sql/core/src/test/resources/sql-tests/inputs/literals.sql index 816386c483209..108cfd766af2c 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/literals.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/literals.sql @@ -82,12 +82,6 @@ select tImEstAmp '2016-03-11 20:54:00.000'; -- invalid timestamp select timestamp '2016-33-11 20:54:00.000'; --- interval -select interval 13.123456789 seconds, interval -13.123456789 second; -select interval 1 year 2 month 3 week 4 day 5 hour 6 minute 7 seconds 8 millisecond, 9 microsecond; --- ns is not supported -select interval 10 nanoseconds; - -- unsupported data type select GEO '(10,-6)'; @@ -106,9 +100,15 @@ select X'XuZ'; -- Hive literal_double test. SELECT 3.14, -3.14, 3.14e8, 3.14e-8, -3.14e8, -3.14e-8, 3.14e+8, 3.14E8, 3.14E-8; --- map + interval test -select map(1, interval 1 day, 2, interval 3 week); - --- typed interval expression -select interval 'interval 3 year 1 hour'; -select interval '3 year 1 hour'; +-- awareness of the negative/positive sign before type +select +date '1999-01-01'; +select +timestamp '1999-01-01'; +select +interval '1 day'; +select +map(1, 2); +select +array(1,2); +select +named_struct('a', 1, 'b', 'spark'); +select +X'1'; +-- can't negate date/timestamp/binary +select -date '1999-01-01'; +select -timestamp '1999-01-01'; +select -x'2379ACFe'; diff --git a/sql/core/src/test/resources/sql-tests/inputs/misc-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/misc-functions.sql new file mode 100644 index 0000000000000..95f71925e9294 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/misc-functions.sql @@ -0,0 +1,10 @@ +-- test for misc functions + +-- typeof +select typeof(null); +select typeof(true); +select typeof(1Y), typeof(1S), typeof(1), typeof(1L); +select typeof(cast(1.0 as float)), typeof(1.0D), typeof(1.2); +select typeof(date '1986-05-23'), typeof(timestamp '1986-05-23'), typeof(interval '23 days'); +select typeof(x'ABCD'), typeof('SPARK'); +select typeof(array(1, 2)), typeof(map(1, 2)), typeof(named_struct('a', 1, 'b', 'spark')); diff --git a/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql b/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql index e0abeda3eb44f..71a50157b766c 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/natural-join.sql @@ -1,8 +1,3 @@ --- List of configuration the test suite is run against: ---SET spark.sql.autoBroadcastJoinThreshold=10485760 ---SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=true ---SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=false - create temporary view nt1 as select * from values ("one", 1), ("two", 2), diff --git a/sql/core/src/test/resources/sql-tests/inputs/order-by-nulls-ordering.sql b/sql/core/src/test/resources/sql-tests/inputs/order-by-nulls-ordering.sql index f7637b444b9fe..ad3977465c835 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/order-by-nulls-ordering.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/order-by-nulls-ordering.sql @@ -1,3 +1,8 @@ +-- Test sort operator with codegen on and off. +--CONFIG_DIM1 spark.sql.codegen.wholeStage=true +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + -- Q1. testing window functions with order by create table spark_10747(col1 int, col2 int, col3 int) using parquet; diff --git a/sql/core/src/test/resources/sql-tests/inputs/outer-join.sql b/sql/core/src/test/resources/sql-tests/inputs/outer-join.sql index ce09c21568f13..ceb438ec34b2d 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/outer-join.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/outer-join.sql @@ -1,7 +1,14 @@ --- List of configuration the test suite is run against: ---SET spark.sql.autoBroadcastJoinThreshold=10485760 ---SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=true ---SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=false +-- There are 2 dimensions we want to test +-- 1. run with broadcast hash join, sort merge join or shuffle hash join. +-- 2. run with whole-stage-codegen, operator codegen or no codegen. + +--CONFIG_DIM1 spark.sql.autoBroadcastJoinThreshold=10485760 +--CONFIG_DIM1 spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=true +--CONFIG_DIM1 spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=false + +--CONFIG_DIM2 spark.sql.codegen.wholeStage=true +--CONFIG_DIM2 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM2 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN -- SPARK-17099: Incorrect result when HAVING clause is added to group by query CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES @@ -29,9 +36,6 @@ CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (97) as t1(int_col1) CREATE OR REPLACE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (0) as t2(int_col1); --- Set the cross join enabled flag for the LEFT JOIN test since there's no join condition. --- Ultimately the join should be optimized away. -set spark.sql.crossJoin.enabled = true; SELECT * FROM ( SELECT @@ -39,6 +43,3 @@ SELECT FROM t1 LEFT JOIN t2 ON false ) t where (t.int_col) is not null; -set spark.sql.crossJoin.enabled = false; - - diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/numeric.sql b/sql/core/src/test/resources/sql-tests/inputs/pgSQL/numeric.sql deleted file mode 100644 index c447a0dc2c7f2..0000000000000 --- a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/numeric.sql +++ /dev/null @@ -1,1096 +0,0 @@ --- --- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group --- --- --- NUMERIC --- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/numeric.sql --- - --- [SPARK-28318] Decimal can only support precision up to 38. We rewrite numeric(210,10) to decimal(38,10). -CREATE TABLE num_data (id int, val decimal(38,10)) USING parquet; -CREATE TABLE num_exp_add (id1 int, id2 int, expected decimal(38,10)) USING parquet; -CREATE TABLE num_exp_sub (id1 int, id2 int, expected decimal(38,10)) USING parquet; -CREATE TABLE num_exp_div (id1 int, id2 int, expected decimal(38,10)) USING parquet; -CREATE TABLE num_exp_mul (id1 int, id2 int, expected decimal(38,10)) USING parquet; -CREATE TABLE num_exp_sqrt (id int, expected decimal(38,10)) USING parquet; -CREATE TABLE num_exp_ln (id int, expected decimal(38,10)) USING parquet; -CREATE TABLE num_exp_log10 (id int, expected decimal(38,10)) USING parquet; -CREATE TABLE num_exp_power_10_ln (id int, expected decimal(38,10)) USING parquet; - -CREATE TABLE num_result (id1 int, id2 int, result decimal(38,10)) USING parquet; - - --- ****************************** --- * The following EXPECTED results are computed by bc(1) --- * with a scale of 200 --- ****************************** - --- BEGIN TRANSACTION; -INSERT INTO num_exp_add VALUES (0,0,'0'); -INSERT INTO num_exp_sub VALUES (0,0,'0'); -INSERT INTO num_exp_mul VALUES (0,0,'0'); -INSERT INTO num_exp_div VALUES (0,0,'NaN'); -INSERT INTO num_exp_add VALUES (0,1,'0'); -INSERT INTO num_exp_sub VALUES (0,1,'0'); -INSERT INTO num_exp_mul VALUES (0,1,'0'); -INSERT INTO num_exp_div VALUES (0,1,'NaN'); -INSERT INTO num_exp_add VALUES (0,2,'-34338492.215397047'); -INSERT INTO num_exp_sub VALUES (0,2,'34338492.215397047'); -INSERT INTO num_exp_mul VALUES (0,2,'0'); -INSERT INTO num_exp_div VALUES (0,2,'0'); -INSERT INTO num_exp_add VALUES (0,3,'4.31'); -INSERT INTO num_exp_sub VALUES (0,3,'-4.31'); -INSERT INTO num_exp_mul VALUES (0,3,'0'); -INSERT INTO num_exp_div VALUES (0,3,'0'); -INSERT INTO num_exp_add VALUES (0,4,'7799461.4119'); -INSERT INTO num_exp_sub VALUES (0,4,'-7799461.4119'); -INSERT INTO num_exp_mul VALUES (0,4,'0'); -INSERT INTO num_exp_div VALUES (0,4,'0'); -INSERT INTO num_exp_add VALUES (0,5,'16397.038491'); -INSERT INTO num_exp_sub VALUES (0,5,'-16397.038491'); -INSERT INTO num_exp_mul VALUES (0,5,'0'); -INSERT INTO num_exp_div VALUES (0,5,'0'); -INSERT INTO num_exp_add VALUES (0,6,'93901.57763026'); -INSERT INTO num_exp_sub VALUES (0,6,'-93901.57763026'); -INSERT INTO num_exp_mul VALUES (0,6,'0'); -INSERT INTO num_exp_div VALUES (0,6,'0'); -INSERT INTO num_exp_add VALUES (0,7,'-83028485'); -INSERT INTO num_exp_sub VALUES (0,7,'83028485'); -INSERT INTO num_exp_mul VALUES (0,7,'0'); -INSERT INTO num_exp_div VALUES (0,7,'0'); -INSERT INTO num_exp_add VALUES (0,8,'74881'); -INSERT INTO num_exp_sub VALUES (0,8,'-74881'); -INSERT INTO num_exp_mul VALUES (0,8,'0'); -INSERT INTO num_exp_div VALUES (0,8,'0'); -INSERT INTO num_exp_add VALUES (0,9,'-24926804.045047420'); -INSERT INTO num_exp_sub VALUES (0,9,'24926804.045047420'); -INSERT INTO num_exp_mul VALUES (0,9,'0'); -INSERT INTO num_exp_div VALUES (0,9,'0'); -INSERT INTO num_exp_add VALUES (1,0,'0'); -INSERT INTO num_exp_sub VALUES (1,0,'0'); -INSERT INTO num_exp_mul VALUES (1,0,'0'); -INSERT INTO num_exp_div VALUES (1,0,'NaN'); -INSERT INTO num_exp_add VALUES (1,1,'0'); -INSERT INTO num_exp_sub VALUES (1,1,'0'); -INSERT INTO num_exp_mul VALUES (1,1,'0'); -INSERT INTO num_exp_div VALUES (1,1,'NaN'); -INSERT INTO num_exp_add VALUES (1,2,'-34338492.215397047'); -INSERT INTO num_exp_sub VALUES (1,2,'34338492.215397047'); -INSERT INTO num_exp_mul VALUES (1,2,'0'); -INSERT INTO num_exp_div VALUES (1,2,'0'); -INSERT INTO num_exp_add VALUES (1,3,'4.31'); -INSERT INTO num_exp_sub VALUES (1,3,'-4.31'); -INSERT INTO num_exp_mul VALUES (1,3,'0'); -INSERT INTO num_exp_div VALUES (1,3,'0'); -INSERT INTO num_exp_add VALUES (1,4,'7799461.4119'); -INSERT INTO num_exp_sub VALUES (1,4,'-7799461.4119'); -INSERT INTO num_exp_mul VALUES (1,4,'0'); -INSERT INTO num_exp_div VALUES (1,4,'0'); -INSERT INTO num_exp_add VALUES (1,5,'16397.038491'); -INSERT INTO num_exp_sub VALUES (1,5,'-16397.038491'); -INSERT INTO num_exp_mul VALUES (1,5,'0'); -INSERT INTO num_exp_div VALUES (1,5,'0'); -INSERT INTO num_exp_add VALUES (1,6,'93901.57763026'); -INSERT INTO num_exp_sub VALUES (1,6,'-93901.57763026'); -INSERT INTO num_exp_mul VALUES (1,6,'0'); -INSERT INTO num_exp_div VALUES (1,6,'0'); -INSERT INTO num_exp_add VALUES (1,7,'-83028485'); -INSERT INTO num_exp_sub VALUES (1,7,'83028485'); -INSERT INTO num_exp_mul VALUES (1,7,'0'); -INSERT INTO num_exp_div VALUES (1,7,'0'); -INSERT INTO num_exp_add VALUES (1,8,'74881'); -INSERT INTO num_exp_sub VALUES (1,8,'-74881'); -INSERT INTO num_exp_mul VALUES (1,8,'0'); -INSERT INTO num_exp_div VALUES (1,8,'0'); -INSERT INTO num_exp_add VALUES (1,9,'-24926804.045047420'); -INSERT INTO num_exp_sub VALUES (1,9,'24926804.045047420'); -INSERT INTO num_exp_mul VALUES (1,9,'0'); -INSERT INTO num_exp_div VALUES (1,9,'0'); -INSERT INTO num_exp_add VALUES (2,0,'-34338492.215397047'); -INSERT INTO num_exp_sub VALUES (2,0,'-34338492.215397047'); -INSERT INTO num_exp_mul VALUES (2,0,'0'); -INSERT INTO num_exp_div VALUES (2,0,'NaN'); -INSERT INTO num_exp_add VALUES (2,1,'-34338492.215397047'); -INSERT INTO num_exp_sub VALUES (2,1,'-34338492.215397047'); -INSERT INTO num_exp_mul VALUES (2,1,'0'); -INSERT INTO num_exp_div VALUES (2,1,'NaN'); -INSERT INTO num_exp_add VALUES (2,2,'-68676984.430794094'); -INSERT INTO num_exp_sub VALUES (2,2,'0'); -INSERT INTO num_exp_mul VALUES (2,2,'1179132047626883.596862135856320209'); -INSERT INTO num_exp_div VALUES (2,2,'1.00000000000000000000'); -INSERT INTO num_exp_add VALUES (2,3,'-34338487.905397047'); -INSERT INTO num_exp_sub VALUES (2,3,'-34338496.525397047'); -INSERT INTO num_exp_mul VALUES (2,3,'-147998901.44836127257'); -INSERT INTO num_exp_div VALUES (2,3,'-7967167.56737750510440835266'); -INSERT INTO num_exp_add VALUES (2,4,'-26539030.803497047'); -INSERT INTO num_exp_sub VALUES (2,4,'-42137953.627297047'); -INSERT INTO num_exp_mul VALUES (2,4,'-267821744976817.8111137106593'); -INSERT INTO num_exp_div VALUES (2,4,'-4.40267480046830116685'); -INSERT INTO num_exp_add VALUES (2,5,'-34322095.176906047'); -INSERT INTO num_exp_sub VALUES (2,5,'-34354889.253888047'); -INSERT INTO num_exp_mul VALUES (2,5,'-563049578578.769242506736077'); -INSERT INTO num_exp_div VALUES (2,5,'-2094.18866914563535496429'); -INSERT INTO num_exp_add VALUES (2,6,'-34244590.637766787'); -INSERT INTO num_exp_sub VALUES (2,6,'-34432393.793027307'); -INSERT INTO num_exp_mul VALUES (2,6,'-3224438592470.18449811926184222'); -INSERT INTO num_exp_div VALUES (2,6,'-365.68599891479766440940'); -INSERT INTO num_exp_add VALUES (2,7,'-117366977.215397047'); -INSERT INTO num_exp_sub VALUES (2,7,'48689992.784602953'); -INSERT INTO num_exp_mul VALUES (2,7,'2851072985828710.485883795'); -INSERT INTO num_exp_div VALUES (2,7,'.41357483778485235518'); -INSERT INTO num_exp_add VALUES (2,8,'-34263611.215397047'); -INSERT INTO num_exp_sub VALUES (2,8,'-34413373.215397047'); -INSERT INTO num_exp_mul VALUES (2,8,'-2571300635581.146276407'); -INSERT INTO num_exp_div VALUES (2,8,'-458.57416721727870888476'); -INSERT INTO num_exp_add VALUES (2,9,'-59265296.260444467'); -INSERT INTO num_exp_sub VALUES (2,9,'-9411688.170349627'); -INSERT INTO num_exp_mul VALUES (2,9,'855948866655588.453741509242968740'); -INSERT INTO num_exp_div VALUES (2,9,'1.37757299946438931811'); -INSERT INTO num_exp_add VALUES (3,0,'4.31'); -INSERT INTO num_exp_sub VALUES (3,0,'4.31'); -INSERT INTO num_exp_mul VALUES (3,0,'0'); -INSERT INTO num_exp_div VALUES (3,0,'NaN'); -INSERT INTO num_exp_add VALUES (3,1,'4.31'); -INSERT INTO num_exp_sub VALUES (3,1,'4.31'); -INSERT INTO num_exp_mul VALUES (3,1,'0'); -INSERT INTO num_exp_div VALUES (3,1,'NaN'); -INSERT INTO num_exp_add VALUES (3,2,'-34338487.905397047'); -INSERT INTO num_exp_sub VALUES (3,2,'34338496.525397047'); -INSERT INTO num_exp_mul VALUES (3,2,'-147998901.44836127257'); -INSERT INTO num_exp_div VALUES (3,2,'-.00000012551512084352'); -INSERT INTO num_exp_add VALUES (3,3,'8.62'); -INSERT INTO num_exp_sub VALUES (3,3,'0'); -INSERT INTO num_exp_mul VALUES (3,3,'18.5761'); -INSERT INTO num_exp_div VALUES (3,3,'1.00000000000000000000'); -INSERT INTO num_exp_add VALUES (3,4,'7799465.7219'); -INSERT INTO num_exp_sub VALUES (3,4,'-7799457.1019'); -INSERT INTO num_exp_mul VALUES (3,4,'33615678.685289'); -INSERT INTO num_exp_div VALUES (3,4,'.00000055260225961552'); -INSERT INTO num_exp_add VALUES (3,5,'16401.348491'); -INSERT INTO num_exp_sub VALUES (3,5,'-16392.728491'); -INSERT INTO num_exp_mul VALUES (3,5,'70671.23589621'); -INSERT INTO num_exp_div VALUES (3,5,'.00026285234387695504'); -INSERT INTO num_exp_add VALUES (3,6,'93905.88763026'); -INSERT INTO num_exp_sub VALUES (3,6,'-93897.26763026'); -INSERT INTO num_exp_mul VALUES (3,6,'404715.7995864206'); -INSERT INTO num_exp_div VALUES (3,6,'.00004589912234457595'); -INSERT INTO num_exp_add VALUES (3,7,'-83028480.69'); -INSERT INTO num_exp_sub VALUES (3,7,'83028489.31'); -INSERT INTO num_exp_mul VALUES (3,7,'-357852770.35'); -INSERT INTO num_exp_div VALUES (3,7,'-.00000005190989574240'); -INSERT INTO num_exp_add VALUES (3,8,'74885.31'); -INSERT INTO num_exp_sub VALUES (3,8,'-74876.69'); -INSERT INTO num_exp_mul VALUES (3,8,'322737.11'); -INSERT INTO num_exp_div VALUES (3,8,'.00005755799201399553'); -INSERT INTO num_exp_add VALUES (3,9,'-24926799.735047420'); -INSERT INTO num_exp_sub VALUES (3,9,'24926808.355047420'); -INSERT INTO num_exp_mul VALUES (3,9,'-107434525.43415438020'); -INSERT INTO num_exp_div VALUES (3,9,'-.00000017290624149854'); -INSERT INTO num_exp_add VALUES (4,0,'7799461.4119'); -INSERT INTO num_exp_sub VALUES (4,0,'7799461.4119'); -INSERT INTO num_exp_mul VALUES (4,0,'0'); -INSERT INTO num_exp_div VALUES (4,0,'NaN'); -INSERT INTO num_exp_add VALUES (4,1,'7799461.4119'); -INSERT INTO num_exp_sub VALUES (4,1,'7799461.4119'); -INSERT INTO num_exp_mul VALUES (4,1,'0'); -INSERT INTO num_exp_div VALUES (4,1,'NaN'); -INSERT INTO num_exp_add VALUES (4,2,'-26539030.803497047'); -INSERT INTO num_exp_sub VALUES (4,2,'42137953.627297047'); -INSERT INTO num_exp_mul VALUES (4,2,'-267821744976817.8111137106593'); -INSERT INTO num_exp_div VALUES (4,2,'-.22713465002993920385'); -INSERT INTO num_exp_add VALUES (4,3,'7799465.7219'); -INSERT INTO num_exp_sub VALUES (4,3,'7799457.1019'); -INSERT INTO num_exp_mul VALUES (4,3,'33615678.685289'); -INSERT INTO num_exp_div VALUES (4,3,'1809619.81714617169373549883'); -INSERT INTO num_exp_add VALUES (4,4,'15598922.8238'); -INSERT INTO num_exp_sub VALUES (4,4,'0'); -INSERT INTO num_exp_mul VALUES (4,4,'60831598315717.14146161'); -INSERT INTO num_exp_div VALUES (4,4,'1.00000000000000000000'); -INSERT INTO num_exp_add VALUES (4,5,'7815858.450391'); -INSERT INTO num_exp_sub VALUES (4,5,'7783064.373409'); -INSERT INTO num_exp_mul VALUES (4,5,'127888068979.9935054429'); -INSERT INTO num_exp_div VALUES (4,5,'475.66281046305802686061'); -INSERT INTO num_exp_add VALUES (4,6,'7893362.98953026'); -INSERT INTO num_exp_sub VALUES (4,6,'7705559.83426974'); -INSERT INTO num_exp_mul VALUES (4,6,'732381731243.745115764094'); -INSERT INTO num_exp_div VALUES (4,6,'83.05996138436129499606'); -INSERT INTO num_exp_add VALUES (4,7,'-75229023.5881'); -INSERT INTO num_exp_sub VALUES (4,7,'90827946.4119'); -INSERT INTO num_exp_mul VALUES (4,7,'-647577464846017.9715'); -INSERT INTO num_exp_div VALUES (4,7,'-.09393717604145131637'); -INSERT INTO num_exp_add VALUES (4,8,'7874342.4119'); -INSERT INTO num_exp_sub VALUES (4,8,'7724580.4119'); -INSERT INTO num_exp_mul VALUES (4,8,'584031469984.4839'); -INSERT INTO num_exp_div VALUES (4,8,'104.15808298366741897143'); -INSERT INTO num_exp_add VALUES (4,9,'-17127342.633147420'); -INSERT INTO num_exp_sub VALUES (4,9,'32726265.456947420'); -INSERT INTO num_exp_mul VALUES (4,9,'-194415646271340.1815956522980'); -INSERT INTO num_exp_div VALUES (4,9,'-.31289456112403769409'); -INSERT INTO num_exp_add VALUES (5,0,'16397.038491'); -INSERT INTO num_exp_sub VALUES (5,0,'16397.038491'); -INSERT INTO num_exp_mul VALUES (5,0,'0'); -INSERT INTO num_exp_div VALUES (5,0,'NaN'); -INSERT INTO num_exp_add VALUES (5,1,'16397.038491'); -INSERT INTO num_exp_sub VALUES (5,1,'16397.038491'); -INSERT INTO num_exp_mul VALUES (5,1,'0'); -INSERT INTO num_exp_div VALUES (5,1,'NaN'); -INSERT INTO num_exp_add VALUES (5,2,'-34322095.176906047'); -INSERT INTO num_exp_sub VALUES (5,2,'34354889.253888047'); -INSERT INTO num_exp_mul VALUES (5,2,'-563049578578.769242506736077'); -INSERT INTO num_exp_div VALUES (5,2,'-.00047751189505192446'); -INSERT INTO num_exp_add VALUES (5,3,'16401.348491'); -INSERT INTO num_exp_sub VALUES (5,3,'16392.728491'); -INSERT INTO num_exp_mul VALUES (5,3,'70671.23589621'); -INSERT INTO num_exp_div VALUES (5,3,'3804.41728329466357308584'); -INSERT INTO num_exp_add VALUES (5,4,'7815858.450391'); -INSERT INTO num_exp_sub VALUES (5,4,'-7783064.373409'); -INSERT INTO num_exp_mul VALUES (5,4,'127888068979.9935054429'); -INSERT INTO num_exp_div VALUES (5,4,'.00210232958726897192'); -INSERT INTO num_exp_add VALUES (5,5,'32794.076982'); -INSERT INTO num_exp_sub VALUES (5,5,'0'); -INSERT INTO num_exp_mul VALUES (5,5,'268862871.275335557081'); -INSERT INTO num_exp_div VALUES (5,5,'1.00000000000000000000'); -INSERT INTO num_exp_add VALUES (5,6,'110298.61612126'); -INSERT INTO num_exp_sub VALUES (5,6,'-77504.53913926'); -INSERT INTO num_exp_mul VALUES (5,6,'1539707782.76899778633766'); -INSERT INTO num_exp_div VALUES (5,6,'.17461941433576102689'); -INSERT INTO num_exp_add VALUES (5,7,'-83012087.961509'); -INSERT INTO num_exp_sub VALUES (5,7,'83044882.038491'); -INSERT INTO num_exp_mul VALUES (5,7,'-1361421264394.416135'); -INSERT INTO num_exp_div VALUES (5,7,'-.00019748690453643710'); -INSERT INTO num_exp_add VALUES (5,8,'91278.038491'); -INSERT INTO num_exp_sub VALUES (5,8,'-58483.961509'); -INSERT INTO num_exp_mul VALUES (5,8,'1227826639.244571'); -INSERT INTO num_exp_div VALUES (5,8,'.21897461960978085228'); -INSERT INTO num_exp_add VALUES (5,9,'-24910407.006556420'); -INSERT INTO num_exp_sub VALUES (5,9,'24943201.083538420'); -INSERT INTO num_exp_mul VALUES (5,9,'-408725765384.257043660243220'); -INSERT INTO num_exp_div VALUES (5,9,'-.00065780749354660427'); -INSERT INTO num_exp_add VALUES (6,0,'93901.57763026'); -INSERT INTO num_exp_sub VALUES (6,0,'93901.57763026'); -INSERT INTO num_exp_mul VALUES (6,0,'0'); -INSERT INTO num_exp_div VALUES (6,0,'NaN'); -INSERT INTO num_exp_add VALUES (6,1,'93901.57763026'); -INSERT INTO num_exp_sub VALUES (6,1,'93901.57763026'); -INSERT INTO num_exp_mul VALUES (6,1,'0'); -INSERT INTO num_exp_div VALUES (6,1,'NaN'); -INSERT INTO num_exp_add VALUES (6,2,'-34244590.637766787'); -INSERT INTO num_exp_sub VALUES (6,2,'34432393.793027307'); -INSERT INTO num_exp_mul VALUES (6,2,'-3224438592470.18449811926184222'); -INSERT INTO num_exp_div VALUES (6,2,'-.00273458651128995823'); -INSERT INTO num_exp_add VALUES (6,3,'93905.88763026'); -INSERT INTO num_exp_sub VALUES (6,3,'93897.26763026'); -INSERT INTO num_exp_mul VALUES (6,3,'404715.7995864206'); -INSERT INTO num_exp_div VALUES (6,3,'21786.90896293735498839907'); -INSERT INTO num_exp_add VALUES (6,4,'7893362.98953026'); -INSERT INTO num_exp_sub VALUES (6,4,'-7705559.83426974'); -INSERT INTO num_exp_mul VALUES (6,4,'732381731243.745115764094'); -INSERT INTO num_exp_div VALUES (6,4,'.01203949512295682469'); -INSERT INTO num_exp_add VALUES (6,5,'110298.61612126'); -INSERT INTO num_exp_sub VALUES (6,5,'77504.53913926'); -INSERT INTO num_exp_mul VALUES (6,5,'1539707782.76899778633766'); -INSERT INTO num_exp_div VALUES (6,5,'5.72674008674192359679'); -INSERT INTO num_exp_add VALUES (6,6,'187803.15526052'); -INSERT INTO num_exp_sub VALUES (6,6,'0'); -INSERT INTO num_exp_mul VALUES (6,6,'8817506281.4517452372676676'); -INSERT INTO num_exp_div VALUES (6,6,'1.00000000000000000000'); -INSERT INTO num_exp_add VALUES (6,7,'-82934583.42236974'); -INSERT INTO num_exp_sub VALUES (6,7,'83122386.57763026'); -INSERT INTO num_exp_mul VALUES (6,7,'-7796505729750.37795610'); -INSERT INTO num_exp_div VALUES (6,7,'-.00113095617281538980'); -INSERT INTO num_exp_add VALUES (6,8,'168782.57763026'); -INSERT INTO num_exp_sub VALUES (6,8,'19020.57763026'); -INSERT INTO num_exp_mul VALUES (6,8,'7031444034.53149906'); -INSERT INTO num_exp_div VALUES (6,8,'1.25401073209839612184'); -INSERT INTO num_exp_add VALUES (6,9,'-24832902.467417160'); -INSERT INTO num_exp_sub VALUES (6,9,'25020705.622677680'); -INSERT INTO num_exp_mul VALUES (6,9,'-2340666225110.29929521292692920'); -INSERT INTO num_exp_div VALUES (6,9,'-.00376709254265256789'); -INSERT INTO num_exp_add VALUES (7,0,'-83028485'); -INSERT INTO num_exp_sub VALUES (7,0,'-83028485'); -INSERT INTO num_exp_mul VALUES (7,0,'0'); -INSERT INTO num_exp_div VALUES (7,0,'NaN'); -INSERT INTO num_exp_add VALUES (7,1,'-83028485'); -INSERT INTO num_exp_sub VALUES (7,1,'-83028485'); -INSERT INTO num_exp_mul VALUES (7,1,'0'); -INSERT INTO num_exp_div VALUES (7,1,'NaN'); -INSERT INTO num_exp_add VALUES (7,2,'-117366977.215397047'); -INSERT INTO num_exp_sub VALUES (7,2,'-48689992.784602953'); -INSERT INTO num_exp_mul VALUES (7,2,'2851072985828710.485883795'); -INSERT INTO num_exp_div VALUES (7,2,'2.41794207151503385700'); -INSERT INTO num_exp_add VALUES (7,3,'-83028480.69'); -INSERT INTO num_exp_sub VALUES (7,3,'-83028489.31'); -INSERT INTO num_exp_mul VALUES (7,3,'-357852770.35'); -INSERT INTO num_exp_div VALUES (7,3,'-19264149.65197215777262180974'); -INSERT INTO num_exp_add VALUES (7,4,'-75229023.5881'); -INSERT INTO num_exp_sub VALUES (7,4,'-90827946.4119'); -INSERT INTO num_exp_mul VALUES (7,4,'-647577464846017.9715'); -INSERT INTO num_exp_div VALUES (7,4,'-10.64541262725136247686'); -INSERT INTO num_exp_add VALUES (7,5,'-83012087.961509'); -INSERT INTO num_exp_sub VALUES (7,5,'-83044882.038491'); -INSERT INTO num_exp_mul VALUES (7,5,'-1361421264394.416135'); -INSERT INTO num_exp_div VALUES (7,5,'-5063.62688881730941836574'); -INSERT INTO num_exp_add VALUES (7,6,'-82934583.42236974'); -INSERT INTO num_exp_sub VALUES (7,6,'-83122386.57763026'); -INSERT INTO num_exp_mul VALUES (7,6,'-7796505729750.37795610'); -INSERT INTO num_exp_div VALUES (7,6,'-884.20756174009028770294'); -INSERT INTO num_exp_add VALUES (7,7,'-166056970'); -INSERT INTO num_exp_sub VALUES (7,7,'0'); -INSERT INTO num_exp_mul VALUES (7,7,'6893729321395225'); -INSERT INTO num_exp_div VALUES (7,7,'1.00000000000000000000'); -INSERT INTO num_exp_add VALUES (7,8,'-82953604'); -INSERT INTO num_exp_sub VALUES (7,8,'-83103366'); -INSERT INTO num_exp_mul VALUES (7,8,'-6217255985285'); -INSERT INTO num_exp_div VALUES (7,8,'-1108.80577182462841041118'); -INSERT INTO num_exp_add VALUES (7,9,'-107955289.045047420'); -INSERT INTO num_exp_sub VALUES (7,9,'-58101680.954952580'); -INSERT INTO num_exp_mul VALUES (7,9,'2069634775752159.035758700'); -INSERT INTO num_exp_div VALUES (7,9,'3.33089171198810413382'); -INSERT INTO num_exp_add VALUES (8,0,'74881'); -INSERT INTO num_exp_sub VALUES (8,0,'74881'); -INSERT INTO num_exp_mul VALUES (8,0,'0'); -INSERT INTO num_exp_div VALUES (8,0,'NaN'); -INSERT INTO num_exp_add VALUES (8,1,'74881'); -INSERT INTO num_exp_sub VALUES (8,1,'74881'); -INSERT INTO num_exp_mul VALUES (8,1,'0'); -INSERT INTO num_exp_div VALUES (8,1,'NaN'); -INSERT INTO num_exp_add VALUES (8,2,'-34263611.215397047'); -INSERT INTO num_exp_sub VALUES (8,2,'34413373.215397047'); -INSERT INTO num_exp_mul VALUES (8,2,'-2571300635581.146276407'); -INSERT INTO num_exp_div VALUES (8,2,'-.00218067233500788615'); -INSERT INTO num_exp_add VALUES (8,3,'74885.31'); -INSERT INTO num_exp_sub VALUES (8,3,'74876.69'); -INSERT INTO num_exp_mul VALUES (8,3,'322737.11'); -INSERT INTO num_exp_div VALUES (8,3,'17373.78190255220417633410'); -INSERT INTO num_exp_add VALUES (8,4,'7874342.4119'); -INSERT INTO num_exp_sub VALUES (8,4,'-7724580.4119'); -INSERT INTO num_exp_mul VALUES (8,4,'584031469984.4839'); -INSERT INTO num_exp_div VALUES (8,4,'.00960079113741758956'); -INSERT INTO num_exp_add VALUES (8,5,'91278.038491'); -INSERT INTO num_exp_sub VALUES (8,5,'58483.961509'); -INSERT INTO num_exp_mul VALUES (8,5,'1227826639.244571'); -INSERT INTO num_exp_div VALUES (8,5,'4.56673929509287019456'); -INSERT INTO num_exp_add VALUES (8,6,'168782.57763026'); -INSERT INTO num_exp_sub VALUES (8,6,'-19020.57763026'); -INSERT INTO num_exp_mul VALUES (8,6,'7031444034.53149906'); -INSERT INTO num_exp_div VALUES (8,6,'.79744134113322314424'); -INSERT INTO num_exp_add VALUES (8,7,'-82953604'); -INSERT INTO num_exp_sub VALUES (8,7,'83103366'); -INSERT INTO num_exp_mul VALUES (8,7,'-6217255985285'); -INSERT INTO num_exp_div VALUES (8,7,'-.00090187120721280172'); -INSERT INTO num_exp_add VALUES (8,8,'149762'); -INSERT INTO num_exp_sub VALUES (8,8,'0'); -INSERT INTO num_exp_mul VALUES (8,8,'5607164161'); -INSERT INTO num_exp_div VALUES (8,8,'1.00000000000000000000'); -INSERT INTO num_exp_add VALUES (8,9,'-24851923.045047420'); -INSERT INTO num_exp_sub VALUES (8,9,'25001685.045047420'); -INSERT INTO num_exp_mul VALUES (8,9,'-1866544013697.195857020'); -INSERT INTO num_exp_div VALUES (8,9,'-.00300403532938582735'); -INSERT INTO num_exp_add VALUES (9,0,'-24926804.045047420'); -INSERT INTO num_exp_sub VALUES (9,0,'-24926804.045047420'); -INSERT INTO num_exp_mul VALUES (9,0,'0'); -INSERT INTO num_exp_div VALUES (9,0,'NaN'); -INSERT INTO num_exp_add VALUES (9,1,'-24926804.045047420'); -INSERT INTO num_exp_sub VALUES (9,1,'-24926804.045047420'); -INSERT INTO num_exp_mul VALUES (9,1,'0'); -INSERT INTO num_exp_div VALUES (9,1,'NaN'); -INSERT INTO num_exp_add VALUES (9,2,'-59265296.260444467'); -INSERT INTO num_exp_sub VALUES (9,2,'9411688.170349627'); -INSERT INTO num_exp_mul VALUES (9,2,'855948866655588.453741509242968740'); -INSERT INTO num_exp_div VALUES (9,2,'.72591434384152961526'); -INSERT INTO num_exp_add VALUES (9,3,'-24926799.735047420'); -INSERT INTO num_exp_sub VALUES (9,3,'-24926808.355047420'); -INSERT INTO num_exp_mul VALUES (9,3,'-107434525.43415438020'); -INSERT INTO num_exp_div VALUES (9,3,'-5783481.21694835730858468677'); -INSERT INTO num_exp_add VALUES (9,4,'-17127342.633147420'); -INSERT INTO num_exp_sub VALUES (9,4,'-32726265.456947420'); -INSERT INTO num_exp_mul VALUES (9,4,'-194415646271340.1815956522980'); -INSERT INTO num_exp_div VALUES (9,4,'-3.19596478892958416484'); -INSERT INTO num_exp_add VALUES (9,5,'-24910407.006556420'); -INSERT INTO num_exp_sub VALUES (9,5,'-24943201.083538420'); -INSERT INTO num_exp_mul VALUES (9,5,'-408725765384.257043660243220'); -INSERT INTO num_exp_div VALUES (9,5,'-1520.20159364322004505807'); -INSERT INTO num_exp_add VALUES (9,6,'-24832902.467417160'); -INSERT INTO num_exp_sub VALUES (9,6,'-25020705.622677680'); -INSERT INTO num_exp_mul VALUES (9,6,'-2340666225110.29929521292692920'); -INSERT INTO num_exp_div VALUES (9,6,'-265.45671195426965751280'); -INSERT INTO num_exp_add VALUES (9,7,'-107955289.045047420'); -INSERT INTO num_exp_sub VALUES (9,7,'58101680.954952580'); -INSERT INTO num_exp_mul VALUES (9,7,'2069634775752159.035758700'); -INSERT INTO num_exp_div VALUES (9,7,'.30021990699995814689'); -INSERT INTO num_exp_add VALUES (9,8,'-24851923.045047420'); -INSERT INTO num_exp_sub VALUES (9,8,'-25001685.045047420'); -INSERT INTO num_exp_mul VALUES (9,8,'-1866544013697.195857020'); -INSERT INTO num_exp_div VALUES (9,8,'-332.88556569820675471748'); -INSERT INTO num_exp_add VALUES (9,9,'-49853608.090094840'); -INSERT INTO num_exp_sub VALUES (9,9,'0'); -INSERT INTO num_exp_mul VALUES (9,9,'621345559900192.420120630048656400'); -INSERT INTO num_exp_div VALUES (9,9,'1.00000000000000000000'); --- COMMIT TRANSACTION; --- BEGIN TRANSACTION; -INSERT INTO num_exp_sqrt VALUES (0,'0'); -INSERT INTO num_exp_sqrt VALUES (1,'0'); -INSERT INTO num_exp_sqrt VALUES (2,'5859.90547836712524903505'); -INSERT INTO num_exp_sqrt VALUES (3,'2.07605394920266944396'); -INSERT INTO num_exp_sqrt VALUES (4,'2792.75158435189147418923'); -INSERT INTO num_exp_sqrt VALUES (5,'128.05092147657509145473'); -INSERT INTO num_exp_sqrt VALUES (6,'306.43364311096782703406'); -INSERT INTO num_exp_sqrt VALUES (7,'9111.99676251039939975230'); -INSERT INTO num_exp_sqrt VALUES (8,'273.64392922189960397542'); -INSERT INTO num_exp_sqrt VALUES (9,'4992.67503899937593364766'); --- COMMIT TRANSACTION; --- BEGIN TRANSACTION; -INSERT INTO num_exp_ln VALUES (0,'NaN'); -INSERT INTO num_exp_ln VALUES (1,'NaN'); -INSERT INTO num_exp_ln VALUES (2,'17.35177750493897715514'); -INSERT INTO num_exp_ln VALUES (3,'1.46093790411565641971'); -INSERT INTO num_exp_ln VALUES (4,'15.86956523951936572464'); -INSERT INTO num_exp_ln VALUES (5,'9.70485601768871834038'); -INSERT INTO num_exp_ln VALUES (6,'11.45000246622944403127'); -INSERT INTO num_exp_ln VALUES (7,'18.23469429965478772991'); -INSERT INTO num_exp_ln VALUES (8,'11.22365546576315513668'); -INSERT INTO num_exp_ln VALUES (9,'17.03145425013166006962'); --- COMMIT TRANSACTION; --- BEGIN TRANSACTION; -INSERT INTO num_exp_log10 VALUES (0,'NaN'); -INSERT INTO num_exp_log10 VALUES (1,'NaN'); -INSERT INTO num_exp_log10 VALUES (2,'7.53578122160797276459'); -INSERT INTO num_exp_log10 VALUES (3,'.63447727016073160075'); -INSERT INTO num_exp_log10 VALUES (4,'6.89206461372691743345'); -INSERT INTO num_exp_log10 VALUES (5,'4.21476541614777768626'); -INSERT INTO num_exp_log10 VALUES (6,'4.97267288886207207671'); -INSERT INTO num_exp_log10 VALUES (7,'7.91922711353275546914'); -INSERT INTO num_exp_log10 VALUES (8,'4.87437163556421004138'); -INSERT INTO num_exp_log10 VALUES (9,'7.39666659961986567059'); --- COMMIT TRANSACTION; --- BEGIN TRANSACTION; -INSERT INTO num_exp_power_10_ln VALUES (0,'NaN'); -INSERT INTO num_exp_power_10_ln VALUES (1,'NaN'); -INSERT INTO num_exp_power_10_ln VALUES (2,'224790267919917955.13261618583642653184'); -INSERT INTO num_exp_power_10_ln VALUES (3,'28.90266599445155957393'); -INSERT INTO num_exp_power_10_ln VALUES (4,'7405685069594999.07733999469386277636'); -INSERT INTO num_exp_power_10_ln VALUES (5,'5068226527.32127265408584640098'); -INSERT INTO num_exp_power_10_ln VALUES (6,'281839893606.99372343357047819067'); -INSERT INTO num_exp_power_10_ln VALUES (7,'1716699575118597095.42330819910640247627'); -INSERT INTO num_exp_power_10_ln VALUES (8,'167361463828.07491320069016125952'); -INSERT INTO num_exp_power_10_ln VALUES (9,'107511333880052007.04141124673540337457'); --- COMMIT TRANSACTION; --- BEGIN TRANSACTION; -INSERT INTO num_data VALUES (0, '0'); -INSERT INTO num_data VALUES (1, '0'); -INSERT INTO num_data VALUES (2, '-34338492.215397047'); -INSERT INTO num_data VALUES (3, '4.31'); -INSERT INTO num_data VALUES (4, '7799461.4119'); -INSERT INTO num_data VALUES (5, '16397.038491'); -INSERT INTO num_data VALUES (6, '93901.57763026'); -INSERT INTO num_data VALUES (7, '-83028485'); -INSERT INTO num_data VALUES (8, '74881'); -INSERT INTO num_data VALUES (9, '-24926804.045047420'); --- COMMIT TRANSACTION; - -SELECT * FROM num_data; - --- ****************************** --- * Create indices for faster checks --- ****************************** - --- CREATE UNIQUE INDEX num_exp_add_idx ON num_exp_add (id1, id2); --- CREATE UNIQUE INDEX num_exp_sub_idx ON num_exp_sub (id1, id2); --- CREATE UNIQUE INDEX num_exp_div_idx ON num_exp_div (id1, id2); --- CREATE UNIQUE INDEX num_exp_mul_idx ON num_exp_mul (id1, id2); --- CREATE UNIQUE INDEX num_exp_sqrt_idx ON num_exp_sqrt (id); --- CREATE UNIQUE INDEX num_exp_ln_idx ON num_exp_ln (id); --- CREATE UNIQUE INDEX num_exp_log10_idx ON num_exp_log10 (id); --- CREATE UNIQUE INDEX num_exp_power_10_ln_idx ON num_exp_power_10_ln (id); - --- VACUUM ANALYZE num_exp_add; --- VACUUM ANALYZE num_exp_sub; --- VACUUM ANALYZE num_exp_div; --- VACUUM ANALYZE num_exp_mul; --- VACUUM ANALYZE num_exp_sqrt; --- VACUUM ANALYZE num_exp_ln; --- VACUUM ANALYZE num_exp_log10; --- VACUUM ANALYZE num_exp_power_10_ln; - --- ****************************** --- * Now check the behaviour of the NUMERIC type --- ****************************** - --- ****************************** --- * Addition check --- ****************************** -TRUNCATE TABLE num_result; -INSERT INTO num_result SELECT t1.id, t2.id, t1.val + t2.val - FROM num_data t1, num_data t2; -SELECT t1.id1, t1.id2, t1.result, t2.expected - FROM num_result t1, num_exp_add t2 - WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 - AND t1.result != t2.expected; - -TRUNCATE TABLE num_result; -INSERT INTO num_result SELECT t1.id, t2.id, round(t1.val + t2.val, 10) - FROM num_data t1, num_data t2; -SELECT t1.id1, t1.id2, t1.result, round(t2.expected, 10) as expected - FROM num_result t1, num_exp_add t2 - WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 - AND t1.result != round(t2.expected, 10); - --- ****************************** --- * Subtraction check --- ****************************** -TRUNCATE TABLE num_result; -INSERT INTO num_result SELECT t1.id, t2.id, t1.val - t2.val - FROM num_data t1, num_data t2; -SELECT t1.id1, t1.id2, t1.result, t2.expected - FROM num_result t1, num_exp_sub t2 - WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 - AND t1.result != t2.expected; - -TRUNCATE TABLE num_result; -INSERT INTO num_result SELECT t1.id, t2.id, round(t1.val - t2.val, 40) - FROM num_data t1, num_data t2; -SELECT t1.id1, t1.id2, t1.result, round(t2.expected, 40) - FROM num_result t1, num_exp_sub t2 - WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 - AND t1.result != round(t2.expected, 40); - --- ****************************** --- * Multiply check --- ****************************** --- [SPARK-28316] Decimal precision issue -TRUNCATE TABLE num_result; -INSERT INTO num_result SELECT t1.id, t2.id, t1.val, t2.val, t1.val * t2.val - FROM num_data t1, num_data t2; -SELECT t1.id1, t1.id2, t1.result, t2.expected - FROM num_result t1, num_exp_mul t2 - WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 - AND t1.result != t2.expected; - -TRUNCATE TABLE num_result; -INSERT INTO num_result SELECT t1.id, t2.id, round(t1.val * t2.val, 30) - FROM num_data t1, num_data t2; -SELECT t1.id1, t1.id2, t1.result, round(t2.expected, 30) as expected - FROM num_result t1, num_exp_mul t2 - WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 - AND t1.result != round(t2.expected, 30); - --- ****************************** --- * Division check --- ****************************** --- [SPARK-28316] Decimal precision issue -TRUNCATE TABLE num_result; -INSERT INTO num_result SELECT t1.id, t2.id, t1.val / t2.val - FROM num_data t1, num_data t2 - WHERE t2.val != '0.0'; -SELECT t1.id1, t1.id2, t1.result, t2.expected - FROM num_result t1, num_exp_div t2 - WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 - AND t1.result != t2.expected; - -TRUNCATE TABLE num_result; -INSERT INTO num_result SELECT t1.id, t2.id, round(t1.val / t2.val, 80) - FROM num_data t1, num_data t2 - WHERE t2.val != '0.0'; -SELECT t1.id1, t1.id2, t1.result, round(t2.expected, 80) as expected - FROM num_result t1, num_exp_div t2 - WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 - AND t1.result != round(t2.expected, 80); - --- ****************************** --- * Square root check --- ****************************** -TRUNCATE TABLE num_result; -INSERT INTO num_result SELECT id, 0, SQRT(ABS(val)) - FROM num_data; -SELECT t1.id1, t1.result, t2.expected - FROM num_result t1, num_exp_sqrt t2 - WHERE t1.id1 = t2.id - AND t1.result != t2.expected; - --- ****************************** --- * Natural logarithm check --- ****************************** -TRUNCATE TABLE num_result; -INSERT INTO num_result SELECT id, 0, LN(ABS(val)) - FROM num_data - WHERE val != '0.0'; -SELECT t1.id1, t1.result, t2.expected - FROM num_result t1, num_exp_ln t2 - WHERE t1.id1 = t2.id - AND t1.result != t2.expected; - --- ****************************** --- * Logarithm base 10 check --- ****************************** -TRUNCATE TABLE num_result; -INSERT INTO num_result SELECT id, 0, LOG(cast('10' as decimal(38, 18)), ABS(val)) - FROM num_data - WHERE val != '0.0'; -SELECT t1.id1, t1.result, t2.expected - FROM num_result t1, num_exp_log10 t2 - WHERE t1.id1 = t2.id - AND t1.result != t2.expected; - --- ****************************** --- * POWER(10, LN(value)) check --- ****************************** --- [SPARK-28316] Decimal precision issue -TRUNCATE TABLE num_result; -INSERT INTO num_result SELECT id, 0, POWER(cast('10' as decimal(38, 18)), LN(ABS(round(val,200)))) - FROM num_data - WHERE val != '0.0'; -SELECT t1.id1, t1.result, t2.expected - FROM num_result t1, num_exp_power_10_ln t2 - WHERE t1.id1 = t2.id - AND t1.result != t2.expected; - --- ****************************** --- * miscellaneous checks for things that have been broken in the past... --- ****************************** --- numeric AVG used to fail on some platforms -SELECT AVG(val) FROM num_data; --- [SPARK-28316] STDDEV and VARIANCE returns double type --- Skip it because: Expected "2.779120328758835[]E7", but got "2.779120328758835[4]E7" --- SELECT STDDEV(val) FROM num_data; --- Skip it because: Expected "7.72350980172061[8]E14", but got "7.72350980172061[6]E14" --- SELECT VARIANCE(val) FROM num_data; - --- Check for appropriate rounding and overflow -CREATE TABLE fract_only (id int, val decimal(4,4)) USING parquet; -INSERT INTO fract_only VALUES (1, '0.0'); -INSERT INTO fract_only VALUES (2, '0.1'); --- [SPARK-27923] PostgreSQL throws an exception but Spark SQL is NULL --- INSERT INTO fract_only VALUES (3, '1.0'); -- should fail -INSERT INTO fract_only VALUES (4, '-0.9999'); -INSERT INTO fract_only VALUES (5, '0.99994'); --- [SPARK-27923] PostgreSQL throws an exception but Spark SQL is NULL --- INSERT INTO fract_only VALUES (6, '0.99995'); -- should fail -INSERT INTO fract_only VALUES (7, '0.00001'); -INSERT INTO fract_only VALUES (8, '0.00017'); -SELECT * FROM fract_only; -DROP TABLE fract_only; - --- [SPARK-28315] Decimal can not accept NaN as input --- [SPARK-27923] Decimal type can not accept Infinity and -Infinity --- Check inf/nan conversion behavior -SELECT decimal(double('NaN')); -SELECT decimal(double('Infinity')); -SELECT decimal(double('-Infinity')); -SELECT decimal(float('NaN')); -SELECT decimal(float('Infinity')); -SELECT decimal(float('-Infinity')); - --- Simple check that ceil(), floor(), and round() work correctly -CREATE TABLE ceil_floor_round (a decimal(38, 18)) USING parquet; -INSERT INTO ceil_floor_round VALUES ('-5.5'); -INSERT INTO ceil_floor_round VALUES ('-5.499999'); -INSERT INTO ceil_floor_round VALUES ('9.5'); -INSERT INTO ceil_floor_round VALUES ('9.4999999'); -INSERT INTO ceil_floor_round VALUES ('0.0'); -INSERT INTO ceil_floor_round VALUES ('0.0000001'); -INSERT INTO ceil_floor_round VALUES ('-0.000001'); -SELECT a, ceil(a), ceiling(a), floor(a), round(a) FROM ceil_floor_round; -DROP TABLE ceil_floor_round; - --- [SPARK-28007] Caret operator (^) means bitwise XOR in Spark and exponentiation in Postgres --- Check rounding, it should round ties away from zero. --- SELECT i as pow, --- round((-2.5 * 10 ^ i)::numeric, -i), --- round((-1.5 * 10 ^ i)::numeric, -i), --- round((-0.5 * 10 ^ i)::numeric, -i), --- round((0.5 * 10 ^ i)::numeric, -i), --- round((1.5 * 10 ^ i)::numeric, -i), --- round((2.5 * 10 ^ i)::numeric, -i) --- FROM generate_series(-5,5) AS t(i); - --- [SPARK-21117] Built-in SQL Function Support - WIDTH_BUCKET --- Testing for width_bucket(). For convenience, we test both the --- numeric and float8 versions of the function in this file. - --- errors --- SELECT width_bucket(5.0, 3.0, 4.0, 0); --- SELECT width_bucket(5.0, 3.0, 4.0, -5); --- SELECT width_bucket(3.5, 3.0, 3.0, 888); --- SELECT width_bucket(5.0::float8, 3.0::float8, 4.0::float8, 0); --- SELECT width_bucket(5.0::float8, 3.0::float8, 4.0::float8, -5); --- SELECT width_bucket(3.5::float8, 3.0::float8, 3.0::float8, 888); --- SELECT width_bucket('NaN', 3.0, 4.0, 888); --- SELECT width_bucket(0::float8, 'NaN', 4.0::float8, 888); - --- normal operation --- CREATE TABLE width_bucket_test (operand_num numeric, operand_f8 float8); - --- COPY width_bucket_test (operand_num) FROM stdin; --- -5.2 --- -0.0000000001 --- 0.000000000001 --- 1 --- 1.99999999999999 --- 2 --- 2.00000000000001 --- 3 --- 4 --- 4.5 --- 5 --- 5.5 --- 6 --- 7 --- 8 --- 9 --- 9.99999999999999 --- 10 --- 10.0000000000001 --- \. - --- UPDATE width_bucket_test SET operand_f8 = operand_num::float8; - --- SELECT --- operand_num, --- width_bucket(operand_num, 0, 10, 5) AS wb_1, --- width_bucket(operand_f8, 0, 10, 5) AS wb_1f, --- width_bucket(operand_num, 10, 0, 5) AS wb_2, --- width_bucket(operand_f8, 10, 0, 5) AS wb_2f, --- width_bucket(operand_num, 2, 8, 4) AS wb_3, --- width_bucket(operand_f8, 2, 8, 4) AS wb_3f, --- width_bucket(operand_num, 5.0, 5.5, 20) AS wb_4, --- width_bucket(operand_f8, 5.0, 5.5, 20) AS wb_4f, --- width_bucket(operand_num, -25, 25, 10) AS wb_5, --- width_bucket(operand_f8, -25, 25, 10) AS wb_5f --- FROM width_bucket_test; - --- for float8 only, check positive and negative infinity: we require --- finite bucket bounds, but allow an infinite operand --- SELECT width_bucket(0.0::float8, 'Infinity'::float8, 5, 10); -- error --- SELECT width_bucket(0.0::float8, 5, '-Infinity'::float8, 20); -- error --- SELECT width_bucket('Infinity'::float8, 1, 10, 10), --- width_bucket('-Infinity'::float8, 1, 10, 10); - --- DROP TABLE width_bucket_test; - --- [SPARK-28137] Missing Data Type Formatting Functions: TO_CHAR --- TO_CHAR() --- --- SELECT '' AS to_char_1, to_char(val, '9G999G999G999G999G999') --- FROM num_data; - --- SELECT '' AS to_char_2, to_char(val, '9G999G999G999G999G999D999G999G999G999G999') --- FROM num_data; - --- SELECT '' AS to_char_3, to_char(val, '9999999999999999.999999999999999PR') --- FROM num_data; - --- SELECT '' AS to_char_4, to_char(val, '9999999999999999.999999999999999S') --- FROM num_data; - --- SELECT '' AS to_char_5, to_char(val, 'MI9999999999999999.999999999999999') FROM num_data; --- SELECT '' AS to_char_6, to_char(val, 'FMS9999999999999999.999999999999999') FROM num_data; --- SELECT '' AS to_char_7, to_char(val, 'FM9999999999999999.999999999999999THPR') FROM num_data; --- SELECT '' AS to_char_8, to_char(val, 'SG9999999999999999.999999999999999th') FROM num_data; --- SELECT '' AS to_char_9, to_char(val, '0999999999999999.999999999999999') FROM num_data; --- SELECT '' AS to_char_10, to_char(val, 'S0999999999999999.999999999999999') FROM num_data; --- SELECT '' AS to_char_11, to_char(val, 'FM0999999999999999.999999999999999') FROM num_data; --- SELECT '' AS to_char_12, to_char(val, 'FM9999999999999999.099999999999999') FROM num_data; --- SELECT '' AS to_char_13, to_char(val, 'FM9999999999990999.990999999999999') FROM num_data; --- SELECT '' AS to_char_14, to_char(val, 'FM0999999999999999.999909999999999') FROM num_data; --- SELECT '' AS to_char_15, to_char(val, 'FM9999999990999999.099999999999999') FROM num_data; --- SELECT '' AS to_char_16, to_char(val, 'L9999999999999999.099999999999999') FROM num_data; --- SELECT '' AS to_char_17, to_char(val, 'FM9999999999999999.99999999999999') FROM num_data; --- SELECT '' AS to_char_18, to_char(val, 'S 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 . 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9') FROM num_data; --- SELECT '' AS to_char_19, to_char(val, 'FMS 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 . 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9') FROM num_data; --- SELECT '' AS to_char_20, to_char(val, E'99999 "text" 9999 "9999" 999 "\\"text between quote marks\\"" 9999') FROM num_data; --- SELECT '' AS to_char_21, to_char(val, '999999SG9999999999') FROM num_data; --- SELECT '' AS to_char_22, to_char(val, 'FM9999999999999999.999999999999999') FROM num_data; --- SELECT '' AS to_char_23, to_char(val, '9.999EEEE') FROM num_data; - --- SELECT '' AS to_char_24, to_char('100'::numeric, 'FM999.9'); --- SELECT '' AS to_char_25, to_char('100'::numeric, 'FM999.'); --- SELECT '' AS to_char_26, to_char('100'::numeric, 'FM999'); - --- Check parsing of literal text in a format string --- SELECT '' AS to_char_27, to_char('100'::numeric, 'foo999'); --- SELECT '' AS to_char_28, to_char('100'::numeric, 'f\oo999'); --- SELECT '' AS to_char_29, to_char('100'::numeric, 'f\\oo999'); --- SELECT '' AS to_char_30, to_char('100'::numeric, 'f\"oo999'); --- SELECT '' AS to_char_31, to_char('100'::numeric, 'f\\"oo999'); --- SELECT '' AS to_char_32, to_char('100'::numeric, 'f"ool"999'); --- SELECT '' AS to_char_33, to_char('100'::numeric, 'f"\ool"999'); --- SELECT '' AS to_char_34, to_char('100'::numeric, 'f"\\ool"999'); --- SELECT '' AS to_char_35, to_char('100'::numeric, 'f"ool\"999'); --- SELECT '' AS to_char_36, to_char('100'::numeric, 'f"ool\\"999'); - --- [SPARK-28137] Missing Data Type Formatting Functions: TO_NUMBER --- TO_NUMBER() --- --- SET lc_numeric = 'C'; --- SELECT '' AS to_number_1, to_number('-34,338,492', '99G999G999'); --- SELECT '' AS to_number_2, to_number('-34,338,492.654,878', '99G999G999D999G999'); --- SELECT '' AS to_number_3, to_number('<564646.654564>', '999999.999999PR'); --- SELECT '' AS to_number_4, to_number('0.00001-', '9.999999S'); --- SELECT '' AS to_number_5, to_number('5.01-', 'FM9.999999S'); --- SELECT '' AS to_number_5, to_number('5.01-', 'FM9.999999MI'); --- SELECT '' AS to_number_7, to_number('5 4 4 4 4 8 . 7 8', '9 9 9 9 9 9 . 9 9'); --- SELECT '' AS to_number_8, to_number('.01', 'FM9.99'); --- SELECT '' AS to_number_9, to_number('.0', '99999999.99999999'); --- SELECT '' AS to_number_10, to_number('0', '99.99'); --- SELECT '' AS to_number_11, to_number('.-01', 'S99.99'); --- SELECT '' AS to_number_12, to_number('.01-', '99.99S'); --- SELECT '' AS to_number_13, to_number(' . 0 1-', ' 9 9 . 9 9 S'); --- SELECT '' AS to_number_14, to_number('34,50','999,99'); --- SELECT '' AS to_number_15, to_number('123,000','999G'); --- SELECT '' AS to_number_16, to_number('123456','999G999'); --- SELECT '' AS to_number_17, to_number('$1234.56','L9,999.99'); --- SELECT '' AS to_number_18, to_number('$1234.56','L99,999.99'); --- SELECT '' AS to_number_19, to_number('$1,234.56','L99,999.99'); --- SELECT '' AS to_number_20, to_number('1234.56','L99,999.99'); --- SELECT '' AS to_number_21, to_number('1,234.56','L99,999.99'); --- SELECT '' AS to_number_22, to_number('42nd', '99th'); --- RESET lc_numeric; - --- --- Input syntax --- - -CREATE TABLE num_input_test (n1 decimal(38, 18)) USING parquet; - --- good inputs -INSERT INTO num_input_test VALUES (trim(' 123')); -INSERT INTO num_input_test VALUES (trim(' 3245874 ')); -INSERT INTO num_input_test VALUES (trim(' -93853')); -INSERT INTO num_input_test VALUES ('555.50'); -INSERT INTO num_input_test VALUES ('-555.50'); --- [SPARK-28315] Decimal can not accept NaN as input --- INSERT INTO num_input_test VALUES (trim('NaN ')); --- INSERT INTO num_input_test VALUES (trim(' nan')); - --- [SPARK-27923] Spark SQL accept bad inputs to NULL --- bad inputs --- INSERT INTO num_input_test VALUES (' '); --- INSERT INTO num_input_test VALUES (' 1234 %'); --- INSERT INTO num_input_test VALUES ('xyz'); --- INSERT INTO num_input_test VALUES ('- 1234'); --- INSERT INTO num_input_test VALUES ('5 . 0'); --- INSERT INTO num_input_test VALUES ('5. 0 '); --- INSERT INTO num_input_test VALUES (''); --- INSERT INTO num_input_test VALUES (' N aN '); - -SELECT * FROM num_input_test; - --- [SPARK-28318] Decimal can only support precision up to 38 --- --- Test some corner cases for multiplication --- - --- select 4790999999999999999999999999999999999999999999999999999999999999999999999999999999999999 * 9999999999999999999999999999999999999999999999999999999999999999999999999999999999999999; - --- select 4789999999999999999999999999999999999999999999999999999999999999999999999999999999999999 * 9999999999999999999999999999999999999999999999999999999999999999999999999999999999999999; - --- select 4770999999999999999999999999999999999999999999999999999999999999999999999999999999999999 * 9999999999999999999999999999999999999999999999999999999999999999999999999999999999999999; - --- select 4769999999999999999999999999999999999999999999999999999999999999999999999999999999999999 * 9999999999999999999999999999999999999999999999999999999999999999999999999999999999999999; - --- --- Test some corner cases for division --- --- 999999999999999999999 is overflow for SYSTEM_DEFAULT(decimal(38, 18)), we use BigIntDecimal(decimal(38, 0)). -select cast(999999999999999999999 as decimal(38, 0))/1000000000000000000000; - -select div(cast(999999999999999999999 as decimal(38, 0)),1000000000000000000000); -select mod(cast(999999999999999999999 as decimal(38, 0)),1000000000000000000000); -select div(cast(-9999999999999999999999 as decimal(38, 0)),1000000000000000000000); -select mod(cast(-9999999999999999999999 as decimal(38, 0)),1000000000000000000000); -select div(cast(-9999999999999999999999 as decimal(38, 0)),1000000000000000000000)*1000000000000000000000 + mod(cast(-9999999999999999999999 as decimal(38, 0)),1000000000000000000000); -select mod (70.0,70) ; -select div (70.0,70) ; -select 70.0 / 70 ; -select 12345678901234567890 % 123; --- [SPARK-2659] HiveQL: Division operator should always perform fractional division --- select 12345678901234567890 DIV 123; --- select div(12345678901234567890, 123); --- select div(12345678901234567890, 123) * 123 + 12345678901234567890 % 123; - --- [SPARK-28007] Caret operator (^) means bitwise XOR in Spark and exponentiation in Postgres --- --- Test code path for raising to integer powers --- - --- select 10.0 ^ -2147483648 as rounds_to_zero; --- select 10.0 ^ -2147483647 as rounds_to_zero; --- select 10.0 ^ 2147483647 as overflows; --- select 117743296169.0 ^ 1000000000 as overflows; - --- cases that used to return inaccurate results --- select 3.789 ^ 21; --- select 3.789 ^ 35; --- select 1.2 ^ 345; --- select 0.12 ^ (-20); - --- cases that used to error out --- select 0.12 ^ (-25); --- select 0.5678 ^ (-85); - --- --- Tests for raising to non-integer powers --- - --- special cases --- select 0.0 ^ 0.0; --- select (-12.34) ^ 0.0; --- select 12.34 ^ 0.0; --- select 0.0 ^ 12.34; - --- NaNs --- select 'NaN'::numeric ^ 'NaN'::numeric; --- select 'NaN'::numeric ^ 0; --- select 'NaN'::numeric ^ 1; --- select 0 ^ 'NaN'::numeric; --- select 1 ^ 'NaN'::numeric; - --- invalid inputs --- select 0.0 ^ (-12.34); --- select (-12.34) ^ 1.2; - --- cases that used to generate inaccurate results --- select 32.1 ^ 9.8; --- select 32.1 ^ (-9.8); --- select 12.3 ^ 45.6; --- select 12.3 ^ (-45.6); - --- big test --- select 1.234 ^ 5678; - --- --- Tests for EXP() --- - --- special cases -select exp(0.0); -select exp(1.0); --- [SPARK-28316] EXP returns double type for decimal input --- [SPARK-28318] Decimal can only support precision up to 38 --- select exp(1.0::numeric(71,70)); - --- cases that used to generate inaccurate results -select exp(32.999); -select exp(-32.999); -select exp(123.456); -select exp(-123.456); - --- big test -select exp(1234.5678); - --- --- Tests for generate_series --- -select * from range(cast(0.0 as decimal(38, 18)), cast(4.0 as decimal(38, 18))); -select * from range(cast(0.1 as decimal(38, 18)), cast(4.0 as decimal(38, 18)), cast(1.3 as decimal(38, 18))); -select * from range(cast(4.0 as decimal(38, 18)), cast(-1.5 as decimal(38, 18)), cast(-2.2 as decimal(38, 18))); --- Trigger errors --- select * from generate_series(-100::numeric, 100::numeric, 0::numeric); --- select * from generate_series(-100::numeric, 100::numeric, 'nan'::numeric); --- select * from generate_series('nan'::numeric, 100::numeric, 10::numeric); --- select * from generate_series(0::numeric, 'nan'::numeric, 10::numeric); --- [SPARK-28007] Caret operator (^) means bitwise XOR in Spark and exponentiation in Postgres --- Checks maximum, output is truncated --- select (i / (10::numeric ^ 131071))::numeric(1,0) --- from generate_series(6 * (10::numeric ^ 131071), --- 9 * (10::numeric ^ 131071), --- 10::numeric ^ 131071) as a(i); --- Check usage with variables --- select * from generate_series(1::numeric, 3::numeric) i, generate_series(i,3) j; --- select * from generate_series(1::numeric, 3::numeric) i, generate_series(1,i) j; --- select * from generate_series(1::numeric, 3::numeric) i, generate_series(1,5,i) j; - --- --- Tests for LN() --- - --- [SPARK-27923] Invalid inputs for LN throws exception at PostgreSQL --- Invalid inputs --- select ln(-12.34); --- select ln(0.0); - --- Some random tests -select ln(1.2345678e-28); -select ln(0.0456789); --- [SPARK-28318] Decimal can only support precision up to 38 --- select ln(0.349873948359354029493948309745709580730482050975); -select ln(0.99949452); -select ln(1.00049687395); -select ln(1234.567890123456789); -select ln(5.80397490724e5); -select ln(9.342536355e34); - --- --- Tests for LOG() (base 10) --- - --- [SPARK-27923] Invalid inputs for LOG throws exception at PostgreSQL --- invalid inputs --- select log(-12.34); --- select log(0.0); - --- some random tests --- [SPARK-28318] Decimal can only support precision up to 38 --- select log(1.234567e-89); --- [SPARK-28324] The LOG function using 10 as the base, but Spark using E -select log(3.4634998359873254962349856073435545); -select log(9.999999999999999999); -select log(10.00000000000000000); -select log(10.00000000000000001); -select log(590489.45235237); - --- --- Tests for LOG() (arbitrary base) --- - --- [SPARK-27923] Invalid inputs for LOG throws exception at PostgreSQL --- invalid inputs --- select log(-12.34, 56.78); --- select log(-12.34, -56.78); --- select log(12.34, -56.78); --- select log(0.0, 12.34); --- select log(12.34, 0.0); --- select log(1.0, 12.34); - --- some random tests --- [SPARK-28318] Decimal can only support precision up to 38 --- select log(1.23e-89, 6.4689e45); -select log(0.99923, 4.58934e34); -select log(1.000016, 8.452010e18); --- [SPARK-28318] Decimal can only support precision up to 38 --- select log(3.1954752e47, 9.4792021e-73); - --- [SPARK-28317] Built-in Mathematical Functions: SCALE --- --- Tests for scale() --- - --- select scale(numeric 'NaN'); --- select scale(NULL::numeric); --- select scale(1.12); --- select scale(0); --- select scale(0.00); --- select scale(1.12345); --- select scale(110123.12475871856128); --- select scale(-1123.12471856128); --- select scale(-13.000000000000000); - --- --- Tests for SUM() --- - --- cases that need carry propagation -SELECT SUM(decimal(9999)) FROM range(1, 100001); -SELECT SUM(decimal(-9999)) FROM range(1, 100001); - -DROP TABLE num_data; -DROP TABLE num_exp_add; -DROP TABLE num_exp_sub; -DROP TABLE num_exp_div; -DROP TABLE num_exp_mul; -DROP TABLE num_exp_sqrt; -DROP TABLE num_exp_ln; -DROP TABLE num_exp_log10; -DROP TABLE num_exp_power_10_ln; -DROP TABLE num_result; -DROP TABLE num_input_test; diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/aggregates_part1.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part1.sql similarity index 95% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/aggregates_part1.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part1.sql index 5d54be9341148..63f80bd2efa73 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/aggregates_part1.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part1.sql @@ -8,6 +8,11 @@ -- avoid bit-exact output here because operations may not be bit-exact. -- SET extra_float_digits = 0; +-- Test aggregate operator with codegen on and off. +--CONFIG_DIM1 spark.sql.codegen.wholeStage=true +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + SELECT avg(four) AS avg_1 FROM onek; SELECT avg(a) AS avg_32 FROM aggtest WHERE a < 100; diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/aggregates_part2.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part2.sql similarity index 71% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/aggregates_part2.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part2.sql index 47f9d2f373069..a8af1db77563c 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/aggregates_part2.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part2.sql @@ -5,6 +5,11 @@ -- AGGREGATES [Part 2] -- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/aggregates.sql#L145-L350 +-- Test aggregate operator with codegen on and off. +--CONFIG_DIM1 spark.sql.codegen.wholeStage=true +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + create temporary view int4_tbl as select * from values (0), (123456), @@ -41,42 +46,37 @@ create temporary view int4_tbl as select * from values -- -- test for bitwise integer aggregates -- --- CREATE TEMPORARY TABLE bitwise_test( --- i2 INT2, --- i4 INT4, --- i8 INT8, --- i INTEGER, --- x INT2, --- y BIT(4) --- ); +CREATE OR REPLACE TEMPORARY VIEW bitwise_test AS SELECT * FROM VALUES + (1, 1, 1, 1L), + (3, 3, 3, null), + (7, 7, 7, 3L) AS bitwise_test(b1, b2, b3, b4); -- empty case --- SELECT --- BIT_AND(i2) AS "?", --- BIT_OR(i4) AS "?" --- FROM bitwise_test; - --- COPY bitwise_test FROM STDIN NULL 'null'; --- 1 1 1 1 1 B0101 --- 3 3 3 null 2 B0100 --- 7 7 7 3 4 B1100 --- \. - --- SELECT --- BIT_AND(i2) AS "1", --- BIT_AND(i4) AS "1", --- BIT_AND(i8) AS "1", --- BIT_AND(i) AS "?", --- BIT_AND(x) AS "0", --- BIT_AND(y) AS "0100", --- --- BIT_OR(i2) AS "7", --- BIT_OR(i4) AS "7", --- BIT_OR(i8) AS "7", --- BIT_OR(i) AS "?", --- BIT_OR(x) AS "7", --- BIT_OR(y) AS "1101" --- FROM bitwise_test; +SELECT BIT_AND(b1) AS n1, BIT_OR(b2) AS n2 FROM bitwise_test where 1 = 0; + +-- null case +SELECT BIT_AND(b4) AS n1, BIT_OR(b4) AS n2 FROM bitwise_test where b4 is null; + +SELECT + BIT_AND(cast(b1 as tinyint)) AS a1, + BIT_AND(cast(b2 as smallint)) AS b1, + BIT_AND(b3) AS c1, + BIT_AND(b4) AS d1, + BIT_OR(cast(b1 as tinyint)) AS e7, + BIT_OR(cast(b2 as smallint)) AS f7, + BIT_OR(b3) AS g7, + BIT_OR(b4) AS h3 +FROM bitwise_test; + +-- group by +SELECT b1 , bit_and(b2), bit_or(b4) FROM bitwise_test GROUP BY b1; + +--having +SELECT b1, bit_and(b2) FROM bitwise_test GROUP BY b1 HAVING bit_and(b2) < 7; + +-- window +SELECT b1, b2, bit_and(b2) OVER (PARTITION BY b1 ORDER BY b2) FROM bitwise_test; +SELECT b1, b2, bit_or(b2) OVER (PARTITION BY b1 ORDER BY b2) FROM bitwise_test; -- -- test boolean aggregates @@ -114,50 +114,40 @@ SELECT NOT (FALSE OR FALSE) AS `t`; -- [SPARK-27880] Implement boolean aggregates(BOOL_AND, BOOL_OR and EVERY) --- CREATE TEMPORARY TABLE bool_test( --- b1 BOOL, --- b2 BOOL, --- b3 BOOL, --- b4 BOOL); +CREATE OR REPLACE TEMPORARY VIEW bool_test AS SELECT * FROM VALUES + (TRUE, null, FALSE, null), + (FALSE, TRUE, null, null), + (null, TRUE, FALSE, null) AS bool_test(b1, b2, b3, b4); -- empty case --- SELECT --- BOOL_AND(b1) AS "n", --- BOOL_OR(b3) AS "n" --- FROM bool_test; - --- COPY bool_test FROM STDIN NULL 'null'; --- TRUE null FALSE null --- FALSE TRUE null null --- null TRUE FALSE null --- \. - --- SELECT --- BOOL_AND(b1) AS "f", --- BOOL_AND(b2) AS "t", --- BOOL_AND(b3) AS "f", --- BOOL_AND(b4) AS "n", --- BOOL_AND(NOT b2) AS "f", --- BOOL_AND(NOT b3) AS "t" --- FROM bool_test; - --- SELECT --- EVERY(b1) AS "f", --- EVERY(b2) AS "t", --- EVERY(b3) AS "f", --- EVERY(b4) AS "n", --- EVERY(NOT b2) AS "f", --- EVERY(NOT b3) AS "t" --- FROM bool_test; - --- SELECT --- BOOL_OR(b1) AS "t", --- BOOL_OR(b2) AS "t", --- BOOL_OR(b3) AS "f", --- BOOL_OR(b4) AS "n", --- BOOL_OR(NOT b2) AS "f", --- BOOL_OR(NOT b3) AS "t" --- FROM bool_test; +SELECT BOOL_AND(b1) AS n1, BOOL_OR(b3) AS n2 FROM bool_test WHERE 1 = 0; + +SELECT + BOOL_AND(b1) AS f1, + BOOL_AND(b2) AS t2, + BOOL_AND(b3) AS f3, + BOOL_AND(b4) AS n4, + BOOL_AND(NOT b2) AS f5, + BOOL_AND(NOT b3) AS t6 +FROM bool_test; + +SELECT + EVERY(b1) AS f1, + EVERY(b2) AS t2, + EVERY(b3) AS f3, + EVERY(b4) AS n4, + EVERY(NOT b2) AS f5, + EVERY(NOT b3) AS t6 +FROM bool_test; + +SELECT + BOOL_OR(b1) AS t1, + BOOL_OR(b2) AS t2, + BOOL_OR(b3) AS f3, + BOOL_OR(b4) AS n4, + BOOL_OR(NOT b2) AS f5, + BOOL_OR(NOT b3) AS t6 +FROM bool_test; -- -- Test cases that should be optimized into indexscans instead of diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/aggregates_part3.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part3.sql similarity index 94% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/aggregates_part3.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part3.sql index 78fdbf6ae6cd2..746b677234832 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/aggregates_part3.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part3.sql @@ -5,6 +5,11 @@ -- AGGREGATES [Part 3] -- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/aggregates.sql#L352-L605 +-- Test aggregate operator with codegen on and off. +--CONFIG_DIM1 spark.sql.codegen.wholeStage=true +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + -- [SPARK-28865] Table inheritance -- try it on an inheritance tree -- create table minmaxtest(f1 int); @@ -227,16 +232,16 @@ select max(min(unique1)) from tenk1; -- drop table bytea_test_table; --- [SPARK-27986] Support Aggregate Expressions with filter -- FILTER tests --- select min(unique1) filter (where unique1 > 100) from tenk1; +select min(unique1) filter (where unique1 > 100) from tenk1; --- select sum(1/ten) filter (where ten > 0) from tenk1; +select sum(1/ten) filter (where ten > 0) from tenk1; -- select ten, sum(distinct four) filter (where four::text ~ '123') from onek a -- group by ten; +-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT -- select ten, sum(distinct four) filter (where four > 10) from onek a -- group by ten -- having exists (select 1 from onek b where sum(distinct a.four) = b.four); @@ -249,6 +254,7 @@ select max(min(unique1)) from tenk1; select (select count(*) from (values (1)) t0(inner_c)) from (values (2),(3)) t1(outer_c); -- inner query is aggregation query +-- [SPARK-30219] Support Filter expression reference the outer query -- select (select count(*) filter (where outer_c <> 0) -- from (values (1)) t0(inner_c)) -- from (values (2),(3)) t1(outer_c); -- outer query is aggregation query @@ -260,6 +266,7 @@ from (values (2),(3)) t1(outer_c); -- inner query is aggregation query -- filter (where o.unique1 < 10)) -- from tenk1 o; -- outer query is aggregation query +-- [SPARK-30220] Support Filter expression uses IN/EXISTS predicate sub-queries -- subquery in FILTER clause (PostgreSQL extension) -- select sum(unique1) FILTER (WHERE -- unique1 IN (SELECT unique1 FROM onek where unique1 < 100)) FROM tenk1; diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/aggregates_part4.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part4.sql similarity index 98% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/aggregates_part4.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part4.sql index 6fa2306cf1475..0d255bed24e9c 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/aggregates_part4.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/aggregates_part4.sql @@ -5,6 +5,11 @@ -- AGGREGATES [Part 4] -- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/aggregates.sql#L607-L997 +-- Test aggregate operator with codegen on and off. +--CONFIG_DIM1 spark.sql.codegen.wholeStage=true +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + -- [SPARK-27980] Ordered-Set Aggregate Functions -- ordered-set aggregates diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/boolean.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/boolean.sql similarity index 99% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/boolean.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/boolean.sql index 178823bcfe9d6..3a949c834deb5 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/boolean.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/boolean.sql @@ -98,7 +98,6 @@ SELECT boolean('f') <= boolean('t') AS true; -- explicit casts to/from text SELECT boolean(string('TrUe')) AS true, boolean(string('fAlse')) AS `false`; - SELECT boolean(string(' true ')) AS true, boolean(string(' FALSE')) AS `false`; SELECT string(boolean(true)) AS true, string(boolean(false)) AS `false`; diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/case.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/case.sql similarity index 100% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/case.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/case.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/comments.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/comments.sql new file mode 100644 index 0000000000000..1a454179ef79f --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/comments.sql @@ -0,0 +1,50 @@ +-- +-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group +-- +-- +-- COMMENTS +-- https://github.com/postgres/postgres/blob/REL_12_BETA3/src/test/regress/sql/comments.sql +-- + +SELECT 'trailing' AS first; -- trailing single line +SELECT /* embedded single line */ 'embedded' AS `second`; +SELECT /* both embedded and trailing single line */ 'both' AS third; -- trailing single line + +SELECT 'before multi-line' AS fourth; +--QUERY-DELIMITER-START +-- [SPARK-28880] ANSI SQL: Bracketed comments +/* This is an example of SQL which should not execute: + * select 'multi-line'; + */ +SELECT 'after multi-line' AS fifth; +--QUERY-DELIMITER-END + +-- [SPARK-28880] ANSI SQL: Bracketed comments +-- +-- Nested comments +-- +--QUERY-DELIMITER-START +/* +SELECT 'trailing' as x1; -- inside block comment +*/ + +/* This block comment surrounds a query which itself has a block comment... +SELECT /* embedded single line */ 'embedded' AS x2; +*/ + +SELECT -- continued after the following block comments... +/* Deeply nested comment. + This includes a single apostrophe to make sure we aren't decoding this part as a string. +SELECT 'deep nest' AS n1; +/* Second level of nesting... +SELECT 'deeper nest' as n2; +/* Third level of nesting... +SELECT 'deepest nest' as n3; +*/ +Hoo boy. Still two deep... +*/ +Now just one deep... +*/ +'deeply nested example' AS sixth; +--QUERY-DELIMITER-END +/* and this is the end of the file */ diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/create_view.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/create_view.sql new file mode 100644 index 0000000000000..39e708478e298 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/create_view.sql @@ -0,0 +1,779 @@ +-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group +-- +-- CREATE VIEW +-- https://github.com/postgres/postgres/blob/REL_12_STABLE/src/test/regress/sql/create_view.sql + +-- [SPARK-27764] Support geometric types +-- CREATE VIEW street AS +-- SELECT r.name, r.thepath, c.cname AS cname +-- FROM ONLY road r, real_city c +-- WHERE c.outline ## r.thepath; + +-- [SPARK-27764] Support geometric types +-- CREATE VIEW iexit AS +-- SELECT ih.name, ih.thepath, +-- interpt_pp(ih.thepath, r.thepath) AS exit +-- FROM ihighway ih, ramp r +-- WHERE ih.thepath ## r.thepath; + +CREATE TABLE emp ( + name string, + age int, + -- [SPARK-27764] Support geometric types + -- location point + salary int, + manager string +) USING parquet; + +CREATE VIEW toyemp AS + SELECT name, age, /* location ,*/ 12*salary AS annualsal + FROM emp; + +-- [SPARK-29659] Support COMMENT ON syntax +-- Test comments +-- COMMENT ON VIEW noview IS 'no view'; +-- COMMENT ON VIEW toyemp IS 'is a view'; +-- COMMENT ON VIEW toyemp IS NULL; + +DROP VIEW toyemp; +DROP TABLE emp; + +-- These views are left around mainly to exercise special cases in pg_dump. + +-- [SPARK-19842] Informational Referential Integrity Constraints Support in Spark +CREATE TABLE view_base_table (key int /* PRIMARY KEY */, data varchar(20)); +-- +CREATE VIEW key_dependent_view AS + SELECT * FROM view_base_table GROUP BY key; +-- +-- [SPARK-19842] Informational Referential Integrity Constraints Support in Spark +-- ALTER TABLE view_base_table DROP CONSTRAINT view_base_table_pkey; -- fails + +CREATE VIEW key_dependent_view_no_cols AS + SELECT FROM view_base_table GROUP BY key HAVING length(data) > 0; + +-- +-- CREATE OR REPLACE VIEW +-- + +CREATE TABLE viewtest_tbl (a int, b int) using parquet; +-- [SPARK-29386] Copy data between a file and a table +-- COPY viewtest_tbl FROM stdin; +-- 5 10 +-- 10 15 +-- 15 20 +-- 20 25 +-- \. +INSERT INTO viewtest_tbl VALUES (5, 10), (10, 15), (15, 20), (20, 25); + +CREATE OR REPLACE VIEW viewtest AS + SELECT * FROM viewtest_tbl; + +CREATE OR REPLACE VIEW viewtest AS + SELECT * FROM viewtest_tbl WHERE a > 10; + +SELECT * FROM viewtest; + +CREATE OR REPLACE VIEW viewtest AS + SELECT a, b FROM viewtest_tbl WHERE a > 5 ORDER BY b DESC; + +SELECT * FROM viewtest; + +-- should fail +-- [SPARK-29660] Dropping columns and changing column names/types are prohibited in VIEW definition +CREATE OR REPLACE VIEW viewtest AS + SELECT a FROM viewtest_tbl WHERE a <> 20; + +-- should fail +-- [SPARK-29660] Dropping columns and changing column names/types are prohibited in VIEW definition +CREATE OR REPLACE VIEW viewtest AS + SELECT 1, * FROM viewtest_tbl; + +-- should fail +-- [SPARK-29660] Dropping columns and changing column names/types are prohibited in VIEW definition +CREATE OR REPLACE VIEW viewtest AS + SELECT a, decimal(b) FROM viewtest_tbl; + +-- should work +CREATE OR REPLACE VIEW viewtest AS + SELECT a, b, 0 AS c FROM viewtest_tbl; + +DROP VIEW viewtest; +DROP TABLE viewtest_tbl; + +-- tests for temporary views + +-- [SPARK-29661] Support cascaded syntax in CREATE SCHEMA +-- CREATE SCHEMA temp_view_test +-- CREATE TABLE base_table (a int, id int) using parquet +-- CREATE TABLE base_table2 (a int, id int) using parquet; +CREATE SCHEMA temp_view_test; +CREATE TABLE temp_view_test.base_table (a int, id int) using parquet; +CREATE TABLE temp_view_test.base_table2 (a int, id int) using parquet; + +-- Replace SET with USE +-- SET search_path TO temp_view_test, public; +USE temp_view_test; + +-- Since Spark doesn't support CREATE TEMPORARY TABLE, we used CREATE TEMPORARY VIEW instead +-- CREATE TEMPORARY TABLE temp_table (a int, id int); +CREATE TEMPORARY VIEW temp_table AS SELECT * FROM VALUES + (1, 1) as temp_table(a, id); + +-- should be created in temp_view_test schema +CREATE VIEW v1 AS SELECT * FROM base_table; +DESC TABLE EXTENDED v1; +-- should be created in temp object schema +-- [SPARK-29628] Forcibly create a temporary view in CREATE VIEW if referencing a temporary view +CREATE VIEW v1_temp AS SELECT * FROM temp_table; +-- should be created in temp object schema +CREATE TEMP VIEW v2_temp AS SELECT * FROM base_table; +DESC TABLE EXTENDED v2_temp; +-- should be created in temp_views schema +CREATE VIEW temp_view_test.v2 AS SELECT * FROM base_table; +DESC TABLE EXTENDED temp_view_test.v2; +-- should fail +-- [SPARK-29628] Forcibly create a temporary view in CREATE VIEW if referencing a temporary view +CREATE VIEW temp_view_test.v3_temp AS SELECT * FROM temp_table; +-- should fail +-- [SPARK-29661] Support cascaded syntax in CREATE SCHEMA +-- CREATE SCHEMA test_view_schema +-- CREATE TEMP VIEW testview AS SELECT 1; + +-- joins: if any of the join relations are temporary, the view +-- should also be temporary + +-- should be non-temp +CREATE VIEW v3 AS + SELECT t1.a AS t1_a, t2.a AS t2_a + FROM base_table t1, base_table2 t2 + WHERE t1.id = t2.id; +DESC TABLE EXTENDED v3; +-- should be temp (one join rel is temp) +-- [SPARK-29628] Forcibly create a temporary view in CREATE VIEW if referencing a temporary view +CREATE VIEW v4_temp AS + SELECT t1.a AS t1_a, t2.a AS t2_a + FROM base_table t1, temp_table t2 + WHERE t1.id = t2.id; +-- should be temp +-- [SPARK-29628] Forcibly create a temporary view in CREATE VIEW if referencing a temporary view +CREATE VIEW v5_temp AS + SELECT t1.a AS t1_a, t2.a AS t2_a, t3.a AS t3_a + FROM base_table t1, base_table2 t2, temp_table t3 + WHERE t1.id = t2.id and t2.id = t3.id; + +-- subqueries +CREATE VIEW v4 AS SELECT * FROM base_table WHERE id IN (SELECT id FROM base_table2); +DESC TABLE EXTENDED v4; +CREATE VIEW v5 AS SELECT t1.id, t2.a FROM base_table t1, (SELECT * FROM base_table2) t2; +DESC TABLE EXTENDED v5; +CREATE VIEW v6 AS SELECT * FROM base_table WHERE EXISTS (SELECT 1 FROM base_table2); +DESC TABLE EXTENDED v6; +CREATE VIEW v7 AS SELECT * FROM base_table WHERE NOT EXISTS (SELECT 1 FROM base_table2); +DESC TABLE EXTENDED v7; +CREATE VIEW v8 AS SELECT * FROM base_table WHERE EXISTS (SELECT 1); +DESC TABLE EXTENDED v8; + +-- [SPARK-29628] Forcibly create a temporary view in CREATE VIEW if referencing a temporary view +CREATE VIEW v6_temp AS SELECT * FROM base_table WHERE id IN (SELECT id FROM temp_table); +CREATE VIEW v7_temp AS SELECT t1.id, t2.a FROM base_table t1, (SELECT * FROM temp_table) t2; +CREATE VIEW v8_temp AS SELECT * FROM base_table WHERE EXISTS (SELECT 1 FROM temp_table); +CREATE VIEW v9_temp AS SELECT * FROM base_table WHERE NOT EXISTS (SELECT 1 FROM temp_table); + +-- a view should also be temporary if it references a temporary view +-- [SPARK-29628] Forcibly create a temporary view in CREATE VIEW if referencing a temporary view +CREATE VIEW v10_temp AS SELECT * FROM v7_temp; +CREATE VIEW v11_temp AS SELECT t1.id, t2.a FROM base_table t1, v10_temp t2; +CREATE VIEW v12_temp AS SELECT true FROM v11_temp; + +-- [SPARK-27764] Support ANSI SQL CREATE SEQUENCE +-- a view should also be temporary if it references a temporary sequence +-- CREATE SEQUENCE seq1; +-- CREATE TEMPORARY SEQUENCE seq1_temp; +-- CREATE VIEW v9 AS SELECT seq1.is_called FROM seq1; +-- CREATE VIEW v13_temp AS SELECT seq1_temp.is_called FROM seq1_temp; + +-- Skip the tests below because of PostgreSQL specific cases +-- SELECT relname FROM pg_class +-- WHERE relname LIKE 'v_' +-- AND relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'temp_view_test') +-- ORDER BY relname; +-- SELECT relname FROM pg_class +-- WHERE relname LIKE 'v%' +-- AND relnamespace IN (SELECT oid FROM pg_namespace WHERE nspname LIKE 'pg_temp%') +-- ORDER BY relname; + +CREATE SCHEMA testviewschm2; +-- Replace SET with USE +-- SET search_path TO testviewschm2, public; +USE testviewschm2; + +CREATE TABLE t1 (num int, name string) using parquet; +CREATE TABLE t2 (num2 int, value string) using parquet; +-- Since Spark doesn't support CREATE TEMPORARY TABLE, we used CREATE TEMPORARY VIEW instead +-- CREATE TEMP TABLE tt (num2 int, value string); +CREATE TEMP VIEW tt AS SELECT * FROM VALUES + (1, 'a') AS tt(num2, value); + +CREATE VIEW nontemp1 AS SELECT * FROM t1 CROSS JOIN t2; +DESC TABLE EXTENDED nontemp1; +-- [SPARK-29628] Forcibly create a temporary view in CREATE VIEW if referencing a temporary view +CREATE VIEW temporal1 AS SELECT * FROM t1 CROSS JOIN tt; +CREATE VIEW nontemp2 AS SELECT * FROM t1 INNER JOIN t2 ON t1.num = t2.num2; +DESC TABLE EXTENDED nontemp2; +-- [SPARK-29628] Forcibly create a temporary view in CREATE VIEW if referencing a temporary view +CREATE VIEW temporal2 AS SELECT * FROM t1 INNER JOIN tt ON t1.num = tt.num2; +CREATE VIEW nontemp3 AS SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2; +DESC TABLE EXTENDED nontemp3; +-- [SPARK-29628] Forcibly create a temporary view in CREATE VIEW if referencing a temporary view +CREATE VIEW temporal3 AS SELECT * FROM t1 LEFT JOIN tt ON t1.num = tt.num2; +CREATE VIEW nontemp4 AS SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 AND t2.value = 'xxx'; +DESC TABLE EXTENDED nontemp4; +-- [SPARK-29628] Forcibly create a temporary view in CREATE VIEW if referencing a temporary view +CREATE VIEW temporal4 AS SELECT * FROM t1 LEFT JOIN tt ON t1.num = tt.num2 AND tt.value = 'xxx'; +CREATE VIEW temporal5 AS SELECT * FROM t1 WHERE num IN (SELECT num FROM t1 WHERE EXISTS (SELECT 1 FROM tt)); + +-- Skip the tests below because of PostgreSQL specific cases +-- SELECT relname FROM pg_class +-- WHERE relname LIKE 'nontemp%' +-- AND relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = 'testviewschm2') +-- ORDER BY relname; +-- SELECT relname FROM pg_class +-- WHERE relname LIKE 'temporal%' +-- AND relnamespace IN (SELECT oid FROM pg_namespace WHERE nspname LIKE 'pg_temp%') +-- ORDER BY relname; + +CREATE TABLE tbl1 ( a int, b int) using parquet; +CREATE TABLE tbl2 (c int, d int) using parquet; +CREATE TABLE tbl3 (e int, f int) using parquet; +CREATE TABLE tbl4 (g int, h int) using parquet; +-- Since Spark doesn't support CREATE TEMPORARY TABLE, we used CREATE TABLE instead +-- CREATE TEMP TABLE tmptbl (i int, j int); +CREATE TABLE tmptbl (i int, j int) using parquet; +INSERT INTO tmptbl VALUES (1, 1); + +--Should be in testviewschm2 +CREATE VIEW pubview AS SELECT * FROM tbl1 WHERE tbl1.a +BETWEEN (SELECT d FROM tbl2 WHERE c = 1) AND (SELECT e FROM tbl3 WHERE f = 2) +AND EXISTS (SELECT g FROM tbl4 LEFT JOIN tbl3 ON tbl4.h = tbl3.f); +DESC TABLE EXTENDED pubview; + +-- Skip the test below because of PostgreSQL specific cases +-- SELECT count(*) FROM pg_class where relname = 'pubview' +-- AND relnamespace IN (SELECT OID FROM pg_namespace WHERE nspname = 'testviewschm2'); + +--Should be in temp object schema +CREATE VIEW mytempview AS SELECT * FROM tbl1 WHERE tbl1.a +BETWEEN (SELECT d FROM tbl2 WHERE c = 1) AND (SELECT e FROM tbl3 WHERE f = 2) +AND EXISTS (SELECT g FROM tbl4 LEFT JOIN tbl3 ON tbl4.h = tbl3.f) +AND NOT EXISTS (SELECT g FROM tbl4 LEFT JOIN tmptbl ON tbl4.h = tmptbl.j); +DESC TABLE EXTENDED mytempview; + +-- Skip the test below because of PostgreSQL specific cases +-- SELECT count(*) FROM pg_class where relname LIKE 'mytempview' +-- And relnamespace IN (SELECT OID FROM pg_namespace WHERE nspname LIKE 'pg_temp%'); + +-- +-- CREATE VIEW and WITH(...) clause +-- CREATE VIEW mysecview1 +-- AS SELECT * FROM tbl1 WHERE a = 0; +-- +-- Skip the tests below because Spark doesn't support `WITH options` +-- CREATE VIEW mysecview2 WITH (security_barrier=true) +-- AS SELECT * FROM tbl1 WHERE a > 0; +-- CREATE VIEW mysecview3 WITH (security_barrier=false) +-- AS SELECT * FROM tbl1 WHERE a < 0; +-- CREATE VIEW mysecview4 WITH (security_barrier) +-- AS SELECT * FROM tbl1 WHERE a <> 0; +-- Spark cannot support options in WITH clause +-- CREATE VIEW mysecview5 WITH (security_barrier=100) -- Error +-- AS SELECT * FROM tbl1 WHERE a > 100; +-- CREATE VIEW mysecview6 WITH (invalid_option) -- Error +-- AS SELECT * FROM tbl1 WHERE a < 100; +-- Skip the test below because of PostgreSQL specific cases +-- SELECT relname, relkind, reloptions FROM pg_class +-- WHERE oid in ('mysecview1'::regclass, 'mysecview2'::regclass, +-- 'mysecview3'::regclass, 'mysecview4'::regclass) +-- ORDER BY relname; + +-- CREATE OR REPLACE VIEW mysecview1 +-- AS SELECT * FROM tbl1 WHERE a = 256; +-- CREATE OR REPLACE VIEW mysecview2 +-- AS SELECT * FROM tbl1 WHERE a > 256; +-- CREATE OR REPLACE VIEW mysecview3 WITH (security_barrier=true) +-- AS SELECT * FROM tbl1 WHERE a < 256; +-- CREATE OR REPLACE VIEW mysecview4 WITH (security_barrier=false) +-- AS SELECT * FROM tbl1 WHERE a <> 256; +-- Skip the test below because of PostgreSQL specific cases +-- SELECT relname, relkind, reloptions FROM pg_class +-- WHERE oid in ('mysecview1'::regclass, 'mysecview2'::regclass, +-- 'mysecview3'::regclass, 'mysecview4'::regclass) +-- ORDER BY relname; + +-- Check that unknown literals are converted to "text" in CREATE VIEW, +-- so that we don't end up with unknown-type columns. + +-- Skip the tests below because of PostgreSQL specific cases +-- CREATE VIEW unspecified_types AS +-- SELECT 42 as i, 42.5 as num, 'foo' as u, 'foo'::unknown as u2, null as n; +-- \d+ unspecified_types +-- SELECT * FROM unspecified_types; + +-- This test checks that proper typmods are assigned in a multi-row VALUES + +CREATE VIEW tt1 AS + SELECT * FROM ( + VALUES + ('abc', '0123456789', 42, 'abcd'), + ('0123456789', 'abc', 42.12, 'abc') + ) vv(a,b,c,d); +-- Replace the PostgreSQL meta command `\d` with `DESC` +-- \d+ tt1 +SELECT * FROM tt1; +SELECT string(a) FROM tt1; +DROP VIEW tt1; + +-- Test view decompilation in the face of relation renaming conflicts + +CREATE TABLE tt1 (f1 int, f2 int, f3 string) using parquet; +CREATE TABLE tx1 (x1 int, x2 int, x3 string) using parquet; +CREATE TABLE temp_view_test.tt1 (y1 int, f2 int, f3 string) using parquet; + +CREATE VIEW aliased_view_1 AS + select * from tt1 + where exists (select 1 from tx1 where tt1.f1 = tx1.x1); +CREATE VIEW aliased_view_2 AS + select * from tt1 a1 + where exists (select 1 from tx1 where a1.f1 = tx1.x1); +CREATE VIEW aliased_view_3 AS + select * from tt1 + where exists (select 1 from tx1 a2 where tt1.f1 = a2.x1); +CREATE VIEW aliased_view_4 AS + select * from temp_view_test.tt1 + where exists (select 1 from tt1 where temp_view_test.tt1.y1 = tt1.f1); + +-- Replace the PostgreSQL meta command `\d` with `DESC` +-- \d+ aliased_view_1 +DESC TABLE aliased_view_1; +-- \d+ aliased_view_2 +DESC TABLE aliased_view_2; +-- \d+ aliased_view_3 +DESC TABLE aliased_view_3; +-- \d+ aliased_view_4 +DESC TABLE aliased_view_4; + +ALTER TABLE tx1 RENAME TO a1; + +-- Replace the PostgreSQL meta command `\d` with `DESC` +-- \d+ aliased_view_1 +DESC TABLE aliased_view_1; +-- \d+ aliased_view_2 +DESC TABLE aliased_view_2; +-- \d+ aliased_view_3 +DESC TABLE aliased_view_3; +-- \d+ aliased_view_4 +DESC TABLE aliased_view_4; + +ALTER TABLE tt1 RENAME TO a2; + +-- Replace the PostgreSQL meta command `\d` with `DESC` +-- \d+ aliased_view_1 +DESC TABLE aliased_view_1; +-- \d+ aliased_view_2 +DESC TABLE aliased_view_2; +-- \d+ aliased_view_3 +DESC TABLE aliased_view_3; +-- \d+ aliased_view_4 +DESC TABLE aliased_view_4; + +ALTER TABLE a1 RENAME TO tt1; + +-- Replace the PostgreSQL meta command `\d` with `DESC` +-- \d+ aliased_view_1 +DESC TABLE aliased_view_1; +-- \d+ aliased_view_2 +DESC TABLE aliased_view_2; +-- \d+ aliased_view_3 +DESC TABLE aliased_view_3; +-- \d+ aliased_view_4 +DESC TABLE aliased_view_4; + +ALTER TABLE a2 RENAME TO tx1; +-- [SPARK-29632] Support ALTER TABLE [relname] SET SCHEMA [dbname] +-- ALTER TABLE tx1 SET SCHEMA temp_view_test; + +-- \d+ aliased_view_1 +-- \d+ aliased_view_2 +-- \d+ aliased_view_3 +-- \d+ aliased_view_4 + +-- [SPARK-29632] Support ALTER TABLE [relname] SET SCHEMA [dbname] +-- ALTER TABLE temp_view_test.tt1 RENAME TO tmp1; +-- ALTER TABLE temp_view_test.tmp1 SET SCHEMA testviewschm2; +-- ALTER TABLE tmp1 RENAME TO tx1; + +-- Replace the PostgreSQL meta command `\d` with `DESC` +-- \d+ aliased_view_1 +-- \d+ aliased_view_2 +-- \d+ aliased_view_3 +-- \d+ aliased_view_4 + +-- Test aliasing of joins + +create view view_of_joins as +select * from + (select * from (tbl1 cross join tbl2) same) ss, + (tbl3 cross join tbl4) same; + +-- Replace the PostgreSQL meta command `\d` with `DESC` +-- \d+ view_of_joins + +-- Test view decompilation in the face of column addition/deletion/renaming + +create table tt2 (a int, b int, c int) using parquet; +create table tt3 (ax bigint, b short, c decimal) using parquet; +create table tt4 (ay int, b int, q int) using parquet; + +create view v1 as select * from tt2 natural join tt3; +create view v1a as select * from (tt2 natural join tt3) j; +create view v2 as select * from tt2 join tt3 using (b,c) join tt4 using (b); +create view v2a as select * from (tt2 join tt3 using (b,c) join tt4 using (b)) j; +create view v3 as select * from tt2 join tt3 using (b,c) full join tt4 using (b); + +-- Replace `pg_get_viewdef` with `DESC` +-- select pg_get_viewdef('v1', true); +DESC TABLE v1; +-- select pg_get_viewdef('v1a', true); +DESC TABLE v1a; +-- select pg_get_viewdef('v2', true); +DESC TABLE v2; +-- select pg_get_viewdef('v2a', true); +DESC TABLE v2a; +-- select pg_get_viewdef('v3', true); +DESC TABLE v3; + +alter table tt2 add column d int; +alter table tt2 add column e int; + +-- Replace `pg_get_viewdef` with `DESC` +-- select pg_get_viewdef('v1', true); +DESC TABLE v1; +-- select pg_get_viewdef('v1a', true); +DESC TABLE v1a; +-- select pg_get_viewdef('v2', true); +DESC TABLE v2; +-- select pg_get_viewdef('v2a', true); +DESC TABLE v2a; +-- select pg_get_viewdef('v3', true); +DESC TABLE v3; + +-- [SPARK-27764] Make COLUMN optional in ALTER TABLE +-- [SPARK-27589] Spark file source V2 (For supporting RENAME COLUMN in ALTER TABLE) +-- alter table tt3 rename c to d; +drop table tt3; +create table tt3 (ax bigint, b short, d decimal) using parquet; + +-- select pg_get_viewdef('v1', true); +-- select pg_get_viewdef('v1a', true); +-- select pg_get_viewdef('v2', true); +-- select pg_get_viewdef('v2a', true); +-- select pg_get_viewdef('v3', true); + +alter table tt3 add column c int; +alter table tt3 add column e int; + +-- Replace `pg_get_viewdef` with `DESC` +-- select pg_get_viewdef('v1', true); +DESC TABLE v1; +-- select pg_get_viewdef('v1a', true); +DESC TABLE v1a; +-- select pg_get_viewdef('v2', true); +DESC TABLE v2; +-- select pg_get_viewdef('v2a', true); +DESC TABLE v2a; +-- select pg_get_viewdef('v3', true); +DESC TABLE v3; + +-- [SPARK-27589] Spark file source V2 (For supporting DROP COLUMN in ALTER TABLE) +-- alter table tt2 drop column d; + +-- select pg_get_viewdef('v1', true); +-- select pg_get_viewdef('v1a', true); +-- select pg_get_viewdef('v2', true); +-- select pg_get_viewdef('v2a', true); +-- select pg_get_viewdef('v3', true); + +create table tt5 (a int, b int) using parquet; +create table tt6 (c int, d int) using parquet; +create view vv1 as select * from (tt5 cross join tt6) j(aa,bb,cc,dd); +-- Replace `pg_get_viewdef` with `DESC` +-- select pg_get_viewdef('vv1', true); +DESC TABLE vv1; +alter table tt5 add column c int; +-- select pg_get_viewdef('vv1', true); +DESC TABLE vv1; +alter table tt5 add column cc int; +-- select pg_get_viewdef('vv1', true); +DESC TABLE vv1; +-- [SPARK-27589] Spark file source V2 (For supporting DROP COLUMN in ALTER TABLE) +-- alter table tt5 drop column c; +-- select pg_get_viewdef('vv1', true); + +-- Unnamed FULL JOIN USING is lots of fun too + +-- [SPARK-27589] Spark file source V2 (For supporting DROP COLUMN in ALTER TABLE) +create table tt7 (x int, /* xx int, */ y int) using parquet; +-- alter table tt7 drop column xx; +create table tt8 (x int, z int) using parquet; + +create view vv2 as +select * from (values(1,2,3,4,5)) v(a,b,c,d,e) +union all +select * from tt7 full join tt8 using (x), tt8 tt8x; + +-- Replace `pg_get_viewdef` with `DESC` +-- select pg_get_viewdef('vv2', true); +DESC TABLE vv2; + +create view vv3 as +select * from (values(1,2,3,4,5,6)) v(a,b,c,x,e,f) +union all +select * from + tt7 full join tt8 using (x), + tt7 tt7x full join tt8 tt8x using (x); + +-- Replace `pg_get_viewdef` with `DESC` +-- select pg_get_viewdef('vv3', true); +DESC TABLE vv3; + +create view vv4 as +select * from (values(1,2,3,4,5,6,7)) v(a,b,c,x,e,f,g) +union all +select * from + tt7 full join tt8 using (x), + tt7 tt7x full join tt8 tt8x using (x) full join tt8 tt8y using (x); + +-- Replace `pg_get_viewdef` with `DESC` +-- select pg_get_viewdef('vv4', true); +DESC TABLE vv4; + +alter table tt7 add column zz int; +alter table tt7 add column z int; +-- [SPARK-27589] Spark file source V2 (For supporting DROP COLUMN in ALTER TABLE) +-- alter table tt7 drop column zz; +alter table tt8 add column z2 int; + +-- Replace `pg_get_viewdef` with `DESC` +-- select pg_get_viewdef('vv2', true); +DESC TABLE vv2; +-- select pg_get_viewdef('vv3', true); +DESC TABLE vv3; +-- select pg_get_viewdef('vv4', true); +DESC TABLE vv4; + +-- Implicit coercions in a JOIN USING create issues similar to FULL JOIN + +-- [SPARK-27589] Spark file source V2 (For supporting DROP COLUMN in ALTER TABLE) +create table tt7a (x date, /* xx int, */ y int) using parquet; +-- alter table tt7a drop column xx; +create table tt8a (x timestamp, z int) using parquet; + +-- To pass the query, added exact column names in the select stmt +create view vv2a as +select * from (values(now(),2,3,now(),5)) v(a,b,c,d,e) +union all +select * from tt7a left join tt8a using (x), tt8a tt8ax; + +-- Replace `pg_get_viewdef` with `DESC` +-- select pg_get_viewdef('vv4', true); +DESC TABLE vv4; +-- select pg_get_viewdef('vv2a', true); +DESC TABLE vv2a; + +-- +-- Also check dropping a column that existed when the view was made +-- + +create table tt9 (x int, xx int, y int) using parquet; +create table tt10 (x int, z int) using parquet; + +create view vv5 as select x,y,z from tt9 join tt10 using(x); + +-- Replace `pg_get_viewdef` with `DESC` +-- select pg_get_viewdef('vv5', true); +DESC TABLE vv5; + +-- [SPARK-27589] Spark file source V2 (For supporting DROP COLUMN in ALTER TABLE) +-- alter table tt9 drop column xx; + +-- Replace `pg_get_viewdef` with `DESC` +-- select pg_get_viewdef('vv5', true); +DESC TABLE vv5; + +-- +-- Another corner case is that we might add a column to a table below a +-- JOIN USING, and thereby make the USING column name ambiguous +-- + +create table tt11 (x int, y int) using parquet; +create table tt12 (x int, z int) using parquet; +create table tt13 (z int, q int) using parquet; + +create view vv6 as select x,y,z,q from + (tt11 join tt12 using(x)) join tt13 using(z); + +-- Replace `pg_get_viewdef` with `DESC` +-- select pg_get_viewdef('vv6', true); +DESC TABLE vv6; + +alter table tt11 add column z int; + +-- Replace `pg_get_viewdef` with `DESC` +-- select pg_get_viewdef('vv6', true); +DESC TABLE vv6; + +-- +-- Check cases involving dropped/altered columns in a function's rowtype result +-- + +-- Skip the tests below because Spark does't support PostgreSQL-specific UDFs/transactions +-- create table tt14t (f1 text, f2 text, f3 text, f4 text); +-- insert into tt14t values('foo', 'bar', 'baz', '42'); +-- +-- alter table tt14t drop column f2; +-- +-- create function tt14f() returns setof tt14t as +-- $$ +-- declare +-- rec1 record; +-- begin +-- for rec1 in select * from tt14t +-- loop +-- return next rec1; +-- end loop; +-- end; +-- $$ +-- language plpgsql; +-- +-- create view tt14v as select t.* from tt14f() t; +-- +-- select pg_get_viewdef('tt14v', true); +-- select * from tt14v; +-- +-- begin; +-- +-- -- this perhaps should be rejected, but it isn't: +-- alter table tt14t drop column f3; +-- +-- -- f3 is still in the view ... +-- select pg_get_viewdef('tt14v', true); +-- -- but will fail at execution +-- select f1, f4 from tt14v; +-- select * from tt14v; +-- +-- rollback; +-- +-- begin; +-- +-- -- this perhaps should be rejected, but it isn't: +-- alter table tt14t alter column f4 type integer using f4::integer; +-- +-- -- f4 is still in the view ... +-- select pg_get_viewdef('tt14v', true); +-- -- but will fail at execution +-- select f1, f3 from tt14v; +-- select * from tt14v; +-- +-- rollback; + +-- check display of whole-row variables in some corner cases + +-- Skip the tests below because we do not support creating types +-- create type nestedcomposite as (x int8_tbl); +-- create view tt15v as select row(i)::nestedcomposite from int8_tbl i; +-- select * from tt15v; +-- select pg_get_viewdef('tt15v', true); +-- select row(i.*::int8_tbl)::nestedcomposite from int8_tbl i; +-- +-- create view tt16v as select * from int8_tbl i, lateral(values(i)) ss; +-- select * from tt16v; +-- select pg_get_viewdef('tt16v', true); +-- select * from int8_tbl i, lateral(values(i.*::int8_tbl)) ss; +-- +-- create view tt17v as select * from int8_tbl i where i in (values(i)); +-- select * from tt17v; +-- select pg_get_viewdef('tt17v', true); +-- select * from int8_tbl i where i.* in (values(i.*::int8_tbl)); + +-- check unique-ification of overlength names + +CREATE TABLE int8_tbl (q1 int, q2 int) USING parquet; + +create view tt18v as + select * from int8_tbl xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxy + union all + select * from int8_tbl xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxz; +-- Replace `pg_get_viewdef` with `DESC` +-- select pg_get_viewdef('tt18v', true); +DESC TABLE tt18v; +-- explain (costs off) select * from tt18v; + +-- check display of ScalarArrayOp with a sub-select + +-- Skip the tests below because of PostgreSQL specific cases +-- select 'foo'::text = any(array['abc','def','foo']::text[]); +-- select 'foo'::text = any((select array['abc','def','foo']::text[])); -- fail +-- select 'foo'::text = any((select array['abc','def','foo']::text[])::text[]); +-- +-- create view tt19v as +-- select 'foo'::text = any(array['abc','def','foo']::text[]) c1, +-- 'foo'::text = any((select array['abc','def','foo']::text[])::text[]) c2; +-- select pg_get_viewdef('tt19v', true); + +-- check display of assorted RTE_FUNCTION expressions + +-- [SPARK-28682] ANSI SQL: Collation Support +-- create view tt20v as +-- select * from +-- coalesce(1,2) as c, +-- collation for ('x'::text) col, +-- current_date as d, +-- localtimestamp(3) as t, +-- cast(1+2 as int4) as i4, +-- cast(1+2 as int8) as i8; +-- select pg_get_viewdef('tt20v', true); + +-- corner cases with empty join conditions + +create view tt21v as +select * from tt5 natural inner join tt6; +-- Replace `pg_get_viewdef` with `DESC` +-- select pg_get_viewdef('tt21v', true); +DESC TABLE tt21v; + +create view tt22v as +select * from tt5 natural left join tt6; +-- Replace `pg_get_viewdef` with `DESC` +-- select pg_get_viewdef('tt22v', true); +DESC TABLE tt22v; + +-- check handling of views with immediately-renamed columns + +create view tt23v (col_a, col_b) as +select q1 as other_name1, q2 as other_name2 from int8_tbl +union +select 42, 43; + +-- Replace `pg_get_viewdef` with `DESC` +-- select pg_get_viewdef('tt23v', true); +DESC TABLE tt23v; +-- Skip the test below because of PostgreSQL specific cases +-- select pg_get_ruledef(oid, true) from pg_rewrite +-- where ev_class = 'tt23v'::regclass and ev_type = '1'; + +-- clean up all the random objects we made above +DROP SCHEMA temp_view_test CASCADE; +DROP SCHEMA testviewschm2 CASCADE; + +DROP VIEW temp_table; +DROP VIEW tt; diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/date.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/date.sql similarity index 89% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/date.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/date.sql index b9a6b998e52fe..0bab2f884d976 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/date.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/date.sql @@ -7,23 +7,25 @@ CREATE TABLE DATE_TBL (f1 date) USING parquet; -INSERT INTO DATE_TBL VALUES ('1957-04-09'); -INSERT INTO DATE_TBL VALUES ('1957-06-13'); -INSERT INTO DATE_TBL VALUES ('1996-02-28'); -INSERT INTO DATE_TBL VALUES ('1996-02-29'); -INSERT INTO DATE_TBL VALUES ('1996-03-01'); -INSERT INTO DATE_TBL VALUES ('1996-03-02'); -INSERT INTO DATE_TBL VALUES ('1997-02-28'); +-- PostgreSQL implicitly casts string literals to data with date types, but +-- Spark does not support that kind of implicit casts. +INSERT INTO DATE_TBL VALUES (date('1957-04-09')); +INSERT INTO DATE_TBL VALUES (date('1957-06-13')); +INSERT INTO DATE_TBL VALUES (date('1996-02-28')); +INSERT INTO DATE_TBL VALUES (date('1996-02-29')); +INSERT INTO DATE_TBL VALUES (date('1996-03-01')); +INSERT INTO DATE_TBL VALUES (date('1996-03-02')); +INSERT INTO DATE_TBL VALUES (date('1997-02-28')); -- [SPARK-27923] Skip invalid date: 1997-02-29 --- INSERT INTO DATE_TBL VALUES ('1997-02-29'); -INSERT INTO DATE_TBL VALUES ('1997-03-01'); -INSERT INTO DATE_TBL VALUES ('1997-03-02'); -INSERT INTO DATE_TBL VALUES ('2000-04-01'); -INSERT INTO DATE_TBL VALUES ('2000-04-02'); -INSERT INTO DATE_TBL VALUES ('2000-04-03'); -INSERT INTO DATE_TBL VALUES ('2038-04-08'); -INSERT INTO DATE_TBL VALUES ('2039-04-09'); -INSERT INTO DATE_TBL VALUES ('2040-04-10'); +-- INSERT INTO DATE_TBL VALUES ('1997-02-29')); +INSERT INTO DATE_TBL VALUES (date('1997-03-01')); +INSERT INTO DATE_TBL VALUES (date('1997-03-02')); +INSERT INTO DATE_TBL VALUES (date('2000-04-01')); +INSERT INTO DATE_TBL VALUES (date('2000-04-02')); +INSERT INTO DATE_TBL VALUES (date('2000-04-03')); +INSERT INTO DATE_TBL VALUES (date('2038-04-08')); +INSERT INTO DATE_TBL VALUES (date('2039-04-09')); +INSERT INTO DATE_TBL VALUES (date('2040-04-10')); SELECT f1 AS `Fifteen` FROM DATE_TBL; @@ -208,20 +210,19 @@ SELECT date '5874898-01-01'; -- out of range SELECT f1 - date '2000-01-01' AS `Days From 2K` FROM DATE_TBL; --- [SPARK-28141] Date type can not accept special values --- SELECT f1 - date 'epoch' AS "Days From Epoch" FROM DATE_TBL; +SELECT f1 - date 'epoch' AS `Days From Epoch` FROM DATE_TBL; --- SELECT date 'yesterday' - date 'today' AS "One day"; +SELECT date 'yesterday' - date 'today' AS `One day`; --- SELECT date 'today' - date 'tomorrow' AS "One day"; +SELECT date 'today' - date 'tomorrow' AS `One day`; --- SELECT date 'yesterday' - date 'tomorrow' AS "Two days"; +SELECT date 'yesterday' - date 'tomorrow' AS `Two days`; --- SELECT date 'tomorrow' - date 'today' AS "One day"; +SELECT date 'tomorrow' - date 'today' AS `One day`; --- SELECT date 'today' - date 'yesterday' AS "One day"; +SELECT date 'today' - date 'yesterday' AS `One day`; --- SELECT date 'tomorrow' - date 'yesterday' AS "Two days"; +SELECT date 'tomorrow' - date 'yesterday' AS `Two days`; -- [SPARK-28017] Enhance date EXTRACT -- @@ -290,7 +291,7 @@ SELECT DATE_TRUNC('DECADE', DATE '1993-12-25'); -- 1990-01-01 SELECT DATE_TRUNC('DECADE', DATE '0004-12-25'); -- 0001-01-01 BC SELECT DATE_TRUNC('DECADE', TO_DATE('0002-12-31 BC', 'yyyy-MM-dd G')); -- 0011-01-01 BC --- [SPARK-28141] Date type can not accept special values +-- [SPARK-29006] Support special date/timestamp values `infinity`/`-infinity` -- -- test infinity -- diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/float4.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/float4.sql similarity index 96% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/float4.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/float4.sql index 058467695a608..2989569e219ff 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/float4.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/float4.sql @@ -7,11 +7,13 @@ CREATE TABLE FLOAT4_TBL (f1 float) USING parquet; -INSERT INTO FLOAT4_TBL VALUES (' 0.0'); -INSERT INTO FLOAT4_TBL VALUES ('1004.30 '); -INSERT INTO FLOAT4_TBL VALUES (' -34.84 '); -INSERT INTO FLOAT4_TBL VALUES ('1.2345678901234e+20'); -INSERT INTO FLOAT4_TBL VALUES ('1.2345678901234e-20'); +-- PostgreSQL implicitly casts string literals to data with floating point types, but +-- Spark does not support that kind of implicit casts. +INSERT INTO FLOAT4_TBL VALUES (float(' 0.0')); +INSERT INTO FLOAT4_TBL VALUES (float('1004.30 ')); +INSERT INTO FLOAT4_TBL VALUES (float(' -34.84 ')); +INSERT INTO FLOAT4_TBL VALUES (float('1.2345678901234e+20')); +INSERT INTO FLOAT4_TBL VALUES (float('1.2345678901234e-20')); -- [SPARK-28024] Incorrect numeric values when out of range -- test for over and under flow diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/float8.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/float8.sql similarity index 95% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/float8.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/float8.sql index 957dabdebab4e..932cdb95fcf3a 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/float8.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/float8.sql @@ -7,11 +7,13 @@ CREATE TABLE FLOAT8_TBL(f1 double) USING parquet; -INSERT INTO FLOAT8_TBL VALUES (' 0.0 '); -INSERT INTO FLOAT8_TBL VALUES ('1004.30 '); -INSERT INTO FLOAT8_TBL VALUES (' -34.84'); -INSERT INTO FLOAT8_TBL VALUES ('1.2345678901234e+200'); -INSERT INTO FLOAT8_TBL VALUES ('1.2345678901234e-200'); +-- PostgreSQL implicitly casts string literals to data with floating point types, but +-- Spark does not support that kind of implicit casts. +INSERT INTO FLOAT8_TBL VALUES (double(' 0.0 ')); +INSERT INTO FLOAT8_TBL VALUES (double('1004.30 ')); +INSERT INTO FLOAT8_TBL VALUES (double(' -34.84')); +INSERT INTO FLOAT8_TBL VALUES (double('1.2345678901234e+200')); +INSERT INTO FLOAT8_TBL VALUES (double('1.2345678901234e-200')); -- [SPARK-28024] Incorrect numeric values when out of range -- test for underflow and overflow handling @@ -227,15 +229,17 @@ SELECT atanh(double('NaN')); TRUNCATE TABLE FLOAT8_TBL; -INSERT INTO FLOAT8_TBL VALUES ('0.0'); +-- PostgreSQL implicitly casts string literals to data with floating point types, but +-- Spark does not support that kind of implicit casts. +INSERT INTO FLOAT8_TBL VALUES (double('0.0')); -INSERT INTO FLOAT8_TBL VALUES ('-34.84'); +INSERT INTO FLOAT8_TBL VALUES (double('-34.84')); -INSERT INTO FLOAT8_TBL VALUES ('-1004.30'); +INSERT INTO FLOAT8_TBL VALUES (double('-1004.30')); -INSERT INTO FLOAT8_TBL VALUES ('-1.2345678901234e+200'); +INSERT INTO FLOAT8_TBL VALUES (double('-1.2345678901234e+200')); -INSERT INTO FLOAT8_TBL VALUES ('-1.2345678901234e-200'); +INSERT INTO FLOAT8_TBL VALUES (double('-1.2345678901234e-200')); SELECT '' AS five, * FROM FLOAT8_TBL; diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/groupingsets.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/groupingsets.sql new file mode 100644 index 0000000000000..fc54d179f742c --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/groupingsets.sql @@ -0,0 +1,562 @@ +-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group +-- +-- GROUPING SETS +-- https://github.com/postgres/postgres/blob/REL_12_STABLE/src/test/regress/sql/groupingsets.sql + +-- test data sources + +create temp view gstest1(a,b,v) + as values (1,1,10),(1,1,11),(1,2,12),(1,2,13),(1,3,14), + (2,3,15), + (3,3,16),(3,4,17), + (4,1,18),(4,1,19); + +-- Since Spark doesn't support CREATE TEMPORARY TABLE, we used CREATE TABLE instead +-- create temp table gstest2 (a integer, b integer, c integer, d integer, +-- e integer, f integer, g integer, h integer); +create table gstest2 (a integer, b integer, c integer, d integer, + e integer, f integer, g integer, h integer) using parquet; +-- [SPARK-29386] Copy data between a file and a table +-- copy gstest2 from stdin; +-- 1 1 1 1 1 1 1 1 +-- 1 1 1 1 1 1 1 2 +-- 1 1 1 1 1 1 2 2 +-- 1 1 1 1 1 2 2 2 +-- 1 1 1 1 2 2 2 2 +-- 1 1 1 2 2 2 2 2 +-- 1 1 2 2 2 2 2 2 +-- 1 2 2 2 2 2 2 2 +-- 2 2 2 2 2 2 2 2 +-- \. +insert into gstest2 values + (1, 1, 1, 1, 1, 1, 1, 1), + (1, 1, 1, 1, 1, 1, 1, 2), + (1, 1, 1, 1, 1, 1, 2, 2), + (1, 1, 1, 1, 1, 2, 2, 2), + (1, 1, 1, 1, 2, 2, 2, 2), + (1, 1, 1, 2, 2, 2, 2, 2), + (1, 1, 2, 2, 2, 2, 2, 2), + (1, 2, 2, 2, 2, 2, 2, 2), + (2, 2, 2, 2, 2, 2, 2, 2); + +-- Since Spark doesn't support CREATE TEMPORARY TABLE, we used CREATE TABLE instead +-- create temp table gstest3 (a integer, b integer, c integer, d integer); +create table gstest3 (a integer, b integer, c integer, d integer) using parquet; +-- [SPARK-29386] Copy data between a file and a table +-- copy gstest3 from stdin; +-- 1 1 1 1 +-- 2 2 2 2 +-- \. +insert into gstest3 values + (1, 1, 1, 1), + (2, 2, 2, 2); +-- [SPARK-19842] Informational Referential Integrity Constraints Support in Spark +-- alter table gstest3 add primary key (a); + +-- Since Spark doesn't support CREATE TEMPORARY TABLE, we used CREATE TABLE instead +-- create temp table gstest4(id integer, v integer, +-- unhashable_col bit(4), unsortable_col xid); +-- [SPARK-29697] Support bit string types/literals +create table gstest4(id integer, v integer, + unhashable_col /* bit(4) */ byte, unsortable_col /* xid */ integer) using parquet; +insert into gstest4 +-- values (1,1,b'0000','1'), (2,2,b'0001','1'), +-- (3,4,b'0010','2'), (4,8,b'0011','2'), +-- (5,16,b'0000','2'), (6,32,b'0001','2'), +-- (7,64,b'0010','1'), (8,128,b'0011','1'); +values (1,1,tinyint('0'),1), (2,2,tinyint('1'),1), + (3,4,tinyint('2'),2), (4,8,tinyint('3'),2), + (5,16,tinyint('0'),2), (6,32,tinyint('1'),2), + (7,64,tinyint('2'),1), (8,128,tinyint('3'),1); + +-- Since Spark doesn't support CREATE TEMPORARY TABLE, we used CREATE TABLE instead +-- create temp table gstest_empty (a integer, b integer, v integer); +create table gstest_empty (a integer, b integer, v integer) using parquet; + +-- Spark doesn't handle UDFs in SQL +-- create function gstest_data(v integer, out a integer, out b integer) +-- returns setof record +-- as $f$ +-- begin +-- return query select v, i from generate_series(1,3) i; +-- end; +-- $f$ language plpgsql; + +-- basic functionality + +-- Ignore a PostgreSQL-specific option +-- set enable_hashagg = false; -- test hashing explicitly later + +-- simple rollup with multiple plain aggregates, with and without ordering +-- (and with ordering differing from grouping) + +-- [SPARK-29698] Support grouping function with multiple arguments +-- select a, b, grouping(a,b), sum(v), count(*), max(v) +select a, b, grouping(a), grouping(b), sum(v), count(*), max(v) + from gstest1 group by rollup (a,b); +-- select a, b, grouping(a,b), sum(v), count(*), max(v) +select a, b, grouping(a), grouping(b), sum(v), count(*), max(v) + from gstest1 group by rollup (a,b) order by a,b; +-- select a, b, grouping(a,b), sum(v), count(*), max(v) +select a, b, grouping(a), grouping(b), sum(v), count(*), max(v) + from gstest1 group by rollup (a,b) order by b desc, a; +-- select a, b, grouping(a,b), sum(v), count(*), max(v) +select a, b, grouping(a), grouping(b), sum(v), count(*), max(v) + from gstest1 group by rollup (a,b) order by coalesce(a,0)+coalesce(b,0); + +-- [SPARK-28664] ORDER BY in aggregate function +-- various types of ordered aggs +-- select a, b, grouping(a,b), +-- array_agg(v order by v), +-- string_agg(string(v:text, ':' order by v desc), +-- percentile_disc(0.5) within group (order by v), +-- rank(1,2,12) within group (order by a,b,v) +-- from gstest1 group by rollup (a,b) order by a,b; + +-- [SPARK-28664] ORDER BY in aggregate function +-- test usage of grouped columns in direct args of aggs +-- select grouping(a), a, array_agg(b), +-- rank(a) within group (order by b nulls first), +-- rank(a) within group (order by b nulls last) +-- from (values (1,1),(1,4),(1,5),(3,1),(3,2)) v(a,b) +-- group by rollup (a) order by a; + +-- nesting with window functions +-- [SPARK-29699] Different answers in nested aggregates with window functions +select a, b, sum(c), sum(sum(c)) over (order by a,b) as rsum + from gstest2 group by rollup (a,b) order by rsum, a, b; + +-- [SPARK-29700] Support nested grouping sets +-- nesting with grouping sets +-- select sum(c) from gstest2 +-- group by grouping sets((), grouping sets((), grouping sets(()))) +-- order by 1 desc; +-- select sum(c) from gstest2 +-- group by grouping sets((), grouping sets((), grouping sets(((a, b))))) +-- order by 1 desc; +-- select sum(c) from gstest2 +-- group by grouping sets(grouping sets(rollup(c), grouping sets(cube(c)))) +-- order by 1 desc; +-- select sum(c) from gstest2 +-- group by grouping sets(a, grouping sets(a, cube(b))) +-- order by 1 desc; +-- select sum(c) from gstest2 +-- group by grouping sets(grouping sets((a, (b)))) +-- order by 1 desc; +-- select sum(c) from gstest2 +-- group by grouping sets(grouping sets((a, b))) +-- order by 1 desc; +-- select sum(c) from gstest2 +-- group by grouping sets(grouping sets(a, grouping sets(a), a)) +-- order by 1 desc; +-- select sum(c) from gstest2 +-- group by grouping sets(grouping sets(a, grouping sets(a, grouping sets(a), ((a)), a, grouping sets(a), (a)), a)) +-- order by 1 desc; +-- select sum(c) from gstest2 +-- group by grouping sets((a,(a,b)), grouping sets((a,(a,b)),a)) +-- order by 1 desc; + +-- empty input: first is 0 rows, second 1, third 3 etc. +select a, b, sum(v), count(*) from gstest_empty group by grouping sets ((a,b),a); +-- [SPARK-29701] Different answers when empty input given in GROUPING SETS +select a, b, sum(v), count(*) from gstest_empty group by grouping sets ((a,b),()); +select a, b, sum(v), count(*) from gstest_empty group by grouping sets ((a,b),(),(),()); +select sum(v), count(*) from gstest_empty group by grouping sets ((),(),()); + +-- empty input with joins tests some important code paths +-- [SPARK-29701] Different answers when empty input given in GROUPING SETS +select t1.a, t2.b, sum(t1.v), count(*) from gstest_empty t1, gstest_empty t2 + group by grouping sets ((t1.a,t2.b),()); + +-- simple joins, var resolution, GROUPING on join vars +-- [SPARK-29698] Support grouping function with multiple arguments +-- select t1.a, t2.b, grouping(t1.a, t2.b), sum(t1.v), max(t2.a) +select t1.a, t2.b, grouping(t1.a), grouping(t2.b), sum(t1.v), max(t2.a) + from gstest1 t1, gstest2 t2 + group by grouping sets ((t1.a, t2.b), ()); + +-- [SPARK-29698] Support grouping function with multiple arguments +-- select t1.a, t2.b, grouping(t1.a, t2.b), sum(t1.v), max(t2.a) +select t1.a, t2.b, grouping(t1.a), grouping(t2.b), sum(t1.v), max(t2.a) + from gstest1 t1 join gstest2 t2 on (t1.a=t2.a) + group by grouping sets ((t1.a, t2.b), ()); + +-- [SPARK-29698] Support grouping function with multiple arguments +-- select a, b, grouping(a, b), sum(t1.v), max(t2.c) +select a, b, grouping(a), grouping(b), sum(t1.v), max(t2.c) + from gstest1 t1 join gstest2 t2 using (a,b) + group by grouping sets ((a, b), ()); + +-- check that functionally dependent cols are not nulled +-- [SPARK-29698] Support grouping function with multiple arguments +-- [SPARK-19842] Informational Referential Integrity Constraints Support in Spark +-- [SPARK-29702] Resolve group-by columns with functional dependencies +-- select a, d, grouping(a,b,c) +-- from gstest3 +-- group by grouping sets ((a,b), (a,c)); + +-- check that distinct grouping columns are kept separate +-- even if they are equal() +-- explain (costs off) +-- select g as alias1, g as alias2 +-- from generate_series(1,3) g +-- group by alias1, rollup(alias2); + +-- [SPARK-27767] Built-in function: generate_series +-- [SPARK-29704] Support the combinations of grouping operations +-- select g as alias1, g as alias2 +-- from generate_series(1,3) g +-- group by alias1, rollup(alias2); + +-- check that pulled-up subquery outputs still go to null when appropriate +select four, x + from (select four, ten, 'foo' as x from tenk1) as t + group by grouping sets (four, x) + having x = 'foo'; + +select four, x || 'x' + from (select four, ten, 'foo' as x from tenk1) as t + group by grouping sets (four, x) + order by four; + +select (x+y)*1, sum(z) + from (select 1 as x, 2 as y, 3 as z) s + group by grouping sets (x+y, x); + +CREATE TEMP VIEW int8_tbl AS SELECT * FROM VALUES + (123L, 456L), + (123L, 4567890123456789L), + (4567890123456789L, 123L), + (4567890123456789L, 4567890123456789L), + (4567890123456789L, -4567890123456789L) as int8_tbl(q1, q2); + +select x, not x as not_x, q2 from + (select *, q1 = 1 as x from int8_tbl i1) as t + group by grouping sets(x, q2) + order by x, q2; + +DROP VIEW int8_tbl; + +-- simple rescan tests + +-- Spark doesn't handle UDFs in SQL +-- select a, b, sum(v.x) +-- from (values (1),(2)) v(x), gstest_data(v.x) +-- group by rollup (a,b); + +-- Spark doesn't handle UDFs in SQL +-- select * +-- from (values (1),(2)) v(x), +-- lateral (select a, b, sum(v.x) from gstest_data(v.x) group by rollup (a,b)) s; + +-- min max optimization should still work with GROUP BY () +-- explain (costs off) +-- select min(unique1) from tenk1 GROUP BY (); + +-- Views with GROUPING SET queries +-- [SPARK-29698] Support grouping function with multiple arguments +-- [SPARK-29705] Support more expressive forms in GroupingSets/Cube/Rollup +-- CREATE VIEW gstest_view AS select a, b, grouping(a,b), sum(c), count(*), max(c) +-- from gstest2 group by rollup ((a,b,c),(c,d)); + +-- select pg_get_viewdef('gstest_view'::regclass, true); + +-- Nested queries with 3 or more levels of nesting +-- [SPARK-29698] Support grouping function with multiple arguments +-- [SPARK-29703] grouping() can only be used with GroupingSets/Cube/Rollup +-- select(select (select grouping(a,b) from (values (1)) v2(c)) from (values (1,2)) v1(a,b) group by (a,b)) from (values(6,7)) v3(e,f) GROUP BY ROLLUP(e,f); +-- select(select (select grouping(e,f) from (values (1)) v2(c)) from (values (1,2)) v1(a,b) group by (a,b)) from (values(6,7)) v3(e,f) GROUP BY ROLLUP(e,f); +-- select(select (select grouping(c) from (values (1)) v2(c) GROUP BY c) from (values (1,2)) v1(a,b) group by (a,b)) from (values(6,7)) v3(e,f) GROUP BY ROLLUP(e,f); + +-- Combinations of operations +-- [SPARK-29704] Support the combinations of grouping operations +-- select a, b, c, d from gstest2 group by rollup(a,b),grouping sets(c,d); +-- select a, b from (values (1,2),(2,3)) v(a,b) group by a,b, grouping sets(a); + +-- Spark doesn't handle UDFs in SQL +-- Tests for chained aggregates +-- select a, b, grouping(a,b), sum(v), count(*), max(v) +-- from gstest1 group by grouping sets ((a,b),(a+1,b+1),(a+2,b+2)) order by 3,6; +-- select(select (select grouping(a,b) from (values (1)) v2(c)) from (values (1,2)) v1(a,b) group by (a,b)) from (values(6,7)) v3(e,f) GROUP BY ROLLUP((e+1),(f+1)); +-- select(select (select grouping(a,b) from (values (1)) v2(c)) from (values (1,2)) v1(a,b) group by (a,b)) from (values(6,7)) v3(e,f) GROUP BY CUBE((e+1),(f+1)) ORDER BY (e+1),(f+1); +-- select a, b, sum(c), sum(sum(c)) over (order by a,b) as rsum +-- from gstest2 group by cube (a,b) order by rsum, a, b; +-- select a, b, sum(c) from (values (1,1,10),(1,1,11),(1,2,12),(1,2,13),(1,3,14),(2,3,15),(3,3,16),(3,4,17),(4,1,18),(4,1,19)) v(a,b,c) group by rollup (a,b); +-- select a, b, sum(v.x) +-- from (values (1),(2)) v(x), gstest_data(v.x) +-- group by cube (a,b) order by a,b; + +-- Test reordering of grouping sets +-- explain (costs off) +-- select * from gstest1 group by grouping sets((a,b,v),(v)) order by v,b,a; + +-- [SPARK-29698] Support grouping function with multiple arguments +-- [SPARK-29703] grouping() can only be used with GroupingSets/Cube/Rollup +-- Agg level check. This query should error out. +-- select (select grouping(a), grouping(b) from gstest2) from gstest2 group by a,b; + +--Nested queries +-- [SPARK-29700] Support nested grouping sets +-- select a, b, sum(c), count(*) from gstest2 group by grouping sets (rollup(a,b),a); + +-- HAVING queries +select ten, sum(distinct four) from onek a +group by grouping sets((ten,four),(ten)) +having exists (select 1 from onek b where sum(distinct a.four) = b.four); + +-- Tests around pushdown of HAVING clauses, partially testing against previous bugs +select a,count(*) from gstest2 group by rollup(a) order by a; +select a,count(*) from gstest2 group by rollup(a) having a is distinct from 1 order by a; +-- explain (costs off) +-- select a,count(*) from gstest2 group by rollup(a) having a is distinct from 1 order by a; + +-- [SPARK-29706] Support an empty grouping expression +-- select v.c, (select count(*) from gstest2 group by () having v.c) +-- from (values (false),(true)) v(c) order by v.c; +-- explain (costs off) +-- select v.c, (select count(*) from gstest2 group by () having v.c) +-- from (values (false),(true)) v(c) order by v.c; + +-- HAVING with GROUPING queries +select ten, grouping(ten) from onek +group by grouping sets(ten) having grouping(ten) >= 0 +order by 2,1; +select ten, grouping(ten) from onek +group by grouping sets(ten, four) having grouping(ten) > 0 +order by 2,1; +select ten, grouping(ten) from onek +group by rollup(ten) having grouping(ten) > 0 +order by 2,1; +select ten, grouping(ten) from onek +group by cube(ten) having grouping(ten) > 0 +order by 2,1; +-- [SPARK-29703] grouping() can only be used with GroupingSets/Cube/Rollup +-- select ten, grouping(ten) from onek +-- group by (ten) having grouping(ten) >= 0 +-- order by 2,1; + +-- FILTER queries +-- [SPARK-30276] Support Filter expression allows simultaneous use of DISTINCT +-- select ten, sum(distinct four) filter (where string(four) like '123') from onek a +-- group by rollup(ten); + +-- More rescan tests +-- [SPARK-27877] ANSI SQL: LATERAL derived table(T491) +-- select * from (values (1),(2)) v(a) left join lateral (select v.a, four, ten, count(*) from onek group by cube(four,ten)) s on true order by v.a,four,ten; +-- [SPARK-27878] Support ARRAY(sub-SELECT) expressions +-- select array(select row(v.a,s1.*) from (select two,four, count(*) from onek group by cube(two,four) order by two,four) s1) from (values (1),(2)) v(a); + +-- [SPARK-29704] Support the combinations of grouping operations +-- Grouping on text columns +-- select sum(ten) from onek group by two, rollup(string(four)) order by 1; +-- select sum(ten) from onek group by rollup(string(four)), two order by 1; + +-- hashing support + +-- Ignore a PostgreSQL-specific option +-- set enable_hashagg = true; + +-- failure cases + +-- Since this test is implementation specific for plans, it passes in Spark +select count(*) from gstest4 group by rollup(unhashable_col,unsortable_col); +-- [SPARK-27878] Support ARRAY(sub-SELECT) expressions +-- select array_agg(v order by v) from gstest4 group by grouping sets ((id,unsortable_col),(id)); + +-- simple cases + +-- [SPARK-29698] Support grouping function with multiple arguments +-- select a, b, grouping(a,b), sum(v), count(*), max(v) +select a, b, grouping(a), grouping(b), sum(v), count(*), max(v) + from gstest1 group by grouping sets ((a),(b)) order by 3,4,1,2 /* 3,1,2 */; +-- explain (costs off) select a, b, grouping(a,b), sum(v), count(*), max(v) +-- from gstest1 group by grouping sets ((a),(b)) order by 3,1,2; + +-- [SPARK-29698] Support grouping function with multiple arguments +-- select a, b, grouping(a,b), sum(v), count(*), max(v) +select a, b, grouping(a), grouping(b), sum(v), count(*), max(v) + from gstest1 group by cube(a,b) order by 3,4,1,2 /* 3,1,2 */; +-- explain (costs off) select a, b, grouping(a,b), sum(v), count(*), max(v) +-- from gstest1 group by cube(a,b) order by 3,1,2; + +-- shouldn't try and hash +-- explain (costs off) +-- select a, b, grouping(a,b), array_agg(v order by v) +-- from gstest1 group by cube(a,b); + +-- unsortable cases +select unsortable_col, count(*) + from gstest4 group by grouping sets ((unsortable_col),(unsortable_col)) + order by string(unsortable_col); + +-- mixed hashable/sortable cases +-- [SPARK-29698] Support grouping function with multiple arguments +select unhashable_col, unsortable_col, + -- grouping(unhashable_col, unsortable_col), + grouping(unhashable_col), grouping(unsortable_col), + count(*), sum(v) + from gstest4 group by grouping sets ((unhashable_col),(unsortable_col)) + order by 3, 4, 6 /* 3, 5 */; +-- explain (costs off) +-- select unhashable_col, unsortable_col, +-- grouping(unhashable_col, unsortable_col), +-- count(*), sum(v) +-- from gstest4 group by grouping sets ((unhashable_col),(unsortable_col)) +-- order by 3,5; + +-- [SPARK-29698] Support grouping function with multiple arguments +select unhashable_col, unsortable_col, + -- grouping(unhashable_col, unsortable_col), + grouping(unhashable_col), grouping(unsortable_col), + count(*), sum(v) + from gstest4 group by grouping sets ((v,unhashable_col),(v,unsortable_col)) + order by 3, 4, 6 /* 3,5 */; +-- explain (costs off) +-- select unhashable_col, unsortable_col, +-- grouping(unhashable_col, unsortable_col), +-- count(*), sum(v) +-- from gstest4 group by grouping sets ((v,unhashable_col),(v,unsortable_col)) +-- order by 3,5; + +-- empty input: first is 0 rows, second 1, third 3 etc. +select a, b, sum(v), count(*) from gstest_empty group by grouping sets ((a,b),a); +-- explain (costs off) +-- select a, b, sum(v), count(*) from gstest_empty group by grouping sets ((a,b),a); +-- [SPARK-29701] Different answers when empty input given in GROUPING SETS +select a, b, sum(v), count(*) from gstest_empty group by grouping sets ((a,b),()); +select a, b, sum(v), count(*) from gstest_empty group by grouping sets ((a,b),(),(),()); +-- explain (costs off) +-- select a, b, sum(v), count(*) from gstest_empty group by grouping sets ((a,b),(),(),()); +-- [SPARK-29701] Different answers when empty input given in GROUPING SETS +select sum(v), count(*) from gstest_empty group by grouping sets ((),(),()); +-- explain (costs off) +-- select sum(v), count(*) from gstest_empty group by grouping sets ((),(),()); + +-- [SPARK-29698] Support grouping function with multiple arguments +-- [SPARK-19842] Informational Referential Integrity Constraints Support in Spark +-- [SPARK-29702] Resolve group-by columns with functional dependencies +-- check that functionally dependent cols are not nulled +-- select a, d, grouping(a,b,c) +-- from gstest3 +-- group by grouping sets ((a,b), (a,c)); +-- explain (costs off) +-- select a, d, grouping(a,b,c) +-- from gstest3 +-- group by grouping sets ((a,b), (a,c)); + +-- simple rescan tests + +-- select a, b, sum(v.x) +-- from (values (1),(2)) v(x), gstest_data(v.x) +-- group by grouping sets (a,b) +-- order by 1, 2, 3; +-- explain (costs off) +-- select a, b, sum(v.x) +-- from (values (1),(2)) v(x), gstest_data(v.x) +-- group by grouping sets (a,b) +-- order by 3, 1, 2; +-- select * +-- from (values (1),(2)) v(x), +-- lateral (select a, b, sum(v.x) from gstest_data(v.x) group by grouping sets (a,b)) s; +-- explain (costs off) +-- select * +-- from (values (1),(2)) v(x), +-- lateral (select a, b, sum(v.x) from gstest_data(v.x) group by grouping sets (a,b)) s; + +-- Tests for chained aggregates +-- [SPARK-29698] Support grouping function with multiple arguments +-- select a, b, grouping(a,b), sum(v), count(*), max(v) +select a, b, grouping(a), grouping(b), sum(v), count(*), max(v) + from gstest1 group by grouping sets ((a,b),(a+1,b+1),(a+2,b+2)) order by 3,4,7 /* 3,6 */; +-- explain (costs off) +-- select a, b, grouping(a,b), sum(v), count(*), max(v) +-- from gstest1 group by grouping sets ((a,b),(a+1,b+1),(a+2,b+2)) order by 3,6; +-- [SPARK-29699] Different answers in nested aggregates with window functions +select a, b, sum(c), sum(sum(c)) over (order by a,b) as rsum + from gstest2 group by cube (a,b) order by rsum, a, b; +-- explain (costs off) +-- select a, b, sum(c), sum(sum(c)) over (order by a,b) as rsum +-- from gstest2 group by cube (a,b) order by rsum, a, b; +-- select a, b, sum(v.x) +-- from (values (1),(2)) v(x), gstest_data(v.x) +-- group by cube (a,b) order by a,b; +-- explain (costs off) +-- select a, b, sum(v.x) +-- from (values (1),(2)) v(x), gstest_data(v.x) +-- group by cube (a,b) order by a,b; + +-- Verify that we correctly handle the child node returning a +-- non-minimal slot, which happens if the input is pre-sorted, +-- e.g. due to an index scan. +-- BEGIN; +-- Ignore a PostgreSQL-specific option +-- SET LOCAL enable_hashagg = false; +-- EXPLAIN (COSTS OFF) SELECT a, b, count(*), max(a), max(b) FROM gstest3 GROUP BY GROUPING SETS(a, b,()) ORDER BY a, b; +SELECT a, b, count(*), max(a), max(b) FROM gstest3 GROUP BY GROUPING SETS(a, b,()) ORDER BY a, b; +-- Ignore a PostgreSQL-specific option +-- SET LOCAL enable_seqscan = false; +-- EXPLAIN (COSTS OFF) SELECT a, b, count(*), max(a), max(b) FROM gstest3 GROUP BY GROUPING SETS(a, b,()) ORDER BY a, b; +-- SELECT a, b, count(*), max(a), max(b) FROM gstest3 GROUP BY GROUPING SETS(a, b,()) ORDER BY a, b; +-- COMMIT; + +-- More rescan tests +-- [SPARK-27877] ANSI SQL: LATERAL derived table(T491) +-- select * from (values (1),(2)) v(a) left join lateral (select v.a, four, ten, count(*) from onek group by cube(four,ten)) s on true order by v.a,four,ten; +-- [SPARK-27878] Support ARRAY(sub-SELECT) expressions +-- select array(select row(v.a,s1.*) from (select two,four, count(*) from onek group by cube(two,four) order by two,four) s1) from (values (1),(2)) v(a); + +-- Rescan logic changes when there are no empty grouping sets, so test +-- that too: +-- [SPARK-27877] ANSI SQL: LATERAL derived table(T491) +-- select * from (values (1),(2)) v(a) left join lateral (select v.a, four, ten, count(*) from onek group by grouping sets(four,ten)) s on true order by v.a,four,ten; +-- [SPARK-27878] Support ARRAY(sub-SELECT) expressions +-- select array(select row(v.a,s1.*) from (select two,four, count(*) from onek group by grouping sets(two,four) order by two,four) s1) from (values (1),(2)) v(a); + +-- test the knapsack + +-- Ignore a PostgreSQL-specific option +-- set enable_indexscan = false; +-- set work_mem = '64kB'; +-- explain (costs off) +-- select unique1, +-- count(two), count(four), count(ten), +-- count(hundred), count(thousand), count(twothousand), +-- count(*) +-- from tenk1 group by grouping sets (unique1,twothousand,thousand,hundred,ten,four,two); +-- explain (costs off) +-- select unique1, +-- count(two), count(four), count(ten), +-- count(hundred), count(thousand), count(twothousand), +-- count(*) +-- from tenk1 group by grouping sets (unique1,hundred,ten,four,two); + +-- Ignore a PostgreSQL-specific option +-- set work_mem = '384kB'; +-- explain (costs off) +-- select unique1, +-- count(two), count(four), count(ten), +-- count(hundred), count(thousand), count(twothousand), +-- count(*) +-- from tenk1 group by grouping sets (unique1,twothousand,thousand,hundred,ten,four,two); + +-- check collation-sensitive matching between grouping expressions +-- (similar to a check for aggregates, but there are additional code +-- paths for GROUPING, so check again here) + +-- [SPARK-28382] Array Functions: unnest +select v||'a', case grouping(v||'a') when 1 then 1 else 0 end, count(*) + -- from unnest(array[1,1], array['a','b']) u(i,v) + from values (1, 'a'), (1, 'b') u(i,v) + group by rollup(i, v||'a') order by 1,3; +select v||'a', case when grouping(v||'a') = 1 then 1 else 0 end, count(*) + -- from unnest(array[1,1], array['a','b']) u(i,v) + from values (1, 'a'), (1, 'b') u(i,v) + group by rollup(i, v||'a') order by 1,3; + +-- end + +DROP VIEW gstest1; +DROP TABLE gstest2; +DROP TABLE gstest3; +DROP TABLE gstest4; +DROP TABLE gstest_empty; diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/insert.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/insert.sql new file mode 100644 index 0000000000000..6783dda9ff015 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/insert.sql @@ -0,0 +1,653 @@ +-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group +-- +-- INSERT +-- https://github.com/postgres/postgres/blob/REL_12_STABLE/src/test/regress/sql/insert.sql + +-- +-- insert with DEFAULT in the target_list +-- +-- [SPARK-19842] Informational Referential Integrity Constraints Support in Spark +-- [SPARK-29119] DEFAULT option is not supported in Spark +create table inserttest (col1 int, col2 int /* NOT NULL */, col3 string /* default 'testing' */) using parquet; +-- [SPARK-29119] DEFAULT option is not supported in Spark +-- [SPARK-20845] Support specification of column names in INSERT INTO +-- Skip a test below because the PK constraint is violated and the query fails in PostgreSQL +-- insert into inserttest (col1, col2, col3) values (DEFAULT, DEFAULT, DEFAULT); +-- insert into inserttest (col2, col3) values (3, DEFAULT); +insert into inserttest values (NULL, 3, 'testing'); +-- insert into inserttest (col1, col2, col3) values (DEFAULT, 5, DEFAULT); +insert into inserttest values (NULL, 5, 'testing'); +-- insert into inserttest values (DEFAULT, 5, 'test'); +insert into inserttest values (NULL, 5, 'test'); +-- insert into inserttest values (DEFAULT, 7); +insert into inserttest values (NULL, 7, 'testing'); + +select * from inserttest; + +-- +-- insert with similar expression / target_list values (all fail) +-- +-- [SPARK-20845] Support specification of column names in INSERT INTO +-- [SPARK-29119] DEFAULT option is not supported in Spark +-- insert into inserttest (col1, col2, col3) values (DEFAULT, DEFAULT); +-- insert into inserttest (col1, col2, col3) values (1, 2); +-- insert into inserttest (col1) values (1, 2); +-- insert into inserttest (col1) values (DEFAULT, DEFAULT); + +-- select * from inserttest; + +-- +-- VALUES test +-- +-- [SPARK-29119] DEFAULT option is not supported in Spark +-- [SPARK-29715] Support SELECT statements in VALUES of INSERT INTO +-- insert into inserttest values(10, 20, '40'), (-1, 2, DEFAULT), +-- ((select 2), (select i from (values(3)) as foo (i)), 'values are fun!'); + +-- select * from inserttest; + +-- +-- TOASTed value test +-- +insert into inserttest values(30, 50, repeat('x', 10000)); + +select col1, col2, char_length(col3) from inserttest; + +drop table inserttest; + +-- +-- check indirection (field/array assignment), cf bug #14265 +-- +-- these tests are aware that transformInsertStmt has 3 separate code paths +-- + +-- [SPARK-29716] Support [CREATE|DROP] TYPE +-- create type insert_test_type as (if1 int, if2 array); + +-- create table inserttest (f1 int, f2 int[], +-- f3 insert_test_type, f4 insert_test_type[]); +-- +-- insert into inserttest (f2[1], f2[2]) values (1,2); +-- insert into inserttest (f2[1], f2[2]) values (3,4), (5,6); +-- insert into inserttest (f2[1], f2[2]) select 7,8; +-- insert into inserttest (f2[1], f2[2]) values (1,default); -- not supported +-- +-- insert into inserttest (f3.if1, f3.if2) values (1,array['foo']); +-- insert into inserttest (f3.if1, f3.if2) values (1,'{foo}'), (2,'{bar}'); +-- insert into inserttest (f3.if1, f3.if2) select 3, '{baz,quux}'; +-- insert into inserttest (f3.if1, f3.if2) values (1,default); -- not supported +-- +-- insert into inserttest (f3.if2[1], f3.if2[2]) values ('foo', 'bar'); +-- insert into inserttest (f3.if2[1], f3.if2[2]) values ('foo', 'bar'), ('baz', 'quux'); +-- insert into inserttest (f3.if2[1], f3.if2[2]) select 'bear', 'beer'; +-- +-- insert into inserttest (f4[1].if2[1], f4[1].if2[2]) values ('foo', 'bar'); +-- insert into inserttest (f4[1].if2[1], f4[1].if2[2]) values ('foo', 'bar'), ('baz', 'quux'); +-- insert into inserttest (f4[1].if2[1], f4[1].if2[2]) select 'bear', 'beer'; +-- +-- select * from inserttest; + +-- also check reverse-listing +-- create table inserttest2 (f1 bigint, f2 string); +-- [SPARK-29717] Support [CREATE|DROP] RULE - define a new plan rewrite rule +-- create rule irule1 as on insert to inserttest2 do also +-- insert into inserttest (f3.if2[1], f3.if2[2]) +-- values (new.f1,new.f2); +-- create rule irule2 as on insert to inserttest2 do also +-- insert into inserttest (f4[1].if1, f4[1].if2[2]) +-- values (1,'fool'),(new.f1,new.f2); +-- create rule irule3 as on insert to inserttest2 do also +-- insert into inserttest (f4[1].if1, f4[1].if2[2]) +-- select new.f1, new.f2; +-- \d+ inserttest2 + +-- drop table inserttest2; +-- drop table inserttest; +-- [SPARK-29716] Support [CREATE|DROP] TYPE +-- drop type insert_test_type; + +-- direct partition inserts should check partition bound constraint +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table range_parted ( +-- a string, +-- b int +-- ) partition by range (a, (b+0)); + +-- no partitions, so fail +-- insert into range_parted values ('a', 11); + +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table part1 partition of range_parted for values from ('a', 1) to ('a', 10); +-- create table part2 partition of range_parted for values from ('a', 10) to ('a', 20); +-- create table part3 partition of range_parted for values from ('b', 1) to ('b', 10); +-- create table part4 partition of range_parted for values from ('b', 10) to ('b', 20); + +-- fail +-- insert into part1 values ('a', 11); +-- insert into part1 values ('b', 1); +-- ok +-- insert into part1 values ('a', 1); +-- fail +-- insert into part4 values ('b', 21); +-- insert into part4 values ('a', 10); +-- ok +-- insert into part4 values ('b', 10); + +-- fail (partition key a has a NOT NULL constraint) +-- insert into part1 values (null); +-- fail (expression key (b+0) cannot be null either) +-- insert into part1 values (1); + +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table list_parted ( +-- a text, +-- b int +-- ) partition by list (lower(a)); +-- create table part_aa_bb partition of list_parted FOR VALUES IN ('aa', 'bb'); +-- create table part_cc_dd partition of list_parted FOR VALUES IN ('cc', 'dd'); +-- create table part_null partition of list_parted FOR VALUES IN (null); + +-- fail +-- insert into part_aa_bb values ('cc', 1); +-- insert into part_aa_bb values ('AAa', 1); +-- insert into part_aa_bb values (null); +-- ok +-- insert into part_cc_dd values ('cC', 1); +-- insert into part_null values (null, 0); + +-- check in case of multi-level partitioned table +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table part_ee_ff partition of list_parted for values in ('ee', 'ff') partition by range (b); +-- create table part_ee_ff1 partition of part_ee_ff for values from (1) to (10); +-- create table part_ee_ff2 partition of part_ee_ff for values from (10) to (20); + +-- test default partition +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table part_default partition of list_parted default; +-- Negative test: a row, which would fit in other partition, does not fit +-- default partition, even when inserted directly +-- insert into part_default values ('aa', 2); +-- insert into part_default values (null, 2); +-- ok +-- insert into part_default values ('Zz', 2); +-- test if default partition works as expected for multi-level partitioned +-- table as well as when default partition itself is further partitioned +-- drop table part_default; +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table part_xx_yy partition of list_parted for values in ('xx', 'yy') partition by list (a); +-- create table part_xx_yy_p1 partition of part_xx_yy for values in ('xx'); +-- create table part_xx_yy_defpart partition of part_xx_yy default; +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table part_default partition of list_parted default partition by range(b); +-- create table part_default_p1 partition of part_default for values from (20) to (30); +-- create table part_default_p2 partition of part_default for values from (30) to (40); + +-- fail +-- insert into part_ee_ff1 values ('EE', 11); +-- insert into part_default_p2 values ('gg', 43); +-- fail (even the parent's, ie, part_ee_ff's partition constraint applies) +-- insert into part_ee_ff1 values ('cc', 1); +-- insert into part_default values ('gg', 43); +-- ok +-- insert into part_ee_ff1 values ('ff', 1); +-- insert into part_ee_ff2 values ('ff', 11); +-- insert into part_default_p1 values ('cd', 25); +-- insert into part_default_p2 values ('de', 35); +-- insert into list_parted values ('ab', 21); +-- insert into list_parted values ('xx', 1); +-- insert into list_parted values ('yy', 2); +-- select tableoid::regclass, * from list_parted; + +-- Check tuple routing for partitioned tables + +-- fail +-- insert into range_parted values ('a', 0); +-- ok +-- insert into range_parted values ('a', 1); +-- insert into range_parted values ('a', 10); +-- fail +-- insert into range_parted values ('a', 20); +-- ok +-- insert into range_parted values ('b', 1); +-- insert into range_parted values ('b', 10); +-- fail (partition key (b+0) is null) +-- insert into range_parted values ('a'); + +-- Check default partition +-- create table part_def partition of range_parted default; +-- fail +-- insert into part_def values ('b', 10); +-- ok +-- insert into part_def values ('c', 10); +-- insert into range_parted values (null, null); +-- insert into range_parted values ('a', null); +-- insert into range_parted values (null, 19); +-- insert into range_parted values ('b', 20); + +-- select tableoid::regclass, * from range_parted; +-- ok +-- insert into list_parted values (null, 1); +-- insert into list_parted (a) values ('aA'); +-- fail (partition of part_ee_ff not found in both cases) +-- insert into list_parted values ('EE', 0); +-- insert into part_ee_ff values ('EE', 0); +-- ok +-- insert into list_parted values ('EE', 1); +-- insert into part_ee_ff values ('EE', 10); +-- select tableoid::regclass, * from list_parted; + +-- some more tests to exercise tuple-routing with multi-level partitioning +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table part_gg partition of list_parted for values in ('gg') partition by range (b); +-- create table part_gg1 partition of part_gg for values from (minvalue) to (1); +-- create table part_gg2 partition of part_gg for values from (1) to (10) partition by range (b); +-- create table part_gg2_1 partition of part_gg2 for values from (1) to (5); +-- create table part_gg2_2 partition of part_gg2 for values from (5) to (10); + +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table part_ee_ff3 partition of part_ee_ff for values from (20) to (30) partition by range (b); +-- create table part_ee_ff3_1 partition of part_ee_ff3 for values from (20) to (25); +-- create table part_ee_ff3_2 partition of part_ee_ff3 for values from (25) to (30); + +-- truncate list_parted; +-- insert into list_parted values ('aa'), ('cc'); +-- [SPARK-27767] Built-in function: generate_series +-- insert into list_parted select 'Ff', s.a from generate_series(1, 29) s(a); +-- insert into list_parted select 'gg', s.a from generate_series(1, 9) s(a); +-- insert into list_parted (b) values (1); +-- select tableoid::regclass::text, a, min(b) as min_b, max(b) as max_b from list_parted group by 1, 2 order by 1; + +-- direct partition inserts should check hash partition bound constraint + +-- Use hand-rolled hash functions and operator classes to get predictable +-- result on different matchines. The hash function for int4 simply returns +-- the sum of the values passed to it and the one for text returns the length +-- of the non-empty string value passed to it or 0. + +-- create or replace function part_hashint4_noop(value int4, seed int8) +-- returns int8 as $$ +-- select value + seed; +-- $$ language sql immutable; + +-- create operator class part_test_int4_ops +-- for type int4 +-- using hash as +-- operator 1 =, +-- function 2 part_hashint4_noop(int4, int8); + +-- create or replace function part_hashtext_length(value text, seed int8) +-- RETURNS int8 AS $$ +-- select length(coalesce(value, ''))::int8 +-- $$ language sql immutable; + +-- create operator class part_test_text_ops +-- for type text +-- using hash as +-- operator 1 =, +-- function 2 part_hashtext_length(text, int8); + +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table hash_parted ( +-- a int +-- ) partition by hash (a part_test_int4_ops); +-- create table hpart0 partition of hash_parted for values with (modulus 4, remainder 0); +-- create table hpart1 partition of hash_parted for values with (modulus 4, remainder 1); +-- create table hpart2 partition of hash_parted for values with (modulus 4, remainder 2); +-- create table hpart3 partition of hash_parted for values with (modulus 4, remainder 3); + +-- [SPARK-27767] Built-in function: generate_series +-- insert into hash_parted values(generate_series(1,10)); + +-- direct insert of values divisible by 4 - ok; +-- insert into hpart0 values(12),(16); +-- fail; +-- insert into hpart0 values(11); +-- 11 % 4 -> 3 remainder i.e. valid data for hpart3 partition +-- insert into hpart3 values(11); + +-- view data +-- select tableoid::regclass as part, a, a%4 as "remainder = a % 4" +-- from hash_parted order by part; + +-- test \d+ output on a table which has both partitioned and unpartitioned +-- partitions +-- \d+ list_parted + +-- cleanup +-- drop table range_parted, list_parted; +-- drop table hash_parted; + +-- test that a default partition added as the first partition accepts any value +-- including null +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table list_parted (a int) partition by list (a); +-- create table part_default partition of list_parted default; +-- \d+ part_default +-- insert into part_default values (null); +-- insert into part_default values (1); +-- insert into part_default values (-1); +-- select tableoid::regclass, a from list_parted; +-- cleanup +-- drop table list_parted; + +-- more tests for certain multi-level partitioning scenarios +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table mlparted (a int, b int) partition by range (a, b); +-- create table mlparted1 (b int not null, a int not null) partition by range ((b+0)); +-- create table mlparted11 (like mlparted1); +-- alter table mlparted11 drop a; +-- alter table mlparted11 add a int; +-- alter table mlparted11 drop a; +-- alter table mlparted11 add a int not null; +-- attnum for key attribute 'a' is different in mlparted, mlparted1, and mlparted11 +-- select attrelid::regclass, attname, attnum +-- from pg_attribute +-- where attname = 'a' +-- and (attrelid = 'mlparted'::regclass +-- or attrelid = 'mlparted1'::regclass +-- or attrelid = 'mlparted11'::regclass) +-- order by attrelid::regclass::text; + +-- alter table mlparted1 attach partition mlparted11 for values from (2) to (5); +-- alter table mlparted attach partition mlparted1 for values from (1, 2) to (1, 10); + +-- check that "(1, 2)" is correctly routed to mlparted11. +-- insert into mlparted values (1, 2); +-- select tableoid::regclass, * from mlparted; + +-- check that proper message is shown after failure to route through mlparted1 +-- insert into mlparted (a, b) values (1, 5); + +-- truncate mlparted; +-- alter table mlparted add constraint check_b check (b = 3); + +-- have a BR trigger modify the row such that the check_b is violated +-- create function mlparted11_trig_fn() +-- returns trigger AS +-- $$ +-- begin +-- NEW.b := 4; +-- return NEW; +-- end; +-- $$ +-- language plpgsql; +-- create trigger mlparted11_trig before insert ON mlparted11 +-- for each row execute procedure mlparted11_trig_fn(); + +-- check that the correct row is shown when constraint check_b fails after +-- "(1, 2)" is routed to mlparted11 (actually "(1, 4)" would be shown due +-- to the BR trigger mlparted11_trig_fn) +-- insert into mlparted values (1, 2); +-- drop trigger mlparted11_trig on mlparted11; +-- drop function mlparted11_trig_fn(); + +-- check that inserting into an internal partition successfully results in +-- checking its partition constraint before inserting into the leaf partition +-- selected by tuple-routing +-- insert into mlparted1 (a, b) values (2, 3); + +-- check routing error through a list partitioned table when the key is null +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table lparted_nonullpart (a int, b char) partition by list (b); +-- create table lparted_nonullpart_a partition of lparted_nonullpart for values in ('a'); +-- insert into lparted_nonullpart values (1); +-- drop table lparted_nonullpart; + +-- check that RETURNING works correctly with tuple-routing +-- alter table mlparted drop constraint check_b; +-- create table mlparted12 partition of mlparted1 for values from (5) to (10); +-- create table mlparted2 (b int not null, a int not null); +-- alter table mlparted attach partition mlparted2 for values from (1, 10) to (1, 20); +-- create table mlparted3 partition of mlparted for values from (1, 20) to (1, 30); +-- create table mlparted4 (like mlparted); +-- alter table mlparted4 drop a; +-- alter table mlparted4 add a int not null; +-- alter table mlparted attach partition mlparted4 for values from (1, 30) to (1, 40); +-- [SPARK-27767] Built-in function: generate_series +-- with ins (a, b, c) as +-- (insert into mlparted (b, a) select s.a, 1 from generate_series(2, 39) s(a) returning tableoid::regclass, *) +-- select a, b, min(c), max(c) from ins group by a, b order by 1; + +-- alter table mlparted add c text; +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table mlparted5 (c text, a int not null, b int not null) partition by list (c); +-- create table mlparted5a (a int not null, c text, b int not null); +-- alter table mlparted5 attach partition mlparted5a for values in ('a'); +-- alter table mlparted attach partition mlparted5 for values from (1, 40) to (1, 50); +-- alter table mlparted add constraint check_b check (a = 1 and b < 45); +-- insert into mlparted values (1, 45, 'a'); +-- create function mlparted5abrtrig_func() returns trigger as $$ begin new.c = 'b'; return new; end; $$ language plpgsql; +-- create trigger mlparted5abrtrig before insert on mlparted5a for each row execute procedure mlparted5abrtrig_func(); +-- insert into mlparted5 (a, b, c) values (1, 40, 'a'); +-- drop table mlparted5; +-- alter table mlparted drop constraint check_b; + +-- Check multi-level default partition +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table mlparted_def partition of mlparted default partition by range(a); +-- create table mlparted_def1 partition of mlparted_def for values from (40) to (50); +-- create table mlparted_def2 partition of mlparted_def for values from (50) to (60); +-- insert into mlparted values (40, 100); +-- insert into mlparted_def1 values (42, 100); +-- insert into mlparted_def2 values (54, 50); +-- fail +-- insert into mlparted values (70, 100); +-- insert into mlparted_def1 values (52, 50); +-- insert into mlparted_def2 values (34, 50); +-- ok +-- create table mlparted_defd partition of mlparted_def default; +-- insert into mlparted values (70, 100); + +-- select tableoid::regclass, * from mlparted_def; + +-- Check multi-level tuple routing with attributes dropped from the +-- top-most parent. First remove the last attribute. +-- alter table mlparted add d int, add e int; +-- alter table mlparted drop e; +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table mlparted5 partition of mlparted +-- for values from (1, 40) to (1, 50) partition by range (c); +-- create table mlparted5_ab partition of mlparted5 +-- for values from ('a') to ('c') partition by list (c); +-- This partitioned table should remain with no partitions. +-- create table mlparted5_cd partition of mlparted5 +-- for values from ('c') to ('e') partition by list (c); +-- create table mlparted5_a partition of mlparted5_ab for values in ('a'); +-- create table mlparted5_b (d int, b int, c text, a int); +-- alter table mlparted5_ab attach partition mlparted5_b for values in ('b'); +-- truncate mlparted; +-- insert into mlparted values (1, 2, 'a', 1); +-- insert into mlparted values (1, 40, 'a', 1); -- goes to mlparted5_a +-- insert into mlparted values (1, 45, 'b', 1); -- goes to mlparted5_b +-- insert into mlparted values (1, 45, 'c', 1); -- goes to mlparted5_cd, fails +-- insert into mlparted values (1, 45, 'f', 1); -- goes to mlparted5, fails +-- select tableoid::regclass, * from mlparted order by a, b, c, d; +-- alter table mlparted drop d; +-- truncate mlparted; +-- Remove the before last attribute. +-- alter table mlparted add e int, add d int; +-- alter table mlparted drop e; +-- insert into mlparted values (1, 2, 'a', 1); +-- insert into mlparted values (1, 40, 'a', 1); -- goes to mlparted5_a +-- insert into mlparted values (1, 45, 'b', 1); -- goes to mlparted5_b +-- insert into mlparted values (1, 45, 'c', 1); -- goes to mlparted5_cd, fails +-- insert into mlparted values (1, 45, 'f', 1); -- goes to mlparted5, fails +-- select tableoid::regclass, * from mlparted order by a, b, c, d; +-- alter table mlparted drop d; +-- drop table mlparted5; + +-- check that message shown after failure to find a partition shows the +-- appropriate key description (or none) in various situations +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table key_desc (a int, b int) partition by list ((a+0)); +-- create table key_desc_1 partition of key_desc for values in (1) partition by range (b); + +-- create user regress_insert_other_user; +-- grant select (a) on key_desc_1 to regress_insert_other_user; +-- grant insert on key_desc to regress_insert_other_user; + +-- set role regress_insert_other_user; +-- no key description is shown +-- insert into key_desc values (1, 1); + +-- reset role; +-- grant select (b) on key_desc_1 to regress_insert_other_user; +-- set role regress_insert_other_user; +-- key description (b)=(1) is now shown +-- insert into key_desc values (1, 1); + +-- key description is not shown if key contains expression +-- insert into key_desc values (2, 1); +-- reset role; +-- revoke all on key_desc from regress_insert_other_user; +-- revoke all on key_desc_1 from regress_insert_other_user; +-- drop role regress_insert_other_user; +-- drop table key_desc, key_desc_1; + +-- test minvalue/maxvalue restrictions +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table mcrparted (a int, b int, c int) partition by range (a, abs(b), c); +-- create table mcrparted0 partition of mcrparted for values from (minvalue, 0, 0) to (1, maxvalue, maxvalue); +-- create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue) to (10, maxvalue, minvalue); +-- create table mcrparted4 partition of mcrparted for values from (21, minvalue, 0) to (30, 20, minvalue); + +-- check multi-column range partitioning expression enforces the same +-- constraint as what tuple-routing would determine it to be +-- create table mcrparted0 partition of mcrparted for values from (minvalue, minvalue, minvalue) to (1, maxvalue, maxvalue); +-- create table mcrparted1 partition of mcrparted for values from (2, 1, minvalue) to (10, 5, 10); +-- create table mcrparted2 partition of mcrparted for values from (10, 6, minvalue) to (10, maxvalue, maxvalue); +-- create table mcrparted3 partition of mcrparted for values from (11, 1, 1) to (20, 10, 10); +-- create table mcrparted4 partition of mcrparted for values from (21, minvalue, minvalue) to (30, 20, maxvalue); +-- create table mcrparted5 partition of mcrparted for values from (30, 21, 20) to (maxvalue, maxvalue, maxvalue); + +-- null not allowed in range partition +-- insert into mcrparted values (null, null, null); + +-- routed to mcrparted0 +-- insert into mcrparted values (0, 1, 1); +-- insert into mcrparted0 values (0, 1, 1); + +-- routed to mcparted1 +-- insert into mcrparted values (9, 1000, 1); +-- insert into mcrparted1 values (9, 1000, 1); +-- insert into mcrparted values (10, 5, -1); +-- insert into mcrparted1 values (10, 5, -1); +-- insert into mcrparted values (2, 1, 0); +-- insert into mcrparted1 values (2, 1, 0); + +-- routed to mcparted2 +-- insert into mcrparted values (10, 6, 1000); +-- insert into mcrparted2 values (10, 6, 1000); +-- insert into mcrparted values (10, 1000, 1000); +-- insert into mcrparted2 values (10, 1000, 1000); + +-- no partition exists, nor does mcrparted3 accept it +-- insert into mcrparted values (11, 1, -1); +-- insert into mcrparted3 values (11, 1, -1); + +-- routed to mcrparted5 +-- insert into mcrparted values (30, 21, 20); +-- insert into mcrparted5 values (30, 21, 20); +-- insert into mcrparted4 values (30, 21, 20); -- error + +-- check rows +-- select tableoid::regclass::text, * from mcrparted order by 1; + +-- cleanup +-- drop table mcrparted; + +-- check that a BR constraint can't make partition contain violating rows +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table brtrigpartcon (a int, b text) partition by list (a); +-- create table brtrigpartcon1 partition of brtrigpartcon for values in (1); +-- create or replace function brtrigpartcon1trigf() returns trigger as $$begin new.a := 2; return new; end$$ language plpgsql; +-- create trigger brtrigpartcon1trig before insert on brtrigpartcon1 for each row execute procedure brtrigpartcon1trigf(); +-- insert into brtrigpartcon values (1, 'hi there'); +-- insert into brtrigpartcon1 values (1, 'hi there'); + +-- check that the message shows the appropriate column description in a +-- situation where the partitioned table is not the primary ModifyTable node +-- create table inserttest3 (f1 text default 'foo', f2 text default 'bar', f3 int); +-- create role regress_coldesc_role; +-- grant insert on inserttest3 to regress_coldesc_role; +-- grant insert on brtrigpartcon to regress_coldesc_role; +-- revoke select on brtrigpartcon from regress_coldesc_role; +-- set role regress_coldesc_role; +-- with result as (insert into brtrigpartcon values (1, 'hi there') returning 1) +-- insert into inserttest3 (f3) select * from result; +-- reset role; + +-- cleanup +-- revoke all on inserttest3 from regress_coldesc_role; +-- revoke all on brtrigpartcon from regress_coldesc_role; +-- drop role regress_coldesc_role; +-- drop table inserttest3; +-- drop table brtrigpartcon; +-- drop function brtrigpartcon1trigf(); + +-- check that "do nothing" BR triggers work with tuple-routing (this checks +-- that estate->es_result_relation_info is appropriately set/reset for each +-- routed tuple) +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table donothingbrtrig_test (a int, b text) partition by list (a); +-- create table donothingbrtrig_test1 (b text, a int); +-- create table donothingbrtrig_test2 (c text, b text, a int); +-- alter table donothingbrtrig_test2 drop column c; +-- create or replace function donothingbrtrig_func() returns trigger as $$begin raise notice 'b: %', new.b; return NULL; end$$ language plpgsql; +-- create trigger donothingbrtrig1 before insert on donothingbrtrig_test1 for each row execute procedure donothingbrtrig_func(); +-- create trigger donothingbrtrig2 before insert on donothingbrtrig_test2 for each row execute procedure donothingbrtrig_func(); +-- alter table donothingbrtrig_test attach partition donothingbrtrig_test1 for values in (1); +-- alter table donothingbrtrig_test attach partition donothingbrtrig_test2 for values in (2); +-- insert into donothingbrtrig_test values (1, 'foo'), (2, 'bar'); +-- [SPARK-29386] Copy data between a file and a table +-- copy donothingbrtrig_test from stdout; +-- 1 baz +-- 2 qux +-- \. +-- select tableoid::regclass, * from donothingbrtrig_test; + +-- cleanup +-- drop table donothingbrtrig_test; +-- drop function donothingbrtrig_func(); + +-- check multi-column range partitioning with minvalue/maxvalue constraints +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table mcrparted (a text, b int) partition by range(a, b); +-- create table mcrparted1_lt_b partition of mcrparted for values from (minvalue, minvalue) to ('b', minvalue); +-- create table mcrparted2_b partition of mcrparted for values from ('b', minvalue) to ('c', minvalue); +-- create table mcrparted3_c_to_common partition of mcrparted for values from ('c', minvalue) to ('common', minvalue); +-- create table mcrparted4_common_lt_0 partition of mcrparted for values from ('common', minvalue) to ('common', 0); +-- create table mcrparted5_common_0_to_10 partition of mcrparted for values from ('common', 0) to ('common', 10); +-- create table mcrparted6_common_ge_10 partition of mcrparted for values from ('common', 10) to ('common', maxvalue); +-- create table mcrparted7_gt_common_lt_d partition of mcrparted for values from ('common', maxvalue) to ('d', minvalue); +-- create table mcrparted8_ge_d partition of mcrparted for values from ('d', minvalue) to (maxvalue, maxvalue); + +-- \d+ mcrparted +-- \d+ mcrparted1_lt_b +-- \d+ mcrparted2_b +-- \d+ mcrparted3_c_to_common +-- \d+ mcrparted4_common_lt_0 +-- \d+ mcrparted5_common_0_to_10 +-- \d+ mcrparted6_common_ge_10 +-- \d+ mcrparted7_gt_common_lt_d +-- \d+ mcrparted8_ge_d + +-- insert into mcrparted values ('aaa', 0), ('b', 0), ('bz', 10), ('c', -10), +-- ('comm', -10), ('common', -10), ('common', 0), ('common', 10), +-- ('commons', 0), ('d', -10), ('e', 0); +-- select tableoid::regclass, * from mcrparted order by a, b; +-- drop table mcrparted; + +-- check that wholerow vars in the RETURNING list work with partitioned tables +-- [SPARK-29718] Support PARTITION BY [RANGE|LIST|HASH] and PARTITION OF in CREATE TABLE +-- create table returningwrtest (a int) partition by list (a); +-- create table returningwrtest1 partition of returningwrtest for values in (1); +-- insert into returningwrtest values (1) returning returningwrtest; + +-- check also that the wholerow vars in RETURNING list are converted as needed +-- alter table returningwrtest add b text; +-- create table returningwrtest2 (b text, c int, a int); +-- alter table returningwrtest2 drop c; +-- alter table returningwrtest attach partition returningwrtest2 for values in (2); +-- insert into returningwrtest values (2, 'foo') returning returningwrtest; +-- drop table returningwrtest; diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/int2.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/int2.sql similarity index 87% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/int2.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/int2.sql index f64ec5d75afcf..07f5976ca6d2f 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/int2.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/int2.sql @@ -8,19 +8,23 @@ CREATE TABLE INT2_TBL(f1 smallint) USING parquet; -- [SPARK-28023] Trim the string when cast string type to other types -INSERT INTO INT2_TBL VALUES (trim('0 ')); +-- PostgreSQL implicitly casts string literals to data with integral types, but +-- Spark does not support that kind of implicit casts. +INSERT INTO INT2_TBL VALUES (smallint(trim('0 '))); -INSERT INTO INT2_TBL VALUES (trim(' 1234 ')); +INSERT INTO INT2_TBL VALUES (smallint(trim(' 1234 '))); -INSERT INTO INT2_TBL VALUES (trim(' -1234')); +INSERT INTO INT2_TBL VALUES (smallint(trim(' -1234'))); -- [SPARK-27923] Invalid input syntax for type short throws exception at PostgreSQL -- INSERT INTO INT2_TBL VALUES ('34.5'); -- largest and smallest values -INSERT INTO INT2_TBL VALUES ('32767'); +-- PostgreSQL implicitly casts string literals to data with integral types, but +-- Spark does not support that kind of implicit casts. +INSERT INTO INT2_TBL VALUES (smallint('32767')); -INSERT INTO INT2_TBL VALUES ('-32767'); +INSERT INTO INT2_TBL VALUES (smallint('-32767')); -- bad input values -- should give errors -- INSERT INTO INT2_TBL VALUES ('100000'); diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/int4.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/int4.sql similarity index 90% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/int4.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/int4.sql index 1012db72e1873..3a409eea34837 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/int4.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/int4.sql @@ -9,19 +9,23 @@ CREATE TABLE INT4_TBL(f1 int) USING parquet; -- [SPARK-28023] Trim the string when cast string type to other types -INSERT INTO INT4_TBL VALUES (trim(' 0 ')); +-- PostgreSQL implicitly casts string literals to data with integral types, but +-- Spark does not support that kind of implicit casts. +INSERT INTO INT4_TBL VALUES (int(trim(' 0 '))); -INSERT INTO INT4_TBL VALUES (trim('123456 ')); +INSERT INTO INT4_TBL VALUES (int(trim('123456 '))); -INSERT INTO INT4_TBL VALUES (trim(' -123456')); +INSERT INTO INT4_TBL VALUES (int(trim(' -123456'))); -- [SPARK-27923] Invalid input syntax for integer: "34.5" at PostgreSQL -- INSERT INTO INT4_TBL(f1) VALUES ('34.5'); -- largest and smallest values -INSERT INTO INT4_TBL VALUES ('2147483647'); +-- PostgreSQL implicitly casts string literals to data with integral types, but +-- Spark does not support that kind of implicit casts. +INSERT INTO INT4_TBL VALUES (int('2147483647')); -INSERT INTO INT4_TBL VALUES ('-2147483647'); +INSERT INTO INT4_TBL VALUES (int('-2147483647')); -- [SPARK-27923] Spark SQL insert these bad inputs to NULL -- bad input values @@ -33,11 +37,6 @@ INSERT INTO INT4_TBL VALUES ('-2147483647'); -- INSERT INTO INT4_TBL(f1) VALUES ('123 5'); -- INSERT INTO INT4_TBL(f1) VALUES (''); --- We cannot test this when failOnOverFlow=true here --- because exception happens in the executors and the --- output stacktrace cannot have an exact match -set spark.sql.arithmeticOperations.failOnOverFlow=false; - SELECT '' AS five, * FROM INT4_TBL; SELECT '' AS four, i.* FROM INT4_TBL i WHERE i.f1 <> smallint('0'); diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/int8.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/int8.sql similarity index 94% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/int8.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/int8.sql index d29bf3bfad4ca..5fea758e73084 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/int8.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/int8.sql @@ -8,11 +8,13 @@ -- CREATE TABLE INT8_TBL(q1 bigint, q2 bigint) USING parquet; -INSERT INTO INT8_TBL VALUES(trim(' 123 '),trim(' 456')); -INSERT INTO INT8_TBL VALUES(trim('123 '),'4567890123456789'); -INSERT INTO INT8_TBL VALUES('4567890123456789','123'); -INSERT INTO INT8_TBL VALUES(+4567890123456789,'4567890123456789'); -INSERT INTO INT8_TBL VALUES('+4567890123456789','-4567890123456789'); +-- PostgreSQL implicitly casts string literals to data with integral types, but +-- Spark does not support that kind of implicit casts. +INSERT INTO INT8_TBL VALUES(bigint(trim(' 123 ')),bigint(trim(' 456'))); +INSERT INTO INT8_TBL VALUES(bigint(trim('123 ')),bigint('4567890123456789')); +INSERT INTO INT8_TBL VALUES(bigint('4567890123456789'),bigint('123')); +INSERT INTO INT8_TBL VALUES(+4567890123456789,bigint('4567890123456789')); +INSERT INTO INT8_TBL VALUES(bigint('+4567890123456789'),bigint('-4567890123456789')); -- [SPARK-27923] Spark SQL insert there bad inputs to NULL -- bad inputs diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/interval.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/interval.sql new file mode 100644 index 0000000000000..eb8cc34419519 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/interval.sql @@ -0,0 +1,344 @@ +-- +-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group +-- +-- +-- INTERVAL +-- https://github.com/postgres/postgres/blob/REL_12_STABLE/src/test/regress/sql/interval.sql + +-- [SPARK-28259] Date/Time Output Styles and Date Order Conventions +-- SET DATESTYLE = 'ISO'; +-- [SPARK-29406] Interval output styles +-- SET IntervalStyle to postgres; + +-- check acceptance of "time zone style" +-- [SPARK-29369] Accept strings without `interval` prefix in casting to intervals +-- [SPARK-29370] Interval strings without explicit unit markings +-- SELECT INTERVAL '01:00' AS `One hour`; +-- SELECT INTERVAL '+02:00' AS `Two hours`; +-- SELECT INTERVAL '-08:00' AS `Eight hours`; +-- SELECT INTERVAL '-1 +02:03' AS `22 hours ago...`; +-- SELECT INTERVAL '-1 days +02:03' AS `22 hours ago...`; +-- [SPARK-29371] Support interval field values with fractional parts +-- SELECT INTERVAL '1.5 weeks' AS `Ten days twelve hours`; +-- SELECT INTERVAL '1.5 months' AS `One month 15 days`; +-- SELECT INTERVAL '10 years -11 month -12 days +13:14' AS `9 years...`; + +-- [SPARK-29382] Support writing `INTERVAL` type to datasource table +-- CREATE TABLE INTERVAL_TBL (f1 interval); + +-- [SPARK-29383] Support the optional prefix `@` in interval strings +-- INSERT INTO INTERVAL_TBL (f1) VALUES ('@ 1 minute'); +-- INSERT INTO INTERVAL_TBL (f1) VALUES ('@ 5 hour'); +-- INSERT INTO INTERVAL_TBL (f1) VALUES ('@ 10 day'); +-- INSERT INTO INTERVAL_TBL (f1) VALUES ('@ 34 year'); +-- INSERT INTO INTERVAL_TBL (f1) VALUES ('@ 3 months'); +-- [SPARK-29384] Support `ago` in interval strings +-- INSERT INTO INTERVAL_TBL (f1) VALUES ('@ 14 seconds ago'); +-- INSERT INTO INTERVAL_TBL (f1) VALUES ('1 day 2 hours 3 minutes 4 seconds'); +-- INSERT INTO INTERVAL_TBL (f1) VALUES ('6 years'); +-- INSERT INTO INTERVAL_TBL (f1) VALUES ('5 months'); +-- INSERT INTO INTERVAL_TBL (f1) VALUES ('5 months 12 hours'); + +-- badly formatted interval +-- INSERT INTO INTERVAL_TBL (f1) VALUES ('badly formatted interval'); +-- INSERT INTO INTERVAL_TBL (f1) VALUES ('@ 30 eons ago'); + +-- test interval operators + +-- SELECT '' AS ten, * FROM INTERVAL_TBL; +-- [SPARK-29385] Make `INTERVAL` values comparable +-- SELECT '' AS nine, * FROM INTERVAL_TBL +-- WHERE INTERVAL_TBL.f1 <> interval '@ 10 days'; + +-- SELECT '' AS three, * FROM INTERVAL_TBL +-- WHERE INTERVAL_TBL.f1 <= interval '@ 5 hours'; + +-- SELECT '' AS three, * FROM INTERVAL_TBL +-- WHERE INTERVAL_TBL.f1 < interval '@ 1 day'; + +-- SELECT '' AS one, * FROM INTERVAL_TBL +-- WHERE INTERVAL_TBL.f1 = interval '@ 34 years'; + +-- SELECT '' AS five, * FROM INTERVAL_TBL +-- WHERE INTERVAL_TBL.f1 >= interval '@ 1 month'; + +-- SELECT '' AS nine, * FROM INTERVAL_TBL +-- WHERE INTERVAL_TBL.f1 > interval '@ 3 seconds ago'; + +-- SELECT '' AS fortyfive, r1.*, r2.* +-- FROM INTERVAL_TBL r1, INTERVAL_TBL r2 +-- WHERE r1.f1 > r2.f1 +-- ORDER BY r1.f1, r2.f1; + +-- Test intervals that are large enough to overflow 64 bits in comparisons +-- [SPARK-29369] Accept strings without `interval` prefix in casting to intervals +-- CREATE TEMP TABLE INTERVAL_TBL_OF (f1 interval); +-- INSERT INTO INTERVAL_TBL_OF (f1) VALUES +-- ('2147483647 days 2147483647 months'), +-- ('2147483647 days -2147483648 months'), +-- ('1 year'), +-- ('-2147483648 days 2147483647 months'), +-- ('-2147483648 days -2147483648 months'); +-- these should fail as out-of-range +-- INSERT INTO INTERVAL_TBL_OF (f1) VALUES ('2147483648 days'); +-- INSERT INTO INTERVAL_TBL_OF (f1) VALUES ('-2147483649 days'); +-- INSERT INTO INTERVAL_TBL_OF (f1) VALUES ('2147483647 years'); +-- INSERT INTO INTERVAL_TBL_OF (f1) VALUES ('-2147483648 years'); + +-- SELECT r1.*, r2.* +-- FROM INTERVAL_TBL_OF r1, INTERVAL_TBL_OF r2 +-- WHERE r1.f1 > r2.f1 +-- ORDER BY r1.f1, r2.f1; + +-- CREATE INDEX ON INTERVAL_TBL_OF USING btree (f1); +-- SET enable_seqscan TO false; +-- EXPLAIN (COSTS OFF) +-- SELECT f1 FROM INTERVAL_TBL_OF r1 ORDER BY f1; +-- SELECT f1 FROM INTERVAL_TBL_OF r1 ORDER BY f1; +-- RESET enable_seqscan; + +-- DROP TABLE INTERVAL_TBL_OF; + +-- Test multiplication and division with intervals. +-- Floating point arithmetic rounding errors can lead to unexpected results, +-- though the code attempts to do the right thing and round up to days and +-- minutes to avoid results such as '3 days 24:00 hours' or '14:20:60'. +-- Note that it is expected for some day components to be greater than 29 and +-- some time components be greater than 23:59:59 due to how intervals are +-- stored internally. +-- [SPARK-29386] Copy data between a file and a table +-- CREATE TABLE INTERVAL_MULDIV_TBL (span interval); +-- COPY INTERVAL_MULDIV_TBL FROM STDIN; +-- 41 mon 12 days 360:00 +-- -41 mon -12 days +360:00 +-- -12 days +-- 9 mon -27 days 12:34:56 +-- -3 years 482 days 76:54:32.189 +-- 4 mon +-- 14 mon +-- 999 mon 999 days +-- \. +-- [SPARK-29387] Support `*` and `\` operators for intervals +-- SELECT span * 0.3 AS product +-- FROM INTERVAL_MULDIV_TBL; + +-- SELECT span * 8.2 AS product +-- FROM INTERVAL_MULDIV_TBL; + +-- SELECT span / 10 AS quotient +-- FROM INTERVAL_MULDIV_TBL; + +-- SELECT span / 100 AS quotient +-- FROM INTERVAL_MULDIV_TBL; + +-- DROP TABLE INTERVAL_MULDIV_TBL; +-- [SPARK-28259] Date/Time Output Styles and Date Order Conventions +-- SET DATESTYLE = 'postgres'; +-- [SPARK-29406] Interval output styles +-- SET IntervalStyle to postgres_verbose; + +-- SELECT '' AS ten, * FROM INTERVAL_TBL; + +-- test avg(interval), which is somewhat fragile since people have been +-- known to change the allowed input syntax for type interval without +-- updating pg_aggregate.agginitval + +-- select avg(f1) from interval_tbl; + +-- test long interval input +-- [SPARK-29388] Construct intervals from the `millenniums`, `centuries` or `decades` units +-- select '4 millenniums 5 centuries 4 decades 1 year 4 months 4 days 17 minutes 31 seconds'::interval; + +-- test long interval output +-- Note: the actual maximum length of the interval output is longer, +-- but we need the test to work for both integer and floating-point +-- timestamps. +-- [SPARK-29389] Support synonyms for interval units +-- select '100000000y 10mon -1000000000d -100000h -10min -10.000001s ago'::interval; + +-- test justify_hours() and justify_days() +-- [SPARK-29390] Add the justify_days(), justify_hours() and justify_interval() functions +-- SELECT justify_hours(interval '6 months 3 days 52 hours 3 minutes 2 seconds') as `6 mons 5 days 4 hours 3 mins 2 seconds`; +-- SELECT justify_days(interval '6 months 36 days 5 hours 4 minutes 3 seconds') as `7 mons 6 days 5 hours 4 mins 3 seconds`; + +-- test justify_interval() + +-- SELECT justify_interval(interval '1 month -1 hour') as `1 month -1 hour`; + +-- test fractional second input, and detection of duplicate units +-- [SPARK-28259] Date/Time Output Styles and Date Order Conventions +-- SET DATESTYLE = 'ISO'; +-- [SPARK-29406] Interval output styles +-- SET IntervalStyle TO postgres; +-- [SPARK-29369] Accept strings without `interval` prefix in casting to intervals +-- SELECT '1 millisecond'::interval, '1 microsecond'::interval, +-- '500 seconds 99 milliseconds 51 microseconds'::interval; +-- SELECT '3 days 5 milliseconds'::interval; + +-- SELECT '1 second 2 seconds'::interval; -- error +-- SELECT '10 milliseconds 20 milliseconds'::interval; -- error +-- SELECT '5.5 seconds 3 milliseconds'::interval; -- error +-- SELECT '1:20:05 5 microseconds'::interval; -- error +-- SELECT '1 day 1 day'::interval; -- error +-- [SPARK-29391] Default year-month units +-- SELECT interval '1-2'; -- SQL year-month literal +SELECT interval '999' second; -- oversize leading field is ok +SELECT interval '999' minute; +SELECT interval '999' hour; +SELECT interval '999' day; +SELECT interval '999' month; + +-- test SQL-spec syntaxes for restricted field sets +SELECT interval '1' year; +SELECT interval '2' month; +SELECT interval '3' day; +SELECT interval '4' hour; +SELECT interval '5' minute; +SELECT interval '6' second; +-- [SPARK-29391] Default year-month units +-- SELECT interval '1' year to month; +SELECT interval '1-2' year to month; +-- [SPARK-29391] Default year-month units +-- SELECT interval '1 2' day to hour; +SELECT interval '1 2:03' day to hour; +SELECT interval '1 2:03:04' day to hour; +-- SELECT interval '1 2' day to minute; +SELECT interval '1 2:03' day to minute; +SELECT interval '1 2:03:04' day to minute; +-- SELECT interval '1 2' day to second; +SELECT interval '1 2:03' day to second; +SELECT interval '1 2:03:04' day to second; +-- SELECT interval '1 2' hour to minute; +SELECT interval '1 2:03' hour to minute; +SELECT interval '1 2:03:04' hour to minute; +-- SELECT interval '1 2' hour to second; +SELECT interval '1 2:03' hour to second; +SELECT interval '1 2:03:04' hour to second; +-- SELECT interval '1 2' minute to second; +SELECT interval '1 2:03' minute to second; +SELECT interval '1 2:03:04' minute to second; +-- [SPARK-29370] Interval strings without explicit unit markings +-- SELECT interval '1 +2:03' minute to second; +-- SELECT interval '1 +2:03:04' minute to second; +-- SELECT interval '1 -2:03' minute to second; +-- SELECT interval '1 -2:03:04' minute to second; +-- SELECT interval '123 11' day to hour; -- ok +-- SELECT interval '123 11' day; -- not ok +-- SELECT interval '123 11'; -- not ok, too ambiguous +-- SELECT interval '123 2:03 -2:04'; -- not ok, redundant hh:mm fields + +-- test syntaxes for restricted precision +-- [SPARK-29395] Precision of the interval type +-- SELECT interval(0) '1 day 01:23:45.6789'; +-- SELECT interval(2) '1 day 01:23:45.6789'; +-- SELECT interval '12:34.5678' minute to second(2); -- per SQL spec +-- SELECT interval '1.234' second; +-- SELECT interval '1.234' second(2); +-- SELECT interval '1 2.345' day to second(2); +-- SELECT interval '1 2:03' day to second(2); +-- SELECT interval '1 2:03.4567' day to second(2); +-- SELECT interval '1 2:03:04.5678' day to second(2); +-- SELECT interval '1 2.345' hour to second(2); +-- SELECT interval '1 2:03.45678' hour to second(2); +-- SELECT interval '1 2:03:04.5678' hour to second(2); +-- SELECT interval '1 2.3456' minute to second(2); +-- SELECT interval '1 2:03.5678' minute to second(2); +-- SELECT interval '1 2:03:04.5678' minute to second(2); + +-- test casting to restricted precision (bug #14479) +-- SELECT f1, f1::INTERVAL DAY TO MINUTE AS `minutes`, +-- (f1 + INTERVAL '1 month')::INTERVAL MONTH::INTERVAL YEAR AS `years` +-- FROM interval_tbl; + +-- test inputting and outputting SQL standard interval literals +-- [SPARK-29406] Interval output styles +-- SET IntervalStyle TO sql_standard; +-- [SPARK-29407] Support syntax for zero interval +-- SELECT interval '0' AS zero, +-- interval '1-2' year to month AS `year-month`, +-- interval '1 2:03:04' day to second AS `day-time`, +-- [SPARK-29408] Support interval literal with negative sign `-` +-- - interval '1-2' AS `negative year-month`, +-- - interval '1 2:03:04' AS `negative day-time`; + +-- test input of some not-quite-standard interval values in the sql style +-- [SPARK-29406] Interval output styles +-- SET IntervalStyle TO postgres; +-- SELECT interval '+1 -1:00:00', +-- interval '-1 +1:00:00', +-- interval '+1-2 -3 +4:05:06.789', +-- interval '-1-2 +3 -4:05:06.789'; + +-- test output of couple non-standard interval values in the sql style +-- [SPARK-29406] Interval output styles +-- SET IntervalStyle TO sql_standard; +-- SELECT interval '1 day -1 hours', +-- interval '-1 days +1 hours', +-- interval '1 years 2 months -3 days 4 hours 5 minutes 6.789 seconds', +-- - interval '1 years 2 months -3 days 4 hours 5 minutes 6.789 seconds'; + +-- test outputting iso8601 intervals +-- [SPARK-29406] Interval output styles +-- SET IntervalStyle to iso_8601; +-- select interval '0' AS zero, +-- interval '1-2' AS `a year 2 months`, +-- interval '1 2:03:04' AS `a bit over a day`, +-- interval '2:03:04.45679' AS `a bit over 2 hours`, +-- (interval '1-2' + interval '3 4:05:06.7') AS `all fields`, +-- (interval '1-2' - interval '3 4:05:06.7') AS `mixed sign`, +-- (- interval '1-2' + interval '3 4:05:06.7') AS negative; + +-- test inputting ISO 8601 4.4.2.1 "Format With Time Unit Designators" +-- [SPARK-29406] Interval output styles +-- SET IntervalStyle to sql_standard; +-- [SPARK-29394] Support ISO 8601 format for intervals +-- select interval 'P0Y' AS zero, +-- interval 'P1Y2M' AS `a year 2 months`, +-- interval 'P1W' AS `a week`, +-- interval 'P1DT2H3M4S' AS `a bit over a day`, +-- interval 'P1Y2M3DT4H5M6.7S' AS `all fields`, +-- interval 'P-1Y-2M-3DT-4H-5M-6.7S' AS negative, +-- interval 'PT-0.1S' AS `fractional second`; + +-- test inputting ISO 8601 4.4.2.2 "Alternative Format" +-- [SPARK-29406] Interval output styles +-- SET IntervalStyle to postgres; +-- select interval 'P00021015T103020' AS `ISO8601 Basic Format`, +-- interval 'P0002-10-15T10:30:20' AS `ISO8601 Extended Format`; + +-- Make sure optional ISO8601 alternative format fields are optional. +-- select interval 'P0002' AS `year only`, +-- interval 'P0002-10' AS `year month`, +-- interval 'P0002-10-15' AS `year month day`, +-- interval 'P0002T1S' AS `year only plus time`, +-- interval 'P0002-10T1S' AS `year month plus time`, +-- interval 'P0002-10-15T1S' AS `year month day plus time`, +-- interval 'PT10' AS `hour only`, +-- interval 'PT10:30' AS `hour minute`; + +-- test a couple rounding cases that changed since 8.3 w/ HAVE_INT64_TIMESTAMP. +-- [SPARK-29406] Interval output styles +-- SET IntervalStyle to postgres_verbose; +-- select interval '-10 mons -3 days +03:55:06.70'; +-- select interval '1 year 2 mons 3 days 04:05:06.699999'; +-- select interval '0:0:0.7', interval '@ 0.70 secs', interval '0.7 seconds'; + +-- check that '30 days' equals '1 month' according to the hash function +-- [SPARK-29385] Make `INTERVAL` values comparable +-- select '30 days'::interval = '1 month'::interval as t; +-- select interval_hash('30 days'::interval) = interval_hash('1 month'::interval) as t; + +-- numeric constructor +-- [SPARK-29393] Add the make_interval() function +-- select make_interval(years := 2); +-- select make_interval(years := 1, months := 6); +-- select make_interval(years := 1, months := -1, weeks := 5, days := -7, hours := 25, mins := -180); + +-- select make_interval() = make_interval(years := 0, months := 0, weeks := 0, days := 0, mins := 0, secs := 0.0); +-- select make_interval(hours := -2, mins := -10, secs := -25.3); + +-- select make_interval(years := 'inf'::float::int); +-- select make_interval(months := 'NaN'::float::int); +-- select make_interval(secs := 'inf'); +-- select make_interval(secs := 'NaN'); +-- select make_interval(secs := 7e12); diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/join.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/join.sql similarity index 98% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/join.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/join.sql index 08f54fe0a40e5..cc07b00cc3670 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/join.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/join.sql @@ -6,6 +6,19 @@ -- Test JOIN clauses -- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/join.sql -- + +-- There are 2 dimensions we want to test +-- 1. run with broadcast hash join, sort merge join or shuffle hash join. +-- 2. run with whole-stage-codegen, operator codegen or no codegen. + +--CONFIG_DIM1 spark.sql.autoBroadcastJoinThreshold=10485760 +--CONFIG_DIM1 spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=true +--CONFIG_DIM1 spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=false + +--CONFIG_DIM2 spark.sql.codegen.wholeStage=true +--CONFIG_DIM2 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM2 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + CREATE OR REPLACE TEMPORARY VIEW INT4_TBL AS SELECT * FROM (VALUES (0), (123456), (-123456), (2147483647), (-2147483647)) AS v(f1); @@ -577,15 +590,15 @@ select count(*) from tenk1 a, tenk1 b -- regression test for 8.2 bug with improper re-ordering of left joins -- -DROP TABLE IF EXISTS tt3; -CREATE TABLE tt3(f1 int, f2 string) USING parquet; -INSERT INTO tt3 SELECT x.id, repeat('xyzzy', 100) FROM range(1,10001) x; +create or replace temporary view tt3 as select * from + (SELECT cast(x.id as int), repeat('xyzzy', 100) FROM range(1,10001) x) + as v(f1, f2); -- create index tt3i on tt3(f1); -- analyze tt3; -DROP TABLE IF EXISTS tt4; -CREATE TABLE tt4(f1 int) USING parquet; -INSERT INTO tt4 VALUES (0),(1),(9999); +create or replace temporary view tt4 as select * from + (values (0), (1), (9999)) + as v(f1); -- analyze tt4; SELECT a.f1 diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/limit.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/limit.sql new file mode 100644 index 0000000000000..bc0b5d6dddc52 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/limit.sql @@ -0,0 +1,164 @@ +-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group +-- +-- LIMIT +-- https://github.com/postgres/postgres/blob/REL_12_STABLE/src/test/regress/sql/limit.sql + +SELECT '' AS two, unique1, unique2, stringu1 + FROM onek WHERE unique1 > 50 + ORDER BY unique1 LIMIT 2; +SELECT '' AS five, unique1, unique2, stringu1 + FROM onek WHERE unique1 > 60 + ORDER BY unique1 LIMIT 5; +SELECT '' AS two, unique1, unique2, stringu1 + FROM onek WHERE unique1 > 60 AND unique1 < 63 + ORDER BY unique1 LIMIT 5; +-- [SPARK-28330] ANSI SQL: Top-level in +-- SELECT '' AS three, unique1, unique2, stringu1 +-- FROM onek WHERE unique1 > 100 +-- ORDER BY unique1 LIMIT 3 OFFSET 20; +-- SELECT '' AS zero, unique1, unique2, stringu1 +-- FROM onek WHERE unique1 < 50 +-- ORDER BY unique1 DESC LIMIT 8 OFFSET 99; +-- SELECT '' AS eleven, unique1, unique2, stringu1 +-- FROM onek WHERE unique1 < 50 +-- ORDER BY unique1 DESC LIMIT 20 OFFSET 39; +-- SELECT '' AS ten, unique1, unique2, stringu1 +-- FROM onek +-- ORDER BY unique1 OFFSET 990; +-- SELECT '' AS five, unique1, unique2, stringu1 +-- FROM onek +-- ORDER BY unique1 OFFSET 990 LIMIT 5; +-- SELECT '' AS five, unique1, unique2, stringu1 +-- FROM onek +-- ORDER BY unique1 LIMIT 5 OFFSET 900; + +CREATE OR REPLACE TEMPORARY VIEW INT8_TBL AS SELECT * FROM + (VALUES + (123, 456), + (123, 4567890123456789), + (4567890123456789, 123), + (4567890123456789, 4567890123456789), + (4567890123456789, -4567890123456789)) + AS v(q1, q2); + +-- Test null limit and offset. The planner would discard a simple null +-- constant, so to ensure executor is exercised, do this: +-- [SPARK-29650] Discard a NULL constant in LIMIT +select * from int8_tbl limit (case when random() < 0.5 then bigint(null) end); +-- [SPARK-28330] ANSI SQL: Top-level in +-- select * from int8_tbl offset (case when random() < 0.5 then bigint(null) end); + +-- Test assorted cases involving backwards fetch from a LIMIT plan node +-- [SPARK-20965] Support PREPARE/EXECUTE/DECLARE/FETCH statements +-- begin; +-- +-- declare c1 cursor for select * from int8_tbl limit 10; +-- fetch all in c1; +-- fetch 1 in c1; +-- fetch backward 1 in c1; +-- fetch backward all in c1; +-- fetch backward 1 in c1; +-- fetch all in c1; +-- +-- declare c2 cursor for select * from int8_tbl limit 3; +-- fetch all in c2; +-- fetch 1 in c2; +-- fetch backward 1 in c2; +-- fetch backward all in c2; +-- fetch backward 1 in c2; +-- fetch all in c2; +-- +-- declare c3 cursor for select * from int8_tbl offset 3; +-- fetch all in c3; +-- fetch 1 in c3; +-- fetch backward 1 in c3; +-- fetch backward all in c3; +-- fetch backward 1 in c3; +-- fetch all in c3; +-- +-- declare c4 cursor for select * from int8_tbl offset 10; +-- fetch all in c4; +-- fetch 1 in c4; +-- fetch backward 1 in c4; +-- fetch backward all in c4; +-- fetch backward 1 in c4; +-- fetch all in c4; +-- +-- rollback; + +DROP VIEW INT8_TBL; + +-- Stress test for variable LIMIT in conjunction with bounded-heap sorting + +-- [SPARK-28330] ANSI SQL: Top-level in +-- SELECT +-- (SELECT n +-- FROM (VALUES (1)) AS x, +-- (SELECT n FROM generate_series(1,10) AS n +-- ORDER BY n LIMIT 1 OFFSET s-1) AS y) AS z +-- FROM generate_series(1,10) AS s; + +-- +-- Test behavior of volatile and set-returning functions in conjunction +-- with ORDER BY and LIMIT. +-- + +-- [SPARK-29631] Support ANSI SQL CREATE SEQUENCE +-- create temp sequence testseq; + +-- explain (verbose, costs off) +-- select unique1, unique2, nextval('testseq') +-- from tenk1 order by unique2 limit 10; + +-- select unique1, unique2, nextval('testseq') +-- from tenk1 order by unique2 limit 10; + +-- select currval('testseq'); + +-- explain (verbose, costs off) +-- select unique1, unique2, nextval('testseq') +-- from tenk1 order by tenthous limit 10; + +-- select unique1, unique2, nextval('testseq') +-- from tenk1 order by tenthous limit 10; + +-- select currval('testseq'); + +-- explain (verbose, costs off) +-- select unique1, unique2, generate_series(1,10) +-- from tenk1 order by unique2 limit 7; + +-- [SPARK-27767] Built-in function: generate_series +-- select unique1, unique2, generate_series(1,10) +-- from tenk1 order by unique2 limit 7; + +-- explain (verbose, costs off) +-- select unique1, unique2, generate_series(1,10) +-- from tenk1 order by tenthous limit 7; + +-- [SPARK-27767] Built-in function: generate_series +-- select unique1, unique2, generate_series(1,10) +-- from tenk1 order by tenthous limit 7; + +-- use of random() is to keep planner from folding the expressions together +-- explain (verbose, costs off) +-- select generate_series(0,2) as s1, generate_series((random()*.1)::int,2) as s2; + +-- [SPARK-27767] Built-in function: generate_series +-- select generate_series(0,2) as s1, generate_series((random()*.1)::int,2) as s2; + +-- explain (verbose, costs off) +-- select generate_series(0,2) as s1, generate_series((random()*.1)::int,2) as s2 +-- order by s2 desc; + +-- [SPARK-27767] Built-in function: generate_series +-- select generate_series(0,2) as s1, generate_series((random()*.1)::int,2) as s2 +-- order by s2 desc; + +-- test for failure to set all aggregates' aggtranstype +-- explain (verbose, costs off) +-- select sum(tenthous) as s1, sum(tenthous) + random()*0 as s2 +-- from tenk1 group by thousand order by thousand limit 3; + +select sum(tenthous) as s1, sum(tenthous) + random()*0 as s2 + from tenk1 group by thousand order by thousand limit 3; diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/numeric.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/numeric.sql new file mode 100644 index 0000000000000..dbdb2cace0e0c --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/numeric.sql @@ -0,0 +1,1150 @@ +-- +-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group +-- +-- +-- NUMERIC +-- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/numeric.sql +-- + +-- [SPARK-28318] Decimal can only support precision up to 38. We rewrite numeric(210,10) to decimal(38,10). +CREATE TABLE num_data (id int, val decimal(38,10)) USING parquet; +CREATE TABLE num_exp_add (id1 int, id2 int, expected decimal(38,10)) USING parquet; +CREATE TABLE num_exp_sub (id1 int, id2 int, expected decimal(38,10)) USING parquet; +CREATE TABLE num_exp_div (id1 int, id2 int, expected decimal(38,10)) USING parquet; +CREATE TABLE num_exp_mul (id1 int, id2 int, expected decimal(38,10)) USING parquet; +CREATE TABLE num_exp_sqrt (id int, expected decimal(38,10)) USING parquet; +CREATE TABLE num_exp_ln (id int, expected decimal(38,10)) USING parquet; +CREATE TABLE num_exp_log10 (id int, expected decimal(38,10)) USING parquet; +CREATE TABLE num_exp_power_10_ln (id int, expected decimal(38,10)) USING parquet; + +CREATE TABLE num_result (id1 int, id2 int, result decimal(38,10)) USING parquet; + + +-- ****************************** +-- * The following EXPECTED results are computed by bc(1) +-- * with a scale of 200 +-- ****************************** + +-- BEGIN TRANSACTION; +-- PostgreSQL implicitly casts string literals to data with decimal types, but +-- Spark does not support that kind of implicit casts. To test all the INSERT queries below, +-- we rewrote them into the other typed literals. +INSERT INTO num_exp_add VALUES (0,0,0); +INSERT INTO num_exp_sub VALUES (0,0,0); +INSERT INTO num_exp_mul VALUES (0,0,0); +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_div VALUES (0,0,double('NaN')); +INSERT INTO num_exp_add VALUES (0,1,0); +INSERT INTO num_exp_sub VALUES (0,1,0); +INSERT INTO num_exp_mul VALUES (0,1,0); +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_div VALUES (0,1,double('NaN')); +INSERT INTO num_exp_add VALUES (0,2,-34338492.215397047); +INSERT INTO num_exp_sub VALUES (0,2,34338492.215397047); +INSERT INTO num_exp_mul VALUES (0,2,0); +INSERT INTO num_exp_div VALUES (0,2,0); +INSERT INTO num_exp_add VALUES (0,3,4.31); +INSERT INTO num_exp_sub VALUES (0,3,-4.31); +INSERT INTO num_exp_mul VALUES (0,3,0); +INSERT INTO num_exp_div VALUES (0,3,0); +INSERT INTO num_exp_add VALUES (0,4,7799461.4119); +INSERT INTO num_exp_sub VALUES (0,4,-7799461.4119); +INSERT INTO num_exp_mul VALUES (0,4,0); +INSERT INTO num_exp_div VALUES (0,4,0); +INSERT INTO num_exp_add VALUES (0,5,16397.038491); +INSERT INTO num_exp_sub VALUES (0,5,-16397.038491); +INSERT INTO num_exp_mul VALUES (0,5,0); +INSERT INTO num_exp_div VALUES (0,5,0); +INSERT INTO num_exp_add VALUES (0,6,93901.57763026); +INSERT INTO num_exp_sub VALUES (0,6,-93901.57763026); +INSERT INTO num_exp_mul VALUES (0,6,0); +INSERT INTO num_exp_div VALUES (0,6,0); +INSERT INTO num_exp_add VALUES (0,7,-83028485); +INSERT INTO num_exp_sub VALUES (0,7,83028485); +INSERT INTO num_exp_mul VALUES (0,7,0); +INSERT INTO num_exp_div VALUES (0,7,0); +INSERT INTO num_exp_add VALUES (0,8,74881); +INSERT INTO num_exp_sub VALUES (0,8,-74881); +INSERT INTO num_exp_mul VALUES (0,8,0); +INSERT INTO num_exp_div VALUES (0,8,0); +INSERT INTO num_exp_add VALUES (0,9,-24926804.045047420); +INSERT INTO num_exp_sub VALUES (0,9,24926804.045047420); +INSERT INTO num_exp_mul VALUES (0,9,0); +INSERT INTO num_exp_div VALUES (0,9,0); +INSERT INTO num_exp_add VALUES (1,0,0); +INSERT INTO num_exp_sub VALUES (1,0,0); +INSERT INTO num_exp_mul VALUES (1,0,0); +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_div VALUES (1,0,double('NaN')); +INSERT INTO num_exp_add VALUES (1,1,0); +INSERT INTO num_exp_sub VALUES (1,1,0); +INSERT INTO num_exp_mul VALUES (1,1,0); +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_div VALUES (1,1,double('NaN')); +INSERT INTO num_exp_add VALUES (1,2,-34338492.215397047); +INSERT INTO num_exp_sub VALUES (1,2,34338492.215397047); +INSERT INTO num_exp_mul VALUES (1,2,0); +INSERT INTO num_exp_div VALUES (1,2,0); +INSERT INTO num_exp_add VALUES (1,3,4.31); +INSERT INTO num_exp_sub VALUES (1,3,-4.31); +INSERT INTO num_exp_mul VALUES (1,3,0); +INSERT INTO num_exp_div VALUES (1,3,0); +INSERT INTO num_exp_add VALUES (1,4,7799461.4119); +INSERT INTO num_exp_sub VALUES (1,4,-7799461.4119); +INSERT INTO num_exp_mul VALUES (1,4,0); +INSERT INTO num_exp_div VALUES (1,4,0); +INSERT INTO num_exp_add VALUES (1,5,16397.038491); +INSERT INTO num_exp_sub VALUES (1,5,-16397.038491); +INSERT INTO num_exp_mul VALUES (1,5,0); +INSERT INTO num_exp_div VALUES (1,5,0); +INSERT INTO num_exp_add VALUES (1,6,93901.57763026); +INSERT INTO num_exp_sub VALUES (1,6,-93901.57763026); +INSERT INTO num_exp_mul VALUES (1,6,0); +INSERT INTO num_exp_div VALUES (1,6,0); +INSERT INTO num_exp_add VALUES (1,7,-83028485); +INSERT INTO num_exp_sub VALUES (1,7,83028485); +INSERT INTO num_exp_mul VALUES (1,7,0); +INSERT INTO num_exp_div VALUES (1,7,0); +INSERT INTO num_exp_add VALUES (1,8,74881); +INSERT INTO num_exp_sub VALUES (1,8,-74881); +INSERT INTO num_exp_mul VALUES (1,8,0); +INSERT INTO num_exp_div VALUES (1,8,0); +INSERT INTO num_exp_add VALUES (1,9,-24926804.045047420); +INSERT INTO num_exp_sub VALUES (1,9,24926804.045047420); +INSERT INTO num_exp_mul VALUES (1,9,0); +INSERT INTO num_exp_div VALUES (1,9,0); +INSERT INTO num_exp_add VALUES (2,0,-34338492.215397047); +INSERT INTO num_exp_sub VALUES (2,0,-34338492.215397047); +INSERT INTO num_exp_mul VALUES (2,0,0); +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_div VALUES (2,0,double('NaN')); +INSERT INTO num_exp_add VALUES (2,1,-34338492.215397047); +INSERT INTO num_exp_sub VALUES (2,1,-34338492.215397047); +INSERT INTO num_exp_mul VALUES (2,1,0); +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_div VALUES (2,1,double('NaN')); +INSERT INTO num_exp_add VALUES (2,2,-68676984.430794094); +INSERT INTO num_exp_sub VALUES (2,2,0); +INSERT INTO num_exp_mul VALUES (2,2,1179132047626883.596862135856320209); +INSERT INTO num_exp_div VALUES (2,2,1.00000000000000000000); +INSERT INTO num_exp_add VALUES (2,3,-34338487.905397047); +INSERT INTO num_exp_sub VALUES (2,3,-34338496.525397047); +INSERT INTO num_exp_mul VALUES (2,3,-147998901.44836127257); +INSERT INTO num_exp_div VALUES (2,3,-7967167.56737750510440835266); +INSERT INTO num_exp_add VALUES (2,4,-26539030.803497047); +INSERT INTO num_exp_sub VALUES (2,4,-42137953.627297047); +INSERT INTO num_exp_mul VALUES (2,4,-267821744976817.8111137106593); +INSERT INTO num_exp_div VALUES (2,4,-4.40267480046830116685); +INSERT INTO num_exp_add VALUES (2,5,-34322095.176906047); +INSERT INTO num_exp_sub VALUES (2,5,-34354889.253888047); +INSERT INTO num_exp_mul VALUES (2,5,-563049578578.769242506736077); +INSERT INTO num_exp_div VALUES (2,5,-2094.18866914563535496429); +INSERT INTO num_exp_add VALUES (2,6,-34244590.637766787); +INSERT INTO num_exp_sub VALUES (2,6,-34432393.793027307); +INSERT INTO num_exp_mul VALUES (2,6,-3224438592470.18449811926184222); +INSERT INTO num_exp_div VALUES (2,6,-365.68599891479766440940); +INSERT INTO num_exp_add VALUES (2,7,-117366977.215397047); +INSERT INTO num_exp_sub VALUES (2,7,48689992.784602953); +INSERT INTO num_exp_mul VALUES (2,7,2851072985828710.485883795); +INSERT INTO num_exp_div VALUES (2,7,.41357483778485235518); +INSERT INTO num_exp_add VALUES (2,8,-34263611.215397047); +INSERT INTO num_exp_sub VALUES (2,8,-34413373.215397047); +INSERT INTO num_exp_mul VALUES (2,8,-2571300635581.146276407); +INSERT INTO num_exp_div VALUES (2,8,-458.57416721727870888476); +INSERT INTO num_exp_add VALUES (2,9,-59265296.260444467); +INSERT INTO num_exp_sub VALUES (2,9,-9411688.170349627); +INSERT INTO num_exp_mul VALUES (2,9,855948866655588.453741509242968740); +INSERT INTO num_exp_div VALUES (2,9,1.37757299946438931811); +INSERT INTO num_exp_add VALUES (3,0,4.31); +INSERT INTO num_exp_sub VALUES (3,0,4.31); +INSERT INTO num_exp_mul VALUES (3,0,0); +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_div VALUES (3,0,double('NaN')); +INSERT INTO num_exp_add VALUES (3,1,4.31); +INSERT INTO num_exp_sub VALUES (3,1,4.31); +INSERT INTO num_exp_mul VALUES (3,1,0); +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_div VALUES (3,1,double('NaN')); +INSERT INTO num_exp_add VALUES (3,2,-34338487.905397047); +INSERT INTO num_exp_sub VALUES (3,2,34338496.525397047); +INSERT INTO num_exp_mul VALUES (3,2,-147998901.44836127257); +INSERT INTO num_exp_div VALUES (3,2,-.00000012551512084352); +INSERT INTO num_exp_add VALUES (3,3,8.62); +INSERT INTO num_exp_sub VALUES (3,3,0); +INSERT INTO num_exp_mul VALUES (3,3,18.5761); +INSERT INTO num_exp_div VALUES (3,3,1.00000000000000000000); +INSERT INTO num_exp_add VALUES (3,4,7799465.7219); +INSERT INTO num_exp_sub VALUES (3,4,-7799457.1019); +INSERT INTO num_exp_mul VALUES (3,4,33615678.685289); +INSERT INTO num_exp_div VALUES (3,4,.00000055260225961552); +INSERT INTO num_exp_add VALUES (3,5,16401.348491); +INSERT INTO num_exp_sub VALUES (3,5,-16392.728491); +INSERT INTO num_exp_mul VALUES (3,5,70671.23589621); +INSERT INTO num_exp_div VALUES (3,5,.00026285234387695504); +INSERT INTO num_exp_add VALUES (3,6,93905.88763026); +INSERT INTO num_exp_sub VALUES (3,6,-93897.26763026); +INSERT INTO num_exp_mul VALUES (3,6,404715.7995864206); +INSERT INTO num_exp_div VALUES (3,6,.00004589912234457595); +INSERT INTO num_exp_add VALUES (3,7,-83028480.69); +INSERT INTO num_exp_sub VALUES (3,7,83028489.31); +INSERT INTO num_exp_mul VALUES (3,7,-357852770.35); +INSERT INTO num_exp_div VALUES (3,7,-.00000005190989574240); +INSERT INTO num_exp_add VALUES (3,8,74885.31); +INSERT INTO num_exp_sub VALUES (3,8,-74876.69); +INSERT INTO num_exp_mul VALUES (3,8,322737.11); +INSERT INTO num_exp_div VALUES (3,8,.00005755799201399553); +INSERT INTO num_exp_add VALUES (3,9,-24926799.735047420); +INSERT INTO num_exp_sub VALUES (3,9,24926808.355047420); +INSERT INTO num_exp_mul VALUES (3,9,-107434525.43415438020); +INSERT INTO num_exp_div VALUES (3,9,-.00000017290624149854); +INSERT INTO num_exp_add VALUES (4,0,7799461.4119); +INSERT INTO num_exp_sub VALUES (4,0,7799461.4119); +INSERT INTO num_exp_mul VALUES (4,0,0); +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_div VALUES (4,0,double('NaN')); +INSERT INTO num_exp_add VALUES (4,1,7799461.4119); +INSERT INTO num_exp_sub VALUES (4,1,7799461.4119); +INSERT INTO num_exp_mul VALUES (4,1,0); +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_div VALUES (4,1,double('NaN')); +INSERT INTO num_exp_add VALUES (4,2,-26539030.803497047); +INSERT INTO num_exp_sub VALUES (4,2,42137953.627297047); +INSERT INTO num_exp_mul VALUES (4,2,-267821744976817.8111137106593); +INSERT INTO num_exp_div VALUES (4,2,-.22713465002993920385); +INSERT INTO num_exp_add VALUES (4,3,7799465.7219); +INSERT INTO num_exp_sub VALUES (4,3,7799457.1019); +INSERT INTO num_exp_mul VALUES (4,3,33615678.685289); +INSERT INTO num_exp_div VALUES (4,3,1809619.81714617169373549883); +INSERT INTO num_exp_add VALUES (4,4,15598922.8238); +INSERT INTO num_exp_sub VALUES (4,4,0); +INSERT INTO num_exp_mul VALUES (4,4,60831598315717.14146161); +INSERT INTO num_exp_div VALUES (4,4,1.00000000000000000000); +INSERT INTO num_exp_add VALUES (4,5,7815858.450391); +INSERT INTO num_exp_sub VALUES (4,5,7783064.373409); +INSERT INTO num_exp_mul VALUES (4,5,127888068979.9935054429); +INSERT INTO num_exp_div VALUES (4,5,475.66281046305802686061); +INSERT INTO num_exp_add VALUES (4,6,7893362.98953026); +INSERT INTO num_exp_sub VALUES (4,6,7705559.83426974); +INSERT INTO num_exp_mul VALUES (4,6,732381731243.745115764094); +INSERT INTO num_exp_div VALUES (4,6,83.05996138436129499606); +INSERT INTO num_exp_add VALUES (4,7,-75229023.5881); +INSERT INTO num_exp_sub VALUES (4,7,90827946.4119); +INSERT INTO num_exp_mul VALUES (4,7,-647577464846017.9715); +INSERT INTO num_exp_div VALUES (4,7,-.09393717604145131637); +INSERT INTO num_exp_add VALUES (4,8,7874342.4119); +INSERT INTO num_exp_sub VALUES (4,8,7724580.4119); +INSERT INTO num_exp_mul VALUES (4,8,584031469984.4839); +INSERT INTO num_exp_div VALUES (4,8,104.15808298366741897143); +INSERT INTO num_exp_add VALUES (4,9,-17127342.633147420); +INSERT INTO num_exp_sub VALUES (4,9,32726265.456947420); +INSERT INTO num_exp_mul VALUES (4,9,-194415646271340.1815956522980); +INSERT INTO num_exp_div VALUES (4,9,-.31289456112403769409); +INSERT INTO num_exp_add VALUES (5,0,16397.038491); +INSERT INTO num_exp_sub VALUES (5,0,16397.038491); +INSERT INTO num_exp_mul VALUES (5,0,0); +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_div VALUES (5,0,double('NaN')); +INSERT INTO num_exp_add VALUES (5,1,16397.038491); +INSERT INTO num_exp_sub VALUES (5,1,16397.038491); +INSERT INTO num_exp_mul VALUES (5,1,0); +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_div VALUES (5,1,double('NaN')); +INSERT INTO num_exp_add VALUES (5,2,-34322095.176906047); +INSERT INTO num_exp_sub VALUES (5,2,34354889.253888047); +INSERT INTO num_exp_mul VALUES (5,2,-563049578578.769242506736077); +INSERT INTO num_exp_div VALUES (5,2,-.00047751189505192446); +INSERT INTO num_exp_add VALUES (5,3,16401.348491); +INSERT INTO num_exp_sub VALUES (5,3,16392.728491); +INSERT INTO num_exp_mul VALUES (5,3,70671.23589621); +INSERT INTO num_exp_div VALUES (5,3,3804.41728329466357308584); +INSERT INTO num_exp_add VALUES (5,4,7815858.450391); +INSERT INTO num_exp_sub VALUES (5,4,-7783064.373409); +INSERT INTO num_exp_mul VALUES (5,4,127888068979.9935054429); +INSERT INTO num_exp_div VALUES (5,4,.00210232958726897192); +INSERT INTO num_exp_add VALUES (5,5,32794.076982); +INSERT INTO num_exp_sub VALUES (5,5,0); +INSERT INTO num_exp_mul VALUES (5,5,268862871.275335557081); +INSERT INTO num_exp_div VALUES (5,5,1.00000000000000000000); +INSERT INTO num_exp_add VALUES (5,6,110298.61612126); +INSERT INTO num_exp_sub VALUES (5,6,-77504.53913926); +INSERT INTO num_exp_mul VALUES (5,6,1539707782.76899778633766); +INSERT INTO num_exp_div VALUES (5,6,.17461941433576102689); +INSERT INTO num_exp_add VALUES (5,7,-83012087.961509); +INSERT INTO num_exp_sub VALUES (5,7,83044882.038491); +INSERT INTO num_exp_mul VALUES (5,7,-1361421264394.416135); +INSERT INTO num_exp_div VALUES (5,7,-.00019748690453643710); +INSERT INTO num_exp_add VALUES (5,8,91278.038491); +INSERT INTO num_exp_sub VALUES (5,8,-58483.961509); +INSERT INTO num_exp_mul VALUES (5,8,1227826639.244571); +INSERT INTO num_exp_div VALUES (5,8,.21897461960978085228); +INSERT INTO num_exp_add VALUES (5,9,-24910407.006556420); +INSERT INTO num_exp_sub VALUES (5,9,24943201.083538420); +INSERT INTO num_exp_mul VALUES (5,9,-408725765384.257043660243220); +INSERT INTO num_exp_div VALUES (5,9,-.00065780749354660427); +INSERT INTO num_exp_add VALUES (6,0,93901.57763026); +INSERT INTO num_exp_sub VALUES (6,0,93901.57763026); +INSERT INTO num_exp_mul VALUES (6,0,0); +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_div VALUES (6,0,double('NaN')); +INSERT INTO num_exp_add VALUES (6,1,93901.57763026); +INSERT INTO num_exp_sub VALUES (6,1,93901.57763026); +INSERT INTO num_exp_mul VALUES (6,1,0); +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_div VALUES (6,1,double('NaN')); +INSERT INTO num_exp_add VALUES (6,2,-34244590.637766787); +INSERT INTO num_exp_sub VALUES (6,2,34432393.793027307); +INSERT INTO num_exp_mul VALUES (6,2,-3224438592470.18449811926184222); +INSERT INTO num_exp_div VALUES (6,2,-.00273458651128995823); +INSERT INTO num_exp_add VALUES (6,3,93905.88763026); +INSERT INTO num_exp_sub VALUES (6,3,93897.26763026); +INSERT INTO num_exp_mul VALUES (6,3,404715.7995864206); +INSERT INTO num_exp_div VALUES (6,3,21786.90896293735498839907); +INSERT INTO num_exp_add VALUES (6,4,7893362.98953026); +INSERT INTO num_exp_sub VALUES (6,4,-7705559.83426974); +INSERT INTO num_exp_mul VALUES (6,4,732381731243.745115764094); +INSERT INTO num_exp_div VALUES (6,4,.01203949512295682469); +INSERT INTO num_exp_add VALUES (6,5,110298.61612126); +INSERT INTO num_exp_sub VALUES (6,5,77504.53913926); +INSERT INTO num_exp_mul VALUES (6,5,1539707782.76899778633766); +INSERT INTO num_exp_div VALUES (6,5,5.72674008674192359679); +INSERT INTO num_exp_add VALUES (6,6,187803.15526052); +INSERT INTO num_exp_sub VALUES (6,6,0); +INSERT INTO num_exp_mul VALUES (6,6,8817506281.4517452372676676); +INSERT INTO num_exp_div VALUES (6,6,1.00000000000000000000); +INSERT INTO num_exp_add VALUES (6,7,-82934583.42236974); +INSERT INTO num_exp_sub VALUES (6,7,83122386.57763026); +INSERT INTO num_exp_mul VALUES (6,7,-7796505729750.37795610); +INSERT INTO num_exp_div VALUES (6,7,-.00113095617281538980); +INSERT INTO num_exp_add VALUES (6,8,168782.57763026); +INSERT INTO num_exp_sub VALUES (6,8,19020.57763026); +INSERT INTO num_exp_mul VALUES (6,8,7031444034.53149906); +INSERT INTO num_exp_div VALUES (6,8,1.25401073209839612184); +INSERT INTO num_exp_add VALUES (6,9,-24832902.467417160); +INSERT INTO num_exp_sub VALUES (6,9,25020705.622677680); +INSERT INTO num_exp_mul VALUES (6,9,-2340666225110.29929521292692920); +INSERT INTO num_exp_div VALUES (6,9,-.00376709254265256789); +INSERT INTO num_exp_add VALUES (7,0,-83028485); +INSERT INTO num_exp_sub VALUES (7,0,-83028485); +INSERT INTO num_exp_mul VALUES (7,0,0); +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_div VALUES (7,0,double('NaN')); +INSERT INTO num_exp_add VALUES (7,1,-83028485); +INSERT INTO num_exp_sub VALUES (7,1,-83028485); +INSERT INTO num_exp_mul VALUES (7,1,0); +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_div VALUES (7,1,double('NaN')); +INSERT INTO num_exp_add VALUES (7,2,-117366977.215397047); +INSERT INTO num_exp_sub VALUES (7,2,-48689992.784602953); +INSERT INTO num_exp_mul VALUES (7,2,2851072985828710.485883795); +INSERT INTO num_exp_div VALUES (7,2,2.41794207151503385700); +INSERT INTO num_exp_add VALUES (7,3,-83028480.69); +INSERT INTO num_exp_sub VALUES (7,3,-83028489.31); +INSERT INTO num_exp_mul VALUES (7,3,-357852770.35); +INSERT INTO num_exp_div VALUES (7,3,-19264149.65197215777262180974); +INSERT INTO num_exp_add VALUES (7,4,-75229023.5881); +INSERT INTO num_exp_sub VALUES (7,4,-90827946.4119); +INSERT INTO num_exp_mul VALUES (7,4,-647577464846017.9715); +INSERT INTO num_exp_div VALUES (7,4,-10.64541262725136247686); +INSERT INTO num_exp_add VALUES (7,5,-83012087.961509); +INSERT INTO num_exp_sub VALUES (7,5,-83044882.038491); +INSERT INTO num_exp_mul VALUES (7,5,-1361421264394.416135); +INSERT INTO num_exp_div VALUES (7,5,-5063.62688881730941836574); +INSERT INTO num_exp_add VALUES (7,6,-82934583.42236974); +INSERT INTO num_exp_sub VALUES (7,6,-83122386.57763026); +INSERT INTO num_exp_mul VALUES (7,6,-7796505729750.37795610); +INSERT INTO num_exp_div VALUES (7,6,-884.20756174009028770294); +INSERT INTO num_exp_add VALUES (7,7,-166056970); +INSERT INTO num_exp_sub VALUES (7,7,0); +INSERT INTO num_exp_mul VALUES (7,7,6893729321395225); +INSERT INTO num_exp_div VALUES (7,7,1.00000000000000000000); +INSERT INTO num_exp_add VALUES (7,8,-82953604); +INSERT INTO num_exp_sub VALUES (7,8,-83103366); +INSERT INTO num_exp_mul VALUES (7,8,-6217255985285); +INSERT INTO num_exp_div VALUES (7,8,-1108.80577182462841041118); +INSERT INTO num_exp_add VALUES (7,9,-107955289.045047420); +INSERT INTO num_exp_sub VALUES (7,9,-58101680.954952580); +INSERT INTO num_exp_mul VALUES (7,9,2069634775752159.035758700); +INSERT INTO num_exp_div VALUES (7,9,3.33089171198810413382); +INSERT INTO num_exp_add VALUES (8,0,74881); +INSERT INTO num_exp_sub VALUES (8,0,74881); +INSERT INTO num_exp_mul VALUES (8,0,0); +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_div VALUES (8,0,double('NaN')); +INSERT INTO num_exp_add VALUES (8,1,74881); +INSERT INTO num_exp_sub VALUES (8,1,74881); +INSERT INTO num_exp_mul VALUES (8,1,0); +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_div VALUES (8,1,double('NaN')); +INSERT INTO num_exp_add VALUES (8,2,-34263611.215397047); +INSERT INTO num_exp_sub VALUES (8,2,34413373.215397047); +INSERT INTO num_exp_mul VALUES (8,2,-2571300635581.146276407); +INSERT INTO num_exp_div VALUES (8,2,-.00218067233500788615); +INSERT INTO num_exp_add VALUES (8,3,74885.31); +INSERT INTO num_exp_sub VALUES (8,3,74876.69); +INSERT INTO num_exp_mul VALUES (8,3,322737.11); +INSERT INTO num_exp_div VALUES (8,3,17373.78190255220417633410); +INSERT INTO num_exp_add VALUES (8,4,7874342.4119); +INSERT INTO num_exp_sub VALUES (8,4,-7724580.4119); +INSERT INTO num_exp_mul VALUES (8,4,584031469984.4839); +INSERT INTO num_exp_div VALUES (8,4,.00960079113741758956); +INSERT INTO num_exp_add VALUES (8,5,91278.038491); +INSERT INTO num_exp_sub VALUES (8,5,58483.961509); +INSERT INTO num_exp_mul VALUES (8,5,1227826639.244571); +INSERT INTO num_exp_div VALUES (8,5,4.56673929509287019456); +INSERT INTO num_exp_add VALUES (8,6,168782.57763026); +INSERT INTO num_exp_sub VALUES (8,6,-19020.57763026); +INSERT INTO num_exp_mul VALUES (8,6,7031444034.53149906); +INSERT INTO num_exp_div VALUES (8,6,.79744134113322314424); +INSERT INTO num_exp_add VALUES (8,7,-82953604); +INSERT INTO num_exp_sub VALUES (8,7,83103366); +INSERT INTO num_exp_mul VALUES (8,7,-6217255985285); +INSERT INTO num_exp_div VALUES (8,7,-.00090187120721280172); +INSERT INTO num_exp_add VALUES (8,8,149762); +INSERT INTO num_exp_sub VALUES (8,8,0); +INSERT INTO num_exp_mul VALUES (8,8,5607164161); +INSERT INTO num_exp_div VALUES (8,8,1.00000000000000000000); +INSERT INTO num_exp_add VALUES (8,9,-24851923.045047420); +INSERT INTO num_exp_sub VALUES (8,9,25001685.045047420); +INSERT INTO num_exp_mul VALUES (8,9,-1866544013697.195857020); +INSERT INTO num_exp_div VALUES (8,9,-.00300403532938582735); +INSERT INTO num_exp_add VALUES (9,0,-24926804.045047420); +INSERT INTO num_exp_sub VALUES (9,0,-24926804.045047420); +INSERT INTO num_exp_mul VALUES (9,0,0); +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_div VALUES (9,0,double('NaN')); +INSERT INTO num_exp_add VALUES (9,1,-24926804.045047420); +INSERT INTO num_exp_sub VALUES (9,1,-24926804.045047420); +INSERT INTO num_exp_mul VALUES (9,1,0); +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_div VALUES (9,1,double('NaN')); +INSERT INTO num_exp_add VALUES (9,2,-59265296.260444467); +INSERT INTO num_exp_sub VALUES (9,2,9411688.170349627); +INSERT INTO num_exp_mul VALUES (9,2,855948866655588.453741509242968740); +INSERT INTO num_exp_div VALUES (9,2,.72591434384152961526); +INSERT INTO num_exp_add VALUES (9,3,-24926799.735047420); +INSERT INTO num_exp_sub VALUES (9,3,-24926808.355047420); +INSERT INTO num_exp_mul VALUES (9,3,-107434525.43415438020); +INSERT INTO num_exp_div VALUES (9,3,-5783481.21694835730858468677); +INSERT INTO num_exp_add VALUES (9,4,-17127342.633147420); +INSERT INTO num_exp_sub VALUES (9,4,-32726265.456947420); +INSERT INTO num_exp_mul VALUES (9,4,-194415646271340.1815956522980); +INSERT INTO num_exp_div VALUES (9,4,-3.19596478892958416484); +INSERT INTO num_exp_add VALUES (9,5,-24910407.006556420); +INSERT INTO num_exp_sub VALUES (9,5,-24943201.083538420); +INSERT INTO num_exp_mul VALUES (9,5,-408725765384.257043660243220); +INSERT INTO num_exp_div VALUES (9,5,-1520.20159364322004505807); +INSERT INTO num_exp_add VALUES (9,6,-24832902.467417160); +INSERT INTO num_exp_sub VALUES (9,6,-25020705.622677680); +INSERT INTO num_exp_mul VALUES (9,6,-2340666225110.29929521292692920); +INSERT INTO num_exp_div VALUES (9,6,-265.45671195426965751280); +INSERT INTO num_exp_add VALUES (9,7,-107955289.045047420); +INSERT INTO num_exp_sub VALUES (9,7,58101680.954952580); +INSERT INTO num_exp_mul VALUES (9,7,2069634775752159.035758700); +INSERT INTO num_exp_div VALUES (9,7,.30021990699995814689); +INSERT INTO num_exp_add VALUES (9,8,-24851923.045047420); +INSERT INTO num_exp_sub VALUES (9,8,-25001685.045047420); +INSERT INTO num_exp_mul VALUES (9,8,-1866544013697.195857020); +INSERT INTO num_exp_div VALUES (9,8,-332.88556569820675471748); +INSERT INTO num_exp_add VALUES (9,9,-49853608.090094840); +INSERT INTO num_exp_sub VALUES (9,9,0); +INSERT INTO num_exp_mul VALUES (9,9,621345559900192.420120630048656400); +INSERT INTO num_exp_div VALUES (9,9,1.00000000000000000000); +-- COMMIT TRANSACTION; +-- BEGIN TRANSACTION; +-- PostgreSQL implicitly casts string literals to data with decimal types, but +-- Spark does not support that kind of implicit casts. To test all the INSERT queries below, +-- we rewrote them into the other typed literals. +INSERT INTO num_exp_sqrt VALUES (0,0); +INSERT INTO num_exp_sqrt VALUES (1,0); +INSERT INTO num_exp_sqrt VALUES (2,5859.90547836712524903505); +INSERT INTO num_exp_sqrt VALUES (3,2.07605394920266944396); +INSERT INTO num_exp_sqrt VALUES (4,2792.75158435189147418923); +INSERT INTO num_exp_sqrt VALUES (5,128.05092147657509145473); +INSERT INTO num_exp_sqrt VALUES (6,306.43364311096782703406); +INSERT INTO num_exp_sqrt VALUES (7,9111.99676251039939975230); +INSERT INTO num_exp_sqrt VALUES (8,273.64392922189960397542); +INSERT INTO num_exp_sqrt VALUES (9,4992.67503899937593364766); +-- COMMIT TRANSACTION; +-- BEGIN TRANSACTION; +-- PostgreSQL implicitly casts string literals to data with decimal types, but +-- Spark does not support that kind of implicit casts. To test all the INSERT queries below, +-- we rewrote them into the other typed literals. +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_ln VALUES (0,double('NaN')); +INSERT INTO num_exp_ln VALUES (1,double('NaN')); +INSERT INTO num_exp_ln VALUES (2,17.35177750493897715514); +INSERT INTO num_exp_ln VALUES (3,1.46093790411565641971); +INSERT INTO num_exp_ln VALUES (4,15.86956523951936572464); +INSERT INTO num_exp_ln VALUES (5,9.70485601768871834038); +INSERT INTO num_exp_ln VALUES (6,11.45000246622944403127); +INSERT INTO num_exp_ln VALUES (7,18.23469429965478772991); +INSERT INTO num_exp_ln VALUES (8,11.22365546576315513668); +INSERT INTO num_exp_ln VALUES (9,17.03145425013166006962); +-- COMMIT TRANSACTION; +-- BEGIN TRANSACTION; +-- PostgreSQL implicitly casts string literals to data with decimal types, but +-- Spark does not support that kind of implicit casts. To test all the INSERT queries below, +-- we rewrote them into the other typed literals. +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_log10 VALUES (0,double('NaN')); +INSERT INTO num_exp_log10 VALUES (1,double('NaN')); +INSERT INTO num_exp_log10 VALUES (2,7.53578122160797276459); +INSERT INTO num_exp_log10 VALUES (3,.63447727016073160075); +INSERT INTO num_exp_log10 VALUES (4,6.89206461372691743345); +INSERT INTO num_exp_log10 VALUES (5,4.21476541614777768626); +INSERT INTO num_exp_log10 VALUES (6,4.97267288886207207671); +INSERT INTO num_exp_log10 VALUES (7,7.91922711353275546914); +INSERT INTO num_exp_log10 VALUES (8,4.87437163556421004138); +INSERT INTO num_exp_log10 VALUES (9,7.39666659961986567059); +-- COMMIT TRANSACTION; +-- BEGIN TRANSACTION; +-- PostgreSQL implicitly casts string literals to data with decimal types, but +-- Spark does not support that kind of implicit casts. To test all the INSERT queries below, +-- we rewrote them into the other typed literals. +-- [SPARK-28315] Decimal can not accept NaN as input +INSERT INTO num_exp_power_10_ln VALUES (0,double('NaN')); +INSERT INTO num_exp_power_10_ln VALUES (1,double('NaN')); +INSERT INTO num_exp_power_10_ln VALUES (2,224790267919917955.13261618583642653184); +INSERT INTO num_exp_power_10_ln VALUES (3,28.90266599445155957393); +INSERT INTO num_exp_power_10_ln VALUES (4,7405685069594999.07733999469386277636); +INSERT INTO num_exp_power_10_ln VALUES (5,5068226527.32127265408584640098); +INSERT INTO num_exp_power_10_ln VALUES (6,281839893606.99372343357047819067); +-- In Spark, decimal can only support precision up to 38 +INSERT INTO num_exp_power_10_ln VALUES (7,1716699575118597095.42330819910640247627); +INSERT INTO num_exp_power_10_ln VALUES (8,167361463828.07491320069016125952); +INSERT INTO num_exp_power_10_ln VALUES (9,107511333880052007.04141124673540337457); +-- COMMIT TRANSACTION; +-- BEGIN TRANSACTION; +-- PostgreSQL implicitly casts string literals to data with decimal types, but +-- Spark does not support that kind of implicit casts. To test all the INSERT queries below, +-- we rewrote them into the other typed literals. +INSERT INTO num_data VALUES (0, 0); +INSERT INTO num_data VALUES (1, 0); +INSERT INTO num_data VALUES (2, -34338492.215397047); +INSERT INTO num_data VALUES (3, 4.31); +INSERT INTO num_data VALUES (4, 7799461.4119); +INSERT INTO num_data VALUES (5, 16397.038491); +INSERT INTO num_data VALUES (6, 93901.57763026); +INSERT INTO num_data VALUES (7, -83028485); +INSERT INTO num_data VALUES (8, 74881); +INSERT INTO num_data VALUES (9, -24926804.045047420); +-- COMMIT TRANSACTION; + +SELECT * FROM num_data; + +-- ****************************** +-- * Create indices for faster checks +-- ****************************** + +-- CREATE UNIQUE INDEX num_exp_add_idx ON num_exp_add (id1, id2); +-- CREATE UNIQUE INDEX num_exp_sub_idx ON num_exp_sub (id1, id2); +-- CREATE UNIQUE INDEX num_exp_div_idx ON num_exp_div (id1, id2); +-- CREATE UNIQUE INDEX num_exp_mul_idx ON num_exp_mul (id1, id2); +-- CREATE UNIQUE INDEX num_exp_sqrt_idx ON num_exp_sqrt (id); +-- CREATE UNIQUE INDEX num_exp_ln_idx ON num_exp_ln (id); +-- CREATE UNIQUE INDEX num_exp_log10_idx ON num_exp_log10 (id); +-- CREATE UNIQUE INDEX num_exp_power_10_ln_idx ON num_exp_power_10_ln (id); + +-- VACUUM ANALYZE num_exp_add; +-- VACUUM ANALYZE num_exp_sub; +-- VACUUM ANALYZE num_exp_div; +-- VACUUM ANALYZE num_exp_mul; +-- VACUUM ANALYZE num_exp_sqrt; +-- VACUUM ANALYZE num_exp_ln; +-- VACUUM ANALYZE num_exp_log10; +-- VACUUM ANALYZE num_exp_power_10_ln; + +-- ****************************** +-- * Now check the behaviour of the NUMERIC type +-- ****************************** + +-- ****************************** +-- * Addition check +-- ****************************** +TRUNCATE TABLE num_result; +INSERT INTO num_result SELECT t1.id, t2.id, t1.val + t2.val + FROM num_data t1, num_data t2; +SELECT t1.id1, t1.id2, t1.result, t2.expected + FROM num_result t1, num_exp_add t2 + WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 + AND t1.result != t2.expected; + +TRUNCATE TABLE num_result; +INSERT INTO num_result SELECT t1.id, t2.id, round(t1.val + t2.val, 10) + FROM num_data t1, num_data t2; +SELECT t1.id1, t1.id2, t1.result, round(t2.expected, 10) as expected + FROM num_result t1, num_exp_add t2 + WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 + AND t1.result != round(t2.expected, 10); + +-- ****************************** +-- * Subtraction check +-- ****************************** +TRUNCATE TABLE num_result; +INSERT INTO num_result SELECT t1.id, t2.id, t1.val - t2.val + FROM num_data t1, num_data t2; +SELECT t1.id1, t1.id2, t1.result, t2.expected + FROM num_result t1, num_exp_sub t2 + WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 + AND t1.result != t2.expected; + +TRUNCATE TABLE num_result; +INSERT INTO num_result SELECT t1.id, t2.id, round(t1.val - t2.val, 40) + FROM num_data t1, num_data t2; +SELECT t1.id1, t1.id2, t1.result, round(t2.expected, 40) + FROM num_result t1, num_exp_sub t2 + WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 + AND t1.result != round(t2.expected, 40); + +-- ****************************** +-- * Multiply check +-- ****************************** +-- [SPARK-28316] Decimal precision issue +TRUNCATE TABLE num_result; +INSERT INTO num_result SELECT t1.id, t2.id, t1.val, t2.val, t1.val * t2.val + FROM num_data t1, num_data t2; +SELECT t1.id1, t1.id2, t1.result, t2.expected + FROM num_result t1, num_exp_mul t2 + WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 + AND t1.result != t2.expected; + +TRUNCATE TABLE num_result; +INSERT INTO num_result SELECT t1.id, t2.id, round(t1.val * t2.val, 30) + FROM num_data t1, num_data t2; +SELECT t1.id1, t1.id2, t1.result, round(t2.expected, 30) as expected + FROM num_result t1, num_exp_mul t2 + WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 + AND t1.result != round(t2.expected, 30); + +-- ****************************** +-- * Division check +-- ****************************** +-- [SPARK-28316] Decimal precision issue +TRUNCATE TABLE num_result; +INSERT INTO num_result SELECT t1.id, t2.id, t1.val / t2.val + FROM num_data t1, num_data t2 + WHERE t2.val != '0.0'; +SELECT t1.id1, t1.id2, t1.result, t2.expected + FROM num_result t1, num_exp_div t2 + WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 + AND t1.result != t2.expected; + +TRUNCATE TABLE num_result; +INSERT INTO num_result SELECT t1.id, t2.id, round(t1.val / t2.val, 80) + FROM num_data t1, num_data t2 + WHERE t2.val != '0.0'; +SELECT t1.id1, t1.id2, t1.result, round(t2.expected, 80) as expected + FROM num_result t1, num_exp_div t2 + WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 + AND t1.result != round(t2.expected, 80); + +-- ****************************** +-- * Square root check +-- ****************************** +TRUNCATE TABLE num_result; +INSERT INTO num_result SELECT id, 0, SQRT(ABS(val)) + FROM num_data; +SELECT t1.id1, t1.result, t2.expected + FROM num_result t1, num_exp_sqrt t2 + WHERE t1.id1 = t2.id + AND t1.result != t2.expected; + +-- ****************************** +-- * Natural logarithm check +-- ****************************** +TRUNCATE TABLE num_result; +INSERT INTO num_result SELECT id, 0, LN(ABS(val)) + FROM num_data + WHERE val != '0.0'; +SELECT t1.id1, t1.result, t2.expected + FROM num_result t1, num_exp_ln t2 + WHERE t1.id1 = t2.id + AND t1.result != t2.expected; + +-- ****************************** +-- * Logarithm base 10 check +-- ****************************** +TRUNCATE TABLE num_result; +INSERT INTO num_result SELECT id, 0, LOG(cast('10' as decimal(38, 18)), ABS(val)) + FROM num_data + WHERE val != '0.0'; +SELECT t1.id1, t1.result, t2.expected + FROM num_result t1, num_exp_log10 t2 + WHERE t1.id1 = t2.id + AND t1.result != t2.expected; + +-- ****************************** +-- * POWER(10, LN(value)) check +-- ****************************** +-- [SPARK-28316] Decimal precision issue +TRUNCATE TABLE num_result; +INSERT INTO num_result SELECT id, 0, POWER(cast('10' as decimal(38, 18)), LN(ABS(round(val,200)))) + FROM num_data + WHERE val != '0.0'; +SELECT t1.id1, t1.result, t2.expected + FROM num_result t1, num_exp_power_10_ln t2 + WHERE t1.id1 = t2.id + AND t1.result != t2.expected; + +-- ****************************** +-- * miscellaneous checks for things that have been broken in the past... +-- ****************************** +-- numeric AVG used to fail on some platforms +SELECT AVG(val) FROM num_data; +-- [SPARK-28316] STDDEV and VARIANCE returns double type +-- Skip it because: Expected "2.779120328758835[]E7", but got "2.779120328758835[4]E7" +-- SELECT STDDEV(val) FROM num_data; +-- Skip it because: Expected "7.72350980172061[8]E14", but got "7.72350980172061[6]E14" +-- SELECT VARIANCE(val) FROM num_data; + +-- Check for appropriate rounding and overflow +CREATE TABLE fract_only (id int, val decimal(4,4)) USING parquet; +INSERT INTO fract_only VALUES (1, 0.0); +INSERT INTO fract_only VALUES (2, 0.1); +-- [SPARK-27923] PostgreSQL throws an exception but Spark SQL is NULL +-- INSERT INTO fract_only VALUES (3, '1.0'); -- should fail +-- PostgreSQL implicitly casts string literals to data with decimal types, but +-- Spark does not support that kind of implicit casts. To test all the INSERT queries below, +-- we rewrote them into the other typed literals. +INSERT INTO fract_only VALUES (4, -0.9999); +INSERT INTO fract_only VALUES (5, 0.99994); +-- [SPARK-27923] PostgreSQL throws an exception but Spark SQL is NULL +-- INSERT INTO fract_only VALUES (6, '0.99995'); -- should fail +-- PostgreSQL implicitly casts string literals to data with decimal types, but +-- Spark does not support that kind of implicit casts. To test all the INSERT queries below, +-- we rewrote them into the other typed literals. +INSERT INTO fract_only VALUES (7, 0.00001); +INSERT INTO fract_only VALUES (8, 0.00017); +SELECT * FROM fract_only; +DROP TABLE fract_only; + +-- [SPARK-28315] Decimal can not accept NaN as input +-- [SPARK-27923] Decimal type can not accept Infinity and -Infinity +-- Check inf/nan conversion behavior +SELECT decimal(double('NaN')); +SELECT decimal(double('Infinity')); +SELECT decimal(double('-Infinity')); +SELECT decimal(float('NaN')); +SELECT decimal(float('Infinity')); +SELECT decimal(float('-Infinity')); + +-- Simple check that ceil(), floor(), and round() work correctly +CREATE TABLE ceil_floor_round (a decimal(38, 18)) USING parquet; +-- PostgreSQL implicitly casts string literals to data with decimal types, but +-- Spark does not support that kind of implicit casts. To test all the INSERT queries below, +-- we rewrote them into the other typed literals. +INSERT INTO ceil_floor_round VALUES (-5.5); +INSERT INTO ceil_floor_round VALUES (-5.499999); +INSERT INTO ceil_floor_round VALUES (9.5); +INSERT INTO ceil_floor_round VALUES (9.4999999); +INSERT INTO ceil_floor_round VALUES (0.0); +INSERT INTO ceil_floor_round VALUES (0.0000001); +INSERT INTO ceil_floor_round VALUES (-0.000001); +SELECT a, ceil(a), ceiling(a), floor(a), round(a) FROM ceil_floor_round; +DROP TABLE ceil_floor_round; + +-- [SPARK-28007] Caret operator (^) means bitwise XOR in Spark and exponentiation in Postgres +-- Check rounding, it should round ties away from zero. +-- SELECT i as pow, +-- round((-2.5 * 10 ^ i)::numeric, -i), +-- round((-1.5 * 10 ^ i)::numeric, -i), +-- round((-0.5 * 10 ^ i)::numeric, -i), +-- round((0.5 * 10 ^ i)::numeric, -i), +-- round((1.5 * 10 ^ i)::numeric, -i), +-- round((2.5 * 10 ^ i)::numeric, -i) +-- FROM generate_series(-5,5) AS t(i); + +-- [SPARK-21117] Built-in SQL Function Support - WIDTH_BUCKET +-- Testing for width_bucket(). For convenience, we test both the +-- numeric and float8 versions of the function in this file. + +-- errors +-- SELECT width_bucket(5.0, 3.0, 4.0, 0); +-- SELECT width_bucket(5.0, 3.0, 4.0, -5); +-- SELECT width_bucket(3.5, 3.0, 3.0, 888); +-- SELECT width_bucket(5.0::float8, 3.0::float8, 4.0::float8, 0); +-- SELECT width_bucket(5.0::float8, 3.0::float8, 4.0::float8, -5); +-- SELECT width_bucket(3.5::float8, 3.0::float8, 3.0::float8, 888); +-- SELECT width_bucket('NaN', 3.0, 4.0, 888); +-- SELECT width_bucket(0::float8, 'NaN', 4.0::float8, 888); + +-- normal operation +-- CREATE TABLE width_bucket_test (operand_num numeric, operand_f8 float8); + +-- COPY width_bucket_test (operand_num) FROM stdin; +-- -5.2 +-- -0.0000000001 +-- 0.000000000001 +-- 1 +-- 1.99999999999999 +-- 2 +-- 2.00000000000001 +-- 3 +-- 4 +-- 4.5 +-- 5 +-- 5.5 +-- 6 +-- 7 +-- 8 +-- 9 +-- 9.99999999999999 +-- 10 +-- 10.0000000000001 +-- \. + +-- UPDATE width_bucket_test SET operand_f8 = operand_num::float8; + +-- SELECT +-- operand_num, +-- width_bucket(operand_num, 0, 10, 5) AS wb_1, +-- width_bucket(operand_f8, 0, 10, 5) AS wb_1f, +-- width_bucket(operand_num, 10, 0, 5) AS wb_2, +-- width_bucket(operand_f8, 10, 0, 5) AS wb_2f, +-- width_bucket(operand_num, 2, 8, 4) AS wb_3, +-- width_bucket(operand_f8, 2, 8, 4) AS wb_3f, +-- width_bucket(operand_num, 5.0, 5.5, 20) AS wb_4, +-- width_bucket(operand_f8, 5.0, 5.5, 20) AS wb_4f, +-- width_bucket(operand_num, -25, 25, 10) AS wb_5, +-- width_bucket(operand_f8, -25, 25, 10) AS wb_5f +-- FROM width_bucket_test; + +-- for float8 only, check positive and negative infinity: we require +-- finite bucket bounds, but allow an infinite operand +-- SELECT width_bucket(0.0::float8, 'Infinity'::float8, 5, 10); -- error +-- SELECT width_bucket(0.0::float8, 5, '-Infinity'::float8, 20); -- error +-- SELECT width_bucket('Infinity'::float8, 1, 10, 10), +-- width_bucket('-Infinity'::float8, 1, 10, 10); + +-- DROP TABLE width_bucket_test; + +-- [SPARK-28137] Missing Data Type Formatting Functions: TO_CHAR +-- TO_CHAR() +-- +-- SELECT '' AS to_char_1, to_char(val, '9G999G999G999G999G999') +-- FROM num_data; + +-- SELECT '' AS to_char_2, to_char(val, '9G999G999G999G999G999D999G999G999G999G999') +-- FROM num_data; + +-- SELECT '' AS to_char_3, to_char(val, '9999999999999999.999999999999999PR') +-- FROM num_data; + +-- SELECT '' AS to_char_4, to_char(val, '9999999999999999.999999999999999S') +-- FROM num_data; + +-- SELECT '' AS to_char_5, to_char(val, 'MI9999999999999999.999999999999999') FROM num_data; +-- SELECT '' AS to_char_6, to_char(val, 'FMS9999999999999999.999999999999999') FROM num_data; +-- SELECT '' AS to_char_7, to_char(val, 'FM9999999999999999.999999999999999THPR') FROM num_data; +-- SELECT '' AS to_char_8, to_char(val, 'SG9999999999999999.999999999999999th') FROM num_data; +-- SELECT '' AS to_char_9, to_char(val, '0999999999999999.999999999999999') FROM num_data; +-- SELECT '' AS to_char_10, to_char(val, 'S0999999999999999.999999999999999') FROM num_data; +-- SELECT '' AS to_char_11, to_char(val, 'FM0999999999999999.999999999999999') FROM num_data; +-- SELECT '' AS to_char_12, to_char(val, 'FM9999999999999999.099999999999999') FROM num_data; +-- SELECT '' AS to_char_13, to_char(val, 'FM9999999999990999.990999999999999') FROM num_data; +-- SELECT '' AS to_char_14, to_char(val, 'FM0999999999999999.999909999999999') FROM num_data; +-- SELECT '' AS to_char_15, to_char(val, 'FM9999999990999999.099999999999999') FROM num_data; +-- SELECT '' AS to_char_16, to_char(val, 'L9999999999999999.099999999999999') FROM num_data; +-- SELECT '' AS to_char_17, to_char(val, 'FM9999999999999999.99999999999999') FROM num_data; +-- SELECT '' AS to_char_18, to_char(val, 'S 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 . 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9') FROM num_data; +-- SELECT '' AS to_char_19, to_char(val, 'FMS 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 . 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9') FROM num_data; +-- SELECT '' AS to_char_20, to_char(val, E'99999 "text" 9999 "9999" 999 "\\"text between quote marks\\"" 9999') FROM num_data; +-- SELECT '' AS to_char_21, to_char(val, '999999SG9999999999') FROM num_data; +-- SELECT '' AS to_char_22, to_char(val, 'FM9999999999999999.999999999999999') FROM num_data; +-- SELECT '' AS to_char_23, to_char(val, '9.999EEEE') FROM num_data; + +-- SELECT '' AS to_char_24, to_char('100'::numeric, 'FM999.9'); +-- SELECT '' AS to_char_25, to_char('100'::numeric, 'FM999.'); +-- SELECT '' AS to_char_26, to_char('100'::numeric, 'FM999'); + +-- Check parsing of literal text in a format string +-- SELECT '' AS to_char_27, to_char('100'::numeric, 'foo999'); +-- SELECT '' AS to_char_28, to_char('100'::numeric, 'f\oo999'); +-- SELECT '' AS to_char_29, to_char('100'::numeric, 'f\\oo999'); +-- SELECT '' AS to_char_30, to_char('100'::numeric, 'f\"oo999'); +-- SELECT '' AS to_char_31, to_char('100'::numeric, 'f\\"oo999'); +-- SELECT '' AS to_char_32, to_char('100'::numeric, 'f"ool"999'); +-- SELECT '' AS to_char_33, to_char('100'::numeric, 'f"\ool"999'); +-- SELECT '' AS to_char_34, to_char('100'::numeric, 'f"\\ool"999'); +-- SELECT '' AS to_char_35, to_char('100'::numeric, 'f"ool\"999'); +-- SELECT '' AS to_char_36, to_char('100'::numeric, 'f"ool\\"999'); + +-- [SPARK-28137] Missing Data Type Formatting Functions: TO_NUMBER +-- TO_NUMBER() +-- +-- SET lc_numeric = 'C'; +-- SELECT '' AS to_number_1, to_number('-34,338,492', '99G999G999'); +-- SELECT '' AS to_number_2, to_number('-34,338,492.654,878', '99G999G999D999G999'); +-- SELECT '' AS to_number_3, to_number('<564646.654564>', '999999.999999PR'); +-- SELECT '' AS to_number_4, to_number('0.00001-', '9.999999S'); +-- SELECT '' AS to_number_5, to_number('5.01-', 'FM9.999999S'); +-- SELECT '' AS to_number_5, to_number('5.01-', 'FM9.999999MI'); +-- SELECT '' AS to_number_7, to_number('5 4 4 4 4 8 . 7 8', '9 9 9 9 9 9 . 9 9'); +-- SELECT '' AS to_number_8, to_number('.01', 'FM9.99'); +-- SELECT '' AS to_number_9, to_number('.0', '99999999.99999999'); +-- SELECT '' AS to_number_10, to_number('0', '99.99'); +-- SELECT '' AS to_number_11, to_number('.-01', 'S99.99'); +-- SELECT '' AS to_number_12, to_number('.01-', '99.99S'); +-- SELECT '' AS to_number_13, to_number(' . 0 1-', ' 9 9 . 9 9 S'); +-- SELECT '' AS to_number_14, to_number('34,50','999,99'); +-- SELECT '' AS to_number_15, to_number('123,000','999G'); +-- SELECT '' AS to_number_16, to_number('123456','999G999'); +-- SELECT '' AS to_number_17, to_number('$1234.56','L9,999.99'); +-- SELECT '' AS to_number_18, to_number('$1234.56','L99,999.99'); +-- SELECT '' AS to_number_19, to_number('$1,234.56','L99,999.99'); +-- SELECT '' AS to_number_20, to_number('1234.56','L99,999.99'); +-- SELECT '' AS to_number_21, to_number('1,234.56','L99,999.99'); +-- SELECT '' AS to_number_22, to_number('42nd', '99th'); +-- RESET lc_numeric; + +-- +-- Input syntax +-- + +CREATE TABLE num_input_test (n1 decimal(38, 18)) USING parquet; + +-- good inputs +-- PostgreSQL implicitly casts string literals to data with decimal types, but +-- Spark does not support that kind of implicit casts. To test all the INSERT queries below, +-- we rewrote them into the other typed literals. +INSERT INTO num_input_test VALUES (double(trim(' 123'))); +INSERT INTO num_input_test VALUES (double(trim(' 3245874 '))); +INSERT INTO num_input_test VALUES (double(trim(' -93853'))); +INSERT INTO num_input_test VALUES (555.50); +INSERT INTO num_input_test VALUES (-555.50); +-- [SPARK-28315] Decimal can not accept NaN as input +-- INSERT INTO num_input_test VALUES (trim('NaN ')); +-- INSERT INTO num_input_test VALUES (trim(' nan')); + +-- [SPARK-27923] Spark SQL accept bad inputs to NULL +-- bad inputs +-- INSERT INTO num_input_test VALUES (' '); +-- INSERT INTO num_input_test VALUES (' 1234 %'); +-- INSERT INTO num_input_test VALUES ('xyz'); +-- INSERT INTO num_input_test VALUES ('- 1234'); +-- INSERT INTO num_input_test VALUES ('5 . 0'); +-- INSERT INTO num_input_test VALUES ('5. 0 '); +-- INSERT INTO num_input_test VALUES (''); +-- INSERT INTO num_input_test VALUES (' N aN '); + +SELECT * FROM num_input_test; + +-- [SPARK-28318] Decimal can only support precision up to 38 +-- +-- Test some corner cases for multiplication +-- + +-- select 4790999999999999999999999999999999999999999999999999999999999999999999999999999999999999 * 9999999999999999999999999999999999999999999999999999999999999999999999999999999999999999; + +-- select 4789999999999999999999999999999999999999999999999999999999999999999999999999999999999999 * 9999999999999999999999999999999999999999999999999999999999999999999999999999999999999999; + +-- select 4770999999999999999999999999999999999999999999999999999999999999999999999999999999999999 * 9999999999999999999999999999999999999999999999999999999999999999999999999999999999999999; + +-- select 4769999999999999999999999999999999999999999999999999999999999999999999999999999999999999 * 9999999999999999999999999999999999999999999999999999999999999999999999999999999999999999; + +-- +-- Test some corner cases for division +-- +-- 999999999999999999999 is overflow for SYSTEM_DEFAULT(decimal(38, 18)), we use BigIntDecimal(decimal(38, 0)). +select cast(999999999999999999999 as decimal(38, 0))/1000000000000000000000; + +select div(cast(999999999999999999999 as decimal(38, 0)),1000000000000000000000); +select mod(cast(999999999999999999999 as decimal(38, 0)),1000000000000000000000); +select div(cast(-9999999999999999999999 as decimal(38, 0)),1000000000000000000000); +select mod(cast(-9999999999999999999999 as decimal(38, 0)),1000000000000000000000); +select div(cast(-9999999999999999999999 as decimal(38, 0)),1000000000000000000000)*1000000000000000000000 + mod(cast(-9999999999999999999999 as decimal(38, 0)),1000000000000000000000); +select mod (70.0,70) ; +select div (70.0,70) ; +select 70.0 / 70 ; +select 12345678901234567890 % 123; +-- [SPARK-2659] HiveQL: Division operator should always perform fractional division +-- select 12345678901234567890 DIV 123; +-- select div(12345678901234567890, 123); +-- select div(12345678901234567890, 123) * 123 + 12345678901234567890 % 123; + +-- [SPARK-28007] Caret operator (^) means bitwise XOR in Spark and exponentiation in Postgres +-- +-- Test code path for raising to integer powers +-- + +-- select 10.0 ^ -2147483648 as rounds_to_zero; +-- select 10.0 ^ -2147483647 as rounds_to_zero; +-- select 10.0 ^ 2147483647 as overflows; +-- select 117743296169.0 ^ 1000000000 as overflows; + +-- cases that used to return inaccurate results +-- select 3.789 ^ 21; +-- select 3.789 ^ 35; +-- select 1.2 ^ 345; +-- select 0.12 ^ (-20); + +-- cases that used to error out +-- select 0.12 ^ (-25); +-- select 0.5678 ^ (-85); + +-- +-- Tests for raising to non-integer powers +-- + +-- special cases +-- select 0.0 ^ 0.0; +-- select (-12.34) ^ 0.0; +-- select 12.34 ^ 0.0; +-- select 0.0 ^ 12.34; + +-- NaNs +-- select 'NaN'::numeric ^ 'NaN'::numeric; +-- select 'NaN'::numeric ^ 0; +-- select 'NaN'::numeric ^ 1; +-- select 0 ^ 'NaN'::numeric; +-- select 1 ^ 'NaN'::numeric; + +-- invalid inputs +-- select 0.0 ^ (-12.34); +-- select (-12.34) ^ 1.2; + +-- cases that used to generate inaccurate results +-- select 32.1 ^ 9.8; +-- select 32.1 ^ (-9.8); +-- select 12.3 ^ 45.6; +-- select 12.3 ^ (-45.6); + +-- big test +-- select 1.234 ^ 5678; + +-- +-- Tests for EXP() +-- + +-- special cases +select exp(0.0); +select exp(1.0); +-- [SPARK-28316] EXP returns double type for decimal input +-- [SPARK-28318] Decimal can only support precision up to 38 +-- select exp(1.0::numeric(71,70)); + +-- cases that used to generate inaccurate results +select exp(32.999); +select exp(-32.999); +select exp(123.456); +select exp(-123.456); + +-- big test +select exp(1234.5678); + +-- +-- Tests for generate_series +-- +select * from range(cast(0.0 as decimal(38, 18)), cast(4.0 as decimal(38, 18))); +select * from range(cast(0.1 as decimal(38, 18)), cast(4.0 as decimal(38, 18)), cast(1.3 as decimal(38, 18))); +select * from range(cast(4.0 as decimal(38, 18)), cast(-1.5 as decimal(38, 18)), cast(-2.2 as decimal(38, 18))); +-- Trigger errors +-- select * from generate_series(-100::numeric, 100::numeric, 0::numeric); +-- select * from generate_series(-100::numeric, 100::numeric, 'nan'::numeric); +-- select * from generate_series('nan'::numeric, 100::numeric, 10::numeric); +-- select * from generate_series(0::numeric, 'nan'::numeric, 10::numeric); +-- [SPARK-28007] Caret operator (^) means bitwise XOR in Spark and exponentiation in Postgres +-- Checks maximum, output is truncated +-- select (i / (10::numeric ^ 131071))::numeric(1,0) +-- from generate_series(6 * (10::numeric ^ 131071), +-- 9 * (10::numeric ^ 131071), +-- 10::numeric ^ 131071) as a(i); +-- Check usage with variables +-- select * from generate_series(1::numeric, 3::numeric) i, generate_series(i,3) j; +-- select * from generate_series(1::numeric, 3::numeric) i, generate_series(1,i) j; +-- select * from generate_series(1::numeric, 3::numeric) i, generate_series(1,5,i) j; + +-- +-- Tests for LN() +-- + +-- [SPARK-27923] Invalid inputs for LN throws exception at PostgreSQL +-- Invalid inputs +-- select ln(-12.34); +-- select ln(0.0); + +-- Some random tests +select ln(1.2345678e-28); +select ln(0.0456789); +-- [SPARK-28318] Decimal can only support precision up to 38 +-- select ln(0.349873948359354029493948309745709580730482050975); +select ln(0.99949452); +select ln(1.00049687395); +select ln(1234.567890123456789); +select ln(5.80397490724e5); +select ln(9.342536355e34); + +-- +-- Tests for LOG() (base 10) +-- + +-- [SPARK-27923] Invalid inputs for LOG throws exception at PostgreSQL +-- invalid inputs +-- select log(-12.34); +-- select log(0.0); + +-- some random tests +-- [SPARK-28318] Decimal can only support precision up to 38 +-- select log(1.234567e-89); +-- [SPARK-28324] The LOG function using 10 as the base, but Spark using E +select log(3.4634998359873254962349856073435545); +select log(9.999999999999999999); +select log(10.00000000000000000); +select log(10.00000000000000001); +select log(590489.45235237); + +-- +-- Tests for LOG() (arbitrary base) +-- + +-- [SPARK-27923] Invalid inputs for LOG throws exception at PostgreSQL +-- invalid inputs +-- select log(-12.34, 56.78); +-- select log(-12.34, -56.78); +-- select log(12.34, -56.78); +-- select log(0.0, 12.34); +-- select log(12.34, 0.0); +-- select log(1.0, 12.34); + +-- some random tests +-- [SPARK-28318] Decimal can only support precision up to 38 +-- select log(1.23e-89, 6.4689e45); +select log(0.99923, 4.58934e34); +select log(1.000016, 8.452010e18); +-- [SPARK-28318] Decimal can only support precision up to 38 +-- select log(3.1954752e47, 9.4792021e-73); + +-- [SPARK-28317] Built-in Mathematical Functions: SCALE +-- +-- Tests for scale() +-- + +-- select scale(numeric 'NaN'); +-- select scale(NULL::numeric); +-- select scale(1.12); +-- select scale(0); +-- select scale(0.00); +-- select scale(1.12345); +-- select scale(110123.12475871856128); +-- select scale(-1123.12471856128); +-- select scale(-13.000000000000000); + +-- +-- Tests for SUM() +-- + +-- cases that need carry propagation +SELECT SUM(decimal(9999)) FROM range(1, 100001); +SELECT SUM(decimal(-9999)) FROM range(1, 100001); + +DROP TABLE num_data; +DROP TABLE num_exp_add; +DROP TABLE num_exp_sub; +DROP TABLE num_exp_div; +DROP TABLE num_exp_mul; +DROP TABLE num_exp_sqrt; +DROP TABLE num_exp_ln; +DROP TABLE num_exp_log10; +DROP TABLE num_exp_power_10_ln; +DROP TABLE num_result; +DROP TABLE num_input_test; diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/select.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/select.sql similarity index 100% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/select.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/select.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/select_distinct.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/select_distinct.sql similarity index 100% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/select_distinct.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/select_distinct.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/select_having.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/select_having.sql similarity index 100% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/select_having.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/select_having.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/select_implicit.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/select_implicit.sql similarity index 100% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/select_implicit.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/select_implicit.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/strings.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/strings.sql similarity index 92% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/strings.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/strings.sql index 05841af27dd2e..541ff0bdad745 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/strings.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/strings.sql @@ -270,55 +270,54 @@ SELECT 'indio' NOT LIKE 'in__o' AS `false`; SELECT 'indio' LIKE 'in_o' AS `false`; SELECT 'indio' NOT LIKE 'in_o' AS `true`; --- [SPARK-28083] ANSI SQL: LIKE predicate: ESCAPE clause -- unused escape character --- SELECT 'hawkeye' LIKE 'h%' ESCAPE '#' AS "true"; --- SELECT 'hawkeye' NOT LIKE 'h%' ESCAPE '#' AS "false"; +SELECT 'hawkeye' LIKE 'h%' ESCAPE '#' AS `true`; +SELECT 'hawkeye' NOT LIKE 'h%' ESCAPE '#' AS `false`; --- SELECT 'indio' LIKE 'ind_o' ESCAPE '$' AS "true"; --- SELECT 'indio' NOT LIKE 'ind_o' ESCAPE '$' AS "false"; +SELECT 'indio' LIKE 'ind_o' ESCAPE '$' AS `true`; +SELECT 'indio' NOT LIKE 'ind_o' ESCAPE '$' AS `false`; -- escape character -- E061-05 like predicate with escape clause --- SELECT 'h%' LIKE 'h#%' ESCAPE '#' AS "true"; --- SELECT 'h%' NOT LIKE 'h#%' ESCAPE '#' AS "false"; +SELECT 'h%' LIKE 'h#%' ESCAPE '#' AS `true`; +SELECT 'h%' NOT LIKE 'h#%' ESCAPE '#' AS `false`; --- SELECT 'h%wkeye' LIKE 'h#%' ESCAPE '#' AS "false"; --- SELECT 'h%wkeye' NOT LIKE 'h#%' ESCAPE '#' AS "true"; +SELECT 'h%wkeye' LIKE 'h#%' ESCAPE '#' AS `false`; +SELECT 'h%wkeye' NOT LIKE 'h#%' ESCAPE '#' AS `true`; --- SELECT 'h%wkeye' LIKE 'h#%%' ESCAPE '#' AS "true"; --- SELECT 'h%wkeye' NOT LIKE 'h#%%' ESCAPE '#' AS "false"; +SELECT 'h%wkeye' LIKE 'h#%%' ESCAPE '#' AS `true`; +SELECT 'h%wkeye' NOT LIKE 'h#%%' ESCAPE '#' AS `false`; --- SELECT 'h%awkeye' LIKE 'h#%a%k%e' ESCAPE '#' AS "true"; --- SELECT 'h%awkeye' NOT LIKE 'h#%a%k%e' ESCAPE '#' AS "false"; +SELECT 'h%awkeye' LIKE 'h#%a%k%e' ESCAPE '#' AS `true`; +SELECT 'h%awkeye' NOT LIKE 'h#%a%k%e' ESCAPE '#' AS `false`; --- SELECT 'indio' LIKE '_ndio' ESCAPE '$' AS "true"; --- SELECT 'indio' NOT LIKE '_ndio' ESCAPE '$' AS "false"; +SELECT 'indio' LIKE '_ndio' ESCAPE '$' AS `true`; +SELECT 'indio' NOT LIKE '_ndio' ESCAPE '$' AS `false`; --- SELECT 'i_dio' LIKE 'i$_d_o' ESCAPE '$' AS "true"; --- SELECT 'i_dio' NOT LIKE 'i$_d_o' ESCAPE '$' AS "false"; +SELECT 'i_dio' LIKE 'i$_d_o' ESCAPE '$' AS `true`; +SELECT 'i_dio' NOT LIKE 'i$_d_o' ESCAPE '$' AS `false`; --- SELECT 'i_dio' LIKE 'i$_nd_o' ESCAPE '$' AS "false"; --- SELECT 'i_dio' NOT LIKE 'i$_nd_o' ESCAPE '$' AS "true"; +SELECT 'i_dio' LIKE 'i$_nd_o' ESCAPE '$' AS `false`; +SELECT 'i_dio' NOT LIKE 'i$_nd_o' ESCAPE '$' AS `true`; --- SELECT 'i_dio' LIKE 'i$_d%o' ESCAPE '$' AS "true"; --- SELECT 'i_dio' NOT LIKE 'i$_d%o' ESCAPE '$' AS "false"; +SELECT 'i_dio' LIKE 'i$_d%o' ESCAPE '$' AS `true`; +SELECT 'i_dio' NOT LIKE 'i$_d%o' ESCAPE '$' AS `false`; -- escape character same as pattern character --- SELECT 'maca' LIKE 'm%aca' ESCAPE '%' AS "true"; --- SELECT 'maca' NOT LIKE 'm%aca' ESCAPE '%' AS "false"; +SELECT 'maca' LIKE 'm%aca' ESCAPE '%' AS `true`; +SELECT 'maca' NOT LIKE 'm%aca' ESCAPE '%' AS `false`; --- SELECT 'ma%a' LIKE 'm%a%%a' ESCAPE '%' AS "true"; --- SELECT 'ma%a' NOT LIKE 'm%a%%a' ESCAPE '%' AS "false"; +SELECT 'ma%a' LIKE 'm%a%%a' ESCAPE '%' AS `true`; +SELECT 'ma%a' NOT LIKE 'm%a%%a' ESCAPE '%' AS `false`; --- SELECT 'bear' LIKE 'b_ear' ESCAPE '_' AS "true"; --- SELECT 'bear' NOT LIKE 'b_ear' ESCAPE '_' AS "false"; +SELECT 'bear' LIKE 'b_ear' ESCAPE '_' AS `true`; +SELECT 'bear' NOT LIKE 'b_ear' ESCAPE '_' AS `false`; --- SELECT 'be_r' LIKE 'b_e__r' ESCAPE '_' AS "true"; --- SELECT 'be_r' NOT LIKE 'b_e__r' ESCAPE '_' AS "false"; +SELECT 'be_r' LIKE 'b_e__r' ESCAPE '_' AS `true`; +SELECT 'be_r' NOT LIKE 'b_e__r' ESCAPE '_' AS `false`; --- SELECT 'be_r' LIKE '__e__r' ESCAPE '_' AS "false"; --- SELECT 'be_r' NOT LIKE '__e__r' ESCAPE '_' AS "true"; +SELECT 'be_r' LIKE '__e__r' ESCAPE '_' AS `false`; +SELECT 'be_r' NOT LIKE '__e__r' ESCAPE '_' AS `true`; -- [SPARK-28448] Implement ILIKE operator -- diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/text.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/text.sql similarity index 96% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/text.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/text.sql index 04d3acc145e95..05953123da86f 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/text.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/text.sql @@ -44,11 +44,7 @@ select concat_ws(',',10,20,null,30); select concat_ws('',10,20,null,30); select concat_ws(NULL,10,20,null,30) is null; select reverse('abcde'); --- [SPARK-28036] Built-in udf left/right has inconsistent behavior --- [SPARK-28479] Parser error when enabling ANSI mode -set spark.sql.parser.ansi.enabled=false; select i, left('ahoj', i), right('ahoj', i) from range(-5, 6) t(i) order by i; -set spark.sql.parser.ansi.enabled=true; -- [SPARK-28037] Add built-in String Functions: quote_literal -- select quote_literal(''); -- select quote_literal('abc'''); diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/timestamp.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/timestamp.sql similarity index 76% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/timestamp.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/timestamp.sql index 2b974816766bd..bf69da295a960 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/timestamp.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/timestamp.sql @@ -7,7 +7,6 @@ CREATE TABLE TIMESTAMP_TBL (d1 timestamp) USING parquet; --- [SPARK-28141] Timestamp type can not accept special values -- Test shorthand input values -- We can't just "select" the results since they aren't constants; test for -- equality instead. We can do that by running the test inside a transaction @@ -17,22 +16,28 @@ CREATE TABLE TIMESTAMP_TBL (d1 timestamp) USING parquet; -- block is entered exactly at local midnight; then 'now' and 'today' have -- the same values and the counts will come out different. --- INSERT INTO TIMESTAMP_TBL VALUES ('now'); +-- PostgreSQL implicitly casts string literals to data with timestamp types, but +-- Spark does not support that kind of implicit casts. +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('now')); -- SELECT pg_sleep(0.1); -- BEGIN; --- INSERT INTO TIMESTAMP_TBL VALUES ('now'); --- INSERT INTO TIMESTAMP_TBL VALUES ('today'); --- INSERT INTO TIMESTAMP_TBL VALUES ('yesterday'); --- INSERT INTO TIMESTAMP_TBL VALUES ('tomorrow'); +-- PostgreSQL implicitly casts string literals to data with timestamp types, but +-- Spark does not support that kind of implicit casts. +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('now')); +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('today')); +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('yesterday')); +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('tomorrow')); -- time zone should be ignored by this data type --- INSERT INTO TIMESTAMP_TBL VALUES ('tomorrow EST'); --- INSERT INTO TIMESTAMP_TBL VALUES ('tomorrow zulu'); - --- SELECT count(*) AS One FROM TIMESTAMP_TBL WHERE d1 = timestamp 'today'; --- SELECT count(*) AS Three FROM TIMESTAMP_TBL WHERE d1 = timestamp 'tomorrow'; --- SELECT count(*) AS One FROM TIMESTAMP_TBL WHERE d1 = timestamp 'yesterday'; +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('tomorrow EST')); +-- [SPARK-29024] Ignore case while resolving time zones +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('tomorrow Zulu')); + +SELECT count(*) AS One FROM TIMESTAMP_TBL WHERE d1 = timestamp 'today'; +SELECT count(*) AS Three FROM TIMESTAMP_TBL WHERE d1 = timestamp 'tomorrow'; +SELECT count(*) AS One FROM TIMESTAMP_TBL WHERE d1 = timestamp 'yesterday'; +-- [SPARK-29025] Support seconds precision by the timestamp type -- SELECT count(*) AS One FROM TIMESTAMP_TBL WHERE d1 = timestamp(2) 'now'; -- COMMIT; @@ -48,12 +53,14 @@ CREATE TABLE TIMESTAMP_TBL (d1 timestamp) USING parquet; -- SELECT count(*) AS two FROM TIMESTAMP_TBL WHERE d1 = timestamp(2) 'now'; -- COMMIT; --- TRUNCATE TIMESTAMP_TBL; +TRUNCATE TABLE TIMESTAMP_TBL; -- Special values -- INSERT INTO TIMESTAMP_TBL VALUES ('-infinity'); -- INSERT INTO TIMESTAMP_TBL VALUES ('infinity'); --- INSERT INTO TIMESTAMP_TBL VALUES ('epoch'); +-- PostgreSQL implicitly casts string literals to data with timestamp types, but +-- Spark does not support that kind of implicit casts. +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('epoch')); -- [SPARK-27923] Spark SQL insert there obsolete special values to NULL -- Obsolete special values -- INSERT INTO TIMESTAMP_TBL VALUES ('invalid'); @@ -72,14 +79,16 @@ CREATE TABLE TIMESTAMP_TBL (d1 timestamp) USING parquet; -- INSERT INTO TIMESTAMP_TBL VALUES ('Mon Feb 10 17:32:01.6 1997 PST'); -- ISO 8601 format -INSERT INTO TIMESTAMP_TBL VALUES ('1997-01-02'); -INSERT INTO TIMESTAMP_TBL VALUES ('1997-01-02 03:04:05'); -INSERT INTO TIMESTAMP_TBL VALUES ('1997-02-10 17:32:01-08'); +-- PostgreSQL implicitly casts string literals to data with timestamp types, but +-- Spark does not support that kind of implicit casts. +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('1997-01-02')); +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('1997-01-02 03:04:05')); +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('1997-02-10 17:32:01-08')); -- INSERT INTO TIMESTAMP_TBL VALUES ('1997-02-10 17:32:01-0800'); -- INSERT INTO TIMESTAMP_TBL VALUES ('1997-02-10 17:32:01 -08:00'); -- INSERT INTO TIMESTAMP_TBL VALUES ('19970210 173201 -0800'); -- INSERT INTO TIMESTAMP_TBL VALUES ('1997-06-10 17:32:01 -07:00'); -INSERT INTO TIMESTAMP_TBL VALUES ('2001-09-22T18:19:20'); +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('2001-09-22T18:19:20')); -- POSIX format (note that the timezone abbrev is just decoration here) -- INSERT INTO TIMESTAMP_TBL VALUES ('2000-03-15 08:14:01 GMT+8'); @@ -174,35 +183,32 @@ SELECT '' AS `16`, d1 FROM TIMESTAMP_TBL SELECT '' AS `49`, d1 FROM TIMESTAMP_TBL WHERE d1 >= timestamp '1997-01-02'; --- [SPARK-28425] Add more Date/Time Operators --- SELECT '' AS `54`, d1 - timestamp '1997-01-02' AS diff --- FROM TIMESTAMP_TBL WHERE d1 BETWEEN '1902-01-01' AND '2038-01-01'; +SELECT '' AS `54`, d1 - timestamp '1997-01-02' AS diff + FROM TIMESTAMP_TBL WHERE d1 BETWEEN '1902-01-01' AND '2038-01-01'; SELECT '' AS date_trunc_week, date_trunc( 'week', timestamp '2004-02-29 15:44:17.71393' ) AS week_trunc; --- [SPARK-28425] Add more Date/Time Operators -- Test casting within a BETWEEN qualifier --- SELECT '' AS `54`, d1 - timestamp '1997-01-02' AS diff --- FROM TIMESTAMP_TBL --- WHERE d1 BETWEEN timestamp '1902-01-01' --- AND timestamp '2038-01-01'; - --- [SPARK-28420] Date/Time Functions: date_part --- SELECT '' AS "54", d1 as "timestamp", --- date_part( 'year', d1) AS year, date_part( 'month', d1) AS month, --- date_part( 'day', d1) AS day, date_part( 'hour', d1) AS hour, --- date_part( 'minute', d1) AS minute, date_part( 'second', d1) AS second --- FROM TIMESTAMP_TBL WHERE d1 BETWEEN '1902-01-01' AND '2038-01-01'; - --- SELECT '' AS "54", d1 as "timestamp", --- date_part( 'quarter', d1) AS quarter, date_part( 'msec', d1) AS msec, --- date_part( 'usec', d1) AS usec --- FROM TIMESTAMP_TBL WHERE d1 BETWEEN '1902-01-01' AND '2038-01-01'; - --- SELECT '' AS "54", d1 as "timestamp", --- date_part( 'isoyear', d1) AS isoyear, date_part( 'week', d1) AS week, --- date_part( 'dow', d1) AS dow --- FROM TIMESTAMP_TBL WHERE d1 BETWEEN '1902-01-01' AND '2038-01-01'; +SELECT '' AS `54`, d1 - timestamp '1997-01-02' AS diff + FROM TIMESTAMP_TBL + WHERE d1 BETWEEN timestamp '1902-01-01' + AND timestamp '2038-01-01'; + +SELECT '' AS `54`, d1 as `timestamp`, + date_part( 'year', d1) AS `year`, date_part( 'month', d1) AS `month`, + date_part( 'day', d1) AS `day`, date_part( 'hour', d1) AS `hour`, + date_part( 'minute', d1) AS `minute`, date_part( 'second', d1) AS `second` + FROM TIMESTAMP_TBL WHERE d1 BETWEEN '1902-01-01' AND '2038-01-01'; + +SELECT '' AS `54`, d1 as `timestamp`, + date_part( 'quarter', d1) AS quarter, date_part( 'msec', d1) AS msec, + date_part( 'usec', d1) AS usec + FROM TIMESTAMP_TBL WHERE d1 BETWEEN '1902-01-01' AND '2038-01-01'; + +SELECT '' AS `54`, d1 as `timestamp`, + date_part( 'isoyear', d1) AS isoyear, date_part( 'week', d1) AS week, + date_part( 'dow', d1) AS dow + FROM TIMESTAMP_TBL WHERE d1 BETWEEN '1902-01-01' AND '2038-01-01'; -- [SPARK-28137] Data Type Formatting Functions -- TO_CHAR() diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/union.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/union.sql similarity index 100% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/union.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/union.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part1.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part1.sql new file mode 100644 index 0000000000000..087d7a5befd19 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part1.sql @@ -0,0 +1,357 @@ +-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group +-- +-- Window Functions Testing +-- https://github.com/postgres/postgres/blob/REL_12_STABLE/src/test/regress/sql/window.sql#L1-L319 + +-- Test window operator with codegen on and off. +--CONFIG_DIM1 spark.sql.codegen.wholeStage=true +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + +CREATE TEMPORARY VIEW tenk2 AS SELECT * FROM tenk1; + +-- [SPARK-29540] Thrift in some cases can't parse string to date +-- CREATE TABLE empsalary ( +-- depname string, +-- empno integer, +-- salary int, +-- enroll_date date +-- ) USING parquet; + +-- [SPARK-29540] Thrift in some cases can't parse string to date +-- INSERT INTO empsalary VALUES ('develop', 10, 5200, '2007-08-01'); +-- INSERT INTO empsalary VALUES ('sales', 1, 5000, '2006-10-01'); +-- INSERT INTO empsalary VALUES ('personnel', 5, 3500, '2007-12-10'); +-- INSERT INTO empsalary VALUES ('sales', 4, 4800, '2007-08-08'); +-- INSERT INTO empsalary VALUES ('personnel', 2, 3900, '2006-12-23'); +-- INSERT INTO empsalary VALUES ('develop', 7, 4200, '2008-01-01'); +-- INSERT INTO empsalary VALUES ('develop', 9, 4500, '2008-01-01'); +-- INSERT INTO empsalary VALUES ('sales', 3, 4800, '2007-08-01'); +-- INSERT INTO empsalary VALUES ('develop', 8, 6000, '2006-10-01'); +-- INSERT INTO empsalary VALUES ('develop', 11, 5200, '2007-08-15'); + +-- [SPARK-29540] Thrift in some cases can't parse string to date +-- SELECT depname, empno, salary, sum(salary) OVER (PARTITION BY depname) FROM empsalary ORDER BY depname, salary; + +-- [SPARK-29540] Thrift in some cases can't parse string to date +-- SELECT depname, empno, salary, rank() OVER (PARTITION BY depname ORDER BY salary) FROM empsalary; + +-- with GROUP BY +SELECT four, ten, SUM(SUM(four)) OVER (PARTITION BY four), AVG(ten) FROM tenk1 +GROUP BY four, ten ORDER BY four, ten; + +-- [SPARK-29540] Thrift in some cases can't parse string to date +-- SELECT depname, empno, salary, sum(salary) OVER w FROM empsalary WINDOW w AS (PARTITION BY depname); + +-- [SPARK-28064] Order by does not accept a call to rank() +-- SELECT depname, empno, salary, rank() OVER w FROM empsalary WINDOW w AS (PARTITION BY depname ORDER BY salary) ORDER BY rank() OVER w; + +-- empty window specification +SELECT COUNT(*) OVER () FROM tenk1 WHERE unique2 < 10; + +SELECT COUNT(*) OVER w FROM tenk1 WHERE unique2 < 10 WINDOW w AS (); + +-- no window operation +SELECT four FROM tenk1 WHERE FALSE WINDOW w AS (PARTITION BY ten); + +-- cumulative aggregate +SELECT sum(four) OVER (PARTITION BY ten ORDER BY unique2) AS sum_1, ten, four FROM tenk1 WHERE unique2 < 10; + +SELECT row_number() OVER (ORDER BY unique2) FROM tenk1 WHERE unique2 < 10; + +SELECT rank() OVER (PARTITION BY four ORDER BY ten) AS rank_1, ten, four FROM tenk1 WHERE unique2 < 10; + +SELECT dense_rank() OVER (PARTITION BY four ORDER BY ten), ten, four FROM tenk1 WHERE unique2 < 10; + +SELECT percent_rank() OVER (PARTITION BY four ORDER BY ten), ten, four FROM tenk1 WHERE unique2 < 10; + +SELECT cume_dist() OVER (PARTITION BY four ORDER BY ten), ten, four FROM tenk1 WHERE unique2 < 10; + +SELECT ntile(3) OVER (ORDER BY ten, four), ten, four FROM tenk1 WHERE unique2 < 10; + +-- [SPARK-28065] ntile does not accept NULL as input +-- SELECT ntile(NULL) OVER (ORDER BY ten, four), ten, four FROM tenk1 LIMIT 2; + +SELECT lag(ten) OVER (PARTITION BY four ORDER BY ten), ten, four FROM tenk1 WHERE unique2 < 10; + +-- [SPARK-28068] `lag` second argument must be a literal in Spark +-- SELECT lag(ten, four) OVER (PARTITION BY four ORDER BY ten), ten, four FROM tenk1 WHERE unique2 < 10; + +-- [SPARK-28068] `lag` second argument must be a literal in Spark +-- SELECT lag(ten, four, 0) OVER (PARTITION BY four ORDER BY ten), ten, four FROM tenk1 WHERE unique2 < 10; + +SELECT lead(ten) OVER (PARTITION BY four ORDER BY ten), ten, four FROM tenk1 WHERE unique2 < 10; + +SELECT lead(ten * 2, 1) OVER (PARTITION BY four ORDER BY ten), ten, four FROM tenk1 WHERE unique2 < 10; + +SELECT lead(ten * 2, 1, -1) OVER (PARTITION BY four ORDER BY ten), ten, four FROM tenk1 WHERE unique2 < 10; + +SELECT first(ten) OVER (PARTITION BY four ORDER BY ten), ten, four FROM tenk1 WHERE unique2 < 10; + +-- last returns the last row of the frame, which is CURRENT ROW in ORDER BY window. +SELECT last(four) OVER (ORDER BY ten), ten, four FROM tenk1 WHERE unique2 < 10; + +SELECT last(ten) OVER (PARTITION BY four), ten, four FROM +(SELECT * FROM tenk1 WHERE unique2 < 10 ORDER BY four, ten)s +ORDER BY four, ten; + +-- [SPARK-27951] ANSI SQL: NTH_VALUE function +-- SELECT nth_value(ten, four + 1) OVER (PARTITION BY four), ten, four +-- FROM (SELECT * FROM tenk1 WHERE unique2 < 10 ORDER BY four, ten)s; + +SELECT ten, two, sum(hundred) AS gsum, sum(sum(hundred)) OVER (PARTITION BY two ORDER BY ten) AS wsum +FROM tenk1 GROUP BY ten, two; + +SELECT count(*) OVER (PARTITION BY four), four FROM (SELECT * FROM tenk1 WHERE two = 1)s WHERE unique2 < 10; + +SELECT (count(*) OVER (PARTITION BY four ORDER BY ten) + + sum(hundred) OVER (PARTITION BY four ORDER BY ten)) AS cntsum + FROM tenk1 WHERE unique2 < 10; + +-- opexpr with different windows evaluation. +SELECT * FROM( + SELECT count(*) OVER (PARTITION BY four ORDER BY ten) + + sum(hundred) OVER (PARTITION BY two ORDER BY ten) AS total, + count(*) OVER (PARTITION BY four ORDER BY ten) AS fourcount, + sum(hundred) OVER (PARTITION BY two ORDER BY ten) AS twosum + FROM tenk1 +)sub WHERE total <> fourcount + twosum; + +SELECT avg(four) OVER (PARTITION BY four ORDER BY thousand / 100) FROM tenk1 WHERE unique2 < 10; + +SELECT ten, two, sum(hundred) AS gsum, sum(sum(hundred)) OVER win AS wsum +FROM tenk1 GROUP BY ten, two WINDOW win AS (PARTITION BY two ORDER BY ten); + +-- [SPARK-29540] Thrift in some cases can't parse string to date +-- more than one window with GROUP BY +-- SELECT sum(salary), +-- row_number() OVER (ORDER BY depname), +-- sum(sum(salary)) OVER (ORDER BY depname DESC) +-- FROM empsalary GROUP BY depname; + +-- [SPARK-29540] Thrift in some cases can't parse string to date +-- identical windows with different names +-- SELECT sum(salary) OVER w1, count(*) OVER w2 +-- FROM empsalary WINDOW w1 AS (ORDER BY salary), w2 AS (ORDER BY salary); + +-- subplan +-- [SPARK-28379] Correlated scalar subqueries must be aggregated +-- SELECT lead(ten, (SELECT two FROM tenk1 WHERE s.unique2 = unique2)) OVER (PARTITION BY four ORDER BY ten) +-- FROM tenk1 s WHERE unique2 < 10; + +-- empty table +SELECT count(*) OVER (PARTITION BY four) FROM (SELECT * FROM tenk1 WHERE FALSE)s; + +-- [SPARK-29540] Thrift in some cases can't parse string to date +-- mixture of agg/wfunc in the same window +-- SELECT sum(salary) OVER w, rank() OVER w FROM empsalary WINDOW w AS (PARTITION BY depname ORDER BY salary DESC); + +-- Cannot safely cast 'enroll_date': StringType to DateType; +-- SELECT empno, depname, salary, bonus, depadj, MIN(bonus) OVER (ORDER BY empno), MAX(depadj) OVER () FROM( +-- SELECT *, +-- CASE WHEN enroll_date < '2008-01-01' THEN 2008 - extract(year FROM enroll_date) END * 500 AS bonus, +-- CASE WHEN +-- AVG(salary) OVER (PARTITION BY depname) < salary +-- THEN 200 END AS depadj FROM empsalary +-- )s; + +create temporary view int4_tbl as select * from values + (0), + (123456), + (-123456), + (2147483647), + (-2147483647) + as int4_tbl(f1); + +-- window function over ungrouped agg over empty row set (bug before 9.1) +SELECT SUM(COUNT(f1)) OVER () FROM int4_tbl WHERE f1=42; + +-- window function with ORDER BY an expression involving aggregates (9.1 bug) +select ten, + sum(unique1) + sum(unique2) as res, + rank() over (order by sum(unique1) + sum(unique2)) as rank +from tenk1 +group by ten order by ten; + +-- window and aggregate with GROUP BY expression (9.2 bug) +-- explain +-- select first(max(x)) over (), y +-- from (select unique1 as x, ten+four as y from tenk1) ss +-- group by y; + +-- test non-default frame specifications +SELECT four, ten, +sum(ten) over (partition by four order by ten), +last(ten) over (partition by four order by ten) +FROM (select distinct ten, four from tenk1) ss; + +SELECT four, ten, +sum(ten) over (partition by four order by ten range between unbounded preceding and current row), +last(ten) over (partition by four order by ten range between unbounded preceding and current row) +FROM (select distinct ten, four from tenk1) ss; + +SELECT four, ten, +sum(ten) over (partition by four order by ten range between unbounded preceding and unbounded following), +last(ten) over (partition by four order by ten range between unbounded preceding and unbounded following) +FROM (select distinct ten, four from tenk1) ss; + +-- [SPARK-29451] Some queries with divisions in SQL windows are failling in Thrift +-- SELECT four, ten/4 as two, +-- sum(ten/4) over (partition by four order by ten/4 range between unbounded preceding and current row), +-- last(ten/4) over (partition by four order by ten/4 range between unbounded preceding and current row) +-- FROM (select distinct ten, four from tenk1) ss; + +-- [SPARK-29451] Some queries with divisions in SQL windows are failling in Thrift +-- SELECT four, ten/4 as two, +-- sum(ten/4) over (partition by four order by ten/4 rows between unbounded preceding and current row), +-- last(ten/4) over (partition by four order by ten/4 rows between unbounded preceding and current row) +-- FROM (select distinct ten, four from tenk1) ss; + +SELECT sum(unique1) over (order by four range between current row and unbounded following), +unique1, four +FROM tenk1 WHERE unique1 < 10; + +SELECT sum(unique1) over (rows between current row and unbounded following), +unique1, four +FROM tenk1 WHERE unique1 < 10; + +SELECT sum(unique1) over (rows between 2 preceding and 2 following), +unique1, four +FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT sum(unique1) over (rows between 2 preceding and 2 following exclude no others), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT sum(unique1) over (rows between 2 preceding and 2 following exclude current row), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT sum(unique1) over (rows between 2 preceding and 2 following exclude group), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT sum(unique1) over (rows between 2 preceding and 2 following exclude ties), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT first(unique1) over (ORDER BY four rows between current row and 2 following exclude current row), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT first(unique1) over (ORDER BY four rows between current row and 2 following exclude group), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT first(unique1) over (ORDER BY four rows between current row and 2 following exclude ties), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT last(unique1) over (ORDER BY four rows between current row and 2 following exclude current row), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT last(unique1) over (ORDER BY four rows between current row and 2 following exclude group), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT last(unique1) over (ORDER BY four rows between current row and 2 following exclude ties), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +SELECT sum(unique1) over (rows between 2 preceding and 1 preceding), +unique1, four +FROM tenk1 WHERE unique1 < 10; + +SELECT sum(unique1) over (rows between 1 following and 3 following), +unique1, four +FROM tenk1 WHERE unique1 < 10; + +SELECT sum(unique1) over (rows between unbounded preceding and 1 following), +unique1, four +FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT sum(unique1) over (w range between current row and unbounded following), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10 WINDOW w AS (order by four); + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT sum(unique1) over (w range between unbounded preceding and current row exclude current row), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10 WINDOW w AS (order by four); + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT sum(unique1) over (w range between unbounded preceding and current row exclude group), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10 WINDOW w AS (order by four); + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT sum(unique1) over (w range between unbounded preceding and current row exclude ties), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10 WINDOW w AS (order by four); + +-- [SPARK-27951] ANSI SQL: NTH_VALUE function +-- SELECT first_value(unique1) over w, +-- nth_value(unique1, 2) over w AS nth_2, +-- last_value(unique1) over w, unique1, four +-- FROM tenk1 WHERE unique1 < 10 +-- WINDOW w AS (order by four range between current row and unbounded following); + +-- [SPARK-28501] Frame bound value must be a literal. +-- SELECT sum(unique1) over +-- (order by unique1 +-- rows (SELECT unique1 FROM tenk1 ORDER BY unique1 LIMIT 1) + 1 PRECEDING), +-- unique1 +-- FROM tenk1 WHERE unique1 < 10; + +CREATE TEMP VIEW v_window AS +SELECT i.id, sum(i.id) over (order by i.id rows between 1 preceding and 1 following) as sum_rows +FROM range(1, 11) i; + +SELECT * FROM v_window; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- CREATE OR REPLACE TEMP VIEW v_window AS +-- SELECT i, sum(i) over (order by i rows between 1 preceding and 1 following +-- exclude current row) as sum_rows FROM range(1, 10) i; + +-- SELECT * FROM v_window; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- CREATE OR REPLACE TEMP VIEW v_window AS +-- SELECT i, sum(i) over (order by i rows between 1 preceding and 1 following +-- exclude group) as sum_rows FROM range(1, 10) i; +-- SELECT * FROM v_window; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- CREATE OR REPLACE TEMP VIEW v_window AS +-- SELECT i, sum(i) over (order by i rows between 1 preceding and 1 following +-- exclude ties) as sum_rows FROM generate_series(1, 10) i; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- CREATE OR REPLACE TEMP VIEW v_window AS +-- SELECT i, sum(i) over (order by i rows between 1 preceding and 1 following +-- exclude no others) as sum_rows FROM generate_series(1, 10) i; +-- SELECT * FROM v_window; + +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- CREATE OR REPLACE TEMP VIEW v_window AS +-- SELECT i.id, sum(i.id) over (order by i.id groups between 1 preceding and 1 following) as sum_rows FROM range(1, 11) i; +-- SELECT * FROM v_window; + +DROP VIEW v_window; +-- [SPARK-29540] Thrift in some cases can't parse string to date +-- DROP TABLE empsalary; +DROP VIEW tenk2; +DROP VIEW int4_tbl; diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part2.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part2.sql new file mode 100644 index 0000000000000..ba1acc9f56b4a --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part2.sql @@ -0,0 +1,303 @@ +-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group +-- +-- Window Functions Testing +-- https://github.com/postgres/postgres/blob/REL_12_STABLE/src/test/regress/sql/window.sql#L320-562 + +-- Test window operator with codegen on and off. +--CONFIG_DIM1 spark.sql.codegen.wholeStage=true +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + +CREATE TABLE empsalary ( + depname string, + empno integer, + salary int, + enroll_date date +) USING parquet; + +INSERT INTO empsalary VALUES + ('develop', 10, 5200, date '2007-08-01'), + ('sales', 1, 5000, date '2006-10-01'), + ('personnel', 5, 3500, date '2007-12-10'), + ('sales', 4, 4800, date '2007-08-08'), + ('personnel', 2, 3900, date '2006-12-23'), + ('develop', 7, 4200, date '2008-01-01'), + ('develop', 9, 4500, date '2008-01-01'), + ('sales', 3, 4800, date '2007-08-01'), + ('develop', 8, 6000, date '2006-10-01'), + ('develop', 11, 5200, date '2007-08-15'); + +-- [SPARK-28429] SQL Datetime util function being casted to double instead of timestamp +-- CREATE TEMP VIEW v_window AS +-- SELECT i, min(i) over (order by i range between '1 day' preceding and '10 days' following) as min_i +-- FROM range(now(), now()+'100 days', '1 hour') i; + +-- RANGE offset PRECEDING/FOLLOWING tests + +SELECT sum(unique1) over (order by four range between 2 preceding and 1 preceding), +unique1, four +FROM tenk1 WHERE unique1 < 10; + +SELECT sum(unique1) over (order by four desc range between 2 preceding and 1 preceding), +unique1, four +FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT sum(unique1) over (order by four range between 2 preceding and 1 preceding exclude no others), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT sum(unique1) over (order by four range between 2 preceding and 1 preceding exclude current row), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT sum(unique1) over (order by four range between 2 preceding and 1 preceding exclude group), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT sum(unique1) over (order by four range between 2 preceding and 1 preceding exclude ties), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT sum(unique1) over (order by four range between 2 preceding and 6 following exclude ties), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT sum(unique1) over (order by four range between 2 preceding and 6 following exclude group), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +SELECT sum(unique1) over (partition by four order by unique1 range between 5 preceding and 6 following), +unique1, four +FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT sum(unique1) over (partition by four order by unique1 range between 5 preceding and 6 following +-- exclude current row),unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28429] SQL Datetime util function being casted to double instead of timestamp +-- select sum(salary) over (order by enroll_date range between '1 year' preceding and '1 year' following), +-- salary, enroll_date from empsalary; + +-- [SPARK-28429] SQL Datetime util function being casted to double instead of timestamp +-- select sum(salary) over (order by enroll_date desc range between '1 year' preceding and '1 year' following), +-- salary, enroll_date from empsalary; + +-- [SPARK-28429] SQL Datetime util function being casted to double instead of timestamp +-- select sum(salary) over (order by enroll_date desc range between '1 year' following and '1 year' following), +-- salary, enroll_date from empsalary; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- select sum(salary) over (order by enroll_date range between '1 year' preceding and '1 year' following +-- exclude current row), salary, enroll_date from empsalary; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- select sum(salary) over (order by enroll_date range between '1 year' preceding and '1 year' following +-- exclude group), salary, enroll_date from empsalary; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- select sum(salary) over (order by enroll_date range between '1 year' preceding and '1 year' following +-- exclude ties), salary, enroll_date from empsalary; + +-- [SPARK-27951] ANSI SQL: NTH_VALUE function +-- select first_value(salary) over(order by salary range between 1000 preceding and 1000 following), +-- lead(salary) over(order by salary range between 1000 preceding and 1000 following), +-- nth_value(salary, 1) over(order by salary range between 1000 preceding and 1000 following), +-- salary from empsalary; + +-- [SPARK-30734] AnalysisException that window RangeFrame not match RowFrame +-- select last(salary) over(order by salary range between 1000 preceding and 1000 following), +-- lag(salary) over(order by salary range between 1000 preceding and 1000 following), +-- salary from empsalary; + +-- [SPARK-27951] ANSI SQL: NTH_VALUE function +-- select first_value(salary) over(order by salary range between 1000 following and 3000 following +-- exclude current row), +-- lead(salary) over(order by salary range between 1000 following and 3000 following exclude ties), +-- nth_value(salary, 1) over(order by salary range between 1000 following and 3000 following +-- exclude ties), +-- salary from empsalary; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- select last(salary) over(order by salary range between 1000 following and 3000 following +-- exclude group), +-- lag(salary) over(order by salary range between 1000 following and 3000 following exclude group), +-- salary from empsalary; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- select first(salary) over(order by enroll_date range between unbounded preceding and '1 year' following +-- exclude ties), +-- last(salary) over(order by enroll_date range between unbounded preceding and '1 year' following), +-- salary, enroll_date from empsalary; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- select first(salary) over(order by enroll_date range between unbounded preceding and '1 year' following +-- exclude ties), +-- last(salary) over(order by enroll_date range between unbounded preceding and '1 year' following +-- exclude ties), +-- salary, enroll_date from empsalary; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- select first(salary) over(order by enroll_date range between unbounded preceding and '1 year' following +-- exclude group), +-- last(salary) over(order by enroll_date range between unbounded preceding and '1 year' following +-- exclude group), +-- salary, enroll_date from empsalary; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- select first(salary) over(order by enroll_date range between unbounded preceding and '1 year' following +-- exclude current row), +-- last(salary) over(order by enroll_date range between unbounded preceding and '1 year' following +-- exclude current row), +-- salary, enroll_date from empsalary; + +-- RANGE offset PRECEDING/FOLLOWING with null values +select ss.id, ss.y, + first(ss.y) over w, + last(ss.y) over w +from + (select x.id, x.id as y from range(1,6) as x + union all select null, 42 + union all select null, 43) ss +window w as + (order by ss.id asc nulls first range between 2 preceding and 2 following); + +select ss.id, ss.y, + first(ss.y) over w, + last(ss.y) over w +from + (select x.id, x.id as y from range(1,6) as x + union all select null, 42 + union all select null, 43) ss +window w as + (order by ss.id asc nulls last range between 2 preceding and 2 following); + +select ss.id, ss.y, + first(ss.y) over w, + last(ss.y) over w +from + (select x.id, x.id as y from range(1,6) as x + union all select null, 42 + union all select null, 43) ss +window w as + (order by ss.id desc nulls first range between 2 preceding and 2 following); + +select ss.id, ss.y, + first(ss.y) over w, + last(ss.y) over w +from + (select x.id, x.id as y from range(1,6) as x + union all select null, 42 + union all select null, 43) ss +window w as + (order by ss.id desc nulls last range between 2 preceding and 2 following); + +-- Check overflow behavior for various integer sizes + +select x.id, last(x.id) over (order by x.id range between current row and 2147450884 following) +from range(32764, 32767) x; + +select x.id, last(x.id) over (order by x.id desc range between current row and 2147450885 following) +from range(-32766, -32765) x; + +select x.id, last(x.id) over (order by x.id range between current row and 4 following) +from range(2147483644, 2147483647) x; + +select x.id, last(x.id) over (order by x.id desc range between current row and 5 following) +from range(-2147483646, -2147483645) x; + +select x.id, last(x.id) over (order by x.id range between current row and 4 following) +from range(9223372036854775804, 9223372036854775807) x; + +select x.id, last(x.id) over (order by x.id desc range between current row and 5 following) +from range(-9223372036854775806, -9223372036854775805) x; + +-- Test in_range for other numeric datatypes + +create table numerics ( + id int, + f_float4 float, + f_float8 float, + f_numeric int +) using parquet; + +insert into numerics values +(1, -3, -3, -3), +(2, -1, -1, -1), +(3, 0, 0, 0), +(4, 1.1, 1.1, 1.1), +(5, 1.12, 1.12, 1.12), +(6, 2, 2, 2), +(7, 100, 100, 100); +-- (8, 'infinity', 'infinity', '1000'), +-- (9, 'NaN', 'NaN', 'NaN'), +-- (0, '-infinity', '-infinity', '-1000'); -- numeric type lacks infinities + +select id, f_float4, first(id) over w, last(id) over w +from numerics +window w as (order by f_float4 range between + 1 preceding and 1 following); + +select id, f_float4, first(id) over w, last(id) over w +from numerics +window w as (order by f_float4 range between + 1 preceding and 1.1 following); + +select id, f_float4, first(id) over w, last(id) over w +from numerics +window w as (order by f_float4 range between + 'inf' preceding and 'inf' following); + +select id, f_float4, first(id) over w, last(id) over w +from numerics +window w as (order by f_float4 range between + 1.1 preceding and 'NaN' following); -- error, NaN disallowed + +select id, f_float8, first(id) over w, last(id) over w +from numerics +window w as (order by f_float8 range between + 1 preceding and 1 following); + +select id, f_float8, first(id) over w, last(id) over w +from numerics +window w as (order by f_float8 range between + 1 preceding and 1.1 following); + +select id, f_float8, first(id) over w, last(id) over w +from numerics +window w as (order by f_float8 range between + 'inf' preceding and 'inf' following); + +select id, f_float8, first(id) over w, last(id) over w +from numerics +window w as (order by f_float8 range between + 1.1 preceding and 'NaN' following); -- error, NaN disallowed + +select id, f_numeric, first(id) over w, last(id) over w +from numerics +window w as (order by f_numeric range between + 1 preceding and 1 following); + +select id, f_numeric, first(id) over w, last(id) over w +from numerics +window w as (order by f_numeric range between + 1 preceding and 1.1 following); + +select id, f_numeric, first(id) over w, last(id) over w +from numerics +window w as (order by f_numeric range between + 1 preceding and 1.1 following); -- currently unsupported + +select id, f_numeric, first(id) over w, last(id) over w +from numerics +window w as (order by f_numeric range between + 1.1 preceding and 'NaN' following); -- error, NaN disallowed + +drop table empsalary; +drop table numerics; diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part3.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part3.sql new file mode 100644 index 0000000000000..cd3b74b3aa03f --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part3.sql @@ -0,0 +1,456 @@ +-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group +-- +-- Window Functions Testing +-- https://github.com/postgres/postgres/blob/REL_12_STABLE/src/test/regress/sql/window.sql#L564-L911 + +-- Test window operator with codegen on and off. +--CONFIG_DIM1 spark.sql.codegen.wholeStage=true +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + +CREATE TEMPORARY VIEW tenk2 AS SELECT * FROM tenk1; + +CREATE TABLE empsalary ( + depname string, + empno integer, + salary int, + enroll_date date +) USING parquet; + +INSERT INTO empsalary VALUES + ('develop', 10, 5200, date '2007-08-01'), + ('sales', 1, 5000, date '2006-10-01'), + ('personnel', 5, 3500, date '2007-12-10'), + ('sales', 4, 4800, date '2007-08-08'), + ('personnel', 2, 3900, date '2006-12-23'), + ('develop', 7, 4200, date '2008-01-01'), + ('develop', 9, 4500, date '2008-01-01'), + ('sales', 3, 4800, date '2007-08-01'), + ('develop', 8, 6000, date '2006-10-01'), + ('develop', 11, 5200, date '2007-08-15'); + +-- Test in_range for other datetime datatypes + +-- Spark only supports timestamp +-- [SPARK-29636] Spark can't parse '11:00 BST' or '2000-10-19 10:23:54+01' signatures to timestamp +create table datetimes ( + id int, + f_time timestamp, + f_timetz timestamp, + f_interval timestamp, + f_timestamptz timestamp, + f_timestamp timestamp +) using parquet; + +-- Spark cannot safely cast StringType to TimestampType +-- [SPARK-29636] Spark can't parse '11:00 BST' or '2000-10-19 10:23:54+01' signatures to timestamp +insert into datetimes values +(1, timestamp '11:00', cast ('11:00 BST' as timestamp), cast ('1 year' as timestamp), cast ('2000-10-19 10:23:54+01' as timestamp), timestamp '2000-10-19 10:23:54'), +(2, timestamp '12:00', cast ('12:00 BST' as timestamp), cast ('2 years' as timestamp), cast ('2001-10-19 10:23:54+01' as timestamp), timestamp '2001-10-19 10:23:54'), +(3, timestamp '13:00', cast ('13:00 BST' as timestamp), cast ('3 years' as timestamp), cast ('2001-10-19 10:23:54+01' as timestamp), timestamp '2001-10-19 10:23:54'), +(4, timestamp '14:00', cast ('14:00 BST' as timestamp), cast ('4 years' as timestamp), cast ('2002-10-19 10:23:54+01' as timestamp), timestamp '2002-10-19 10:23:54'), +(5, timestamp '15:00', cast ('15:00 BST' as timestamp), cast ('5 years' as timestamp), cast ('2003-10-19 10:23:54+01' as timestamp), timestamp '2003-10-19 10:23:54'), +(6, timestamp '15:00', cast ('15:00 BST' as timestamp), cast ('5 years' as timestamp), cast ('2004-10-19 10:23:54+01' as timestamp), timestamp '2004-10-19 10:23:54'), +(7, timestamp '17:00', cast ('17:00 BST' as timestamp), cast ('7 years' as timestamp), cast ('2005-10-19 10:23:54+01' as timestamp), timestamp '2005-10-19 10:23:54'), +(8, timestamp '18:00', cast ('18:00 BST' as timestamp), cast ('8 years' as timestamp), cast ('2006-10-19 10:23:54+01' as timestamp), timestamp '2006-10-19 10:23:54'), +(9, timestamp '19:00', cast ('19:00 BST' as timestamp), cast ('9 years' as timestamp), cast ('2007-10-19 10:23:54+01' as timestamp), timestamp '2007-10-19 10:23:54'), +(10, timestamp '20:00', cast ('20:00 BST' as timestamp), cast ('10 years' as timestamp), cast ('2008-10-19 10:23:54+01' as timestamp), timestamp '2008-10-19 10:23:54'); + +-- [SPARK-28429] SQL Datetime util function being casted to double instead of timestamp +-- select id, f_time, first(id) over w, last(id) over w +-- from datetimes +-- window w as (order by f_time range between +-- '70 min' preceding and '2 hours' following); + +-- [SPARK-28429] SQL Datetime util function being casted to double instead of timestamp +-- select id, f_time, first(id) over w, last(id) over w +-- from datetimes +-- window w as (order by f_time desc range between +-- '70 min' preceding and '2 hours' following); + +-- [SPARK-28429] SQL Datetime util function being casted to double instead of timestamp +-- select id, f_timetz, first(id) over w, last(id) over w +-- from datetimes +-- window w as (order by f_timetz range between +-- '70 min' preceding and '2 hours' following); + +-- [SPARK-28429] SQL Datetime util function being casted to double instead of timestamp +-- select id, f_timetz, first(id) over w, last(id) over w +-- from datetimes +-- window w as (order by f_timetz desc range between +-- '70 min' preceding and '2 hours' following); + +-- [SPARK-28429] SQL Datetime util function being casted to double instead of timestamp +-- select id, f_interval, first(id) over w, last(id) over w +-- from datetimes +-- window w as (order by f_interval range between +-- '1 year' preceding and '1 year' following); + +-- [SPARK-28429] SQL Datetime util function being casted to double instead of timestamp +-- select id, f_interval, first(id) over w, last(id) over w +-- from datetimes +-- window w as (order by f_interval desc range between +-- '1 year' preceding and '1 year' following); + +-- [SPARK-28429] SQL Datetime util function being casted to double instead of timestamp +-- select id, f_timestamptz, first(id) over w, last(id) over w +-- from datetimes +-- window w as (order by f_timestamptz range between +-- '1 year' preceding and '1 year' following); + +-- [SPARK-28429] SQL Datetime util function being casted to double instead of timestamp +-- select id, f_timestamptz, first(id) over w, last(id) over w +-- from datetimes +-- window w as (order by f_timestamptz desc range between +-- '1 year' preceding and '1 year' following); + +-- [SPARK-28429] SQL Datetime util function being casted to double instead of timestamp +-- select id, f_timestamp, first(id) over w, last(id) over w +-- from datetimes +-- window w as (order by f_timestamp range between +-- '1 year' preceding and '1 year' following); + +-- [SPARK-28429] SQL Datetime util function being casted to double instead of timestamp +-- select id, f_timestamp, first(id) over w, last(id) over w +-- from datetimes +-- window w as (order by f_timestamp desc range between +-- '1 year' preceding and '1 year' following); + +-- RANGE offset PRECEDING/FOLLOWING error cases +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- select sum(salary) over (order by enroll_date, salary range between '1 year' preceding and '2 years' following +-- exclude ties), salary, enroll_date from empsalary; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- select sum(salary) over (range between '1 year' preceding and '2 years' following +-- exclude ties), salary, enroll_date from empsalary; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- select sum(salary) over (order by depname range between '1 year' preceding and '2 years' following +-- exclude ties), salary, enroll_date from empsalary; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- select max(enroll_date) over (order by enroll_date range between 1 preceding and 2 following +-- exclude ties), salary, enroll_date from empsalary; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- select max(enroll_date) over (order by salary range between -1 preceding and 2 following +-- exclude ties), salary, enroll_date from empsalary; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- select max(enroll_date) over (order by salary range between 1 preceding and -2 following +-- exclude ties), salary, enroll_date from empsalary; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- select max(enroll_date) over (order by salary range between '1 year' preceding and '2 years' following +-- exclude ties), salary, enroll_date from empsalary; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- select max(enroll_date) over (order by enroll_date range between '1 year' preceding and '-2 years' following +-- exclude ties), salary, enroll_date from empsalary; + +-- GROUPS tests + +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- SELECT sum(unique1) over (order by four groups between unbounded preceding and current row), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- SELECT sum(unique1) over (order by four groups between unbounded preceding and unbounded following), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- SELECT sum(unique1) over (order by four groups between current row and unbounded following), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- SELECT sum(unique1) over (order by four groups between 1 preceding and unbounded following), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- SELECT sum(unique1) over (order by four groups between 1 following and unbounded following), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- SELECT sum(unique1) over (order by four groups between unbounded preceding and 2 following), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- SELECT sum(unique1) over (order by four groups between 2 preceding and 1 preceding), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- SELECT sum(unique1) over (order by four groups between 2 preceding and 1 following), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- SELECT sum(unique1) over (order by four groups between 0 preceding and 0 following), +-- unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT sum(unique1) over (order by four groups between 2 preceding and 1 following +-- exclude current row), unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT sum(unique1) over (order by four range between 2 preceding and 1 following +-- exclude group), unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- SELECT sum(unique1) over (order by four range between 2 preceding and 1 following +-- exclude ties), unique1, four +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- SELECT sum(unique1) over (partition by ten +-- order by four groups between 0 preceding and 0 following),unique1, four, ten +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- SELECT sum(unique1) over (partition by ten +-- order by four groups between 0 preceding and 0 following exclude current row), unique1, four, ten +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- SELECT sum(unique1) over (partition by ten +-- order by four groups between 0 preceding and 0 following exclude group), unique1, four, ten +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- SELECT sum(unique1) over (partition by ten +-- order by four groups between 0 preceding and 0 following exclude ties), unique1, four, ten +-- FROM tenk1 WHERE unique1 < 10; + +-- [SPARK-27951] ANSI SQL: NTH_VALUE function +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- select first_value(salary) over(order by enroll_date groups between 1 preceding and 1 following), +-- lead(salary) over(order by enroll_date groups between 1 preceding and 1 following), +-- nth_value(salary, 1) over(order by enroll_date groups between 1 preceding and 1 following), +-- salary, enroll_date from empsalary; + +-- [SPARK-28508] Support for range frame+row frame in the same query +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- select last(salary) over(order by enroll_date groups between 1 preceding and 1 following), +-- lag(salary) over(order by enroll_date groups between 1 preceding and 1 following), +-- salary, enroll_date from empsalary; + +-- [SPARK-27951] ANSI SQL: NTH_VALUE function +-- select first_value(salary) over(order by enroll_date groups between 1 following and 3 following +-- exclude current row), +-- lead(salary) over(order by enroll_date groups between 1 following and 3 following exclude ties), +-- nth_value(salary, 1) over(order by enroll_date groups between 1 following and 3 following +-- exclude ties), +-- salary, enroll_date from empsalary; + +-- [SPARK-28428] Spark `exclude` always expecting `()` +-- select last(salary) over(order by enroll_date groups between 1 following and 3 following +-- exclude group), +-- lag(salary) over(order by enroll_date groups between 1 following and 3 following exclude group), +-- salary, enroll_date from empsalary; + +-- Show differences in offset interpretation between ROWS, RANGE, and GROUPS +WITH cte (x) AS ( + SELECT * FROM range(1, 36, 2) +) +SELECT x, (sum(x) over w) +FROM cte +WINDOW w AS (ORDER BY x rows between 1 preceding and 1 following); + +WITH cte (x) AS ( + SELECT * FROM range(1, 36, 2) +) +SELECT x, (sum(x) over w) +FROM cte +WINDOW w AS (ORDER BY x range between 1 preceding and 1 following); + +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- WITH cte (x) AS ( +-- SELECT * FROM range(1, 36, 2) +-- ) +-- SELECT x, (sum(x) over w) +-- FROM cte +-- WINDOW w AS (ORDER BY x groups between 1 preceding and 1 following); + +WITH cte (x) AS ( + select 1 union all select 1 union all select 1 union all + SELECT * FROM range(5, 50, 2) +) +SELECT x, (sum(x) over w) +FROM cte +WINDOW w AS (ORDER BY x rows between 1 preceding and 1 following); + +WITH cte (x) AS ( + select 1 union all select 1 union all select 1 union all + SELECT * FROM range(5, 50, 2) +) +SELECT x, (sum(x) over w) +FROM cte +WINDOW w AS (ORDER BY x range between 1 preceding and 1 following); + +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- WITH cte (x) AS ( +-- select 1 union all select 1 union all select 1 union all +-- SELECT * FROM range(5, 50, 2) +-- ) +-- SELECT x, (sum(x) over w) +-- FROM cte +-- WINDOW w AS (ORDER BY x groups between 1 preceding and 1 following); + +-- with UNION +SELECT count(*) OVER (PARTITION BY four) FROM (SELECT * FROM tenk1 UNION ALL SELECT * FROM tenk2)s LIMIT 0; + +-- check some degenerate cases +create table t1 (f1 int, f2 int) using parquet; +insert into t1 values (1,1),(1,2),(2,2); + +select f1, sum(f1) over (partition by f1 + range between 1 preceding and 1 following) +from t1 where f1 = f2; -- error, must have order by + +-- Since EXPLAIN clause rely on host physical location, it is commented out +-- explain +-- select f1, sum(f1) over (partition by f1 order by f2 +-- range between 1 preceding and 1 following) +-- from t1 where f1 = f2; + +select f1, sum(f1) over (partition by f1 order by f2 +range between 1 preceding and 1 following) +from t1 where f1 = f2; + +select f1, sum(f1) over (partition by f1, f1 order by f2 +range between 2 preceding and 1 preceding) +from t1 where f1 = f2; + +select f1, sum(f1) over (partition by f1, f2 order by f2 +range between 1 following and 2 following) +from t1 where f1 = f2; + +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- select f1, sum(f1) over (partition by f1, +-- groups between 1 preceding and 1 following) +-- from t1 where f1 = f2; + +-- Since EXPLAIN clause rely on host physical location, it is commented out +-- explain +-- select f1, sum(f1) over (partition by f1 order by f2 +-- range between 1 preceding and 1 following) +-- from t1 where f1 = f2; + +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- select f1, sum(f1) over (partition by f1 order by f2 +-- groups between 1 preceding and 1 following) +-- from t1 where f1 = f2; + +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- select f1, sum(f1) over (partition by f1, f1 order by f2 +-- groups between 2 preceding and 1 preceding) +-- from t1 where f1 = f2; + +-- [SPARK-28648] Adds support to `groups` unit type in window clauses +-- select f1, sum(f1) over (partition by f1, f2 order by f2 +-- groups between 1 following and 2 following) +-- from t1 where f1 = f2; + +-- ordering by a non-integer constant is allowed +SELECT rank() OVER (ORDER BY length('abc')); + +-- can't order by another window function +-- [SPARK-28566] window functions should not be allowed in window definitions +-- SELECT rank() OVER (ORDER BY rank() OVER (ORDER BY random())); + +-- some other errors +SELECT * FROM empsalary WHERE row_number() OVER (ORDER BY salary) < 10; + +SELECT * FROM empsalary INNER JOIN tenk1 ON row_number() OVER (ORDER BY salary) < 10; + +SELECT rank() OVER (ORDER BY 1), count(*) FROM empsalary GROUP BY 1; + +SELECT * FROM rank() OVER (ORDER BY random()); + +-- Original query: DELETE FROM empsalary WHERE (rank() OVER (ORDER BY random())) > 10; +SELECT * FROM empsalary WHERE (rank() OVER (ORDER BY random())) > 10; + +-- Original query: DELETE FROM empsalary RETURNING rank() OVER (ORDER BY random()); +SELECT * FROM empsalary WHERE rank() OVER (ORDER BY random()); + +-- [SPARK-28645] Throw an error on window redefinition +-- select count(*) OVER w FROM tenk1 WINDOW w AS (ORDER BY unique1), w AS (ORDER BY unique1); + +select rank() OVER (PARTITION BY four, ORDER BY ten) FROM tenk1; + +-- [SPARK-28646] Allow usage of `count` only for parameterless aggregate function +-- select count() OVER () FROM tenk1; + +-- The output is the expected one: `range` is not a window or aggregate function. +SELECT range(1, 100) OVER () FROM empsalary; + +SELECT ntile(0) OVER (ORDER BY ten), ten, four FROM tenk1; + +-- [SPARK-27951] ANSI SQL: NTH_VALUE function +-- SELECT nth_value(four, 0) OVER (ORDER BY ten), ten, four FROM tenk1; + +-- filter + +-- [SPARK-30182] Support nested aggregates +-- SELECT sum(salary), row_number() OVER (ORDER BY depname), sum( +-- sum(salary) FILTER (WHERE enroll_date > '2007-01-01') +-- ) +-- FROM empsalary GROUP BY depname; + +-- Test pushdown of quals into a subquery containing window functions + +-- pushdown is safe because all PARTITION BY clauses include depname: +-- Since EXPLAIN clause rely on host physical location, it is commented out +-- EXPLAIN +-- SELECT * FROM +-- (SELECT depname, +-- sum(salary) OVER (PARTITION BY depname) depsalary, +-- min(salary) OVER (PARTITION BY depname || 'A', depname) depminsalary +-- FROM empsalary) emp +-- WHERE depname = 'sales'; + +-- pushdown is unsafe because there's a PARTITION BY clause without depname: +-- Since EXPLAIN clause rely on host physical location, it is commented out +-- EXPLAIN +-- SELECT * FROM +-- (SELECT depname, +-- sum(salary) OVER (PARTITION BY enroll_date) enroll_salary, +-- min(salary) OVER (PARTITION BY depname) depminsalary +-- FROM empsalary) emp +-- WHERE depname = 'sales'; + +-- Test Sort node collapsing +-- Since EXPLAIN clause rely on host physical location, it is commented out +-- EXPLAIN +-- SELECT * FROM +-- (SELECT depname, +-- sum(salary) OVER (PARTITION BY depname order by empno) depsalary, +-- min(salary) OVER (PARTITION BY depname, empno order by enroll_date) depminsalary +-- FROM empsalary) emp +-- WHERE depname = 'sales'; + +-- Test Sort node reordering +-- Since EXPLAIN clause rely on host physical location, it is commented out +-- EXPLAIN +-- SELECT +-- lead(1) OVER (PARTITION BY depname ORDER BY salary, enroll_date), +-- lag(1) OVER (PARTITION BY depname ORDER BY salary,enroll_date,empno) +-- FROM empsalary; + +-- cleanup +DROP TABLE empsalary; +DROP TABLE datetimes; +DROP TABLE t1; diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part4.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part4.sql new file mode 100644 index 0000000000000..64ba8e3b7a5ad --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/window_part4.sql @@ -0,0 +1,404 @@ +-- Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group +-- +-- Window Functions Testing +-- https://github.com/postgres/postgres/blob/REL_12_STABLE/src/test/regress/sql/window.sql#L913-L1278 + +-- Test window operator with codegen on and off. +--CONFIG_DIM1 spark.sql.codegen.wholeStage=true +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + +-- Spark doesn't handle UDFs in SQL +-- test user-defined window function with named args and default args +-- CREATE FUNCTION nth_value_def(val anyelement, n integer = 1) RETURNS anyelement +-- LANGUAGE internal WINDOW IMMUTABLE STRICT AS 'window_nth_value'; + +-- Spark doesn't handle UDFs in SQL +-- SELECT nth_value_def(n := 2, val := ten) OVER (PARTITION BY four), ten, four +-- FROM (SELECT * FROM tenk1 WHERE unique2 < 10 ORDER BY four, ten) s; + +-- Spark doesn't handle UDFs in SQL +-- SELECT nth_value_def(ten) OVER (PARTITION BY four), ten, four +-- FROM (SELECT * FROM tenk1 WHERE unique2 < 10 ORDER BY four, ten) s; + +-- +-- Test the basic moving-aggregate machinery +-- + +-- create aggregates that record the series of transform calls (these are +-- intentionally not true inverses) + +-- Spark doesn't handle UDFs in SQL +-- CREATE FUNCTION logging_sfunc_nonstrict(text, anyelement) RETURNS text AS +-- $$ SELECT COALESCE($1, '') || '*' || quote_nullable($2) $$ +-- LANGUAGE SQL IMMUTABLE; + +-- Spark doesn't handle UDFs in SQL +-- CREATE FUNCTION logging_msfunc_nonstrict(text, anyelement) RETURNS text AS +-- $$ SELECT COALESCE($1, '') || '+' || quote_nullable($2) $$ +-- LANGUAGE SQL IMMUTABLE; + +-- Spark doesn't handle UDFs in SQL +-- CREATE FUNCTION logging_minvfunc_nonstrict(text, anyelement) RETURNS text AS +-- $$ SELECT $1 || '-' || quote_nullable($2) $$ +-- LANGUAGE SQL IMMUTABLE; + +-- Spark doesn't handle UDFs in SQL +-- CREATE AGGREGATE logging_agg_nonstrict (anyelement) +-- ( +-- stype = text, +-- sfunc = logging_sfunc_nonstrict, +-- mstype = text, +-- msfunc = logging_msfunc_nonstrict, +-- minvfunc = logging_minvfunc_nonstrict +-- ); + +-- Spark doesn't handle UDFs in SQL +-- CREATE AGGREGATE logging_agg_nonstrict_initcond (anyelement) +-- ( +-- stype = text, +-- sfunc = logging_sfunc_nonstrict, +-- mstype = text, +-- msfunc = logging_msfunc_nonstrict, +-- minvfunc = logging_minvfunc_nonstrict, +-- initcond = 'I', +-- minitcond = 'MI' +-- ); + +-- Spark doesn't handle UDFs in SQL +-- CREATE FUNCTION logging_sfunc_strict(text, anyelement) RETURNS text AS +-- $$ SELECT $1 || '*' || quote_nullable($2) $$ +-- LANGUAGE SQL STRICT IMMUTABLE; + +-- Spark doesn't handle UDFs in SQL +-- CREATE FUNCTION logging_msfunc_strict(text, anyelement) RETURNS text AS +-- $$ SELECT $1 || '+' || quote_nullable($2) $$ +-- LANGUAGE SQL STRICT IMMUTABLE; + +-- Spark doesn't handle UDFs in SQL +-- CREATE FUNCTION logging_minvfunc_strict(text, anyelement) RETURNS text AS +-- $$ SELECT $1 || '-' || quote_nullable($2) $$ +-- LANGUAGE SQL STRICT IMMUTABLE; + +-- Spark doesn't handle UDFs in SQL +-- CREATE AGGREGATE logging_agg_strict (text) +-- ( +-- stype = text, +-- sfunc = logging_sfunc_strict, +-- mstype = text, +-- msfunc = logging_msfunc_strict, +-- minvfunc = logging_minvfunc_strict +-- ); + +-- Spark doesn't handle UDFs in SQL +-- CREATE AGGREGATE logging_agg_strict_initcond (anyelement) +-- ( +-- stype = text, +-- sfunc = logging_sfunc_strict, +-- mstype = text, +-- msfunc = logging_msfunc_strict, +-- minvfunc = logging_minvfunc_strict, +-- initcond = 'I', +-- minitcond = 'MI' +-- ); + +-- Spark doesn't handle UDFs in SQL +-- test strict and non-strict cases +-- SELECT +-- p::text || ',' || i::text || ':' || COALESCE(v::text, 'NULL') AS row, +-- logging_agg_nonstrict(v) over wnd as nstrict, +-- logging_agg_nonstrict_initcond(v) over wnd as nstrict_init, +-- logging_agg_strict(v::text) over wnd as strict, +-- logging_agg_strict_initcond(v) over wnd as strict_init +-- FROM (VALUES +-- (1, 1, NULL), +-- (1, 2, 'a'), +-- (1, 3, 'b'), +-- (1, 4, NULL), +-- (1, 5, NULL), +-- (1, 6, 'c'), +-- (2, 1, NULL), +-- (2, 2, 'x'), +-- (3, 1, 'z') +-- ) AS t(p, i, v) +-- WINDOW wnd AS (PARTITION BY P ORDER BY i ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) +-- ORDER BY p, i; + +-- Spark doesn't handle UDFs in SQL +-- and again, but with filter +-- SELECT +-- p::text || ',' || i::text || ':' || +-- CASE WHEN f THEN COALESCE(v::text, 'NULL') ELSE '-' END as row, +-- logging_agg_nonstrict(v) filter(where f) over wnd as nstrict_filt, +-- logging_agg_nonstrict_initcond(v) filter(where f) over wnd as nstrict_init_filt, +-- logging_agg_strict(v::text) filter(where f) over wnd as strict_filt, +-- logging_agg_strict_initcond(v) filter(where f) over wnd as strict_init_filt +-- FROM (VALUES +-- (1, 1, true, NULL), +-- (1, 2, false, 'a'), +-- (1, 3, true, 'b'), +-- (1, 4, false, NULL), +-- (1, 5, false, NULL), +-- (1, 6, false, 'c'), +-- (2, 1, false, NULL), +-- (2, 2, true, 'x'), +-- (3, 1, true, 'z') +-- ) AS t(p, i, f, v) +-- WINDOW wnd AS (PARTITION BY p ORDER BY i ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) +-- ORDER BY p, i; + +-- Spark doesn't handle UDFs in SQL +-- test that volatile arguments disable moving-aggregate mode +-- SELECT +-- i::text || ':' || COALESCE(v::text, 'NULL') as row, +-- logging_agg_strict(v::text) +-- over wnd as inverse, +-- logging_agg_strict(v::text || CASE WHEN random() < 0 then '?' ELSE '' END) +-- over wnd as noinverse +-- FROM (VALUES +-- (1, 'a'), +-- (2, 'b'), +-- (3, 'c') +-- ) AS t(i, v) +-- WINDOW wnd AS (ORDER BY i ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) +-- ORDER BY i; + +-- Spark doesn't handle UDFs in SQL +-- SELECT +-- i::text || ':' || COALESCE(v::text, 'NULL') as row, +-- logging_agg_strict(v::text) filter(where true) +-- over wnd as inverse, +-- logging_agg_strict(v::text) filter(where random() >= 0) +-- over wnd as noinverse +-- FROM (VALUES +-- (1, 'a'), +-- (2, 'b'), +-- (3, 'c') +-- ) AS t(i, v) +-- WINDOW wnd AS (ORDER BY i ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) +-- ORDER BY i; + +-- Spark doesn't handle UDFs in SQL +-- test that non-overlapping windows don't use inverse transitions +-- SELECT +-- logging_agg_strict(v::text) OVER wnd +-- FROM (VALUES +-- (1, 'a'), +-- (2, 'b'), +-- (3, 'c') +-- ) AS t(i, v) +-- WINDOW wnd AS (ORDER BY i ROWS BETWEEN CURRENT ROW AND CURRENT ROW) +-- ORDER BY i; + +-- Spark doesn't handle UDFs in SQL +-- test that returning NULL from the inverse transition functions +-- restarts the aggregation from scratch. The second aggregate is supposed +-- to test cases where only some aggregates restart, the third one checks +-- that one aggregate restarting doesn't cause others to restart. + +-- Spark doesn't handle UDFs in SQL +-- CREATE FUNCTION sum_int_randrestart_minvfunc(int4, int4) RETURNS int4 AS +-- $$ SELECT CASE WHEN random() < 0.2 THEN NULL ELSE $1 - $2 END $$ +-- LANGUAGE SQL STRICT; + +-- Spark doesn't handle UDFs in SQL +-- CREATE AGGREGATE sum_int_randomrestart (int4) +-- ( +-- stype = int4, +-- sfunc = int4pl, +-- mstype = int4, +-- msfunc = int4pl, +-- minvfunc = sum_int_randrestart_minvfunc +-- ); + +-- Spark doesn't handle UDFs in SQL +-- WITH +-- vs AS ( +-- SELECT i, (random() * 100)::int4 AS v +-- FROM generate_series(1, 100) AS i +-- ), +-- sum_following AS ( +-- SELECT i, SUM(v) OVER +-- (ORDER BY i DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS s +-- FROM vs +-- ) +-- SELECT DISTINCT +-- sum_following.s = sum_int_randomrestart(v) OVER fwd AS eq1, +-- -sum_following.s = sum_int_randomrestart(-v) OVER fwd AS eq2, +-- 100*3+(vs.i-1)*3 = length(logging_agg_nonstrict(''::text) OVER fwd) AS eq3 +-- FROM vs +-- JOIN sum_following ON sum_following.i = vs.i +-- WINDOW fwd AS ( +-- ORDER BY vs.i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING +-- ); + +-- +-- Test various built-in aggregates that have moving-aggregate support +-- + +-- test inverse transition functions handle NULLs properly +SELECT i,AVG(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1),(2,2),(3,NULL),(4,NULL)) t(i,v); + +SELECT i,AVG(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1),(2,2),(3,NULL),(4,NULL)) t(i,v); + +SELECT i,AVG(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1),(2,2),(3,NULL),(4,NULL)) t(i,v); + +SELECT i,AVG(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1.5),(2,2.5),(3,NULL),(4,NULL)) t(i,v); + +-- [SPARK-28602] Spark does not recognize 'interval' type as 'numeric' +-- SELECT i,AVG(v::interval) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) +-- FROM (VALUES(1,'1 sec'),(2,'2 sec'),(3,NULL),(4,NULL)) t(i,v); + +SELECT i,SUM(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1),(2,2),(3,NULL),(4,NULL)) t(i,v); + +SELECT i,SUM(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1),(2,2),(3,NULL),(4,NULL)) t(i,v); + +SELECT i,SUM(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1),(2,2),(3,NULL),(4,NULL)) t(i,v); + +-- The cast syntax is present in PgSQL for legacy reasons and Spark will not recognize a money field +-- SELECT i,SUM(v::money) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) +-- FROM (VALUES(1,'1.10'),(2,'2.20'),(3,NULL),(4,NULL)) t(i,v); + +-- [SPARK-28602] Spark does not recognize 'interval' type as 'numeric' +-- SELECT i,SUM(cast(v as interval)) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) +-- FROM (VALUES(1,'1 sec'),(2,'2 sec'),(3,NULL),(4,NULL)) t(i,v); + +SELECT i,SUM(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1.1),(2,2.2),(3,NULL),(4,NULL)) t(i,v); + +SELECT SUM(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1.01),(2,2),(3,3)) v(i,n); + +SELECT i,COUNT(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1),(2,2),(3,NULL),(4,NULL)) t(i,v); + +SELECT i,COUNT(*) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1),(2,2),(3,NULL),(4,NULL)) t(i,v); + +SELECT VAR_POP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n); + +SELECT VAR_POP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n); + +SELECT VAR_POP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n); + +SELECT VAR_POP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n); + +SELECT VAR_SAMP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n); + +SELECT VAR_SAMP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n); + +SELECT VAR_SAMP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n); + +SELECT VAR_SAMP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n); + +SELECT VARIANCE(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n); + +SELECT VARIANCE(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n); + +SELECT VARIANCE(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n); + +SELECT VARIANCE(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n); + +SELECT STDDEV_POP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,NULL),(2,600),(3,470),(4,170),(5,430),(6,300)) r(i,n); + +SELECT STDDEV_POP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,NULL),(2,600),(3,470),(4,170),(5,430),(6,300)) r(i,n); + +SELECT STDDEV_POP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,NULL),(2,600),(3,470),(4,170),(5,430),(6,300)) r(i,n); + +SELECT STDDEV_POP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,NULL),(2,600),(3,470),(4,170),(5,430),(6,300)) r(i,n); + +-- For the following queries Spark result differs from PgSQL: +-- Spark handles division by zero as 'NaN' instead of 'NULL', which is the PgSQL behaviour +SELECT STDDEV_SAMP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,NULL),(2,600),(3,470),(4,170),(5,430),(6,300)) r(i,n); + +SELECT STDDEV_SAMP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,NULL),(2,600),(3,470),(4,170),(5,430),(6,300)) r(i,n); + +SELECT STDDEV_SAMP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,NULL),(2,600),(3,470),(4,170),(5,430),(6,300)) r(i,n); + +SELECT STDDEV_SAMP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,NULL),(2,600),(3,470),(4,170),(5,430),(6,300)) r(i,n); + +SELECT STDDEV(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(0,NULL),(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n); + +SELECT STDDEV(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(0,NULL),(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n); + +SELECT STDDEV(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(0,NULL),(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n); + +SELECT STDDEV(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(0,NULL),(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n); + +-- test that inverse transition functions work with various frame options +SELECT i,SUM(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND CURRENT ROW) + FROM (VALUES(1,1),(2,2),(3,NULL),(4,NULL)) t(i,v); + +SELECT i,SUM(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING) + FROM (VALUES(1,1),(2,2),(3,NULL),(4,NULL)) t(i,v); + +SELECT i,SUM(v) OVER (ORDER BY i ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) + FROM (VALUES(1,1),(2,2),(3,3),(4,4)) t(i,v); + +-- [SPARK-29638] Spark handles 'NaN' as 0 in sums +-- ensure aggregate over numeric properly recovers from NaN values +SELECT a, b, + SUM(b) OVER(ORDER BY A ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) +FROM (VALUES(1,1),(2,2),(3,(cast('nan' as int))),(4,3),(5,4)) t(a,b); + +-- It might be tempting for someone to add an inverse trans function for +-- float and double precision. This should not be done as it can give incorrect +-- results. This test should fail if anyone ever does this without thinking too +-- hard about it. +-- [SPARK-28516] adds `to_char` +-- SELECT to_char(SUM(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING),'999999999999999999999D9') +-- FROM (VALUES(1,1e20),(2,1)) n(i,n); + +-- [SPARK-27880] Implement boolean aggregates(BOOL_AND, BOOL_OR and EVERY) +-- SELECT i, b, bool_and(b) OVER w, bool_or(b) OVER w +-- FROM (VALUES (1,true), (2,true), (3,false), (4,false), (5,true)) v(i,b) +-- WINDOW w AS (ORDER BY i ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING); + +-- Tests for problems with failure to walk or mutate expressions +-- within window frame clauses. + +-- [SPARK-27974] Add built-in Aggregate Function: array_agg +-- test walker (fails with collation error if expressions are not walked) +-- SELECT array_agg(i) OVER w +-- FROM range(1,6) i +-- WINDOW w AS (ORDER BY i ROWS BETWEEN (('foo' < 'foobar')::integer) PRECEDING AND CURRENT ROW); + +-- Spark doesn't handle UDFs in SQL +-- test mutator (fails when inlined if expressions are not mutated) +-- CREATE FUNCTION pg_temp.f(group_size BIGINT) RETURNS SETOF integer[] +-- AS $$ +-- SELECT array_agg(s) OVER w +-- FROM generate_series(1,5) s +-- WINDOW w AS (ORDER BY s ROWS BETWEEN CURRENT ROW AND GROUP_SIZE FOLLOWING) +-- $$ LANGUAGE SQL STABLE; diff --git a/sql/core/src/test/resources/sql-tests/inputs/pgSQL/with.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/with.sql similarity index 100% rename from sql/core/src/test/resources/sql-tests/inputs/pgSQL/with.sql rename to sql/core/src/test/resources/sql-tests/inputs/postgreSQL/with.sql diff --git a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql new file mode 100644 index 0000000000000..c0827a3cba39b --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql @@ -0,0 +1,9 @@ +-- regexp_extract +SELECT regexp_extract('1a 2b 14m', '\\d+'); +SELECT regexp_extract('1a 2b 14m', '\\d+', 0); +SELECT regexp_extract('1a 2b 14m', '\\d+', 1); +SELECT regexp_extract('1a 2b 14m', '\\d+', 2); +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)'); +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 0); +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 1); +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 2); diff --git a/sql/core/src/test/resources/sql-tests/inputs/show-create-table.sql b/sql/core/src/test/resources/sql-tests/inputs/show-create-table.sql index 852bfbd63847d..dc77f87d9743a 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/show-create-table.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/show-create-table.sql @@ -59,3 +59,46 @@ TBLPROPERTIES ('a' = '1'); SHOW CREATE TABLE tbl; DROP TABLE tbl; + +-- float alias real and decimal alias numeric +CREATE TABLE tbl (a REAL, b NUMERIC, c NUMERIC(10), d NUMERIC(10,1)) USING parquet; +SHOW CREATE TABLE tbl; +DROP TABLE tbl; + + +-- show create table for view +CREATE TABLE tbl (a INT, b STRING, c INT) USING parquet; + +-- simple +CREATE VIEW view_SPARK_30302 (aaa, bbb) +AS SELECT a, b FROM tbl; + +SHOW CREATE TABLE view_SPARK_30302 AS SERDE; +DROP VIEW view_SPARK_30302; + + +-- comment +CREATE VIEW view_SPARK_30302 (aaa COMMENT 'comment with \'quoted text\' for aaa', bbb) +COMMENT 'This is a comment with \'quoted text\' for view' +AS SELECT a, b FROM tbl; + +SHOW CREATE TABLE view_SPARK_30302 AS SERDE; +DROP VIEW view_SPARK_30302; + + +-- tblproperties +CREATE VIEW view_SPARK_30302 (aaa, bbb) +TBLPROPERTIES ('a' = '1', 'b' = '2') +AS SELECT a, b FROM tbl; + +SHOW CREATE TABLE view_SPARK_30302 AS SERDE; +DROP VIEW view_SPARK_30302; + +-- SHOW CREATE TABLE does not support view +CREATE VIEW view_SPARK_30302 (aaa, bbb) +AS SELECT a, b FROM tbl; + +SHOW CREATE TABLE view_SPARK_30302; +DROP VIEW view_SPARK_30302; + +DROP TABLE tbl; diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-aggregate.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-aggregate.sql index b5f458f2cb184..ae6a9641aae66 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-aggregate.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-aggregate.sql @@ -1,5 +1,10 @@ -- Tests aggregate expressions in outer query and EXISTS subquery. +-- Test aggregate operator with codegen on and off. +--CONFIG_DIM1 spark.sql.codegen.wholeStage=true +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES (100, "emp 1", date "2005-01-01", 100.00D, 10), (100, "emp 1", date "2005-01-01", 100.00D, 10), diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-joins-and-set-ops.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-joins-and-set-ops.sql index cefc3fe6272ab..667573b30d265 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-joins-and-set-ops.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-joins-and-set-ops.sql @@ -1,9 +1,17 @@ -- Tests EXISTS subquery support. Tests Exists subquery -- used in Joins (Both when joins occurs in outer and suquery blocks) --- List of configuration the test suite is run against: ---SET spark.sql.autoBroadcastJoinThreshold=10485760 ---SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=true ---SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=false + +-- There are 2 dimensions we want to test +-- 1. run with broadcast hash join, sort merge join or shuffle hash join. +-- 2. run with whole-stage-codegen, operator codegen or no codegen. + +--CONFIG_DIM1 spark.sql.autoBroadcastJoinThreshold=10485760 +--CONFIG_DIM1 spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=true +--CONFIG_DIM1 spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=false + +--CONFIG_DIM2 spark.sql.codegen.wholeStage=true +--CONFIG_DIM2 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM2 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES (100, "emp 1", date "2005-01-01", 100.00D, 10), diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-orderby-limit.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-orderby-limit.sql index 19fc18833760c..580fc1d4162eb 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-orderby-limit.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/exists-subquery/exists-orderby-limit.sql @@ -1,5 +1,10 @@ -- Tests EXISTS subquery support with ORDER BY and LIMIT clauses. +-- Test sort operator with codegen on and off. +--CONFIG_DIM1 spark.sql.codegen.wholeStage=true +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES (100, "emp 1", date "2005-01-01", 100.00D, 10), (100, "emp 1", date "2005-01-01", 100.00D, 10), diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-group-by.sql index b1d96b32c2478..496285e3514ea 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-group-by.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-group-by.sql @@ -1,50 +1,55 @@ -- A test suite for GROUP BY in parent side, subquery, and both predicate subquery -- It includes correlated cases. +-- Test aggregate operator with codegen on and off. +--CONFIG_DIM1 spark.sql.codegen.wholeStage=true +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + create temporary view t1 as select * from values - ("t1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), - ("t1b", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("t1a", 16S, 12, 21L, float(15.0), 20D, 20E2, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), - ("t1a", 16S, 12, 10L, float(15.0), 20D, 20E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ("t1c", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), - ("t1d", null, 16, 22L, float(17.0), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', null), - ("t1d", null, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-07-04 01:02:00.001', null), - ("t1e", 10S, null, 25L, float(17.0), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), - ("t1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), - ("t1d", 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ("t1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), - ("t1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') + ("t1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), + ("t1b", 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("t1a", 16S, 12, 21L, float(15.0), 20D, 20E2BD, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), + ("t1a", 16S, 12, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ("t1c", 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), + ("t1d", null, 16, 22L, float(17.0), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', null), + ("t1d", null, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.001', null), + ("t1e", 10S, null, 25L, float(17.0), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), + ("t1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), + ("t1d", 10S, null, 12L, float(17.0), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ("t1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), + ("t1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i); create temporary view t2 as select * from values - ("t2a", 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), - ("t1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("t1b", 8S, 16, 119L, float(17), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ("t1c", 12S, 16, 219L, float(17), 25D, 26E2, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), - ("t1b", null, 16, 319L, float(17), 25D, 26E2, timestamp '2017-05-04 01:01:00.000', null), - ("t2e", 8S, null, 419L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ("t1f", 19S, null, 519L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("t1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ("t1b", 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ("t1c", 12S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), - ("t1e", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), - ("t1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), - ("t1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) + ("t2a", 6S, 12, 14L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), + ("t1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("t1b", 8S, 16, 119L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ("t1c", 12S, 16, 219L, float(17), 25D, 26E2BD, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), + ("t1b", null, 16, 319L, float(17), 25D, 26E2BD, timestamp '2017-05-04 01:01:00.000', null), + ("t2e", 8S, null, 419L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ("t1f", 19S, null, 519L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("t1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ("t1b", 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ("t1c", 12S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), + ("t1e", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), + ("t1f", 19S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), + ("t1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', null) as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i); create temporary view t3 as select * from values - ("t3a", 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), - ("t3a", 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("t1b", 10S, 12, 219L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("t1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("t1b", 8S, 16, 319L, float(17), 25D, 26E2, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), - ("t1b", 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), - ("t3c", 17S, 16, 519L, float(17), 25D, 26E2, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), - ("t3c", 17S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), - ("t1b", null, 16, 419L, float(17), 25D, 26E2, timestamp '2014-10-04 01:02:00.000', null), - ("t1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), - ("t3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("t3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') + ("t3a", 6S, 12, 110L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), + ("t3a", 6S, 12, 10L, float(15), 20D, 20E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("t1b", 10S, 12, 219L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("t1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("t1b", 8S, 16, 319L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), + ("t1b", 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), + ("t3c", 17S, 16, 519L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), + ("t3c", 17S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), + ("t1b", null, 16, 419L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:02:00.000', null), + ("t1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-11-04 01:02:00.000', null), + ("t3b", 8S, null, 719L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("t3b", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i); -- correlated IN subquery diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-joins.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-joins.sql index 22f3eafd6a02d..200a71ebbb622 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-joins.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-joins.sql @@ -1,9 +1,17 @@ -- A test suite for IN JOINS in parent side, subquery, and both predicate subquery -- It includes correlated cases. --- List of configuration the test suite is run against: ---SET spark.sql.autoBroadcastJoinThreshold=10485760 ---SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=true ---SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=false + +-- There are 2 dimensions we want to test +-- 1. run with broadcast hash join, sort merge join or shuffle hash join. +-- 2. run with whole-stage-codegen, operator codegen or no codegen. + +--CONFIG_DIM1 spark.sql.autoBroadcastJoinThreshold=10485760 +--CONFIG_DIM1 spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=true +--CONFIG_DIM1 spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=false + +--CONFIG_DIM2 spark.sql.codegen.wholeStage=true +--CONFIG_DIM2 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM2 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN create temporary view t1 as select * from values ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), @@ -51,6 +59,18 @@ create temporary view t3 as select * from values ("val3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i); +create temporary view s1 as select * from values + (1), (3), (5), (7), (9) + as s1(id); + +create temporary view s2 as select * from values + (1), (3), (4), (6), (9) + as s2(id); + +create temporary view s3 as select * from values + (3), (4), (6), (9) + as s3(id); + -- correlated IN subquery -- different JOIN in parent side -- TC 01.01 @@ -83,7 +103,7 @@ GROUP BY t1a, t3a, t3b, t3c -ORDER BY t1a DESC, t3b DESC; +ORDER BY t1a DESC, t3b DESC, t3c ASC; -- TC 01.03 SELECT Count(DISTINCT(t1a)) @@ -272,3 +292,101 @@ Group By t1a, t1b, t1c, t2a, t2b, t2c HAVING t2c IS NOT NULL ORDER By t2b DESC nulls last; + +SELECT s1.id FROM s1 +JOIN s2 ON s1.id = s2.id +AND s1.id IN (SELECT 9); + + +SELECT s1.id FROM s1 +JOIN s2 ON s1.id = s2.id +AND s1.id NOT IN (SELECT 9); + + +-- IN with Subquery ON INNER JOIN +SELECT s1.id FROM s1 +JOIN s2 ON s1.id = s2.id +AND s1.id IN (SELECT id FROM s3); + + +-- IN with Subquery ON LEFT SEMI JOIN +SELECT s1.id AS id2 FROM s1 +LEFT SEMI JOIN s2 +ON s1.id = s2.id +AND s1.id IN (SELECT id FROM s3); + + +-- IN with Subquery ON LEFT ANTI JOIN +SELECT s1.id as id2 FROM s1 +LEFT ANTI JOIN s2 +ON s1.id = s2.id +AND s1.id IN (SELECT id FROM s3); + + +-- IN with Subquery ON LEFT OUTER JOIN +SELECT s1.id, s2.id as id2 FROM s1 +LEFT OUTER JOIN s2 +ON s1.id = s2.id +AND s1.id IN (SELECT id FROM s3); + + +-- IN with Subquery ON RIGHT OUTER JOIN +SELECT s1.id, s2.id as id2 FROM s1 +RIGHT OUTER JOIN s2 +ON s1.id = s2.id +AND s1.id IN (SELECT id FROM s3); + + +-- IN with Subquery ON FULL OUTER JOIN +SELECT s1.id, s2.id AS id2 FROM s1 +FULL OUTER JOIN s2 +ON s1.id = s2.id +AND s1.id IN (SELECT id FROM s3); + + +-- NOT IN with Subquery ON INNER JOIN +SELECT s1.id FROM s1 +JOIN s2 ON s1.id = s2.id +AND s1.id NOT IN (SELECT id FROM s3); + + +-- NOT IN with Subquery ON LEFT SEMI JOIN +SELECT s1.id AS id2 FROM s1 +LEFT SEMI JOIN s2 +ON s1.id = s2.id +AND s1.id NOT IN (SELECT id FROM s3); + + +-- NOT IN with Subquery ON LEFT ANTI JOIN +SELECT s1.id AS id2 FROM s1 +LEFT ANTI JOIN s2 +ON s1.id = s2.id +AND s1.id NOT IN (SELECT id FROM s3); + + +-- NOT IN with Subquery ON LEFT OUTER JOIN +SELECT s1.id, s2.id AS id2 FROM s1 +LEFT OUTER JOIN s2 +ON s1.id = s2.id +AND s1.id NOT IN (SELECT id FROM s3); + + +-- NOT IN with Subquery ON RIGHT OUTER JOIN +SELECT s1.id, s2.id AS id2 FROM s1 +RIGHT OUTER JOIN s2 +ON s1.id = s2.id +AND s1.id NOT IN (SELECT id FROM s3); + + +-- NOT IN with Subquery ON FULL OUTER JOIN +SELECT s1.id, s2.id AS id2 FROM s1 +FULL OUTER JOIN s2 +ON s1.id = s2.id +AND s1.id NOT IN (SELECT id FROM s3); + + +DROP VIEW s1; + +DROP VIEW s2; + +DROP VIEW s3; diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-limit.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-limit.sql index a40ee082ba3b9..481b5e8cc7700 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-limit.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-limit.sql @@ -2,49 +2,49 @@ -- It includes correlated cases. create temporary view t1 as select * from values - ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), - ("val1b", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("val1a", 16S, 12, 21L, float(15.0), 20D, 20E2, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), - ("val1a", 16S, 12, 10L, float(15.0), 20D, 20E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ("val1c", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), - ("val1d", null, 16, 22L, float(17.0), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', null), - ("val1d", null, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-07-04 01:02:00.001', null), - ("val1e", 10S, null, 25L, float(17.0), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), - ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), - ("val1d", 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), - ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') + ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), + ("val1b", 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("val1a", 16S, 12, 21L, float(15.0), 20D, 20E2BD, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), + ("val1a", 16S, 12, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ("val1c", 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), + ("val1d", null, 16, 22L, float(17.0), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', null), + ("val1d", null, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.001', null), + ("val1e", 10S, null, 25L, float(17.0), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), + ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), + ("val1d", 10S, null, 12L, float(17.0), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), + ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i); create temporary view t2 as select * from values - ("val2a", 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), - ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("val1b", 8S, 16, 119L, float(17), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ("val1c", 12S, 16, 219L, float(17), 25D, 26E2, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), - ("val1b", null, 16, 319L, float(17), 25D, 26E2, timestamp '2017-05-04 01:01:00.000', null), - ("val2e", 8S, null, 419L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ("val1f", 19S, null, 519L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ("val1b", 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ("val1c", 12S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), - ("val1e", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), - ("val1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), - ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) + ("val2a", 6S, 12, 14L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), + ("val1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("val1b", 8S, 16, 119L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ("val1c", 12S, 16, 219L, float(17), 25D, 26E2BD, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), + ("val1b", null, 16, 319L, float(17), 25D, 26E2BD, timestamp '2017-05-04 01:01:00.000', null), + ("val2e", 8S, null, 419L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ("val1f", 19S, null, 519L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("val1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ("val1b", 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ("val1c", 12S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), + ("val1e", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), + ("val1f", 19S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), + ("val1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', null) as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i); create temporary view t3 as select * from values - ("val3a", 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), - ("val3a", 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val1b", 10S, 12, 219L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val1b", 8S, 16, 319L, float(17), 25D, 26E2, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), - ("val1b", 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), - ("val3c", 17S, 16, 519L, float(17), 25D, 26E2, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), - ("val3c", 17S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), - ("val1b", null, 16, 419L, float(17), 25D, 26E2, timestamp '2014-10-04 01:02:00.000', null), - ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), - ("val3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') + ("val3a", 6S, 12, 110L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), + ("val3a", 6S, 12, 10L, float(15), 20D, 20E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val1b", 10S, 12, 219L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val1b", 8S, 16, 319L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), + ("val1b", 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), + ("val3c", 17S, 16, 519L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), + ("val3c", 17S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), + ("val1b", null, 16, 419L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:02:00.000', null), + ("val1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-11-04 01:02:00.000', null), + ("val3b", 8S, null, 719L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val3b", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i); -- correlated IN subquery diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-order-by.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-order-by.sql index 892e39ff47c1f..001c49c460b06 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-order-by.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-order-by.sql @@ -1,50 +1,55 @@ -- A test suite for ORDER BY in parent side, subquery, and both predicate subquery -- It includes correlated cases. +-- Test sort operator with codegen on and off. +--CONFIG_DIM1 spark.sql.codegen.wholeStage=true +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + create temporary view t1 as select * from values - ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), - ("val1b", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("val1a", 16S, 12, 21L, float(15.0), 20D, 20E2, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), - ("val1a", 16S, 12, 10L, float(15.0), 20D, 20E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ("val1c", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), - ("val1d", null, 16, 22L, float(17.0), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', null), - ("val1d", null, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-07-04 01:02:00.001', null), - ("val1e", 10S, null, 25L, float(17.0), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), - ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), - ("val1d", 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), - ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') + ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), + ("val1b", 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("val1a", 16S, 12, 21L, float(15.0), 20D, 20E2BD, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), + ("val1a", 16S, 12, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ("val1c", 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), + ("val1d", null, 16, 22L, float(17.0), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', null), + ("val1d", null, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.001', null), + ("val1e", 10S, null, 25L, float(17.0), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), + ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), + ("val1d", 10S, null, 12L, float(17.0), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), + ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i); create temporary view t2 as select * from values - ("val2a", 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), - ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("val1b", 8S, 16, 119L, float(17), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ("val1c", 12S, 16, 219L, float(17), 25D, 26E2, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), - ("val1b", null, 16, 319L, float(17), 25D, 26E2, timestamp '2017-05-04 01:01:00.000', null), - ("val2e", 8S, null, 419L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ("val1f", 19S, null, 519L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ("val1b", 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ("val1c", 12S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), - ("val1e", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), - ("val1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), - ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) + ("val2a", 6S, 12, 14L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), + ("val1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("val1b", 8S, 16, 119L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ("val1c", 12S, 16, 219L, float(17), 25D, 26E2BD, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), + ("val1b", null, 16, 319L, float(17), 25D, 26E2BD, timestamp '2017-05-04 01:01:00.000', null), + ("val2e", 8S, null, 419L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ("val1f", 19S, null, 519L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("val1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ("val1b", 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ("val1c", 12S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), + ("val1e", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), + ("val1f", 19S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), + ("val1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', null) as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i); create temporary view t3 as select * from values - ("val3a", 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), - ("val3a", 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val1b", 10S, 12, 219L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val1b", 8S, 16, 319L, float(17), 25D, 26E2, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), - ("val1b", 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), - ("val3c", 17S, 16, 519L, float(17), 25D, 26E2, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), - ("val3c", 17S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), - ("val1b", null, 16, 419L, float(17), 25D, 26E2, timestamp '2014-10-04 01:02:00.000', null), - ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), - ("val3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') + ("val3a", 6S, 12, 110L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), + ("val3a", 6S, 12, 10L, float(15), 20D, 20E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val1b", 10S, 12, 219L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val1b", 8S, 16, 319L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), + ("val1b", 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), + ("val3c", 17S, 16, 519L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), + ("val3c", 17S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), + ("val1b", null, 16, 419L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:02:00.000', null), + ("val1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-11-04 01:02:00.000', null), + ("val3b", 8S, null, 719L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val3b", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i); -- correlated IN subquery diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-set-operations.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-set-operations.sql index 5c371d2305ac8..b81dd7dce7ff5 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-set-operations.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/in-set-operations.sql @@ -2,49 +2,49 @@ -- It includes correlated cases. create temporary view t1 as select * from values - ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), - ("val1b", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("val1a", 16S, 12, 21L, float(15.0), 20D, 20E2, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), - ("val1a", 16S, 12, 10L, float(15.0), 20D, 20E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ("val1c", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), - ("val1d", null, 16, 22L, float(17.0), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', null), - ("val1d", null, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-07-04 01:02:00.001', null), - ("val1e", 10S, null, 25L, float(17.0), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), - ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), - ("val1d", 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), - ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') + ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), + ("val1b", 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("val1a", 16S, 12, 21L, float(15.0), 20D, 20E2BD, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), + ("val1a", 16S, 12, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ("val1c", 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), + ("val1d", null, 16, 22L, float(17.0), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', null), + ("val1d", null, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.001', null), + ("val1e", 10S, null, 25L, float(17.0), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), + ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), + ("val1d", 10S, null, 12L, float(17.0), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), + ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i); create temporary view t2 as select * from values - ("val2a", 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), - ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("val1b", 8S, 16, 119L, float(17), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ("val1c", 12S, 16, 219L, float(17), 25D, 26E2, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), - ("val1b", null, 16, 319L, float(17), 25D, 26E2, timestamp '2017-05-04 01:01:00.000', null), - ("val2e", 8S, null, 419L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ("val1f", 19S, null, 519L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ("val1b", 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ("val1c", 12S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), - ("val1e", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), - ("val1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), - ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) + ("val2a", 6S, 12, 14L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), + ("val1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("val1b", 8S, 16, 119L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ("val1c", 12S, 16, 219L, float(17), 25D, 26E2BD, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), + ("val1b", null, 16, 319L, float(17), 25D, 26E2BD, timestamp '2017-05-04 01:01:00.000', null), + ("val2e", 8S, null, 419L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ("val1f", 19S, null, 519L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("val1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ("val1b", 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ("val1c", 12S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), + ("val1e", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), + ("val1f", 19S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), + ("val1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', null) as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i); create temporary view t3 as select * from values - ("val3a", 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), - ("val3a", 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val1b", 10S, 12, 219L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val1b", 8S, 16, 319L, float(17), 25D, 26E2, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), - ("val1b", 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), - ("val3c", 17S, 16, 519L, float(17), 25D, 26E2, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), - ("val3c", 17S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), - ("val1b", null, 16, 419L, float(17), 25D, 26E2, timestamp '2014-10-04 01:02:00.000', null), - ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), - ("val3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') + ("val3a", 6S, 12, 110L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), + ("val3a", 6S, 12, 10L, float(15), 20D, 20E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val1b", 10S, 12, 219L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val1b", 8S, 16, 319L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), + ("val1b", 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), + ("val3c", 17S, 16, 519L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), + ("val3c", 17S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), + ("val1b", null, 16, 419L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:02:00.000', null), + ("val1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-11-04 01:02:00.000', null), + ("val3b", 8S, null, 719L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val3b", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i); -- correlated IN subquery diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-group-by.sql index 58cf109e136c5..54b74534c1162 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-group-by.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-group-by.sql @@ -1,6 +1,11 @@ -- A test suite for NOT IN GROUP BY in parent side, subquery, and both predicate subquery -- It includes correlated cases. +-- Test aggregate operator with codegen on and off. +--CONFIG_DIM1 spark.sql.codegen.wholeStage=true +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + create temporary view t1 as select * from values ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), ("val1b", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-joins.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-joins.sql index 4f8ca8bfb27c1..fcdb667ad4523 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-joins.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/not-in-joins.sql @@ -1,9 +1,5 @@ -- A test suite for not-in-joins in parent side, subquery, and both predicate subquery -- It includes correlated cases. --- List of configuration the test suite is run against: ---SET spark.sql.autoBroadcastJoinThreshold=10485760 ---SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=true ---SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=false create temporary view t1 as select * from values ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), @@ -128,7 +124,7 @@ GROUP BY t1b, HAVING t1d NOT IN (SELECT t2d FROM t2 WHERE t1d = t2d) -ORDER BY t1b DESC; +ORDER BY t1b DESC, t1d ASC; -- TC 01.05 SELECT COUNT(DISTINCT(t1a)), diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/simple-in.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/simple-in.sql index f19567d2fac20..2748a959cbef8 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/simple-in.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/in-subquery/simple-in.sql @@ -2,49 +2,49 @@ -- It includes correlated cases. create temporary view t1 as select * from values - ("t1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), - ("t1b", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("t1a", 16S, 12, 21L, float(15.0), 20D, 20E2, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), - ("t1a", 16S, 12, 10L, float(15.0), 20D, 20E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ("t1c", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), - ("t1d", null, 16, 22L, float(17.0), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', null), - ("t1d", null, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-07-04 01:02:00.001', null), - ("t1e", 10S, null, 25L, float(17.0), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), - ("t1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), - ("t1d", 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ("t1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), - ("t1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') + ("t1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), + ("t1b", 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("t1a", 16S, 12, 21L, float(15.0), 20D, 20E2BD, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), + ("t1a", 16S, 12, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ("t1c", 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), + ("t1d", null, 16, 22L, float(17.0), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', null), + ("t1d", null, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.001', null), + ("t1e", 10S, null, 25L, float(17.0), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), + ("t1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), + ("t1d", 10S, null, 12L, float(17.0), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ("t1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), + ("t1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i); create temporary view t2 as select * from values - ("t2a", 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), - ("t1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("t1b", 8S, 16, 119L, float(17), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ("t1c", 12S, 16, 219L, float(17), 25D, 26E2, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), - ("t1b", null, 16, 319L, float(17), 25D, 26E2, timestamp '2017-05-04 01:01:00.000', null), - ("t2e", 8S, null, 419L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ("t1f", 19S, null, 519L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("t1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ("t1b", 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ("t1c", 12S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), - ("t1e", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), - ("t1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), - ("t1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) + ("t2a", 6S, 12, 14L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), + ("t1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("t1b", 8S, 16, 119L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ("t1c", 12S, 16, 219L, float(17), 25D, 26E2BD, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), + ("t1b", null, 16, 319L, float(17), 25D, 26E2BD, timestamp '2017-05-04 01:01:00.000', null), + ("t2e", 8S, null, 419L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ("t1f", 19S, null, 519L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("t1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ("t1b", 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ("t1c", 12S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), + ("t1e", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), + ("t1f", 19S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), + ("t1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', null) as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i); create temporary view t3 as select * from values - ("t3a", 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), - ("t3a", 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("t1b", 10S, 12, 219L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("t1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("t1b", 8S, 16, 319L, float(17), 25D, 26E2, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), - ("t1b", 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), - ("t3c", 17S, 16, 519L, float(17), 25D, 26E2, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), - ("t3c", 17S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), - ("t1b", null, 16, 419L, float(17), 25D, 26E2, timestamp '2014-10-04 01:02:00.000', null), - ("t1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), - ("t3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("t3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') + ("t3a", 6S, 12, 110L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), + ("t3a", 6S, 12, 10L, float(15), 20D, 20E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("t1b", 10S, 12, 219L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("t1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("t1b", 8S, 16, 319L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), + ("t1b", 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), + ("t3c", 17S, 16, 519L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), + ("t3c", 17S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), + ("t1b", null, 16, 419L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:02:00.000', null), + ("t1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-11-04 01:02:00.000', null), + ("t3b", 8S, null, 719L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("t3b", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i); -- correlated IN subquery diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/negative-cases/subq-input-typecheck.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/negative-cases/subq-input-typecheck.sql index 95b115a8dd094..98ce1354a1355 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/negative-cases/subq-input-typecheck.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/negative-cases/subq-input-typecheck.sql @@ -18,7 +18,7 @@ CREATE TEMPORARY VIEW t4 AS SELECT * FROM VALUES AS t1(t4a, t4b, t4c); CREATE TEMPORARY VIEW t5 AS SELECT * FROM VALUES - (CAST(1 AS DECIMAL(18, 0)), CAST(2 AS STRING), CAST(3 AS BIGINT)) + (CAST('2011-01-01 01:01:01' AS TIMESTAMP), CAST(2 AS STRING), CAST(3 AS BIGINT)) AS t1(t5a, t5b, t5c); -- TC 01.01 diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql index 1661209093fc4..17e44a96492b8 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-predicate.sql @@ -22,49 +22,49 @@ AND c.cv = (SELECT max(avg) GROUP BY c1.cv)); create temporary view t1 as select * from values - ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 00:00:00.000', date '2014-04-04'), - ('val1b', 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ('val1a', 16S, 12, 21L, float(15.0), 20D, 20E2, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), - ('val1a', 16S, 12, 10L, float(15.0), 20D, 20E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ('val1c', 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), - ('val1d', null, 16, 22L, float(17.0), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', null), - ('val1d', null, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-07-04 01:02:00.001', null), - ('val1e', 10S, null, 25L, float(17.0), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), - ('val1e', 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), - ('val1d', 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), - ('val1e', 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') + ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 00:00:00.000', date '2014-04-04'), + ('val1b', 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ('val1a', 16S, 12, 21L, float(15.0), 20D, 20E2BD, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), + ('val1a', 16S, 12, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ('val1c', 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), + ('val1d', null, 16, 22L, float(17.0), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', null), + ('val1d', null, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.001', null), + ('val1e', 10S, null, 25L, float(17.0), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), + ('val1e', 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), + ('val1d', 10S, null, 12L, float(17.0), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), + ('val1e', 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i); create temporary view t2 as select * from values - ('val2a', 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), - ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ('val1b', 8S, 16, 119L, float(17), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ('val1c', 12S, 16, 219L, float(17), 25D, 26E2, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), - ('val1b', null, 16, 319L, float(17), 25D, 26E2, timestamp '2017-05-04 01:01:00.000', null), - ('val2e', 8S, null, 419L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ('val1f', 19S, null, 519L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ('val1b', 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ('val1c', 12S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), - ('val1e', 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), - ('val1f', 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), - ('val1b', null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) + ('val2a', 6S, 12, 14L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), + ('val1b', 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ('val1b', 8S, 16, 119L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ('val1c', 12S, 16, 219L, float(17), 25D, 26E2BD, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), + ('val1b', null, 16, 319L, float(17), 25D, 26E2BD, timestamp '2017-05-04 01:01:00.000', null), + ('val2e', 8S, null, 419L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ('val1f', 19S, null, 519L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ('val1b', 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ('val1b', 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ('val1c', 12S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), + ('val1e', 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), + ('val1f', 19S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), + ('val1b', null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', null) as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i); create temporary view t3 as select * from values - ('val3a', 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), - ('val3a', 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val1b', 10S, 12, 219L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val1b', 8S, 16, 319L, float(17), 25D, 26E2, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), - ('val1b', 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), - ('val3c', 17S, 16, 519L, float(17), 25D, 26E2, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), - ('val3c', 17S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), - ('val1b', null, 16, 419L, float(17), 25D, 26E2, timestamp '2014-10-04 01:02:00.000', null), - ('val1b', null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), - ('val3b', 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val3b', 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') + ('val3a', 6S, 12, 110L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), + ('val3a', 6S, 12, 10L, float(15), 20D, 20E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ('val1b', 10S, 12, 219L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ('val1b', 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ('val1b', 8S, 16, 319L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), + ('val1b', 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), + ('val3c', 17S, 16, 519L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), + ('val3c', 17S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), + ('val1b', null, 16, 419L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:02:00.000', null), + ('val1b', null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-11-04 01:02:00.000', null), + ('val3b', 8S, null, 719L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ('val3b', 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i); -- Group 1: scalar subquery in predicate context diff --git a/sql/core/src/test/resources/sql-tests/inputs/udaf.sql b/sql/core/src/test/resources/sql-tests/inputs/udaf.sql index 58613a1325dfa..0374d98feb6e6 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udaf.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udaf.sql @@ -1,3 +1,8 @@ +-- Test aggregate operator and UDAF with codegen on and off. +--CONFIG_DIM1 spark.sql.codegen.wholeStage=true +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (1), (2), (3), (4) as t1(int_col1); diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part1.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part1.sql similarity index 98% rename from sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part1.sql rename to sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part1.sql index d829a5c1159fd..24bc25a3fd1c5 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part1.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part1.sql @@ -8,7 +8,7 @@ -- avoid bit-exact output here because operations may not be bit-exact. -- SET extra_float_digits = 0; --- This test file was converted from pgSQL/aggregates_part1.sql. +-- This test file was converted from postgreSQL/aggregates_part1.sql. SELECT avg(udf(four)) AS avg_1 FROM onek; diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part2.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part2.sql similarity index 75% rename from sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part2.sql rename to sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part2.sql index 5636537398a86..b4054850062b7 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part2.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part2.sql @@ -5,7 +5,7 @@ -- AGGREGATES [Part 2] -- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/aggregates.sql#L145-L350 -- --- This test file was converted from pgSQL/aggregates_part2.sql. +-- This test file was converted from postgreSQL/aggregates_part2.sql. create temporary view int4_tbl as select * from values (0), @@ -43,42 +43,28 @@ create temporary view int4_tbl as select * from values -- -- test for bitwise integer aggregates -- --- CREATE TEMPORARY TABLE bitwise_test( --- i2 INT2, --- i4 INT4, --- i8 INT8, --- i INTEGER, --- x INT2, --- y BIT(4) --- ); +CREATE OR REPLACE TEMPORARY VIEW bitwise_test AS SELECT * FROM VALUES + (1, 1, 1, 1L), + (3, 3, 3, null), + (7, 7, 7, 3L) AS bitwise_test(b1, b2, b3, b4); -- empty case --- SELECT --- BIT_AND(i2) AS "?", --- BIT_OR(i4) AS "?" --- FROM bitwise_test; - --- COPY bitwise_test FROM STDIN NULL 'null'; --- 1 1 1 1 1 B0101 --- 3 3 3 null 2 B0100 --- 7 7 7 3 4 B1100 --- \. - --- SELECT --- BIT_AND(i2) AS "1", --- BIT_AND(i4) AS "1", --- BIT_AND(i8) AS "1", --- BIT_AND(i) AS "?", --- BIT_AND(x) AS "0", --- BIT_AND(y) AS "0100", --- --- BIT_OR(i2) AS "7", --- BIT_OR(i4) AS "7", --- BIT_OR(i8) AS "7", --- BIT_OR(i) AS "?", --- BIT_OR(x) AS "7", --- BIT_OR(y) AS "1101" --- FROM bitwise_test; +SELECT BIT_AND(b1) AS n1, BIT_OR(b2) AS n2 FROM bitwise_test where 1 = 0; + +-- null case +SELECT BIT_AND(b4) AS n1, BIT_OR(b4) AS n2 FROM bitwise_test where b4 is null; + + +SELECT + BIT_AND(cast(b1 as tinyint)) AS a1, + BIT_AND(cast(b2 as smallint)) AS b1, + BIT_AND(b3) AS c1, + BIT_AND(b4) AS d1, + BIT_OR(cast(b1 as tinyint)) AS e7, + BIT_OR(cast(b2 as smallint)) AS f7, + BIT_OR(b3) AS g7, + BIT_OR(b4) AS h3 +FROM bitwise_test; -- -- test boolean aggregates @@ -116,50 +102,40 @@ SELECT NOT (FALSE OR FALSE) AS `t`; -- [SPARK-27880] Implement boolean aggregates(BOOL_AND, BOOL_OR and EVERY) --- CREATE TEMPORARY TABLE bool_test( --- b1 BOOL, --- b2 BOOL, --- b3 BOOL, --- b4 BOOL); +CREATE OR REPLACE TEMPORARY VIEW bool_test AS SELECT * FROM VALUES + (TRUE, null, FALSE, null), + (FALSE, TRUE, null, null), + (null, TRUE, FALSE, null) AS bool_test(b1, b2, b3, b4); -- empty case --- SELECT --- BOOL_AND(b1) AS "n", --- BOOL_OR(b3) AS "n" --- FROM bool_test; - --- COPY bool_test FROM STDIN NULL 'null'; --- TRUE null FALSE null --- FALSE TRUE null null --- null TRUE FALSE null --- \. - --- SELECT --- BOOL_AND(b1) AS "f", --- BOOL_AND(b2) AS "t", --- BOOL_AND(b3) AS "f", --- BOOL_AND(b4) AS "n", --- BOOL_AND(NOT b2) AS "f", --- BOOL_AND(NOT b3) AS "t" --- FROM bool_test; - --- SELECT --- EVERY(b1) AS "f", --- EVERY(b2) AS "t", --- EVERY(b3) AS "f", --- EVERY(b4) AS "n", --- EVERY(NOT b2) AS "f", --- EVERY(NOT b3) AS "t" --- FROM bool_test; - --- SELECT --- BOOL_OR(b1) AS "t", --- BOOL_OR(b2) AS "t", --- BOOL_OR(b3) AS "f", --- BOOL_OR(b4) AS "n", --- BOOL_OR(NOT b2) AS "f", --- BOOL_OR(NOT b3) AS "t" --- FROM bool_test; +SELECT BOOL_AND(b1) AS n1, BOOL_OR(b3) AS n2 FROM bool_test WHERE 1 = 0; + +SELECT + BOOL_AND(b1) AS f1, + BOOL_AND(b2) AS t2, + BOOL_AND(b3) AS f3, + BOOL_AND(b4) AS n4, + BOOL_AND(NOT b2) AS f5, + BOOL_AND(NOT b3) AS t6 +FROM bool_test; + +SELECT + EVERY(b1) AS f1, + EVERY(b2) AS t2, + EVERY(b3) AS f3, + EVERY(b4) AS n4, + EVERY(NOT b2) AS f5, + EVERY(NOT b3) AS t6 +FROM bool_test; + +SELECT + BOOL_OR(b1) AS t1, + BOOL_OR(b2) AS t2, + BOOL_OR(b3) AS f3, + BOOL_OR(b4) AS n4, + BOOL_OR(NOT b2) AS f5, + BOOL_OR(NOT b3) AS t6 +FROM bool_test; -- -- Test cases that should be optimized into indexscans instead of diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part3.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part3.sql similarity index 98% rename from sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part3.sql rename to sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part3.sql index 1c58620d1c11a..b11c8c05f3103 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part3.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part3.sql @@ -5,7 +5,7 @@ -- AGGREGATES [Part 3] -- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/aggregates.sql#L352-L605 --- This test file was converted from pgSQL/aggregates_part3.sql. +-- This test file was converted from postgreSQL/aggregates_part3.sql. -- [SPARK-28865] Table inheritance -- try it on an inheritance tree @@ -229,7 +229,6 @@ select udf(max(min(unique1))) from tenk1; -- drop table bytea_test_table; --- [SPARK-27986] Support Aggregate Expressions with filter -- FILTER tests -- select min(unique1) filter (where unique1 > 100) from tenk1; diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part4.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part4.sql similarity index 99% rename from sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part4.sql rename to sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part4.sql index 7c7777362de8e..8aea00073eee8 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-aggregates_part4.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-aggregates_part4.sql @@ -5,7 +5,7 @@ -- AGGREGATES [Part 4] -- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/aggregates.sql#L607-L997 --- This test file was converted from pgSQL/aggregates_part4.sql. +-- This test file was converted from postgreSQL/aggregates_part4.sql. -- [SPARK-27980] Ordered-Set Aggregate Functions -- ordered-set aggregates diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-case.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-case.sql similarity index 99% rename from sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-case.sql rename to sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-case.sql index 1865ee94ec1f9..8fa3c0a6dfec9 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-case.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-case.sql @@ -6,7 +6,7 @@ -- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/case.sql -- Test the CASE statement -- --- This test file was converted from pgSQL/case.sql. +-- This test file was converted from postgreSQL/case.sql. CREATE TABLE CASE_TBL ( i integer, diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-join.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-join.sql similarity index 99% rename from sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-join.sql rename to sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-join.sql index c05aa156a13bf..e6fe1078b0d24 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-join.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-join.sql @@ -6,7 +6,7 @@ -- Test JOIN clauses -- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/join.sql -- --- This test file was converted from pgSQL/join.sql. +-- This test file was converted from postgreSQL/join.sql. CREATE OR REPLACE TEMPORARY VIEW INT4_TBL AS SELECT * FROM (VALUES (0), (123456), (-123456), (2147483647), (-2147483647)) diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-select_having.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-select_having.sql similarity index 96% rename from sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-select_having.sql rename to sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-select_having.sql index c8e4346cedb89..412d45b49a184 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-select_having.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-select_having.sql @@ -5,7 +5,7 @@ -- SELECT_HAVING -- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/select_having.sql -- --- This test file was converted from inputs/pgSQL/select_having.sql +-- This test file was converted from inputs/postgreSQL/select_having.sql -- TODO: We should add UDFs in GROUP BY clause when [SPARK-28386] and [SPARK-26741] is resolved. -- load test data diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-select_implicit.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-select_implicit.sql similarity index 98% rename from sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-select_implicit.sql rename to sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-select_implicit.sql index 373896ccd1674..1cbd77c6cf86d 100755 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/pgSQL/udf-select_implicit.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/postgreSQL/udf-select_implicit.sql @@ -9,7 +9,7 @@ -- - thomas 1998-07-09 -- https://github.com/postgres/postgres/blob/REL_12_BETA2/src/test/regress/sql/select_implicit.sql -- --- This test file was converted from pgSQL/select_implicit.sql +-- This test file was converted from postgreSQL/select_implicit.sql -- load test data CREATE TABLE test_missing_target (a int, b int, c string, d string) using parquet; diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-join-empty-relation.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-join-empty-relation.sql index 47fb70d02394b..b46206d4530ed 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-join-empty-relation.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-join-empty-relation.sql @@ -1,8 +1,3 @@ --- List of configuration the test suite is run against: ---SET spark.sql.autoBroadcastJoinThreshold=10485760 ---SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=true ---SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=false - -- This test file was converted from join-empty-relation.sql. CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (1) AS GROUPING(a); diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-natural-join.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-natural-join.sql index e5eb812d69a1c..7cf080ea1b4eb 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-natural-join.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-natural-join.sql @@ -1,8 +1,3 @@ --- List of configuration the test suite is run against: ---SET spark.sql.autoBroadcastJoinThreshold=10485760 ---SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=true ---SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=false - -- This test file was converted from natural-join.sql. create temporary view nt1 as select * from values diff --git a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-outer-join.sql b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-outer-join.sql index 4eb0805c9cc67..4b09bcb988d25 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/udf/udf-outer-join.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/udf/udf-outer-join.sql @@ -1,8 +1,4 @@ -- This test file was converted from outer-join.sql. --- List of configuration the test suite is run against: ---SET spark.sql.autoBroadcastJoinThreshold=10485760 ---SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=true ---SET spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=false -- SPARK-17099: Incorrect result when HAVING clause is added to group by query CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES diff --git a/sql/core/src/test/resources/sql-tests/inputs/window.sql b/sql/core/src/test/resources/sql-tests/inputs/window.sql index faab4c61c8640..3d05dfda6c3fa 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/window.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/window.sql @@ -1,3 +1,8 @@ +-- Test window operator with codegen on and off. +--CONFIG_DIM1 spark.sql.codegen.wholeStage=true +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY +--CONFIG_DIM1 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=NO_CODEGEN + -- Test data. CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES (null, 1L, 1.0D, date("2017-08-01"), timestamp(1501545600), "a"), @@ -115,3 +120,8 @@ SELECT cate, sum(val) OVER (w) FROM testData WHERE val is not null WINDOW w AS (PARTITION BY cate ORDER BY val); + +-- with filter predicate +SELECT val, cate, +count(val) FILTER (WHERE val > 1) OVER(PARTITION BY cate) +FROM testData ORDER BY cate, val; \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/decimalArithmeticOperations.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/decimalArithmeticOperations.sql.out new file mode 100644 index 0000000000000..ce53e1c2863e0 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/ansi/decimalArithmeticOperations.sql.out @@ -0,0 +1,138 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 16 + + +-- !query +create table decimals_test(id int, a decimal(38,18), b decimal(38,18)) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +insert into decimals_test values(1, 100.0, 999.0), (2, 12345.123, 12345.123), + (3, 0.1234567891011, 1234.1), (4, 123456789123456789.0, 1.123456789123456789) +-- !query schema +struct<> +-- !query output + + + +-- !query +select id, a*10, b/10 from decimals_test order by id +-- !query schema +struct +-- !query output +1 1000.000000000000000 99.900000000000000000 +2 123451.230000000000000 1234.512300000000000000 +3 1.234567891011000 123.410000000000000000 +4 1234567891234567890.000000000000000 0.112345678912345679 + + +-- !query +select 10.3 * 3.0 +-- !query schema +struct<(CAST(10.3 AS DECIMAL(3,1)) * CAST(3.0 AS DECIMAL(3,1))):decimal(6,2)> +-- !query output +30.90 + + +-- !query +select 10.3000 * 3.0 +-- !query schema +struct<(CAST(10.3000 AS DECIMAL(6,4)) * CAST(3.0 AS DECIMAL(6,4))):decimal(9,5)> +-- !query output +30.90000 + + +-- !query +select 10.30000 * 30.0 +-- !query schema +struct<(CAST(10.30000 AS DECIMAL(7,5)) * CAST(30.0 AS DECIMAL(7,5))):decimal(11,6)> +-- !query output +309.000000 + + +-- !query +select 10.300000000000000000 * 3.000000000000000000 +-- !query schema +struct<(CAST(10.300000000000000000 AS DECIMAL(20,18)) * CAST(3.000000000000000000 AS DECIMAL(20,18))):decimal(38,34)> +-- !query output +30.9000000000000000000000000000000000 + + +-- !query +select 10.300000000000000000 * 3.0000000000000000000 +-- !query schema +struct<(CAST(10.300000000000000000 AS DECIMAL(21,19)) * CAST(3.0000000000000000000 AS DECIMAL(21,19))):decimal(38,34)> +-- !query output +30.9000000000000000000000000000000000 + + +-- !query +select (5e36BD + 0.1) + 5e36BD +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +Decimal(expanded,10000000000000000000000000000000000000.1,39,1}) cannot be represented as Decimal(38, 1). + + +-- !query +select (-4e36BD - 0.1) - 7e36BD +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +Decimal(expanded,-11000000000000000000000000000000000000.1,39,1}) cannot be represented as Decimal(38, 1). + + +-- !query +select 12345678901234567890.0 * 12345678901234567890.0 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +Decimal(expanded,152415787532388367501905199875019052100,39,0}) cannot be represented as Decimal(38, 2). + + +-- !query +select 1e35BD / 0.1 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +Decimal(expanded,1000000000000000000000000000000000000,37,0}) cannot be represented as Decimal(38, 6). + + +-- !query +select 123456789123456789.1234567890 * 1.123456789123456789 +-- !query schema +struct<(CAST(123456789123456789.1234567890 AS DECIMAL(36,18)) * CAST(1.123456789123456789 AS DECIMAL(36,18))):decimal(38,18)> +-- !query output +138698367904130467.654320988515622621 + + +-- !query +select 123456789123456789.1234567890 * 1.123456789123456789 +-- !query schema +struct<(CAST(123456789123456789.1234567890 AS DECIMAL(36,18)) * CAST(1.123456789123456789 AS DECIMAL(36,18))):decimal(38,18)> +-- !query output +138698367904130467.654320988515622621 + + +-- !query +select 12345678912345.123456789123 / 0.000000012345678 +-- !query schema +struct<(CAST(12345678912345.123456789123 AS DECIMAL(29,15)) / CAST(1.2345678E-8 AS DECIMAL(29,15))):decimal(38,9)> +-- !query output +1000000073899961059796.725866332 + + +-- !query +drop table decimals_test +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out new file mode 100644 index 0000000000000..7bef1bad4507e --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/ansi/higher-order-functions.sql.out @@ -0,0 +1,284 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 29 + + +-- !query +create or replace temporary view nested as values + (1, array(32, 97), array(array(12, 99), array(123, 42), array(1))), + (2, array(77, -76), array(array(6, 96, 65), array(-1, -2))), + (3, array(12), array(array(17))) + as t(x, ys, zs) +-- !query schema +struct<> +-- !query output + + + +-- !query +select upper(x -> x) as v +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +A lambda function should only be used in a higher order function. However, its class is org.apache.spark.sql.catalyst.expressions.Upper, which is not a higher order function.; line 1 pos 7 + + +-- !query +select transform(zs, z -> z) as v from nested +-- !query schema +struct>> +-- !query output +[[12,99],[123,42],[1]] +[[17]] +[[6,96,65],[-1,-2]] + + +-- !query +select transform(ys, y -> y * y) as v from nested +-- !query schema +struct> +-- !query output +[1024,9409] +[144] +[5929,5776] + + +-- !query +select transform(ys, (y, i) -> y + i) as v from nested +-- !query schema +struct> +-- !query output +[12] +[32,98] +[77,-75] + + +-- !query +select transform(zs, z -> concat(ys, z)) as v from nested +-- !query schema +struct>> +-- !query output +[[12,17]] +[[32,97,12,99],[32,97,123,42],[32,97,1]] +[[77,-76,6,96,65],[77,-76,-1,-2]] + + +-- !query +select transform(ys, 0) as v from nested +-- !query schema +struct> +-- !query output +[0,0] +[0,0] +[0] + + +-- !query +select transform(cast(null as array), x -> x + 1) as v +-- !query schema +struct> +-- !query output +NULL + + +-- !query +select filter(ys, y -> y > 30) as v from nested +-- !query schema +struct> +-- !query output +[32,97] +[77] +[] + + +-- !query +select filter(cast(null as array), y -> true) as v +-- !query schema +struct> +-- !query output +NULL + + +-- !query +select transform(zs, z -> filter(z, zz -> zz > 50)) as v from nested +-- !query schema +struct>> +-- !query output +[[96,65],[]] +[[99],[123],[]] +[[]] + + +-- !query +select aggregate(ys, 0, (y, a) -> y + a + x) as v from nested +-- !query schema +struct +-- !query output +131 +15 +5 + + +-- !query +select aggregate(ys, (0 as sum, 0 as n), (acc, x) -> (acc.sum + x, acc.n + 1), acc -> acc.sum / acc.n) as v from nested +-- !query schema +struct +-- !query output +0.5 +12.0 +64.5 + + +-- !query +select transform(zs, z -> aggregate(z, 1, (acc, val) -> acc * val * size(z))) as v from nested +-- !query schema +struct> +-- !query output +[1010880,8] +[17] +[4752,20664,1] + + +-- !query +select aggregate(cast(null as array), 0, (a, y) -> a + y + 1, a -> a + 2) as v +-- !query schema +struct +-- !query output +NULL + + +-- !query +select exists(ys, y -> y > 30) as v from nested +-- !query schema +struct +-- !query output +false +true +true + + +-- !query +select exists(cast(null as array), y -> y > 30) as v +-- !query schema +struct +-- !query output +NULL + + +-- !query +select zip_with(ys, zs, (a, b) -> a + size(b)) as v from nested +-- !query schema +struct> +-- !query output +[13] +[34,99,null] +[80,-74] + + +-- !query +select zip_with(array('a', 'b', 'c'), array('d', 'e', 'f'), (x, y) -> concat(x, y)) as v +-- !query schema +struct> +-- !query output +["ad","be","cf"] + + +-- !query +select zip_with(array('a'), array('d', null, 'f'), (x, y) -> coalesce(x, y)) as v +-- !query schema +struct> +-- !query output +["a",null,"f"] + + +-- !query +create or replace temporary view nested as values + (1, map(1, 1, 2, 2, 3, 3)), + (2, map(4, 4, 5, 5, 6, 6)) + as t(x, ys) +-- !query schema +struct<> +-- !query output + + + +-- !query +select transform_keys(ys, (k, v) -> k) as v from nested +-- !query schema +struct> +-- !query output +{1:1,2:2,3:3} +{4:4,5:5,6:6} + + +-- !query +select transform_keys(ys, (k, v) -> k + 1) as v from nested +-- !query schema +struct> +-- !query output +{2:1,3:2,4:3} +{5:4,6:5,7:6} + + +-- !query +select transform_keys(ys, (k, v) -> k + v) as v from nested +-- !query schema +struct> +-- !query output +{10:5,12:6,8:4} +{2:1,4:2,6:3} + + +-- !query +select transform_values(ys, (k, v) -> v) as v from nested +-- !query schema +struct> +-- !query output +{1:1,2:2,3:3} +{4:4,5:5,6:6} + + +-- !query +select transform_values(ys, (k, v) -> v + 1) as v from nested +-- !query schema +struct> +-- !query output +{1:2,2:3,3:4} +{4:5,5:6,6:7} + + +-- !query +select transform_values(ys, (k, v) -> k + v) as v from nested +-- !query schema +struct> +-- !query output +{1:2,2:4,3:6} +{4:8,5:10,6:12} + + +-- !query +select transform(ys, all -> all * all) as v from values (array(32, 97)) as t(ys) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +no viable alternative at input 'all'(line 1, pos 21) + +== SQL == +select transform(ys, all -> all * all) as v from values (array(32, 97)) as t(ys) +---------------------^^^ + + +-- !query +select transform(ys, (all, i) -> all + i) as v from values (array(32, 97)) as t(ys) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +no viable alternative at input 'all'(line 1, pos 22) + +== SQL == +select transform(ys, (all, i) -> all + i) as v from values (array(32, 97)) as t(ys) +----------------------^^^ diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out index 13f72614f5778..7fdb4c53d1dcb 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/interval.sql.out @@ -1,439 +1,1011 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 35 +-- Number of queries: 99 --- !query 0 -SET spark.sql.parser.ansi.enabled=true --- !query 0 schema -struct --- !query 0 output -spark.sql.parser.ansi.enabled true +-- !query +select 3 * (timestamp'2019-10-15 10:11:12.001002' - date'2019-10-15') +-- !query schema +struct +-- !query output +30 hours 33 minutes 36.003006 seconds --- !query 1 -select - '1' second, - 2 seconds, - '1' minute, - 2 minutes, - '1' hour, - 2 hours, - '1' day, - 2 days, - '1' month, - 2 months, - '1' year, - 2 years --- !query 1 schema -struct --- !query 1 output -interval 1 seconds interval 2 seconds interval 1 minutes interval 2 minutes interval 1 hours interval 2 hours interval 1 days interval 2 days interval 1 months interval 2 months interval 1 years interval 2 years - - --- !query 2 -select - interval '10-11' year to month, - interval '10' year, - interval '11' month --- !query 2 schema -struct --- !query 2 output -interval 10 years 11 months interval 10 years interval 11 months +-- !query +select interval 4 month 2 weeks 3 microseconds * 1.5 +-- !query schema +struct +-- !query output +6 months 21 days 0.000005 seconds --- !query 3 -select - '10-11' year to month, - '10' year, - '11' month --- !query 3 schema -struct --- !query 3 output -interval 10 years 11 months interval 10 years interval 11 months +-- !query +select (timestamp'2019-10-15' - timestamp'2019-10-14') / 1.5 +-- !query schema +struct +-- !query output +16 hours --- !query 4 -select - interval '10 9:8:7.987654321' day to second, - interval '10' day, - interval '11' hour, - interval '12' minute, - interval '13' second, - interval '13.123456789' second --- !query 4 schema -struct --- !query 4 output -interval 1 weeks 3 days 9 hours 8 minutes 7 seconds 987 milliseconds 654 microseconds interval 1 weeks 3 days interval 11 hours interval 12 minutes interval 13 seconds interval 13 seconds 123 milliseconds 456 microseconds - - --- !query 5 -select - '10 9:8:7.987654321' day to second, - '10' day, - '11' hour, - '12' minute, - '13' second, - '13.123456789' second --- !query 5 schema -struct --- !query 5 output -interval 1 weeks 3 days 9 hours 8 minutes 7 seconds 987 milliseconds 654 microseconds interval 1 weeks 3 days interval 11 hours interval 12 minutes interval 13 seconds interval 13 seconds 123 milliseconds 456 microseconds - - --- !query 6 -select map(1, interval 1 day, 2, interval 3 week) --- !query 6 schema -struct> --- !query 6 output -{1:interval 1 days,2:interval 3 weeks} +-- !query +select interval '2 seconds' / 0 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +divide by zero --- !query 7 -select map(1, 1 day, 2, 3 week) --- !query 7 schema -struct> --- !query 7 output -{1:interval 1 days,2:interval 3 weeks} +-- !query +select interval '2 seconds' / null +-- !query schema +struct +-- !query output +NULL --- !query 8 -create temporary view interval_arithmetic as - select CAST(dateval AS date), CAST(tsval AS timestamp) from values - ('2012-01-01', '2012-01-01') - as interval_arithmetic(dateval, tsval) --- !query 8 schema +-- !query +select interval '2 seconds' * null +-- !query schema +struct +-- !query output +NULL + + +-- !query +select null * interval '2 seconds' +-- !query schema +struct +-- !query output +NULL + + +-- !query +select -interval '-1 month 1 day -1 second' +-- !query schema +struct<(- INTERVAL '-1 months 1 days -1 seconds'):interval> +-- !query output +1 months -1 days 1 seconds + + +-- !query +select -interval -1 month 1 day -1 second +-- !query schema +struct<(- INTERVAL '-1 months 1 days -1 seconds'):interval> +-- !query output +1 months -1 days 1 seconds + + +-- !query +select +interval '-1 month 1 day -1 second' +-- !query schema +struct<(+ INTERVAL '-1 months 1 days -1 seconds'):interval> +-- !query output +-1 months 1 days -1 seconds + + +-- !query +select +interval -1 month 1 day -1 second +-- !query schema +struct<(+ INTERVAL '-1 months 1 days -1 seconds'):interval> +-- !query output +-1 months 1 days -1 seconds + + +-- !query +select make_interval(1) +-- !query schema +struct +-- !query output +1 years + + +-- !query +select make_interval(1, 2) +-- !query schema +struct +-- !query output +1 years 2 months + + +-- !query +select make_interval(1, 2, 3) +-- !query schema +struct +-- !query output +1 years 2 months 21 days + + +-- !query +select make_interval(1, 2, 3, 4) +-- !query schema +struct +-- !query output +1 years 2 months 25 days + + +-- !query +select make_interval(1, 2, 3, 4, 5) +-- !query schema +struct +-- !query output +1 years 2 months 25 days 5 hours + + +-- !query +select make_interval(1, 2, 3, 4, 5, 6) +-- !query schema +struct +-- !query output +1 years 2 months 25 days 5 hours 6 minutes + + +-- !query +select make_interval(1, 2, 3, 4, 5, 6, 7.008009) +-- !query schema +struct +-- !query output +1 years 2 months 25 days 5 hours 6 minutes 7.008009 seconds + + +-- !query +select cast('1 second' as interval) +-- !query schema +struct +-- !query output +1 seconds + + +-- !query +select cast('+1 second' as interval) +-- !query schema +struct +-- !query output +1 seconds + + +-- !query +select cast('-1 second' as interval) +-- !query schema +struct +-- !query output +-1 seconds + + +-- !query +select cast('+ 1 second' as interval) +-- !query schema +struct +-- !query output +1 seconds + + +-- !query +select cast('- 1 second' as interval) +-- !query schema +struct +-- !query output +-1 seconds + + +-- !query +select cast('- -1 second' as interval) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select cast('- +1 second' as interval) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select interval 13.123456789 seconds, interval -13.123456789 second +-- !query schema +struct +-- !query output +13.123456 seconds -13.123456 seconds + + +-- !query +select interval 1 year 2 month 3 week 4 day 5 hour 6 minute 7 seconds 8 millisecond 9 microsecond +-- !query schema +struct +-- !query output +1 years 2 months 25 days 5 hours 6 minutes 7.008009 seconds + + +-- !query +select interval '30' year '25' month '-100' day '40' hour '80' minute '299.889987299' second +-- !query schema +struct +-- !query output +32 years 1 months -100 days 41 hours 24 minutes 59.889987 seconds + + +-- !query +select interval '0 0:0:0.1' day to second +-- !query schema +struct +-- !query output +0.1 seconds + + +-- !query +select interval '10-9' year to month +-- !query schema +struct +-- !query output +10 years 9 months + + +-- !query +select interval '20 15' day to hour +-- !query schema +struct +-- !query output +20 days 15 hours + + +-- !query +select interval '20 15:40' day to minute +-- !query schema +struct +-- !query output +20 days 15 hours 40 minutes + + +-- !query +select interval '20 15:40:32.99899999' day to second +-- !query schema +struct +-- !query output +20 days 15 hours 40 minutes 32.998999 seconds + + +-- !query +select interval '15:40' hour to minute +-- !query schema +struct +-- !query output +15 hours 40 minutes + + +-- !query +select interval '15:40:32.99899999' hour to second +-- !query schema +struct +-- !query output +15 hours 40 minutes 32.998999 seconds + + +-- !query +select interval '40:32.99899999' minute to second +-- !query schema +struct +-- !query output +40 minutes 32.998999 seconds + + +-- !query +select interval '40:32' minute to second +-- !query schema +struct +-- !query output +40 minutes 32 seconds + + +-- !query +select interval 30 day day +-- !query schema struct<> --- !query 8 output +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +no viable alternative at input 'day'(line 1, pos 23) +== SQL == +select interval 30 day day +-----------------------^^^ --- !query 9 -select - dateval, - dateval - interval '2-2' year to month, - dateval - interval '-2-2' year to month, - dateval + interval '2-2' year to month, - dateval + interval '-2-2' year to month, - - interval '2-2' year to month + dateval, - interval '2-2' year to month + dateval -from interval_arithmetic --- !query 9 schema -struct --- !query 9 output -2012-01-01 2009-11-01 2014-03-01 2014-03-01 2009-11-01 2009-11-01 2014-03-01 +-- !query +select interval '20 15:40:32.99899999' day to hour +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException --- !query 10 -select - dateval, - dateval - '2-2' year to month, - dateval - '-2-2' year to month, - dateval + '2-2' year to month, - dateval + '-2-2' year to month, - - '2-2' year to month + dateval, - '2-2' year to month + dateval -from interval_arithmetic --- !query 10 schema -struct --- !query 10 output -2012-01-01 2009-11-01 2014-03-01 2014-03-01 2009-11-01 2009-11-01 2014-03-01 +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2})$': 20 15:40:32.99899999, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) +== SQL == +select interval '20 15:40:32.99899999' day to hour +----------------^^^ --- !query 11 -select - tsval, - tsval - interval '2-2' year to month, - tsval - interval '-2-2' year to month, - tsval + interval '2-2' year to month, - tsval + interval '-2-2' year to month, - - interval '2-2' year to month + tsval, - interval '2-2' year to month + tsval -from interval_arithmetic --- !query 11 schema -struct --- !query 11 output -2012-01-01 00:00:00 2009-11-01 00:00:00 2014-03-01 00:00:00 2014-03-01 00:00:00 2009-11-01 00:00:00 2009-11-01 00:00:00 2014-03-01 00:00:00 +-- !query +select interval '20 15:40:32.99899999' day to minute +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException --- !query 12 -select - tsval, - tsval - '2-2' year to month, - tsval - '-2-2' year to month, - tsval + '2-2' year to month, - tsval + '-2-2' year to month, - - '2-2' year to month + tsval, - '2-2' year to month + tsval -from interval_arithmetic --- !query 12 schema -struct --- !query 12 output -2012-01-01 00:00:00 2009-11-01 00:00:00 2014-03-01 00:00:00 2014-03-01 00:00:00 2009-11-01 00:00:00 2009-11-01 00:00:00 2014-03-01 00:00:00 +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2}):(?\d{1,2})$': 20 15:40:32.99899999, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) +== SQL == +select interval '20 15:40:32.99899999' day to minute +----------------^^^ --- !query 13 -select - interval '2-2' year to month + interval '3-3' year to month, - interval '2-2' year to month - interval '3-3' year to month -from interval_arithmetic --- !query 13 schema -struct<(interval 2 years 2 months + interval 3 years 3 months):interval,(interval 2 years 2 months - interval 3 years 3 months):interval> --- !query 13 output -interval 5 years 5 months interval -1 years -1 months +-- !query +select interval '15:40:32.99899999' hour to minute +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException --- !query 14 -select - '2-2' year to month + '3-3' year to month, - '2-2' year to month - '3-3' year to month -from interval_arithmetic --- !query 14 schema -struct<(interval 2 years 2 months + interval 3 years 3 months):interval,(interval 2 years 2 months - interval 3 years 3 months):interval> --- !query 14 output -interval 5 years 5 months interval -1 years -1 months +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2})$': 15:40:32.99899999, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) +== SQL == +select interval '15:40:32.99899999' hour to minute +----------------^^^ --- !query 15 -select - dateval, - dateval - interval '99 11:22:33.123456789' day to second, - dateval - interval '-99 11:22:33.123456789' day to second, - dateval + interval '99 11:22:33.123456789' day to second, - dateval + interval '-99 11:22:33.123456789' day to second, - -interval '99 11:22:33.123456789' day to second + dateval, - interval '99 11:22:33.123456789' day to second + dateval -from interval_arithmetic --- !query 15 schema -struct --- !query 15 output -2012-01-01 2011-09-23 2012-04-09 2012-04-09 2011-09-23 2011-09-23 2012-04-09 +-- !query +select interval '15:40.99899999' hour to second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException --- !query 16 -select - dateval, - dateval - '99 11:22:33.123456789' day to second, - dateval - '-99 11:22:33.123456789' day to second, - dateval + '99 11:22:33.123456789' day to second, - dateval + '-99 11:22:33.123456789' day to second, - - '99 11:22:33.123456789' day to second + dateval, - '99 11:22:33.123456789' day to second + dateval -from interval_arithmetic --- !query 16 schema -struct --- !query 16 output -2012-01-01 2011-09-23 2012-04-09 2012-04-09 2011-09-23 2011-09-23 2012-04-09 +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 15:40.99899999, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) +== SQL == +select interval '15:40.99899999' hour to second +----------------^^^ --- !query 17 -select - tsval, - tsval - interval '99 11:22:33.123456789' day to second, - tsval - interval '-99 11:22:33.123456789' day to second, - tsval + interval '99 11:22:33.123456789' day to second, - tsval + interval '-99 11:22:33.123456789' day to second, - -interval '99 11:22:33.123456789' day to second + tsval, - interval '99 11:22:33.123456789' day to second + tsval -from interval_arithmetic --- !query 17 schema -struct --- !query 17 output -2012-01-01 00:00:00 2011-09-23 13:37:26.876544 2012-04-09 12:22:33.123456 2012-04-09 12:22:33.123456 2011-09-23 13:37:26.876544 2011-09-23 13:37:26.876544 2012-04-09 12:22:33.123456 +-- !query +select interval '15:40' hour to second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException --- !query 18 -select - tsval, - tsval - '99 11:22:33.123456789' day to second, - tsval - '-99 11:22:33.123456789' day to second, - tsval + '99 11:22:33.123456789' day to second, - tsval + '-99 11:22:33.123456789' day to second, - - '99 11:22:33.123456789' day to second + tsval, - '99 11:22:33.123456789' day to second + tsval -from interval_arithmetic --- !query 18 schema -struct --- !query 18 output -2012-01-01 00:00:00 2011-09-23 13:37:26.876544 2012-04-09 12:22:33.123456 2012-04-09 12:22:33.123456 2011-09-23 13:37:26.876544 2011-09-23 13:37:26.876544 2012-04-09 12:22:33.123456 +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 15:40, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) +== SQL == +select interval '15:40' hour to second +----------------^^^ --- !query 19 -select - interval '99 11:22:33.123456789' day to second + interval '10 9:8:7.123456789' day to second, - interval '99 11:22:33.123456789' day to second - interval '10 9:8:7.123456789' day to second -from interval_arithmetic --- !query 19 schema -struct<(interval 14 weeks 1 days 11 hours 22 minutes 33 seconds 123 milliseconds 456 microseconds + interval 1 weeks 3 days 9 hours 8 minutes 7 seconds 123 milliseconds 456 microseconds):interval,(interval 14 weeks 1 days 11 hours 22 minutes 33 seconds 123 milliseconds 456 microseconds - interval 1 weeks 3 days 9 hours 8 minutes 7 seconds 123 milliseconds 456 microseconds):interval> --- !query 19 output -interval 15 weeks 4 days 20 hours 30 minutes 40 seconds 246 milliseconds 912 microseconds interval 12 weeks 5 days 2 hours 14 minutes 26 seconds +-- !query +select interval '20 40:32.99899999' minute to second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 20 40:32.99899999, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) + +== SQL == +select interval '20 40:32.99899999' minute to second +----------------^^^ + + +-- !query +select interval 10 nanoseconds +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Error parsing ' 10 nanoseconds' to interval, invalid unit 'nanoseconds'(line 1, pos 16) + +== SQL == +select interval 10 nanoseconds +----------------^^^ + + +-- !query +select map(1, interval 1 day, 2, interval 3 week) +-- !query schema +struct> +-- !query output +{1:1 days,2:21 days} + + +-- !query +select interval 'interval 3 year 1 hour' +-- !query schema +struct +-- !query output +3 years 1 hours + + +-- !query +select interval '3 year 1 hour' +-- !query schema +struct +-- !query output +3 years 1 hours --- !query 20 -select - '99 11:22:33.123456789' day to second + '10 9:8:7.123456789' day to second, - '99 11:22:33.123456789' day to second - '10 9:8:7.123456789' day to second -from interval_arithmetic --- !query 20 schema -struct<(interval 14 weeks 1 days 11 hours 22 minutes 33 seconds 123 milliseconds 456 microseconds + interval 1 weeks 3 days 9 hours 8 minutes 7 seconds 123 milliseconds 456 microseconds):interval,(interval 14 weeks 1 days 11 hours 22 minutes 33 seconds 123 milliseconds 456 microseconds - interval 1 weeks 3 days 9 hours 8 minutes 7 seconds 123 milliseconds 456 microseconds):interval> --- !query 20 output -interval 15 weeks 4 days 20 hours 30 minutes 40 seconds 246 milliseconds 912 microseconds interval 12 weeks 5 days 2 hours 14 minutes 26 seconds +-- !query +select interval +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException --- !query 21 -select 30 day --- !query 21 schema -struct --- !query 21 output -interval 4 weeks 2 days +at least one time unit should be given for interval literal(line 1, pos 7) +== SQL == +select interval +-------^^^ --- !query 22 -select 30 day day --- !query 22 schema + +-- !query +select interval 1 fake_unit +-- !query schema struct<> --- !query 22 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException -no viable alternative at input 'day'(line 1, pos 14) +Error parsing ' 1 fake_unit' to interval, invalid unit 'fake_unit'(line 1, pos 16) == SQL == -select 30 day day ---------------^^^ +select interval 1 fake_unit +----------------^^^ --- !query 23 -select 30 day day day --- !query 23 schema +-- !query +select interval 1 year to month +-- !query schema struct<> --- !query 23 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException -no viable alternative at input 'day'(line 1, pos 14) +The value of from-to unit must be a string(line 1, pos 16) == SQL == -select 30 day day day ---------------^^^ +select interval 1 year to month +----------------^^^ + +-- !query +select interval '1' year to second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException --- !query 24 -select date '2012-01-01' - 30 day --- !query 24 schema -struct --- !query 24 output -2011-12-02 +Intervals FROM year TO second are not supported.(line 1, pos 16) +== SQL == +select interval '1' year to second +----------------^^^ --- !query 25 -select date '2012-01-01' - 30 day day --- !query 25 schema + +-- !query +select interval '10-9' year to month '2-1' year to month +-- !query schema struct<> --- !query 25 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException -no viable alternative at input 'day'(line 1, pos 34) +Can only have a single from-to unit in the interval literal syntax(line 1, pos 37) == SQL == -select date '2012-01-01' - 30 day day -----------------------------------^^^ +select interval '10-9' year to month '2-1' year to month +-------------------------------------^^^ --- !query 26 -select date '2012-01-01' - 30 day day day --- !query 26 schema +-- !query +select interval '10-9' year to month '12:11:10' hour to second +-- !query schema struct<> --- !query 26 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException -no viable alternative at input 'day'(line 1, pos 34) +Can only have a single from-to unit in the interval literal syntax(line 1, pos 37) == SQL == -select date '2012-01-01' - 30 day day day -----------------------------------^^^ +select interval '10-9' year to month '12:11:10' hour to second +-------------------------------------^^^ --- !query 27 -select date '2012-01-01' + '-30' day --- !query 27 schema -struct --- !query 27 output -2011-12-02 +-- !query +select interval '1 15:11' day to minute '12:11:10' hour to second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +Can only have a single from-to unit in the interval literal syntax(line 1, pos 40) --- !query 28 -select date '2012-01-01' + interval '-30' day --- !query 28 schema -struct --- !query 28 output -2011-12-02 +== SQL == +select interval '1 15:11' day to minute '12:11:10' hour to second +----------------------------------------^^^ --- !query 29 -select date '2012-01-01' + interval (-30) day --- !query 29 schema +-- !query +select interval 1 year '2-1' year to month +-- !query schema struct<> --- !query 29 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException -no viable alternative at input 'day'(line 1, pos 42) +Can only have a single from-to unit in the interval literal syntax(line 1, pos 23) == SQL == -select date '2012-01-01' + interval (-30) day -------------------------------------------^^^ +select interval 1 year '2-1' year to month +-----------------------^^^ --- !query 30 -select date '2012-01-01' + (-30) day --- !query 30 schema +-- !query +select interval 1 year '12:11:10' hour to second +-- !query schema struct<> --- !query 30 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException -no viable alternative at input 'day'(line 1, pos 33) +Can only have a single from-to unit in the interval literal syntax(line 1, pos 23) == SQL == -select date '2012-01-01' + (-30) day ----------------------------------^^^ +select interval 1 year '12:11:10' hour to second +-----------------------^^^ --- !query 31 -create temporary view t as select * from values (1), (2) as t(a) --- !query 31 schema +-- !query +select interval '10-9' year to month '1' year +-- !query schema struct<> --- !query 31 output +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Can only have a single from-to unit in the interval literal syntax(line 1, pos 37) +== SQL == +select interval '10-9' year to month '1' year +-------------------------------------^^^ --- !query 32 -select date '2012-01-01' + interval (a + 1) day from t --- !query 32 schema +-- !query +select interval '12:11:10' hour to second '1' year +-- !query schema struct<> --- !query 32 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException -no viable alternative at input 'day'(line 1, pos 44) +Can only have a single from-to unit in the interval literal syntax(line 1, pos 42) == SQL == -select date '2012-01-01' + interval (a + 1) day from t ---------------------------------------------^^^ +select interval '12:11:10' hour to second '1' year +------------------------------------------^^^ --- !query 33 -select date '2012-01-01' + (a + 1) day from t --- !query 33 schema +-- !query +select interval (-30) day +-- !query schema struct<> --- !query 33 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException -no viable alternative at input 'day'(line 1, pos 35) +no viable alternative at input 'day'(line 1, pos 22) == SQL == -select date '2012-01-01' + (a + 1) day from t ------------------------------------^^^ +select interval (-30) day +----------------------^^^ + +-- !query +select interval (a + 1) day +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException --- !query 34 -SET spark.sql.parser.ansi.enabled=false --- !query 34 schema -struct --- !query 34 output -spark.sql.parser.ansi.enabled false +no viable alternative at input 'day'(line 1, pos 24) + +== SQL == +select interval (a + 1) day +------------------------^^^ + + +-- !query +select interval 30 day day day +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +no viable alternative at input 'day'(line 1, pos 23) + +== SQL == +select interval 30 day day day +-----------------------^^^ + + +-- !query +select sum(cast(null as interval)) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select sum(cast(v as interval)) from VALUES ('1 seconds') t(v) where 1=0 +-- !query schema +struct +-- !query output +NULL + + +-- !query +select sum(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v) +-- !query schema +struct +-- !query output +3 seconds + + +-- !query +select sum(cast(v as interval)) from VALUES ('-1 seconds'), ('2 seconds'), (null) t(v) +-- !query schema +struct +-- !query output +1 seconds + + +-- !query +select sum(cast(v as interval)) from VALUES ('-1 seconds'), ('-2 seconds'), (null) t(v) +-- !query schema +struct +-- !query output +-3 seconds + + +-- !query +select sum(cast(v as interval)) from VALUES ('-1 weeks'), ('2 seconds'), (null) t(v) +-- !query schema +struct +-- !query output +-7 days 2 seconds + + +-- !query +select + i, + sum(cast(v as interval)) +from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) +group by i +-- !query schema +struct +-- !query output +1 -2 days +2 2 seconds +3 NULL + + +-- !query +select + sum(cast(v as interval)) as sv +from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) +having sv is not null +-- !query schema +struct +-- !query output +-2 days 2 seconds + + +-- !query +SELECT + i, + sum(cast(v as interval)) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) +FROM VALUES(1, '1 seconds'), (1, '2 seconds'), (2, NULL), (2, NULL) t(i,v) +-- !query schema +struct +-- !query output +1 2 seconds +1 3 seconds +2 NULL +2 NULL + + +-- !query +select avg(cast(v as interval)) from VALUES (null) t(v) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select avg(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v) where 1=0 +-- !query schema +struct +-- !query output +NULL + + +-- !query +select avg(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v) +-- !query schema +struct +-- !query output +1.5 seconds + + +-- !query +select avg(cast(v as interval)) from VALUES ('-1 seconds'), ('2 seconds'), (null) t(v) +-- !query schema +struct +-- !query output +0.5 seconds + + +-- !query +select avg(cast(v as interval)) from VALUES ('-1 seconds'), ('-2 seconds'), (null) t(v) +-- !query schema +struct +-- !query output +-1.5 seconds + + +-- !query +select avg(cast(v as interval)) from VALUES ('-1 weeks'), ('2 seconds'), (null) t(v) +-- !query schema +struct +-- !query output +-3 days -11 hours -59 minutes -59 seconds + + +-- !query +select + i, + avg(cast(v as interval)) +from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) +group by i +-- !query schema +struct +-- !query output +1 -1 days +2 2 seconds +3 NULL + + +-- !query +select + avg(cast(v as interval)) as sv +from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) +having sv is not null +-- !query schema +struct +-- !query output +-15 hours -59 minutes -59.333333 seconds + + +-- !query +SELECT + i, + avg(cast(v as interval)) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) +FROM VALUES (1,'1 seconds'), (1,'2 seconds'), (2,NULL), (2,NULL) t(i,v) +-- !query schema +struct +-- !query output +1 1.5 seconds +1 2 seconds +2 NULL +2 NULL + + +-- !query +create temporary view interval_arithmetic as + select CAST(dateval AS date), CAST(tsval AS timestamp) from values + ('2012-01-01', '2012-01-01') + as interval_arithmetic(dateval, tsval) +-- !query schema +struct<> +-- !query output + + + +-- !query +select + dateval, + dateval - interval '2-2' year to month, + dateval - interval '-2-2' year to month, + dateval + interval '2-2' year to month, + dateval + interval '-2-2' year to month, + - interval '2-2' year to month + dateval, + interval '2-2' year to month + dateval +from interval_arithmetic +-- !query schema +struct +-- !query output +2012-01-01 2009-11-01 2014-03-01 2014-03-01 2009-11-01 2009-11-01 2014-03-01 + + +-- !query +select + tsval, + tsval - interval '2-2' year to month, + tsval - interval '-2-2' year to month, + tsval + interval '2-2' year to month, + tsval + interval '-2-2' year to month, + - interval '2-2' year to month + tsval, + interval '2-2' year to month + tsval +from interval_arithmetic +-- !query schema +struct +-- !query output +2012-01-01 00:00:00 2009-11-01 00:00:00 2014-03-01 00:00:00 2014-03-01 00:00:00 2009-11-01 00:00:00 2009-11-01 00:00:00 2014-03-01 00:00:00 + + +-- !query +select + interval '2-2' year to month + interval '3-3' year to month, + interval '2-2' year to month - interval '3-3' year to month +from interval_arithmetic +-- !query schema +struct<(INTERVAL '2 years 2 months' + INTERVAL '3 years 3 months'):interval,(INTERVAL '2 years 2 months' - INTERVAL '3 years 3 months'):interval> +-- !query output +5 years 5 months -1 years -1 months + + +-- !query +select + dateval, + dateval - interval '99 11:22:33.123456789' day to second, + dateval - interval '-99 11:22:33.123456789' day to second, + dateval + interval '99 11:22:33.123456789' day to second, + dateval + interval '-99 11:22:33.123456789' day to second, + -interval '99 11:22:33.123456789' day to second + dateval, + interval '99 11:22:33.123456789' day to second + dateval +from interval_arithmetic +-- !query schema +struct +-- !query output +2012-01-01 2011-09-23 2012-04-09 2012-04-09 2011-09-23 2011-09-23 2012-04-09 + + +-- !query +select + tsval, + tsval - interval '99 11:22:33.123456789' day to second, + tsval - interval '-99 11:22:33.123456789' day to second, + tsval + interval '99 11:22:33.123456789' day to second, + tsval + interval '-99 11:22:33.123456789' day to second, + -interval '99 11:22:33.123456789' day to second + tsval, + interval '99 11:22:33.123456789' day to second + tsval +from interval_arithmetic +-- !query schema +struct +-- !query output +2012-01-01 00:00:00 2011-09-23 12:37:26.876544 2012-04-09 11:22:33.123456 2012-04-09 11:22:33.123456 2011-09-23 12:37:26.876544 2011-09-23 12:37:26.876544 2012-04-09 11:22:33.123456 + + +-- !query +select + interval '99 11:22:33.123456789' day to second + interval '10 9:8:7.123456789' day to second, + interval '99 11:22:33.123456789' day to second - interval '10 9:8:7.123456789' day to second +from interval_arithmetic +-- !query schema +struct<(INTERVAL '99 days 11 hours 22 minutes 33.123456 seconds' + INTERVAL '10 days 9 hours 8 minutes 7.123456 seconds'):interval,(INTERVAL '99 days 11 hours 22 minutes 33.123456 seconds' - INTERVAL '10 days 9 hours 8 minutes 7.123456 seconds'):interval> +-- !query output +109 days 20 hours 30 minutes 40.246912 seconds 89 days 2 hours 14 minutes 26 seconds + + +-- !query +select interval '\t interval 1 day' +-- !query schema +struct +-- !query output +1 days + + +-- !query +select interval 'interval \t 1\tday' +-- !query schema +struct +-- !query output +1 days + + +-- !query +select interval 'interval\t1\tday' +-- !query schema +struct +-- !query output +1 days + + +-- !query +select interval '1\t' day +-- !query schema +struct +-- !query output +1 days + + +-- !query +select interval '1 ' day +-- !query schema +struct +-- !query output +1 days + + +-- !query +select -(a) from values (interval '-2147483648 months', interval '2147483647 months') t(a, b) +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +integer overflow + + +-- !query +select a - b from values (interval '-2147483648 months', interval '2147483647 months') t(a, b) +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +integer overflow + + +-- !query +select b + interval '1 month' from values (interval '-2147483648 months', interval '2147483647 months') t(a, b) +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +integer overflow + + +-- !query +select a * 1.1 from values (interval '-2147483648 months', interval '2147483647 months') t(a, b) +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +integer overflow + + +-- !query +select a / 0.5 from values (interval '-2147483648 months', interval '2147483647 months') t(a, b) +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +integer overflow + + +-- !query +SELECT + from_csv('1, 1 day', 'a INT, b interval'), + to_csv(from_csv('1, 1 day', 'a INT, b interval')), + to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)), + from_csv(to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)), 'a interval, b interval') +-- !query schema +struct,to_csv(from_csv(1, 1 day)):string,to_csv(named_struct(a, INTERVAL '2 years 8 months', b, INTERVAL '1 hours 10 minutes')):string,from_csv(to_csv(named_struct(a, INTERVAL '2 years 8 months', b, INTERVAL '1 hours 10 minutes'))):struct> +-- !query output +{"a":1,"b":1 days} 1,1 days 2 years 8 months,1 hours 10 minutes {"a":2 years 8 months,"b":1 hours 10 minutes} + + +-- !query +SELECT + from_json('{"a":"1 days"}', 'a interval'), + to_json(from_json('{"a":"1 days"}', 'a interval')), + to_json(map('a', interval 25 month 100 day 130 minute)), + from_json(to_json(map('a', interval 25 month 100 day 130 minute)), 'a interval') +-- !query schema +struct,to_json(from_json({"a":"1 days"})):string,to_json(map(a, INTERVAL '2 years 1 months 100 days 2 hours 10 minutes')):string,from_json(to_json(map(a, INTERVAL '2 years 1 months 100 days 2 hours 10 minutes'))):struct> +-- !query output +{"a":1 days} {"a":"1 days"} {"a":"2 years 1 months 100 days 2 hours 10 minutes"} {"a":2 years 1 months 100 days 2 hours 10 minutes} diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out new file mode 100644 index 0000000000000..f6720f6c5faa4 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/ansi/literals.sql.out @@ -0,0 +1,481 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 50 + + +-- !query +select null, Null, nUll +-- !query schema +struct +-- !query output +NULL NULL NULL + + +-- !query +select true, tRue, false, fALse +-- !query schema +struct +-- !query output +true true false false + + +-- !query +select 1Y +-- !query schema +struct<1:tinyint> +-- !query output +1 + + +-- !query +select 127Y, -128Y +-- !query schema +struct<127:tinyint,-128:tinyint> +-- !query output +127 -128 + + +-- !query +select 128Y +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Numeric literal 128 does not fit in range [-128, 127] for type tinyint(line 1, pos 7) + +== SQL == +select 128Y +-------^^^ + + +-- !query +select 1S +-- !query schema +struct<1:smallint> +-- !query output +1 + + +-- !query +select 32767S, -32768S +-- !query schema +struct<32767:smallint,-32768:smallint> +-- !query output +32767 -32768 + + +-- !query +select 32768S +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Numeric literal 32768 does not fit in range [-32768, 32767] for type smallint(line 1, pos 7) + +== SQL == +select 32768S +-------^^^ + + +-- !query +select 1L, 2147483648L +-- !query schema +struct<1:bigint,2147483648:bigint> +-- !query output +1 2147483648 + + +-- !query +select 9223372036854775807L, -9223372036854775808L +-- !query schema +struct<9223372036854775807:bigint,-9223372036854775808:bigint> +-- !query output +9223372036854775807 -9223372036854775808 + + +-- !query +select 9223372036854775808L +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Numeric literal 9223372036854775808 does not fit in range [-9223372036854775808, 9223372036854775807] for type bigint(line 1, pos 7) + +== SQL == +select 9223372036854775808L +-------^^^ + + +-- !query +select 1, -1 +-- !query schema +struct<1:int,-1:int> +-- !query output +1 -1 + + +-- !query +select 2147483647, -2147483648 +-- !query schema +struct<2147483647:int,-2147483648:int> +-- !query output +2147483647 -2147483648 + + +-- !query +select 9223372036854775807, -9223372036854775808 +-- !query schema +struct<9223372036854775807:bigint,-9223372036854775808:bigint> +-- !query output +9223372036854775807 -9223372036854775808 + + +-- !query +select 9223372036854775808, -9223372036854775809 +-- !query schema +struct<9223372036854775808:decimal(19,0),-9223372036854775809:decimal(19,0)> +-- !query output +9223372036854775808 -9223372036854775809 + + +-- !query +select 1234567890123456789012345678901234567890 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +decimal can only support precision up to 38 +== SQL == +select 1234567890123456789012345678901234567890 + + +-- !query +select 1234567890123456789012345678901234567890.0 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +decimal can only support precision up to 38 +== SQL == +select 1234567890123456789012345678901234567890.0 + + +-- !query +select 1D, 1.2D, 1e10, 1.5e5, .10D, 0.10D, .1e5, .9e+2, 0.9e+2, 900e-1, 9.e+1 +-- !query schema +struct<1.0:double,1.2:double,1.0E10:double,150000.0:double,0.1:double,0.1:double,10000.0:double,90.0:double,90.0:double,90.0:double,90.0:double> +-- !query output +1.0 1.2 1.0E10 150000.0 0.1 0.1 10000.0 90.0 90.0 90.0 90.0 + + +-- !query +select -1D, -1.2D, -1e10, -1.5e5, -.10D, -0.10D, -.1e5 +-- !query schema +struct<-1.0:double,-1.2:double,-1.0E10:double,-150000.0:double,-0.1:double,-0.1:double,-10000.0:double> +-- !query output +-1.0 -1.2 -1.0E10 -150000.0 -0.1 -0.1 -10000.0 + + +-- !query +select .e3 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +no viable alternative at input 'select .'(line 1, pos 7) + +== SQL == +select .e3 +-------^^^ + + +-- !query +select 1E309, -1E309 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Numeric literal 1E309 does not fit in range [-1.7976931348623157E+308, 1.7976931348623157E+308] for type double(line 1, pos 7) + +== SQL == +select 1E309, -1E309 +-------^^^ + + +-- !query +select 0.3, -0.8, .5, -.18, 0.1111, .1111 +-- !query schema +struct<0.3:decimal(1,1),-0.8:decimal(1,1),0.5:decimal(1,1),-0.18:decimal(2,2),0.1111:decimal(4,4),0.1111:decimal(4,4)> +-- !query output +0.3 -0.8 0.5 -0.18 0.1111 0.1111 + + +-- !query +select 123456789012345678901234567890123456789e10d, 123456789012345678901234567890123456789.1e10d +-- !query schema +struct<1.2345678901234568E48:double,1.2345678901234568E48:double> +-- !query output +1.2345678901234568E48 1.2345678901234568E48 + + +-- !query +select "Hello Peter!", 'hello lee!' +-- !query schema +struct +-- !query output +Hello Peter! hello lee! + + +-- !query +select 'hello' 'world', 'hello' " " 'lee' +-- !query schema +struct +-- !query output +helloworld hello lee + + +-- !query +select "hello 'peter'" +-- !query schema +struct +-- !query output +hello 'peter' + + +-- !query +select 'pattern%', 'no-pattern\%', 'pattern\\%', 'pattern\\\%' +-- !query schema +struct +-- !query output +pattern% no-pattern\% pattern\% pattern\\% + + +-- !query +select '\'', '"', '\n', '\r', '\t', 'Z' +-- !query schema +struct<':string,":string, +:string, :string, :string,Z:string> +-- !query output +' " + Z + + +-- !query +select '\110\145\154\154\157\041' +-- !query schema +struct +-- !query output +Hello! + + +-- !query +select '\u0057\u006F\u0072\u006C\u0064\u0020\u003A\u0029' +-- !query schema +struct +-- !query output +World :) + + +-- !query +select dAte '2016-03-12' +-- !query schema +struct +-- !query output +2016-03-12 + + +-- !query +select date 'mar 11 2016' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the DATE value: mar 11 2016(line 1, pos 7) + +== SQL == +select date 'mar 11 2016' +-------^^^ + + +-- !query +select tImEstAmp '2016-03-11 20:54:00.000' +-- !query schema +struct +-- !query output +2016-03-11 20:54:00 + + +-- !query +select timestamp '2016-33-11 20:54:00.000' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the TIMESTAMP value: 2016-33-11 20:54:00.000(line 1, pos 7) + +== SQL == +select timestamp '2016-33-11 20:54:00.000' +-------^^^ + + +-- !query +select GEO '(10,-6)' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Literals of type 'GEO' are currently not supported.(line 1, pos 7) + +== SQL == +select GEO '(10,-6)' +-------^^^ + + +-- !query +select 90912830918230182310293801923652346786BD, 123.0E-28BD, 123.08BD +-- !query schema +struct<90912830918230182310293801923652346786:decimal(38,0),1.230E-26:decimal(29,29),123.08:decimal(5,2)> +-- !query output +90912830918230182310293801923652346786 0.00000000000000000000000001230 123.08 + + +-- !query +select 1.20E-38BD +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +decimal can only support precision up to 38(line 1, pos 7) + +== SQL == +select 1.20E-38BD +-------^^^ + + +-- !query +select x'2379ACFe' +-- !query schema +struct +-- !query output +#y�� + + +-- !query +select X'XuZ' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +contains illegal character for hexBinary: 0XuZ(line 1, pos 7) + +== SQL == +select X'XuZ' +-------^^^ + + +-- !query +SELECT 3.14, -3.14, 3.14e8, 3.14e-8, -3.14e8, -3.14e-8, 3.14e+8, 3.14E8, 3.14E-8 +-- !query schema +struct<3.14:decimal(3,2),-3.14:decimal(3,2),3.14E8:double,3.14E-8:double,-3.14E8:double,-3.14E-8:double,3.14E8:double,3.14E8:double,3.14E-8:double> +-- !query output +3.14 -3.14 3.14E8 3.14E-8 -3.14E8 -3.14E-8 3.14E8 3.14E8 3.14E-8 + + +-- !query +select +date '1999-01-01' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(+ DATE '1999-01-01')' due to data type mismatch: argument 1 requires (numeric or interval) type, however, 'DATE '1999-01-01'' is of date type.; line 1 pos 7 + + +-- !query +select +timestamp '1999-01-01' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(+ TIMESTAMP '1999-01-01 00:00:00')' due to data type mismatch: argument 1 requires (numeric or interval) type, however, 'TIMESTAMP '1999-01-01 00:00:00'' is of timestamp type.; line 1 pos 7 + + +-- !query +select +interval '1 day' +-- !query schema +struct<(+ INTERVAL '1 days'):interval> +-- !query output +1 days + + +-- !query +select +map(1, 2) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(+ map(1, 2))' due to data type mismatch: argument 1 requires (numeric or interval) type, however, 'map(1, 2)' is of map type.; line 1 pos 7 + + +-- !query +select +array(1,2) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(+ array(1, 2))' due to data type mismatch: argument 1 requires (numeric or interval) type, however, 'array(1, 2)' is of array type.; line 1 pos 7 + + +-- !query +select +named_struct('a', 1, 'b', 'spark') +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(+ named_struct('a', 1, 'b', 'spark'))' due to data type mismatch: argument 1 requires (numeric or interval) type, however, 'named_struct('a', 1, 'b', 'spark')' is of struct type.; line 1 pos 7 + + +-- !query +select +X'1' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(+ X'01')' due to data type mismatch: argument 1 requires (numeric or interval) type, however, 'X'01'' is of binary type.; line 1 pos 7 + + +-- !query +select -date '1999-01-01' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(- DATE '1999-01-01')' due to data type mismatch: argument 1 requires (numeric or interval) type, however, 'DATE '1999-01-01'' is of date type.; line 1 pos 7 + + +-- !query +select -timestamp '1999-01-01' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(- TIMESTAMP '1999-01-01 00:00:00')' due to data type mismatch: argument 1 requires (numeric or interval) type, however, 'TIMESTAMP '1999-01-01 00:00:00'' is of timestamp type.; line 1 pos 7 + + +-- !query +select -x'2379ACFe' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(- X'2379ACFE')' due to data type mismatch: argument 1 requires (numeric or interval) type, however, 'X'2379ACFE'' is of binary type.; line 1 pos 7 diff --git a/sql/core/src/test/resources/sql-tests/results/array.sql.out b/sql/core/src/test/resources/sql-tests/results/array.sql.out index 5f5d988771847..2c2b1a7856304 100644 --- a/sql/core/src/test/resources/sql-tests/results/array.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/array.sql.out @@ -2,45 +2,45 @@ -- Number of queries: 12 --- !query 0 +-- !query create temporary view data as select * from values ("one", array(11, 12, 13), array(array(111, 112, 113), array(121, 122, 123))), ("two", array(21, 22, 23), array(array(211, 212, 213), array(221, 222, 223))) as data(a, b, c) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query select * from data --- !query 1 schema +-- !query schema struct,c:array>> --- !query 1 output +-- !query output one [11,12,13] [[111,112,113],[121,122,123]] two [21,22,23] [[211,212,213],[221,222,223]] --- !query 2 +-- !query select a, b[0], b[0] + b[1] from data --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output one 11 23 two 21 43 --- !query 3 +-- !query select a, c[0][0] + c[0][0 + 1] from data --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output one 223 two 423 --- !query 4 +-- !query create temporary view primitive_arrays as select * from values ( array(true), array(2Y, 1Y), @@ -64,21 +64,21 @@ create temporary view primitive_arrays as select * from values ( date_array, timestamp_array ) --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query select * from primitive_arrays --- !query 5 schema +-- !query schema struct,tinyint_array:array,smallint_array:array,int_array:array,bigint_array:array,decimal_array:array,double_array:array,float_array:array,date_array:array,timestamp_array:array> --- !query 5 output -[true] [2,1] [2,1] [2,1] [2,1] [9223372036854775809,9223372036854775808] [2.0,1.0] [2.0,1.0] [2016-03-14,2016-03-13] [2016-11-15 20:54:00.0,2016-11-12 20:54:00.0] +-- !query output +[true] [2,1] [2,1] [2,1] [2,1] [9223372036854775809,9223372036854775808] [2.0,1.0] [2.0,1.0] [2016-03-14,2016-03-13] [2016-11-15 20:54:00,2016-11-12 20:54:00] --- !query 6 +-- !query select array_contains(boolean_array, true), array_contains(boolean_array, false), array_contains(tinyint_array, 2Y), array_contains(tinyint_array, 0Y), @@ -91,22 +91,22 @@ select array_contains(date_array, date '2016-03-14'), array_contains(date_array, date '2016-01-01'), array_contains(timestamp_array, timestamp '2016-11-15 20:54:00.000'), array_contains(timestamp_array, timestamp '2016-01-01 20:54:00.000') from primitive_arrays --- !query 6 schema -struct --- !query 6 output +-- !query schema +struct +-- !query output true false true false true false true false true false true false true false true false true false true false --- !query 7 +-- !query select array_contains(b, 11), array_contains(c, array(111, 112, 113)) from data --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output false false true true --- !query 8 +-- !query select sort_array(boolean_array), sort_array(tinyint_array), @@ -119,31 +119,31 @@ select sort_array(date_array), sort_array(timestamp_array) from primitive_arrays --- !query 8 schema +-- !query schema struct,sort_array(tinyint_array, true):array,sort_array(smallint_array, true):array,sort_array(int_array, true):array,sort_array(bigint_array, true):array,sort_array(decimal_array, true):array,sort_array(double_array, true):array,sort_array(float_array, true):array,sort_array(date_array, true):array,sort_array(timestamp_array, true):array> --- !query 8 output -[true] [1,2] [1,2] [1,2] [1,2] [9223372036854775808,9223372036854775809] [1.0,2.0] [1.0,2.0] [2016-03-13,2016-03-14] [2016-11-12 20:54:00.0,2016-11-15 20:54:00.0] +-- !query output +[true] [1,2] [1,2] [1,2] [1,2] [9223372036854775808,9223372036854775809] [1.0,2.0] [1.0,2.0] [2016-03-13,2016-03-14] [2016-11-12 20:54:00,2016-11-15 20:54:00] --- !query 9 +-- !query select sort_array(array('b', 'd'), '1') --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'sort_array(array('b', 'd'), '1')' due to data type mismatch: Sort order in second argument requires a boolean literal.; line 1 pos 7 --- !query 10 +-- !query select sort_array(array('b', 'd'), cast(NULL as boolean)) --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'sort_array(array('b', 'd'), CAST(NULL AS BOOLEAN))' due to data type mismatch: Sort order in second argument requires a boolean literal.; line 1 pos 7 --- !query 11 +-- !query select size(boolean_array), size(tinyint_array), @@ -156,7 +156,7 @@ select size(date_array), size(timestamp_array) from primitive_arrays --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 1 2 2 2 2 2 2 2 2 2 diff --git a/sql/core/src/test/resources/sql-tests/results/bitwise.sql.out b/sql/core/src/test/resources/sql-tests/results/bitwise.sql.out new file mode 100644 index 0000000000000..552b027df1bc0 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/bitwise.sql.out @@ -0,0 +1,233 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 27 + + +-- !query +select bit_count(null) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select bit_count(true) +-- !query schema +struct +-- !query output +1 + + +-- !query +select bit_count(false) +-- !query schema +struct +-- !query output +0 + + +-- !query +select bit_count(cast(1 as tinyint)) +-- !query schema +struct +-- !query output +1 + + +-- !query +select bit_count(cast(2 as tinyint)) +-- !query schema +struct +-- !query output +1 + + +-- !query +select bit_count(cast(3 as tinyint)) +-- !query schema +struct +-- !query output +2 + + +-- !query +select bit_count(1S) +-- !query schema +struct +-- !query output +1 + + +-- !query +select bit_count(2S) +-- !query schema +struct +-- !query output +1 + + +-- !query +select bit_count(3S) +-- !query schema +struct +-- !query output +2 + + +-- !query +select bit_count(1) +-- !query schema +struct +-- !query output +1 + + +-- !query +select bit_count(2) +-- !query schema +struct +-- !query output +1 + + +-- !query +select bit_count(3) +-- !query schema +struct +-- !query output +2 + + +-- !query +select bit_count(1L) +-- !query schema +struct +-- !query output +1 + + +-- !query +select bit_count(2L) +-- !query schema +struct +-- !query output +1 + + +-- !query +select bit_count(3L) +-- !query schema +struct +-- !query output +2 + + +-- !query +select bit_count(-1L) +-- !query schema +struct +-- !query output +64 + + +-- !query +select bit_count(9223372036854775807L) +-- !query schema +struct +-- !query output +63 + + +-- !query +select bit_count(-9223372036854775808L) +-- !query schema +struct +-- !query output +1 + + +-- !query +select bit_count("bit count") +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'bit_count('bit count')' due to data type mismatch: argument 1 requires (integral or boolean) type, however, ''bit count'' is of string type.; line 1 pos 7 + + +-- !query +select bit_count('a') +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'bit_count('a')' due to data type mismatch: argument 1 requires (integral or boolean) type, however, ''a'' is of string type.; line 1 pos 7 + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW bitwise_test AS SELECT * FROM VALUES + (1, 1, 1, 1L), + (2, 3, 4, null), + (7, 7, 7, 3L) AS bitwise_test(b1, b2, b3, b4) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT BIT_XOR(b3) AS n1 FROM bitwise_test where 1 = 0 +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT BIT_XOR(b4) AS n1 FROM bitwise_test where b4 is null +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT + BIT_XOR(cast(b1 as tinyint)) AS a4, + BIT_XOR(cast(b2 as smallint)) AS b5, + BIT_XOR(b3) AS c2, + BIT_XOR(b4) AS d2, + BIT_XOR(distinct b4) AS e2 +FROM bitwise_test +-- !query schema +struct +-- !query output +4 5 2 2 2 + + +-- !query +SELECT bit_xor(b3) FROM bitwise_test GROUP BY b1 & 1 +-- !query schema +struct +-- !query output +4 +6 + + +-- !query +SELECT b1, bit_xor(b2) FROM bitwise_test GROUP BY b1 HAVING bit_and(b2) < 7 +-- !query schema +struct +-- !query output +1 1 +2 3 + + +-- !query +SELECT b1, b2, bit_xor(b2) OVER (PARTITION BY b1 ORDER BY b2) FROM bitwise_test +-- !query schema +struct +-- !query output +1 1 1 +2 3 3 +7 7 7 diff --git a/sql/core/src/test/resources/sql-tests/results/cast.sql.out b/sql/core/src/test/resources/sql-tests/results/cast.sql.out index adad21f049440..35b4c0e79720b 100644 --- a/sql/core/src/test/resources/sql-tests/results/cast.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cast.sql.out @@ -1,270 +1,270 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 35 +-- Number of queries: 46 --- !query 0 +-- !query SELECT CAST('1.23' AS int) --- !query 0 schema +-- !query schema struct --- !query 0 output +-- !query output 1 --- !query 1 +-- !query SELECT CAST('1.23' AS long) --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 1 --- !query 2 +-- !query SELECT CAST('-4.56' AS int) --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output -4 --- !query 3 +-- !query SELECT CAST('-4.56' AS long) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output -4 --- !query 4 +-- !query SELECT CAST('abc' AS int) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output NULL --- !query 5 +-- !query SELECT CAST('abc' AS long) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output NULL --- !query 6 +-- !query SELECT CAST('1234567890123' AS int) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output NULL --- !query 7 +-- !query SELECT CAST('12345678901234567890123' AS long) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output NULL --- !query 8 +-- !query SELECT CAST('' AS int) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output NULL --- !query 9 +-- !query SELECT CAST('' AS long) --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output NULL --- !query 10 +-- !query SELECT CAST(NULL AS int) --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output NULL --- !query 11 +-- !query SELECT CAST(NULL AS long) --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output NULL --- !query 12 +-- !query SELECT CAST('123.a' AS int) --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output NULL --- !query 13 +-- !query SELECT CAST('123.a' AS long) --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output NULL --- !query 14 +-- !query SELECT CAST('-2147483648' AS int) --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output -2147483648 --- !query 15 +-- !query SELECT CAST('-2147483649' AS int) --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output NULL --- !query 16 +-- !query SELECT CAST('2147483647' AS int) --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output 2147483647 --- !query 17 +-- !query SELECT CAST('2147483648' AS int) --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output NULL --- !query 18 +-- !query SELECT CAST('-9223372036854775808' AS long) --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output -9223372036854775808 --- !query 19 +-- !query SELECT CAST('-9223372036854775809' AS long) --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output NULL --- !query 20 +-- !query SELECT CAST('9223372036854775807' AS long) --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output 9223372036854775807 --- !query 21 +-- !query SELECT CAST('9223372036854775808' AS long) --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output NULL --- !query 22 +-- !query SELECT HEX(CAST('abc' AS binary)) --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output 616263 --- !query 23 +-- !query SELECT HEX(CAST(CAST(123 AS byte) AS binary)) --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output 7B --- !query 24 +-- !query SELECT HEX(CAST(CAST(-123 AS byte) AS binary)) --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output 85 --- !query 25 +-- !query SELECT HEX(CAST(123S AS binary)) --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output 007B --- !query 26 +-- !query SELECT HEX(CAST(-123S AS binary)) --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output FF85 --- !query 27 +-- !query SELECT HEX(CAST(123 AS binary)) --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output 0000007B --- !query 28 +-- !query SELECT HEX(CAST(-123 AS binary)) --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output FFFFFF85 --- !query 29 +-- !query SELECT HEX(CAST(123L AS binary)) --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output 000000000000007B --- !query 30 +-- !query SELECT HEX(CAST(-123L AS binary)) --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output FFFFFFFFFFFFFF85 --- !query 31 +-- !query DESC FUNCTION boolean --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output Class: org.apache.spark.sql.catalyst.expressions.Cast Function: boolean Usage: boolean(expr) - Casts the value `expr` to the target data type `boolean`. --- !query 32 +-- !query DESC FUNCTION EXTENDED boolean --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output Class: org.apache.spark.sql.catalyst.expressions.Cast Extended Usage: No example/argument for boolean. @@ -273,17 +273,108 @@ Function: boolean Usage: boolean(expr) - Casts the value `expr` to the target data type `boolean`. --- !query 33 +-- !query SELECT CAST('interval 3 month 1 hour' AS interval) --- !query 33 schema +-- !query schema struct --- !query 33 output -interval 3 months 1 hours +-- !query output +3 months 1 hours --- !query 34 +-- !query SELECT CAST(interval 3 month 1 hour AS string) --- !query 34 schema -struct --- !query 34 output -interval 3 months 1 hours +-- !query schema +struct +-- !query output +3 months 1 hours + + +-- !query +select cast(' 1' as tinyint) +-- !query schema +struct +-- !query output +1 + + +-- !query +select cast(' 1\t' as tinyint) +-- !query schema +struct +-- !query output +1 + + +-- !query +select cast(' 1' as smallint) +-- !query schema +struct +-- !query output +1 + + +-- !query +select cast(' 1' as INT) +-- !query schema +struct +-- !query output +1 + + +-- !query +select cast(' 1' as bigint) +-- !query schema +struct +-- !query output +1 + + +-- !query +select cast(' 1' as float) +-- !query schema +struct +-- !query output +1.0 + + +-- !query +select cast(' 1 ' as DOUBLE) +-- !query schema +struct +-- !query output +1.0 + + +-- !query +select cast('1.0 ' as DEC) +-- !query schema +struct +-- !query output +1 + + +-- !query +select cast('\t\t true \n\r ' as boolean) +-- !query schema +struct +-- !query output +true + + +-- !query +select cast('\t\n false \t\r' as boolean) +-- !query schema +struct +-- !query output +false + + +-- !query +select cast('\t\n xyz \t\r' as boolean) +-- !query schema +struct +-- !query output +NULL diff --git a/sql/core/src/test/resources/sql-tests/results/change-column.sql.out b/sql/core/src/test/resources/sql-tests/results/change-column.sql.out index 114617873af47..b1a32ad1f63e9 100644 --- a/sql/core/src/test/resources/sql-tests/results/change-column.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/change-column.sql.out @@ -1,323 +1,257 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 34 +-- Number of queries: 28 --- !query 0 +-- !query CREATE TABLE test_change(a INT, b STRING, c INT) using parquet --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query DESC test_change --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output a int b string c int --- !query 2 -ALTER TABLE test_change CHANGE a a1 INT --- !query 2 schema +-- !query +ALTER TABLE test_change CHANGE a +-- !query schema struct<> --- !query 2 output -org.apache.spark.sql.AnalysisException -ALTER TABLE CHANGE COLUMN is not supported for changing column 'a' with type 'IntegerType' to 'a1' with type 'IntegerType'; +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Operation not allowed: ALTER TABLE table CHANGE COLUMN requires a TYPE, a SET/DROP, a COMMENT, or a FIRST/AFTER(line 1, pos 0) +== SQL == +ALTER TABLE test_change CHANGE a +^^^ --- !query 3 + +-- !query DESC test_change --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output a int b string c int --- !query 4 -ALTER TABLE test_change CHANGE a a STRING --- !query 4 schema +-- !query +ALTER TABLE test_change RENAME COLUMN a TO a1 +-- !query schema struct<> --- !query 4 output +-- !query output org.apache.spark.sql.AnalysisException -ALTER TABLE CHANGE COLUMN is not supported for changing column 'a' with type 'IntegerType' to 'a' with type 'StringType'; +RENAME COLUMN is only supported with v2 tables.; --- !query 5 +-- !query DESC test_change --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output a int b string c int --- !query 6 -ALTER TABLE test_change CHANGE a a INT AFTER b --- !query 6 schema +-- !query +ALTER TABLE test_change CHANGE a TYPE STRING +-- !query schema struct<> --- !query 6 output -org.apache.spark.sql.catalyst.parser.ParseException +-- !query output +org.apache.spark.sql.AnalysisException +ALTER TABLE CHANGE COLUMN is not supported for changing column 'a' with type 'IntegerType' to 'a' with type 'StringType'; -Operation not allowed: ALTER TABLE table [PARTITION partition_spec] CHANGE COLUMN ... FIRST | AFTER otherCol(line 1, pos 0) -== SQL == -ALTER TABLE test_change CHANGE a a INT AFTER b -^^^ +-- !query +DESC test_change +-- !query schema +struct +-- !query output +a int +b string +c int --- !query 7 -ALTER TABLE test_change CHANGE b b STRING FIRST --- !query 7 schema +-- !query +ALTER TABLE test_change CHANGE a AFTER b +-- !query schema struct<> --- !query 7 output -org.apache.spark.sql.catalyst.parser.ParseException +-- !query output +org.apache.spark.sql.AnalysisException +ALTER COLUMN ... FIRST | ALTER is only supported with v2 tables.; -Operation not allowed: ALTER TABLE table [PARTITION partition_spec] CHANGE COLUMN ... FIRST | AFTER otherCol(line 1, pos 0) -== SQL == -ALTER TABLE test_change CHANGE b b STRING FIRST -^^^ +-- !query +ALTER TABLE test_change CHANGE b FIRST +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +ALTER COLUMN ... FIRST | ALTER is only supported with v2 tables.; --- !query 8 +-- !query DESC test_change --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output a int b string c int --- !query 9 -ALTER TABLE test_change CHANGE a a INT COMMENT 'this is column a' --- !query 9 schema +-- !query +ALTER TABLE test_change CHANGE a COMMENT 'this is column a' +-- !query schema struct<> --- !query 9 output +-- !query output --- !query 10 -ALTER TABLE test_change CHANGE b b STRING COMMENT '#*02?`' --- !query 10 schema +-- !query +ALTER TABLE test_change CHANGE b COMMENT '#*02?`' +-- !query schema struct<> --- !query 10 output +-- !query output --- !query 11 -ALTER TABLE test_change CHANGE c c INT COMMENT '' --- !query 11 schema +-- !query +ALTER TABLE test_change CHANGE c COMMENT '' +-- !query schema struct<> --- !query 11 output +-- !query output --- !query 12 +-- !query DESC test_change --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output a int this is column a b string #*02?` c int --- !query 13 -ALTER TABLE test_change CHANGE a a INT COMMENT 'this is column a' --- !query 13 schema +-- !query +ALTER TABLE test_change CHANGE a TYPE INT +-- !query schema struct<> --- !query 13 output +-- !query output --- !query 14 -DESC test_change --- !query 14 schema -struct --- !query 14 output -a int this is column a -b string #*02?` -c int - - --- !query 15 -ALTER TABLE test_change CHANGE invalid_col invalid_col INT --- !query 15 schema +-- !query +ALTER TABLE test_change CHANGE a COMMENT 'this is column a' +-- !query schema struct<> --- !query 15 output -org.apache.spark.sql.AnalysisException -Can't find column `invalid_col` given table data columns [`a`, `b`, `c`]; +-- !query output --- !query 16 + +-- !query DESC test_change --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output a int this is column a b string #*02?` c int --- !query 17 -ALTER TABLE test_change CHANGE a a1 STRING COMMENT 'this is column a1' AFTER b --- !query 17 schema +-- !query +ALTER TABLE test_change CHANGE invalid_col TYPE INT +-- !query schema struct<> --- !query 17 output -org.apache.spark.sql.catalyst.parser.ParseException - -Operation not allowed: ALTER TABLE table [PARTITION partition_spec] CHANGE COLUMN ... FIRST | AFTER otherCol(line 1, pos 0) - -== SQL == -ALTER TABLE test_change CHANGE a a1 STRING COMMENT 'this is column a1' AFTER b -^^^ +-- !query output +org.apache.spark.sql.AnalysisException +Can't find column `invalid_col` given table data columns [`a`, `b`, `c`]; --- !query 18 +-- !query DESC test_change --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output a int this is column a b string #*02?` c int --- !query 19 -SET spark.sql.caseSensitive=false --- !query 19 schema -struct --- !query 19 output -spark.sql.caseSensitive false - - --- !query 20 -ALTER TABLE test_change CHANGE a A INT COMMENT 'this is column A' --- !query 20 schema +-- !query +ALTER TABLE test_change CHANGE A COMMENT 'case insensitivity' +-- !query schema struct<> --- !query 20 output - +-- !query output --- !query 21 -SET spark.sql.caseSensitive=true --- !query 21 schema -struct --- !query 21 output -spark.sql.caseSensitive true - --- !query 22 -ALTER TABLE test_change CHANGE a A INT COMMENT 'this is column A1' --- !query 22 schema -struct<> --- !query 22 output -org.apache.spark.sql.AnalysisException -ALTER TABLE CHANGE COLUMN is not supported for changing column 'a' with type 'IntegerType' to 'A' with type 'IntegerType'; - - --- !query 23 +-- !query DESC test_change --- !query 23 schema +-- !query schema struct --- !query 23 output -a int this is column A +-- !query output +a int case insensitivity b string #*02?` c int --- !query 24 +-- !query CREATE TEMPORARY VIEW temp_view(a, b) AS SELECT 1, "one" --- !query 24 schema +-- !query schema struct<> --- !query 24 output +-- !query output --- !query 25 -ALTER TABLE temp_view CHANGE a a INT COMMENT 'this is column a' --- !query 25 schema +-- !query +ALTER TABLE temp_view CHANGE a TYPE INT +-- !query schema struct<> --- !query 25 output -org.apache.spark.sql.catalyst.analysis.NoSuchTableException -Table or view 'temp_view' not found in database 'default'; +-- !query output +org.apache.spark.sql.AnalysisException +Invalid command: 'temp_view' is a view not a table.; line 1 pos 0 --- !query 26 +-- !query CREATE GLOBAL TEMPORARY VIEW global_temp_view(a, b) AS SELECT 1, "one" --- !query 26 schema -struct<> --- !query 26 output - - - --- !query 27 -ALTER TABLE global_temp.global_temp_view CHANGE a a INT COMMENT 'this is column a' --- !query 27 schema -struct<> --- !query 27 output -org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException -Database 'global_temp' not found; - - --- !query 28 -CREATE TABLE partition_table(a INT, b STRING, c INT, d STRING) USING parquet PARTITIONED BY (c, d) --- !query 28 schema +-- !query schema struct<> --- !query 28 output +-- !query output --- !query 29 -ALTER TABLE partition_table PARTITION (c = 1) CHANGE COLUMN a new_a INT --- !query 29 schema +-- !query +ALTER TABLE global_temp.global_temp_view CHANGE a TYPE INT +-- !query schema struct<> --- !query 29 output -org.apache.spark.sql.catalyst.parser.ParseException - -Operation not allowed: ALTER TABLE table PARTITION partition_spec CHANGE COLUMN(line 1, pos 0) - -== SQL == -ALTER TABLE partition_table PARTITION (c = 1) CHANGE COLUMN a new_a INT -^^^ - - --- !query 30 -ALTER TABLE partition_table CHANGE COLUMN c c INT COMMENT 'this is column C' --- !query 30 schema -struct<> --- !query 30 output +-- !query output org.apache.spark.sql.AnalysisException -Can't find column `c` given table data columns [`a`, `b`]; +Invalid command: 'global_temp.global_temp_view' is a view not a table.; line 1 pos 0 --- !query 31 +-- !query DROP TABLE test_change --- !query 31 schema -struct<> --- !query 31 output - - - --- !query 32 -DROP TABLE partition_table --- !query 32 schema +-- !query schema struct<> --- !query 32 output +-- !query output --- !query 33 +-- !query DROP VIEW global_temp.global_temp_view --- !query 33 schema +-- !query schema struct<> --- !query 33 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/columnresolution-negative.sql.out b/sql/core/src/test/resources/sql-tests/results/columnresolution-negative.sql.out index 9fc97f0c39149..f34b75a379aae 100644 --- a/sql/core/src/test/resources/sql-tests/results/columnresolution-negative.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/columnresolution-negative.sql.out @@ -2,239 +2,239 @@ -- Number of queries: 28 --- !query 0 +-- !query CREATE DATABASE mydb1 --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query USE mydb1 --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE TABLE t1 USING parquet AS SELECT 1 AS i1 --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query CREATE DATABASE mydb2 --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query USE mydb2 --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query CREATE TABLE t1 USING parquet AS SELECT 20 AS i1 --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query SET spark.sql.crossJoin.enabled = true --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output spark.sql.crossJoin.enabled true --- !query 7 +-- !query USE mydb1 --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output --- !query 8 +-- !query SELECT i1 FROM t1, mydb1.t1 --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output org.apache.spark.sql.AnalysisException Reference 'i1' is ambiguous, could be: mydb1.t1.i1, mydb1.t1.i1.; line 1 pos 7 --- !query 9 +-- !query SELECT t1.i1 FROM t1, mydb1.t1 --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output org.apache.spark.sql.AnalysisException Reference 't1.i1' is ambiguous, could be: mydb1.t1.i1, mydb1.t1.i1.; line 1 pos 7 --- !query 10 +-- !query SELECT mydb1.t1.i1 FROM t1, mydb1.t1 --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output org.apache.spark.sql.AnalysisException Reference 'mydb1.t1.i1' is ambiguous, could be: mydb1.t1.i1, mydb1.t1.i1.; line 1 pos 7 --- !query 11 +-- !query SELECT i1 FROM t1, mydb2.t1 --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output org.apache.spark.sql.AnalysisException Reference 'i1' is ambiguous, could be: mydb1.t1.i1, mydb2.t1.i1.; line 1 pos 7 --- !query 12 +-- !query SELECT t1.i1 FROM t1, mydb2.t1 --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output org.apache.spark.sql.AnalysisException Reference 't1.i1' is ambiguous, could be: mydb1.t1.i1, mydb2.t1.i1.; line 1 pos 7 --- !query 13 +-- !query USE mydb2 --- !query 13 schema +-- !query schema struct<> --- !query 13 output +-- !query output --- !query 14 +-- !query SELECT i1 FROM t1, mydb1.t1 --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output org.apache.spark.sql.AnalysisException Reference 'i1' is ambiguous, could be: mydb2.t1.i1, mydb1.t1.i1.; line 1 pos 7 --- !query 15 +-- !query SELECT t1.i1 FROM t1, mydb1.t1 --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output org.apache.spark.sql.AnalysisException Reference 't1.i1' is ambiguous, could be: mydb2.t1.i1, mydb1.t1.i1.; line 1 pos 7 --- !query 16 +-- !query SELECT i1 FROM t1, mydb2.t1 --- !query 16 schema +-- !query schema struct<> --- !query 16 output +-- !query output org.apache.spark.sql.AnalysisException Reference 'i1' is ambiguous, could be: mydb2.t1.i1, mydb2.t1.i1.; line 1 pos 7 --- !query 17 +-- !query SELECT t1.i1 FROM t1, mydb2.t1 --- !query 17 schema +-- !query schema struct<> --- !query 17 output +-- !query output org.apache.spark.sql.AnalysisException Reference 't1.i1' is ambiguous, could be: mydb2.t1.i1, mydb2.t1.i1.; line 1 pos 7 --- !query 18 +-- !query SELECT db1.t1.i1 FROM t1, mydb2.t1 --- !query 18 schema +-- !query schema struct<> --- !query 18 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`db1.t1.i1`' given input columns: [mydb2.t1.i1, mydb2.t1.i1]; line 1 pos 7 --- !query 19 +-- !query SET spark.sql.crossJoin.enabled = false --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output spark.sql.crossJoin.enabled false --- !query 20 +-- !query USE mydb1 --- !query 20 schema +-- !query schema struct<> --- !query 20 output +-- !query output --- !query 21 +-- !query SELECT mydb1.t1 FROM t1 --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`mydb1.t1`' given input columns: [mydb1.t1.i1]; line 1 pos 7 --- !query 22 +-- !query SELECT t1.x.y.* FROM t1 --- !query 22 schema +-- !query schema struct<> --- !query 22 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 't1.x.y.*' given input columns 'i1'; --- !query 23 +-- !query SELECT t1 FROM mydb1.t1 --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`t1`' given input columns: [mydb1.t1.i1]; line 1 pos 7 --- !query 24 +-- !query USE mydb2 --- !query 24 schema +-- !query schema struct<> --- !query 24 output +-- !query output --- !query 25 +-- !query SELECT mydb1.t1.i1 FROM t1 --- !query 25 schema +-- !query schema struct<> --- !query 25 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`mydb1.t1.i1`' given input columns: [mydb2.t1.i1]; line 1 pos 7 --- !query 26 +-- !query DROP DATABASE mydb1 CASCADE --- !query 26 schema +-- !query schema struct<> --- !query 26 output +-- !query output --- !query 27 +-- !query DROP DATABASE mydb2 CASCADE --- !query 27 schema +-- !query schema struct<> --- !query 27 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/columnresolution-views.sql.out b/sql/core/src/test/resources/sql-tests/results/columnresolution-views.sql.out index 3d8fb661afe55..16ff4f51bd5f9 100644 --- a/sql/core/src/test/resources/sql-tests/results/columnresolution-views.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/columnresolution-views.sql.out @@ -2,137 +2,137 @@ -- Number of queries: 17 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW view1 AS SELECT 2 AS i1 --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT view1.* FROM view1 --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 2 --- !query 2 +-- !query SELECT * FROM view1 --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 2 --- !query 3 +-- !query SELECT view1.i1 FROM view1 --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 2 --- !query 4 +-- !query SELECT i1 FROM view1 --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 2 --- !query 5 +-- !query SELECT a.i1 FROM view1 AS a --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 2 --- !query 6 +-- !query SELECT i1 FROM view1 AS a --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 2 --- !query 7 +-- !query DROP VIEW view1 --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output --- !query 8 +-- !query CREATE OR REPLACE GLOBAL TEMPORARY VIEW view1 as SELECT 1 as i1 --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output --- !query 9 +-- !query SELECT * FROM global_temp.view1 --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 1 --- !query 10 +-- !query SELECT global_temp.view1.* FROM global_temp.view1 --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 1 --- !query 11 +-- !query SELECT i1 FROM global_temp.view1 --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 1 --- !query 12 +-- !query SELECT global_temp.view1.i1 FROM global_temp.view1 --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 1 --- !query 13 +-- !query SELECT view1.i1 FROM global_temp.view1 --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output 1 --- !query 14 +-- !query SELECT a.i1 FROM global_temp.view1 AS a --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output 1 --- !query 15 +-- !query SELECT i1 FROM global_temp.view1 AS a --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 1 --- !query 16 +-- !query DROP VIEW global_temp.view1 --- !query 16 schema +-- !query schema struct<> --- !query 16 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/columnresolution.sql.out b/sql/core/src/test/resources/sql-tests/results/columnresolution.sql.out index 73e3fdc08232c..dcfd48b687b17 100644 --- a/sql/core/src/test/resources/sql-tests/results/columnresolution.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/columnresolution.sql.out @@ -2,442 +2,442 @@ -- Number of queries: 55 --- !query 0 +-- !query CREATE DATABASE mydb1 --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query USE mydb1 --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE TABLE t1 USING parquet AS SELECT 1 AS i1 --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query CREATE DATABASE mydb2 --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query USE mydb2 --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query CREATE TABLE t1 USING parquet AS SELECT 20 AS i1 --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query USE mydb1 --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output --- !query 7 +-- !query SELECT i1 FROM t1 --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 1 --- !query 8 +-- !query SELECT i1 FROM mydb1.t1 --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 1 --- !query 9 +-- !query SELECT t1.i1 FROM t1 --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 1 --- !query 10 +-- !query SELECT t1.i1 FROM mydb1.t1 --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 1 --- !query 11 +-- !query SELECT mydb1.t1.i1 FROM t1 --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 1 --- !query 12 +-- !query SELECT mydb1.t1.i1 FROM mydb1.t1 --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 1 --- !query 13 +-- !query USE mydb2 --- !query 13 schema +-- !query schema struct<> --- !query 13 output +-- !query output --- !query 14 +-- !query SELECT i1 FROM t1 --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output 20 --- !query 15 +-- !query SELECT i1 FROM mydb1.t1 --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 1 --- !query 16 +-- !query SELECT t1.i1 FROM t1 --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output 20 --- !query 17 +-- !query SELECT t1.i1 FROM mydb1.t1 --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output 1 --- !query 18 +-- !query SELECT mydb1.t1.i1 FROM mydb1.t1 --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output 1 --- !query 19 +-- !query USE mydb1 --- !query 19 schema +-- !query schema struct<> --- !query 19 output +-- !query output --- !query 20 +-- !query SELECT t1.* FROM t1 --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output 1 --- !query 21 +-- !query SELECT mydb1.t1.* FROM mydb1.t1 --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output 1 --- !query 22 +-- !query SELECT t1.* FROM mydb1.t1 --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output 1 --- !query 23 +-- !query USE mydb2 --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output --- !query 24 +-- !query SELECT t1.* FROM t1 --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output 20 --- !query 25 +-- !query SELECT mydb1.t1.* FROM mydb1.t1 --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output 1 --- !query 26 +-- !query SELECT t1.* FROM mydb1.t1 --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output 1 --- !query 27 +-- !query SELECT a.* FROM mydb1.t1 AS a --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output 1 --- !query 28 +-- !query USE mydb1 --- !query 28 schema +-- !query schema struct<> --- !query 28 output +-- !query output --- !query 29 +-- !query CREATE TABLE t3 USING parquet AS SELECT * FROM VALUES (4,1), (3,1) AS t3(c1, c2) --- !query 29 schema +-- !query schema struct<> --- !query 29 output +-- !query output --- !query 30 +-- !query CREATE TABLE t4 USING parquet AS SELECT * FROM VALUES (4,1), (2,1) AS t4(c2, c3) --- !query 30 schema +-- !query schema struct<> --- !query 30 output +-- !query output --- !query 31 +-- !query SELECT * FROM t3 WHERE c1 IN (SELECT c2 FROM t4 WHERE t4.c3 = t3.c2) --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output 4 1 --- !query 32 +-- !query SELECT * FROM mydb1.t3 WHERE c1 IN (SELECT mydb1.t4.c2 FROM mydb1.t4 WHERE mydb1.t4.c3 = mydb1.t3.c2) --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output 4 1 --- !query 33 +-- !query SET spark.sql.crossJoin.enabled = true --- !query 33 schema +-- !query schema struct --- !query 33 output +-- !query output spark.sql.crossJoin.enabled true --- !query 34 +-- !query SELECT mydb1.t1.i1 FROM t1, mydb2.t1 --- !query 34 schema +-- !query schema struct --- !query 34 output +-- !query output 1 --- !query 35 +-- !query SELECT mydb1.t1.i1 FROM mydb1.t1, mydb2.t1 --- !query 35 schema +-- !query schema struct --- !query 35 output +-- !query output 1 --- !query 36 +-- !query USE mydb2 --- !query 36 schema +-- !query schema struct<> --- !query 36 output +-- !query output --- !query 37 +-- !query SELECT mydb1.t1.i1 FROM t1, mydb1.t1 --- !query 37 schema +-- !query schema struct --- !query 37 output +-- !query output 1 --- !query 38 +-- !query SET spark.sql.crossJoin.enabled = false --- !query 38 schema +-- !query schema struct --- !query 38 output +-- !query output spark.sql.crossJoin.enabled false --- !query 39 +-- !query USE mydb1 --- !query 39 schema +-- !query schema struct<> --- !query 39 output +-- !query output --- !query 40 +-- !query CREATE TABLE t5(i1 INT, t5 STRUCT) USING parquet --- !query 40 schema +-- !query schema struct<> --- !query 40 output +-- !query output --- !query 41 +-- !query INSERT INTO t5 VALUES(1, (2, 3)) --- !query 41 schema +-- !query schema struct<> --- !query 41 output +-- !query output --- !query 42 +-- !query SELECT t5.i1 FROM t5 --- !query 42 schema +-- !query schema struct --- !query 42 output +-- !query output 1 --- !query 43 +-- !query SELECT t5.t5.i1 FROM t5 --- !query 43 schema +-- !query schema struct --- !query 43 output +-- !query output 2 --- !query 44 +-- !query SELECT t5.t5.i1 FROM mydb1.t5 --- !query 44 schema +-- !query schema struct --- !query 44 output +-- !query output 2 --- !query 45 +-- !query SELECT t5.i1 FROM mydb1.t5 --- !query 45 schema +-- !query schema struct --- !query 45 output +-- !query output 1 --- !query 46 +-- !query SELECT t5.* FROM mydb1.t5 --- !query 46 schema +-- !query schema struct> --- !query 46 output +-- !query output 1 {"i1":2,"i2":3} --- !query 47 +-- !query SELECT t5.t5.* FROM mydb1.t5 --- !query 47 schema +-- !query schema struct --- !query 47 output +-- !query output 2 3 --- !query 48 +-- !query SELECT mydb1.t5.t5.i1 FROM mydb1.t5 --- !query 48 schema +-- !query schema struct --- !query 48 output +-- !query output 2 --- !query 49 +-- !query SELECT mydb1.t5.t5.i2 FROM mydb1.t5 --- !query 49 schema +-- !query schema struct --- !query 49 output +-- !query output 3 --- !query 50 +-- !query SELECT mydb1.t5.* FROM mydb1.t5 --- !query 50 schema +-- !query schema struct> --- !query 50 output +-- !query output 1 {"i1":2,"i2":3} --- !query 51 +-- !query SELECT mydb1.t5.* FROM t5 --- !query 51 schema +-- !query schema struct> --- !query 51 output +-- !query output 1 {"i1":2,"i2":3} --- !query 52 +-- !query USE default --- !query 52 schema +-- !query schema struct<> --- !query 52 output +-- !query output --- !query 53 +-- !query DROP DATABASE mydb1 CASCADE --- !query 53 schema +-- !query schema struct<> --- !query 53 output +-- !query output --- !query 54 +-- !query DROP DATABASE mydb2 CASCADE --- !query 54 schema +-- !query schema struct<> --- !query 54 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/comparator.sql.out b/sql/core/src/test/resources/sql-tests/results/comparator.sql.out index afc7b5448b7b6..721b56cc1da2f 100644 --- a/sql/core/src/test/resources/sql-tests/results/comparator.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/comparator.sql.out @@ -1,18 +1,82 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 2 +-- Number of queries: 10 --- !query 0 +-- !query select x'00' < x'0f' --- !query 0 schema +-- !query schema struct<(X'00' < X'0F'):boolean> --- !query 0 output +-- !query output true --- !query 1 +-- !query select x'00' < x'ff' --- !query 1 schema +-- !query schema struct<(X'00' < X'FF'):boolean> --- !query 1 output +-- !query output +true + + +-- !query +select '1 ' = 1Y +-- !query schema +struct<(CAST(1 AS TINYINT) = 1):boolean> +-- !query output +true + + +-- !query +select '\t1 ' = 1Y +-- !query schema +struct<(CAST( 1 AS TINYINT) = 1):boolean> +-- !query output +true + + +-- !query +select '1 ' = 1S +-- !query schema +struct<(CAST(1 AS SMALLINT) = 1):boolean> +-- !query output +true + + +-- !query +select '1 ' = 1 +-- !query schema +struct<(CAST(1 AS INT) = 1):boolean> +-- !query output +true + + +-- !query +select ' 1' = 1L +-- !query schema +struct<(CAST( 1 AS BIGINT) = 1):boolean> +-- !query output +true + + +-- !query +select ' 1' = cast(1.0 as float) +-- !query schema +struct<(CAST( 1 AS FLOAT) = CAST(1.0 AS FLOAT)):boolean> +-- !query output +true + + +-- !query +select ' 1.0 ' = 1.0D +-- !query schema +struct<(CAST( 1.0 AS DOUBLE) = 1.0):boolean> +-- !query output +true + + +-- !query +select ' 1.0 ' = 1.0BD +-- !query schema +struct<(CAST( 1.0 AS DOUBLE) = CAST(1.0 AS DOUBLE)):boolean> +-- !query output true diff --git a/sql/core/src/test/resources/sql-tests/results/count.sql.out b/sql/core/src/test/resources/sql-tests/results/count.sql.out index b8a86d4c44594..68a5114bb5859 100644 --- a/sql/core/src/test/resources/sql-tests/results/count.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/count.sql.out @@ -2,27 +2,27 @@ -- Number of queries: 5 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES (1, 1), (1, 2), (2, 1), (1, 1), (null, 2), (1, null), (null, null) AS testData(a, b) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT count(*), count(1), count(null), count(a), count(b), count(a + b), count((a, b)) FROM testData --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 7 7 0 5 5 4 7 --- !query 2 +-- !query SELECT count(DISTINCT 1), count(DISTINCT null), @@ -31,25 +31,25 @@ SELECT count(DISTINCT (a + b)), count(DISTINCT (a, b)) FROM testData --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 1 0 2 2 2 6 --- !query 3 +-- !query SELECT count(a, b), count(b, a), count(testData.*) FROM testData --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 4 4 4 --- !query 4 +-- !query SELECT count(DISTINCT a, b), count(DISTINCT b, a), count(DISTINCT *), count(DISTINCT testData.*) FROM testData --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 3 3 3 3 diff --git a/sql/core/src/test/resources/sql-tests/results/cross-join.sql.out b/sql/core/src/test/resources/sql-tests/results/cross-join.sql.out index 3833c42bdfecf..ce2305cb7ec90 100644 --- a/sql/core/src/test/resources/sql-tests/results/cross-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cross-join.sql.out @@ -2,35 +2,35 @@ -- Number of queries: 13 --- !query 0 +-- !query create temporary view nt1 as select * from values ("one", 1), ("two", 2), ("three", 3) as nt1(k, v1) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view nt2 as select * from values ("one", 1), ("two", 22), ("one", 5) as nt2(k, v2) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query SELECT * FROM nt1 cross join nt2 --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output one 1 one 1 one 1 one 5 one 1 two 22 @@ -42,82 +42,82 @@ two 2 one 5 two 2 two 22 --- !query 3 +-- !query SELECT * FROM nt1 cross join nt2 where nt1.k = nt2.k --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output one 1 one 1 one 1 one 5 two 2 two 22 --- !query 4 +-- !query SELECT * FROM nt1 cross join nt2 on (nt1.k = nt2.k) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output one 1 one 1 one 1 one 5 two 2 two 22 --- !query 5 +-- !query SELECT * FROM nt1 cross join nt2 where nt1.v1 = 1 and nt2.v2 = 22 --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output one 1 two 22 --- !query 6 +-- !query SELECT a.key, b.key FROM (SELECT k key FROM nt1 WHERE v1 < 2) a CROSS JOIN (SELECT k key FROM nt2 WHERE v2 = 22) b --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output one two --- !query 7 +-- !query create temporary view A(a, va) as select * from nt1 --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output --- !query 8 +-- !query create temporary view B(b, vb) as select * from nt1 --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output --- !query 9 +-- !query create temporary view C(c, vc) as select * from nt1 --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output --- !query 10 +-- !query create temporary view D(d, vd) as select * from nt1 --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output --- !query 11 +-- !query select * from ((A join B on (a = b)) cross join C) join D on (a = d) --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output one 1 one 1 one 1 one 1 one 1 one 1 three 3 one 1 one 1 one 1 two 2 one 1 @@ -129,11 +129,11 @@ two 2 two 2 three 3 two 2 two 2 two 2 two 2 two 2 --- !query 12 +-- !query SELECT * FROM nt1 CROSS JOIN nt2 ON (nt1.k > nt2.k) --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output three 3 one 1 three 3 one 5 two 2 one 1 diff --git a/sql/core/src/test/resources/sql-tests/results/csv-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/csv-functions.sql.out index 03d4bfffa8923..8495bef9122ef 100644 --- a/sql/core/src/test/resources/sql-tests/results/csv-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/csv-functions.sql.out @@ -2,36 +2,36 @@ -- Number of queries: 17 --- !query 0 +-- !query select from_csv('1, 3.14', 'a INT, f FLOAT') --- !query 0 schema +-- !query schema struct> --- !query 0 output +-- !query output {"a":1,"f":3.14} --- !query 1 +-- !query select from_csv('26/08/2015', 'time Timestamp', map('timestampFormat', 'dd/MM/yyyy')) --- !query 1 schema +-- !query schema struct> --- !query 1 output -{"time":2015-08-26 00:00:00.0} +-- !query output +{"time":2015-08-26 00:00:00} --- !query 2 +-- !query select from_csv('1', 1) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output org.apache.spark.sql.AnalysisException Schema should be specified in DDL format as a string literal or output of the schema_of_csv function instead of 1;; line 1 pos 7 --- !query 3 +-- !query select from_csv('1', 'a InvalidType') --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output org.apache.spark.sql.AnalysisException DataType invalidtype is not supported.(line 1, pos 2) @@ -42,112 +42,112 @@ a InvalidType ; line 1 pos 7 --- !query 4 +-- !query select from_csv('1', 'a INT', named_struct('mode', 'PERMISSIVE')) --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output org.apache.spark.sql.AnalysisException Must use a map() function for options;; line 1 pos 7 --- !query 5 +-- !query select from_csv('1', 'a INT', map('mode', 1)) --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output org.apache.spark.sql.AnalysisException A type of keys and values in map() must be string, but got map;; line 1 pos 7 --- !query 6 +-- !query select from_csv() --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output org.apache.spark.sql.AnalysisException Invalid number of arguments for function from_csv. Expected: one of 2 and 3; Found: 0; line 1 pos 7 --- !query 7 +-- !query select from_csv('1,abc', schema_of_csv('1,abc')) --- !query 7 schema +-- !query schema struct> --- !query 7 output +-- !query output {"_c0":1,"_c1":"abc"} --- !query 8 +-- !query select schema_of_csv('1|abc', map('delimiter', '|')) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output struct<_c0:int,_c1:string> --- !query 9 +-- !query select schema_of_csv(null) --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'schema_of_csv(NULL)' due to data type mismatch: The input csv should be a string literal and not null; however, got NULL.; line 1 pos 7 --- !query 10 +-- !query CREATE TEMPORARY VIEW csvTable(csvField, a) AS SELECT * FROM VALUES ('1,abc', 'a') --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output --- !query 11 +-- !query SELECT schema_of_csv(csvField) FROM csvTable --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'schema_of_csv(csvtable.`csvField`)' due to data type mismatch: The input csv should be a string literal and not null; however, got csvtable.`csvField`.; line 1 pos 7 --- !query 12 +-- !query DROP VIEW IF EXISTS csvTable --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output --- !query 13 +-- !query select to_csv(named_struct('a', 1, 'b', 2)) --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output 1,2 --- !query 14 +-- !query select to_csv(named_struct('time', to_timestamp('2015-08-26', 'yyyy-MM-dd')), map('timestampFormat', 'dd/MM/yyyy')) --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output 26/08/2015 --- !query 15 +-- !query select to_csv(named_struct('a', 1, 'b', 2), named_struct('mode', 'PERMISSIVE')) --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output org.apache.spark.sql.AnalysisException Must use a map() function for options;; line 1 pos 7 --- !query 16 +-- !query select to_csv(named_struct('a', 1, 'b', 2), map('mode', 1)) --- !query 16 schema +-- !query schema struct<> --- !query 16 output +-- !query output org.apache.spark.sql.AnalysisException A type of keys and values in map() must be string, but got map;; line 1 pos 7 diff --git a/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out index 5193e2536c0cc..a9709c4a79793 100644 --- a/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out @@ -2,65 +2,65 @@ -- Number of queries: 17 --- !query 0 +-- !query create temporary view t as select * from values 0, 1, 2 as t(id) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view t2 as select * from values 0, 1 as t(id) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query SET spark.sql.legacy.ctePrecedence.enabled=true --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output spark.sql.legacy.ctePrecedence.enabled true --- !query 3 +-- !query WITH t as ( WITH t2 AS (SELECT 1) SELECT * FROM t2 ) SELECT * FROM t --- !query 3 schema +-- !query schema struct<1:int> --- !query 3 output +-- !query output 1 --- !query 4 +-- !query SELECT max(c) FROM ( WITH t(c) AS (SELECT 1) SELECT * FROM t ) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 1 --- !query 5 +-- !query SELECT ( WITH t AS (SELECT 1) SELECT * FROM t ) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 1 --- !query 6 +-- !query WITH t AS (SELECT 1), t2 AS ( @@ -68,13 +68,13 @@ WITH SELECT * FROM t ) SELECT * FROM t2 --- !query 6 schema +-- !query schema struct<1:int> --- !query 6 output +-- !query output 1 --- !query 7 +-- !query WITH t(c) AS (SELECT 1), t2 AS ( @@ -86,13 +86,13 @@ WITH ) ) SELECT * FROM t2 --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 1 --- !query 8 +-- !query WITH t AS (SELECT 1), t2 AS ( @@ -104,25 +104,25 @@ WITH SELECT * FROM t2 ) SELECT * FROM t2 --- !query 8 schema +-- !query schema struct<2:int> --- !query 8 output +-- !query output 2 --- !query 9 +-- !query WITH t(c) AS (SELECT 1) SELECT max(c) FROM ( WITH t(c) AS (SELECT 2) SELECT * FROM t ) --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 2 --- !query 10 +-- !query WITH t(c) AS (SELECT 1) SELECT sum(c) FROM ( SELECT max(c) AS c FROM ( @@ -130,13 +130,13 @@ SELECT sum(c) FROM ( SELECT * FROM t ) ) --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 2 --- !query 11 +-- !query WITH t(c) AS (SELECT 1) SELECT sum(c) FROM ( WITH t(c) AS (SELECT 2) @@ -145,25 +145,25 @@ SELECT sum(c) FROM ( SELECT * FROM t ) ) --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 3 --- !query 12 +-- !query WITH t AS (SELECT 1) SELECT ( WITH t AS (SELECT 2) SELECT * FROM t ) --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 1 --- !query 13 +-- !query WITH t AS (SELECT 1) SELECT ( SELECT ( @@ -171,13 +171,13 @@ SELECT ( SELECT * FROM t ) ) --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output 1 --- !query 14 +-- !query WITH t AS (SELECT 1) SELECT ( WITH t AS (SELECT 2) @@ -186,23 +186,23 @@ SELECT ( SELECT * FROM t ) ) --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output 1 --- !query 15 +-- !query DROP VIEW IF EXISTS t --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output --- !query 16 +-- !query DROP VIEW IF EXISTS t2 --- !query 16 schema +-- !query schema struct<> --- !query 16 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out new file mode 100644 index 0000000000000..2d87781193c25 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out @@ -0,0 +1,343 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 27 + + +-- !query +create temporary view t as select * from values 0, 1, 2 as t(id) +-- !query schema +struct<> +-- !query output + + + +-- !query +create temporary view t2 as select * from values 0, 1 as t(id) +-- !query schema +struct<> +-- !query output + + + +-- !query +WITH s AS (SELECT 1 FROM s) SELECT * FROM s +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Table or view not found: s; line 1 pos 25 + + +-- !query +WITH r AS (SELECT (SELECT * FROM r)) +SELECT * FROM r +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Table or view not found: r; line 1 pos 33 + + +-- !query +WITH t AS (SELECT 1 FROM t) SELECT * FROM t +-- !query schema +struct<1:int> +-- !query output +1 +1 +1 + + +-- !query +WITH s1 AS (SELECT 1 FROM s2), s2 AS (SELECT 1 FROM s1) SELECT * FROM s1, s2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Table or view not found: s2; line 1 pos 26 + + +-- !query +WITH t1 AS (SELECT * FROM t2), t2 AS (SELECT 2 FROM t1) SELECT * FROM t1 cross join t2 +-- !query schema +struct +-- !query output +0 2 +0 2 +1 2 +1 2 + + +-- !query +WITH CTE1 AS ( + SELECT b.id AS id + FROM T2 a + CROSS JOIN (SELECT id AS id FROM T2) b +) +SELECT t1.id AS c1, + t2.id AS c2 +FROM CTE1 t1 + CROSS JOIN CTE1 t2 +-- !query schema +struct +-- !query output +0 0 +0 0 +0 0 +0 0 +0 1 +0 1 +0 1 +0 1 +1 0 +1 0 +1 0 +1 0 +1 1 +1 1 +1 1 +1 1 + + +-- !query +WITH t(x) AS (SELECT 1) +SELECT * FROM t WHERE x = 1 +-- !query schema +struct +-- !query output +1 + + +-- !query +WITH t(x, y) AS (SELECT 1, 2) +SELECT * FROM t WHERE x = 1 AND y = 2 +-- !query schema +struct +-- !query output +1 2 + + +-- !query +WITH t(x, x) AS (SELECT 1, 2) +SELECT * FROM t +-- !query schema +struct +-- !query output +1 2 + + +-- !query +WITH t() AS (SELECT 1) +SELECT * FROM t +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +no viable alternative at input 'WITH t()'(line 1, pos 7) + +== SQL == +WITH t() AS (SELECT 1) +-------^^^ +SELECT * FROM t + + +-- !query +WITH + t(x) AS (SELECT 1), + t(x) AS (SELECT 2) +SELECT * FROM t +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +CTE definition can't have duplicate names: 't'.(line 1, pos 0) + +== SQL == +WITH +^^^ + t(x) AS (SELECT 1), + t(x) AS (SELECT 2) +SELECT * FROM t + + +-- !query +WITH t as ( + WITH t2 AS (SELECT 1) + SELECT * FROM t2 +) +SELECT * FROM t +-- !query schema +struct<1:int> +-- !query output +1 + + +-- !query +SELECT max(c) FROM ( + WITH t(c) AS (SELECT 1) + SELECT * FROM t +) +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT ( + WITH t AS (SELECT 1) + SELECT * FROM t +) +-- !query schema +struct +-- !query output +1 + + +-- !query +WITH + t AS (SELECT 1), + t2 AS ( + WITH t AS (SELECT 2) + SELECT * FROM t + ) +SELECT * FROM t2 +-- !query schema +struct<2:int> +-- !query output +2 + + +-- !query +WITH + t(c) AS (SELECT 1), + t2 AS ( + SELECT ( + SELECT max(c) FROM ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t + ) + ) + ) +SELECT * FROM t2 +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH + t AS (SELECT 1), + t2 AS ( + WITH t AS (SELECT 2), + t2 AS ( + WITH t AS (SELECT 3) + SELECT * FROM t + ) + SELECT * FROM t2 + ) +SELECT * FROM t2 +-- !query schema +struct<3:int> +-- !query output +3 + + +-- !query +WITH t(c) AS (SELECT 1) +SELECT max(c) FROM ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t +) +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH t(c) AS (SELECT 1) +SELECT sum(c) FROM ( + SELECT max(c) AS c FROM ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t + ) +) +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH t(c) AS (SELECT 1) +SELECT sum(c) FROM ( + WITH t(c) AS (SELECT 2) + SELECT max(c) AS c FROM ( + WITH t(c) AS (SELECT 3) + SELECT * FROM t + ) +) +-- !query schema +struct +-- !query output +3 + + +-- !query +WITH t AS (SELECT 1) +SELECT ( + WITH t AS (SELECT 2) + SELECT * FROM t +) +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH t AS (SELECT 1) +SELECT ( + SELECT ( + WITH t AS (SELECT 2) + SELECT * FROM t + ) +) +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH t AS (SELECT 1) +SELECT ( + WITH t AS (SELECT 2) + SELECT ( + WITH t AS (SELECT 3) + SELECT * FROM t + ) +) +-- !query schema +struct +-- !query output +3 + + +-- !query +DROP VIEW IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW IF EXISTS t2 +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/cte.sql.out b/sql/core/src/test/resources/sql-tests/results/cte.sql.out index b7dd76c725209..1d50aa8f57505 100644 --- a/sql/core/src/test/resources/sql-tests/results/cte.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cte.sql.out @@ -2,72 +2,72 @@ -- Number of queries: 27 --- !query 0 +-- !query create temporary view t as select * from values 0, 1, 2 as t(id) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view t2 as select * from values 0, 1 as t(id) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query WITH s AS (SELECT 1 FROM s) SELECT * FROM s --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output org.apache.spark.sql.AnalysisException Table or view not found: s; line 1 pos 25 --- !query 3 +-- !query WITH r AS (SELECT (SELECT * FROM r)) SELECT * FROM r --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output org.apache.spark.sql.AnalysisException Table or view not found: r; line 1 pos 33 --- !query 4 +-- !query WITH t AS (SELECT 1 FROM t) SELECT * FROM t --- !query 4 schema +-- !query schema struct<1:int> --- !query 4 output +-- !query output 1 1 1 --- !query 5 +-- !query WITH s1 AS (SELECT 1 FROM s2), s2 AS (SELECT 1 FROM s1) SELECT * FROM s1, s2 --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output org.apache.spark.sql.AnalysisException Table or view not found: s2; line 1 pos 26 --- !query 6 +-- !query WITH t1 AS (SELECT * FROM t2), t2 AS (SELECT 2 FROM t1) SELECT * FROM t1 cross join t2 --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 0 2 0 2 1 2 1 2 --- !query 7 +-- !query WITH CTE1 AS ( SELECT b.id AS id FROM T2 a @@ -77,9 +77,9 @@ SELECT t1.id AS c1, t2.id AS c2 FROM CTE1 t1 CROSS JOIN CTE1 t2 --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 0 0 0 0 0 0 @@ -98,39 +98,39 @@ struct 1 1 --- !query 8 +-- !query WITH t(x) AS (SELECT 1) SELECT * FROM t WHERE x = 1 --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 1 --- !query 9 +-- !query WITH t(x, y) AS (SELECT 1, 2) SELECT * FROM t WHERE x = 1 AND y = 2 --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 1 2 --- !query 10 +-- !query WITH t(x, x) AS (SELECT 1, 2) SELECT * FROM t --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 1 2 --- !query 11 +-- !query WITH t() AS (SELECT 1) SELECT * FROM t --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException no viable alternative at input 'WITH t()'(line 1, pos 7) @@ -141,14 +141,14 @@ WITH t() AS (SELECT 1) SELECT * FROM t --- !query 12 +-- !query WITH t(x) AS (SELECT 1), t(x) AS (SELECT 2) SELECT * FROM t --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException CTE definition can't have duplicate names: 't'.(line 1, pos 0) @@ -161,41 +161,41 @@ WITH SELECT * FROM t --- !query 13 +-- !query WITH t as ( WITH t2 AS (SELECT 1) SELECT * FROM t2 ) SELECT * FROM t --- !query 13 schema +-- !query schema struct<1:int> --- !query 13 output +-- !query output 1 --- !query 14 +-- !query SELECT max(c) FROM ( WITH t(c) AS (SELECT 1) SELECT * FROM t ) --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output 1 --- !query 15 +-- !query SELECT ( WITH t AS (SELECT 1) SELECT * FROM t ) --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 1 --- !query 16 +-- !query WITH t AS (SELECT 1), t2 AS ( @@ -203,13 +203,14 @@ WITH SELECT * FROM t ) SELECT * FROM t2 --- !query 16 schema -struct<2:int> --- !query 16 output -2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; --- !query 17 +-- !query WITH t(c) AS (SELECT 1), t2 AS ( @@ -221,13 +222,14 @@ WITH ) ) SELECT * FROM t2 --- !query 17 schema -struct --- !query 17 output -2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; --- !query 18 +-- !query WITH t AS (SELECT 1), t2 AS ( @@ -239,25 +241,26 @@ WITH SELECT * FROM t2 ) SELECT * FROM t2 --- !query 18 schema -struct<3:int> --- !query 18 output -3 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; --- !query 19 +-- !query WITH t(c) AS (SELECT 1) SELECT max(c) FROM ( WITH t(c) AS (SELECT 2) SELECT * FROM t ) --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output 2 --- !query 20 +-- !query WITH t(c) AS (SELECT 1) SELECT sum(c) FROM ( SELECT max(c) AS c FROM ( @@ -265,13 +268,13 @@ SELECT sum(c) FROM ( SELECT * FROM t ) ) --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output 2 --- !query 21 +-- !query WITH t(c) AS (SELECT 1) SELECT sum(c) FROM ( WITH t(c) AS (SELECT 2) @@ -280,25 +283,26 @@ SELECT sum(c) FROM ( SELECT * FROM t ) ) --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output 3 --- !query 22 +-- !query WITH t AS (SELECT 1) SELECT ( WITH t AS (SELECT 2) SELECT * FROM t ) --- !query 22 schema -struct --- !query 22 output -2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; --- !query 23 +-- !query WITH t AS (SELECT 1) SELECT ( SELECT ( @@ -306,13 +310,14 @@ SELECT ( SELECT * FROM t ) ) --- !query 23 schema -struct --- !query 23 output -2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; --- !query 24 +-- !query WITH t AS (SELECT 1) SELECT ( WITH t AS (SELECT 2) @@ -321,23 +326,24 @@ SELECT ( SELECT * FROM t ) ) --- !query 24 schema -struct --- !query 24 output -3 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; --- !query 25 +-- !query DROP VIEW IF EXISTS t --- !query 25 schema +-- !query schema struct<> --- !query 25 output +-- !query output --- !query 26 +-- !query DROP VIEW IF EXISTS t2 --- !query 26 schema +-- !query schema struct<> --- !query 26 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/date_part.sql.out b/sql/core/src/test/resources/sql-tests/results/date_part.sql.out new file mode 100644 index 0000000000000..b4cceedffd98b --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/date_part.sql.out @@ -0,0 +1,886 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 110 + + +-- !query +CREATE TEMPORARY VIEW t AS select '2011-05-06 07:08:09.1234567' as c +-- !query schema +struct<> +-- !query output + + + +-- !query +select date_part('millennium', c) from t +-- !query schema +struct +-- !query output +3 + + +-- !query +select date_part('millennia', c) from t +-- !query schema +struct +-- !query output +3 + + +-- !query +select date_part('mil', c) from t +-- !query schema +struct +-- !query output +3 + + +-- !query +select date_part('mils', c) from t +-- !query schema +struct +-- !query output +3 + + +-- !query +select date_part('century', c) from t +-- !query schema +struct +-- !query output +21 + + +-- !query +select date_part('centuries', c) from t +-- !query schema +struct +-- !query output +21 + + +-- !query +select date_part('c', c) from t +-- !query schema +struct +-- !query output +21 + + +-- !query +select date_part('cent', c) from t +-- !query schema +struct +-- !query output +21 + + +-- !query +select date_part('decade', c) from t +-- !query schema +struct +-- !query output +201 + + +-- !query +select date_part('decades', c) from t +-- !query schema +struct +-- !query output +201 + + +-- !query +select date_part('dec', c) from t +-- !query schema +struct +-- !query output +201 + + +-- !query +select date_part('decs', c) from t +-- !query schema +struct +-- !query output +201 + + +-- !query +select date_part('year', c) from t +-- !query schema +struct +-- !query output +2011 + + +-- !query +select date_part('y', c) from t +-- !query schema +struct +-- !query output +2011 + + +-- !query +select date_part('years', c) from t +-- !query schema +struct +-- !query output +2011 + + +-- !query +select date_part('yr', c) from t +-- !query schema +struct +-- !query output +2011 + + +-- !query +select date_part('yrs', c) from t +-- !query schema +struct +-- !query output +2011 + + +-- !query +select date_part('quarter', c) from t +-- !query schema +struct +-- !query output +2 + + +-- !query +select date_part('qtr', c) from t +-- !query schema +struct +-- !query output +2 + + +-- !query +select date_part('month', c) from t +-- !query schema +struct +-- !query output +5 + + +-- !query +select date_part('mon', c) from t +-- !query schema +struct +-- !query output +5 + + +-- !query +select date_part('mons', c) from t +-- !query schema +struct +-- !query output +5 + + +-- !query +select date_part('months', c) from t +-- !query schema +struct +-- !query output +5 + + +-- !query +select date_part('week', c) from t +-- !query schema +struct +-- !query output +18 + + +-- !query +select date_part('w', c) from t +-- !query schema +struct +-- !query output +18 + + +-- !query +select date_part('weeks', c) from t +-- !query schema +struct +-- !query output +18 + + +-- !query +select date_part('day', c) from t +-- !query schema +struct +-- !query output +6 + + +-- !query +select date_part('d', c) from t +-- !query schema +struct +-- !query output +6 + + +-- !query +select date_part('days', c) from t +-- !query schema +struct +-- !query output +6 + + +-- !query +select date_part('dayofweek', c) from t +-- !query schema +struct +-- !query output +6 + + +-- !query +select date_part('dow', c) from t +-- !query schema +struct +-- !query output +5 + + +-- !query +select date_part('isodow', c) from t +-- !query schema +struct +-- !query output +5 + + +-- !query +select date_part('doy', c) from t +-- !query schema +struct +-- !query output +126 + + +-- !query +select date_part('hour', c) from t +-- !query schema +struct +-- !query output +7 + + +-- !query +select date_part('h', c) from t +-- !query schema +struct +-- !query output +7 + + +-- !query +select date_part('hours', c) from t +-- !query schema +struct +-- !query output +7 + + +-- !query +select date_part('hr', c) from t +-- !query schema +struct +-- !query output +7 + + +-- !query +select date_part('hrs', c) from t +-- !query schema +struct +-- !query output +7 + + +-- !query +select date_part('minute', c) from t +-- !query schema +struct +-- !query output +8 + + +-- !query +select date_part('m', c) from t +-- !query schema +struct +-- !query output +8 + + +-- !query +select date_part('min', c) from t +-- !query schema +struct +-- !query output +8 + + +-- !query +select date_part('mins', c) from t +-- !query schema +struct +-- !query output +8 + + +-- !query +select date_part('minutes', c) from t +-- !query schema +struct +-- !query output +8 + + +-- !query +select date_part('second', c) from t +-- !query schema +struct +-- !query output +9.123456 + + +-- !query +select date_part('s', c) from t +-- !query schema +struct +-- !query output +9.123456 + + +-- !query +select date_part('sec', c) from t +-- !query schema +struct +-- !query output +9.123456 + + +-- !query +select date_part('seconds', c) from t +-- !query schema +struct +-- !query output +9.123456 + + +-- !query +select date_part('secs', c) from t +-- !query schema +struct +-- !query output +9.123456 + + +-- !query +select date_part('not_supported', c) from t +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Literals of type 'not_supported' are currently not supported for the string type.;; line 1 pos 7 + + +-- !query +select date_part(c, c) from t +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +The field parameter needs to be a foldable string value.;; line 1 pos 7 + + +-- !query +select date_part(null, c) from t +-- !query schema +struct +-- !query output +NULL + + +-- !query +CREATE TEMPORARY VIEW t2 AS select interval 1010 year 9 month 8 day 7 hour 6 minute 5 second 4 millisecond 3 microsecond as c +-- !query schema +struct<> +-- !query output + + + +-- !query +select date_part('millennium', c) from t2 +-- !query schema +struct +-- !query output +1 + + +-- !query +select date_part('millennia', c) from t2 +-- !query schema +struct +-- !query output +1 + + +-- !query +select date_part('mil', c) from t2 +-- !query schema +struct +-- !query output +1 + + +-- !query +select date_part('mils', c) from t2 +-- !query schema +struct +-- !query output +1 + + +-- !query +select date_part('century', c) from t2 +-- !query schema +struct +-- !query output +10 + + +-- !query +select date_part('centuries', c) from t2 +-- !query schema +struct +-- !query output +10 + + +-- !query +select date_part('c', c) from t2 +-- !query schema +struct +-- !query output +10 + + +-- !query +select date_part('cent', c) from t2 +-- !query schema +struct +-- !query output +10 + + +-- !query +select date_part('decade', c) from t2 +-- !query schema +struct +-- !query output +101 + + +-- !query +select date_part('decades', c) from t2 +-- !query schema +struct +-- !query output +101 + + +-- !query +select date_part('dec', c) from t2 +-- !query schema +struct +-- !query output +101 + + +-- !query +select date_part('decs', c) from t2 +-- !query schema +struct +-- !query output +101 + + +-- !query +select date_part('year', c) from t2 +-- !query schema +struct +-- !query output +1010 + + +-- !query +select date_part('y', c) from t2 +-- !query schema +struct +-- !query output +1010 + + +-- !query +select date_part('years', c) from t2 +-- !query schema +struct +-- !query output +1010 + + +-- !query +select date_part('yr', c) from t2 +-- !query schema +struct +-- !query output +1010 + + +-- !query +select date_part('yrs', c) from t2 +-- !query schema +struct +-- !query output +1010 + + +-- !query +select date_part('quarter', c) from t2 +-- !query schema +struct +-- !query output +4 + + +-- !query +select date_part('qtr', c) from t2 +-- !query schema +struct +-- !query output +4 + + +-- !query +select date_part('month', c) from t2 +-- !query schema +struct +-- !query output +9 + + +-- !query +select date_part('mon', c) from t2 +-- !query schema +struct +-- !query output +9 + + +-- !query +select date_part('mons', c) from t2 +-- !query schema +struct +-- !query output +9 + + +-- !query +select date_part('months', c) from t2 +-- !query schema +struct +-- !query output +9 + + +-- !query +select date_part('day', c) from t2 +-- !query schema +struct +-- !query output +8 + + +-- !query +select date_part('d', c) from t2 +-- !query schema +struct +-- !query output +8 + + +-- !query +select date_part('days', c) from t2 +-- !query schema +struct +-- !query output +8 + + +-- !query +select date_part('hour', c) from t2 +-- !query schema +struct +-- !query output +7 + + +-- !query +select date_part('h', c) from t2 +-- !query schema +struct +-- !query output +7 + + +-- !query +select date_part('hours', c) from t2 +-- !query schema +struct +-- !query output +7 + + +-- !query +select date_part('hr', c) from t2 +-- !query schema +struct +-- !query output +7 + + +-- !query +select date_part('hrs', c) from t2 +-- !query schema +struct +-- !query output +7 + + +-- !query +select date_part('minute', c) from t2 +-- !query schema +struct +-- !query output +6 + + +-- !query +select date_part('m', c) from t2 +-- !query schema +struct +-- !query output +6 + + +-- !query +select date_part('min', c) from t2 +-- !query schema +struct +-- !query output +6 + + +-- !query +select date_part('mins', c) from t2 +-- !query schema +struct +-- !query output +6 + + +-- !query +select date_part('minutes', c) from t2 +-- !query schema +struct +-- !query output +6 + + +-- !query +select date_part('second', c) from t2 +-- !query schema +struct +-- !query output +5.004003 + + +-- !query +select date_part('s', c) from t2 +-- !query schema +struct +-- !query output +5.004003 + + +-- !query +select date_part('sec', c) from t2 +-- !query schema +struct +-- !query output +5.004003 + + +-- !query +select date_part('seconds', c) from t2 +-- !query schema +struct +-- !query output +5.004003 + + +-- !query +select date_part('secs', c) from t2 +-- !query schema +struct +-- !query output +5.004003 + + +-- !query +select date_part('milliseconds', c) from t2 +-- !query schema +struct +-- !query output +5004.003 + + +-- !query +select date_part('msec', c) from t2 +-- !query schema +struct +-- !query output +5004.003 + + +-- !query +select date_part('msecs', c) from t2 +-- !query schema +struct +-- !query output +5004.003 + + +-- !query +select date_part('millisecon', c) from t2 +-- !query schema +struct +-- !query output +5004.003 + + +-- !query +select date_part('mseconds', c) from t2 +-- !query schema +struct +-- !query output +5004.003 + + +-- !query +select date_part('ms', c) from t2 +-- !query schema +struct +-- !query output +5004.003 + + +-- !query +select date_part('microseconds', c) from t2 +-- !query schema +struct +-- !query output +5004003 + + +-- !query +select date_part('usec', c) from t2 +-- !query schema +struct +-- !query output +5004003 + + +-- !query +select date_part('usecs', c) from t2 +-- !query schema +struct +-- !query output +5004003 + + +-- !query +select date_part('useconds', c) from t2 +-- !query schema +struct +-- !query output +5004003 + + +-- !query +select date_part('microsecon', c) from t2 +-- !query schema +struct +-- !query output +5004003 + + +-- !query +select date_part('us', c) from t2 +-- !query schema +struct +-- !query output +5004003 + + +-- !query +select date_part('epoch', c) from t2 +-- !query schema +struct +-- !query output +31897220765.004003 + + +-- !query +select date_part('not_supported', c) from t2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Literals of type 'not_supported' are currently not supported for the interval type.;; line 1 pos 7 + + +-- !query +select date_part(c, c) from t2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +The field parameter needs to be a foldable string value.;; line 1 pos 7 + + +-- !query +select date_part(null, c) from t2 +-- !query schema +struct +-- !query output +NULL diff --git a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out old mode 100644 new mode 100755 index 178400e5706b8..a7b098d79a706 --- a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out @@ -1,131 +1,393 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 15 +-- Number of queries: 47 --- !query 0 +-- !query select current_date = current_date(), current_timestamp = current_timestamp() --- !query 0 schema +-- !query schema struct<(current_date() = current_date()):boolean,(current_timestamp() = current_timestamp()):boolean> --- !query 0 output +-- !query output true true --- !query 1 +-- !query select to_date(null), to_date('2016-12-31'), to_date('2016-12-31', 'yyyy-MM-dd') --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output NULL 2016-12-31 2016-12-31 --- !query 2 +-- !query select to_timestamp(null), to_timestamp('2016-12-31 00:12:00'), to_timestamp('2016-12-31', 'yyyy-MM-dd') --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output NULL 2016-12-31 00:12:00 2016-12-31 00:00:00 --- !query 3 +-- !query select dayofweek('2007-02-03'), dayofweek('2009-07-30'), dayofweek('2017-05-27'), dayofweek(null), dayofweek('1582-10-15 13:10:15') --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 7 5 7 NULL 6 --- !query 4 +-- !query create temporary view ttf1 as select * from values (1, 2), (2, 3) as ttf1(current_date, current_timestamp) --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query select current_date, current_timestamp from ttf1 --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 1 2 2 3 --- !query 6 +-- !query create temporary view ttf2 as select * from values (1, 2), (2, 3) as ttf2(a, b) --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output --- !query 7 +-- !query select current_date = current_date(), current_timestamp = current_timestamp(), a, b from ttf2 --- !query 7 schema +-- !query schema struct<(current_date() = current_date()):boolean,(current_timestamp() = current_timestamp()):boolean,a:int,b:int> --- !query 7 output +-- !query output true true 1 2 true true 2 3 --- !query 8 +-- !query select a, b from ttf2 order by a, current_date --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 1 2 2 3 --- !query 9 +-- !query select weekday('2007-02-03'), weekday('2009-07-30'), weekday('2017-05-27'), weekday(null), weekday('1582-10-15 13:10:15') --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 5 3 5 NULL 4 --- !query 10 +-- !query select year('1500-01-01'), month('1500-01-01'), dayOfYear('1500-01-01') --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 1500 1 1 --- !query 11 -select date '2001-09-28' + 7 --- !query 11 schema +-- !query +select date '2019-01-01\t' +-- !query schema +struct +-- !query output +2019-01-01 + + +-- !query +select timestamp '2019-01-01\t' +-- !query schema +struct +-- !query output +2019-01-01 00:00:00 + + +-- !query +select timestamp'2011-11-11 11:11:11' + interval '2' day +-- !query schema +struct +-- !query output +2011-11-13 11:11:11 + + +-- !query +select timestamp'2011-11-11 11:11:11' - interval '2' day +-- !query schema +struct +-- !query output +2011-11-09 11:11:11 + + +-- !query +select date'2011-11-11 11:11:11' + interval '2' second +-- !query schema +struct +-- !query output +2011-11-11 + + +-- !query +select date'2011-11-11 11:11:11' - interval '2' second +-- !query schema +struct +-- !query output +2011-11-10 + + +-- !query +select '2011-11-11' - interval '2' day +-- !query schema +struct +-- !query output +2011-11-09 00:00:00 + + +-- !query +select '2011-11-11 11:11:11' - interval '2' second +-- !query schema +struct +-- !query output +2011-11-11 11:11:09 + + +-- !query +select '1' - interval '2' second +-- !query schema +struct +-- !query output +NULL + + +-- !query +select 1 - interval '2' second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '1 - INTERVAL '2 seconds'' due to data type mismatch: argument 1 requires timestamp type, however, '1' is of int type.; line 1 pos 7 + + +-- !query +select date'2020-01-01' - timestamp'2019-10-06 10:11:12.345678' +-- !query schema +struct +-- !query output +2078 hours 48 minutes 47.654322 seconds + + +-- !query +select timestamp'2019-10-06 10:11:12.345678' - date'2020-01-01' +-- !query schema +struct +-- !query output +-2078 hours -48 minutes -47.654322 seconds + + +-- !query +select timestamp'2019-10-06 10:11:12.345678' - null +-- !query schema +struct +-- !query output +NULL + + +-- !query +select null - timestamp'2019-10-06 10:11:12.345678' +-- !query schema +struct +-- !query output +NULL + + +-- !query +select date_add('2011-11-11', 1Y) +-- !query schema +struct +-- !query output +2011-11-12 + + +-- !query +select date_add('2011-11-11', 1S) +-- !query schema +struct +-- !query output +2011-11-12 + + +-- !query +select date_add('2011-11-11', 1) +-- !query schema +struct +-- !query output +2011-11-12 + + +-- !query +select date_add('2011-11-11', 1L) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_add(CAST('2011-11-11' AS DATE), 1L)' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, '1L' is of bigint type.; line 1 pos 7 + + +-- !query +select date_add('2011-11-11', 1.0) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_add(CAST('2011-11-11' AS DATE), 1.0BD)' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, '1.0BD' is of decimal(2,1) type.; line 1 pos 7 + + +-- !query +select date_add('2011-11-11', 1E1) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_add(CAST('2011-11-11' AS DATE), 10.0D)' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, '10.0D' is of double type.; line 1 pos 7 + + +-- !query +select date_add('2011-11-11', '1') +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_add(CAST('2011-11-11' AS DATE), '1')' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, ''1'' is of string type.; line 1 pos 7 + + +-- !query +select date_add(date'2011-11-11', 1) +-- !query schema +struct +-- !query output +2011-11-12 + + +-- !query +select date_add(timestamp'2011-11-11', 1) +-- !query schema +struct +-- !query output +2011-11-12 + + +-- !query +select date_sub(date'2011-11-11', 1) +-- !query schema +struct +-- !query output +2011-11-10 + + +-- !query +select date_sub(timestamp'2011-11-11', 1) +-- !query schema +struct +-- !query output +2011-11-10 + + +-- !query +select date_sub(null, 1) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select date_sub(date'2011-11-11', null) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select date'2011-11-11' + 1E1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'date_add(DATE '2011-11-11', 10.0D)' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, '10.0D' is of double type.; line 1 pos 7 + + +-- !query +select null + date '2001-09-28' +-- !query schema +struct +-- !query output +NULL + + +-- !query +select date '2001-09-28' + 7Y +-- !query schema struct --- !query 11 output +-- !query output 2001-10-05 --- !query 12 -select 7 + date '2001-09-28' --- !query 12 schema +-- !query +select 7S + date '2001-09-28' +-- !query schema struct --- !query 12 output +-- !query output 2001-10-05 --- !query 13 +-- !query select date '2001-10-01' - 7 --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output 2001-09-24 --- !query 14 +-- !query +select date '2001-09-28' + null +-- !query schema +struct +-- !query output +NULL + + +-- !query +select date '2001-09-28' - null +-- !query schema +struct +-- !query output +NULL + + +-- !query +select null - date '2019-10-06' +-- !query schema +struct +-- !query output +NULL + + +-- !query select date '2001-10-01' - date '2001-09-28' --- !query 14 schema -struct --- !query 14 output -3 +-- !query schema +struct +-- !query output +3 days diff --git a/sql/core/src/test/resources/sql-tests/results/decimalArithmeticOperations.sql.out b/sql/core/src/test/resources/sql-tests/results/decimalArithmeticOperations.sql.out index 217233bfad378..72e46ef493a5d 100644 --- a/sql/core/src/test/resources/sql-tests/results/decimalArithmeticOperations.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/decimalArithmeticOperations.sql.out @@ -1,458 +1,335 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 54 +-- Number of queries: 40 --- !query 0 +-- !query CREATE TEMPORARY VIEW t AS SELECT 1.0 as a, 0.0 as b --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query select a / b from t --- !query 1 schema +-- !query schema struct<(CAST(a AS DECIMAL(2,1)) / CAST(b AS DECIMAL(2,1))):decimal(8,6)> --- !query 1 output +-- !query output NULL --- !query 2 +-- !query select a % b from t --- !query 2 schema +-- !query schema struct<(CAST(a AS DECIMAL(2,1)) % CAST(b AS DECIMAL(2,1))):decimal(1,1)> --- !query 2 output +-- !query output NULL --- !query 3 +-- !query select pmod(a, b) from t --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output NULL --- !query 4 +-- !query create table decimals_test(id int, a decimal(38,18), b decimal(38,18)) using parquet --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query insert into decimals_test values(1, 100.0, 999.0), (2, 12345.123, 12345.123), (3, 0.1234567891011, 1234.1), (4, 123456789123456789.0, 1.123456789123456789) --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query select id, a+b, a-b, a*b, a/b from decimals_test order by id --- !query 6 schema +-- !query schema struct --- !query 6 output -1 1099 -899 99900 0.1001 -2 24690.246 0 152402061.885129 1 -3 1234.2234567891011 -1233.9765432108989 152.358023 0.0001 +-- !query output +1 1099.00000000000000000 -899.00000000000000000 99900.000000 0.100100 +2 24690.24600000000000000 0.00000000000000000 152402061.885129 1.000000 +3 1234.22345678910110000 -1233.97654321089890000 152.358023 0.000100 4 123456789123456790.12345678912345679 123456789123456787.87654321087654321 138698367904130467.515623 109890109097814272.043109 --- !query 7 +-- !query select id, a*10, b/10 from decimals_test order by id --- !query 7 schema +-- !query schema struct --- !query 7 output -1 1000 99.9 -2 123451.23 1234.5123 -3 1.234567891011 123.41 -4 1234567891234567890 0.112345678912345679 +-- !query output +1 1000.000000000000000 99.900000000000000000 +2 123451.230000000000000 1234.512300000000000000 +3 1.234567891011000 123.410000000000000000 +4 1234567891234567890.000000000000000 0.112345678912345679 --- !query 8 +-- !query select 10.3 * 3.0 --- !query 8 schema +-- !query schema struct<(CAST(10.3 AS DECIMAL(3,1)) * CAST(3.0 AS DECIMAL(3,1))):decimal(6,2)> --- !query 8 output -30.9 +-- !query output +30.90 --- !query 9 +-- !query select 10.3000 * 3.0 --- !query 9 schema +-- !query schema struct<(CAST(10.3000 AS DECIMAL(6,4)) * CAST(3.0 AS DECIMAL(6,4))):decimal(9,5)> --- !query 9 output -30.9 +-- !query output +30.90000 --- !query 10 +-- !query select 10.30000 * 30.0 --- !query 10 schema +-- !query schema struct<(CAST(10.30000 AS DECIMAL(7,5)) * CAST(30.0 AS DECIMAL(7,5))):decimal(11,6)> --- !query 10 output -309 +-- !query output +309.000000 --- !query 11 +-- !query select 10.300000000000000000 * 3.000000000000000000 --- !query 11 schema +-- !query schema struct<(CAST(10.300000000000000000 AS DECIMAL(20,18)) * CAST(3.000000000000000000 AS DECIMAL(20,18))):decimal(38,34)> --- !query 11 output -30.9 +-- !query output +30.9000000000000000000000000000000000 --- !query 12 +-- !query select 10.300000000000000000 * 3.0000000000000000000 --- !query 12 schema +-- !query schema struct<(CAST(10.300000000000000000 AS DECIMAL(21,19)) * CAST(3.0000000000000000000 AS DECIMAL(21,19))):decimal(38,34)> --- !query 12 output -30.9 +-- !query output +30.9000000000000000000000000000000000 --- !query 13 +-- !query select 2.35E10 * 1.0 --- !query 13 schema -struct<(CAST(2.35E+10 AS DECIMAL(12,1)) * CAST(1.0 AS DECIMAL(12,1))):decimal(6,-7)> --- !query 13 output -23500000000 +-- !query schema +struct<(2.35E10 * CAST(1.0 AS DOUBLE)):double> +-- !query output +2.35E10 --- !query 14 -select (5e36 + 0.1) + 5e36 --- !query 14 schema -struct<(CAST((CAST(5E+36 AS DECIMAL(38,1)) + CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) + CAST(5E+36 AS DECIMAL(38,1))):decimal(38,1)> --- !query 14 output +-- !query +select (5e36BD + 0.1) + 5e36BD +-- !query schema +struct<(CAST((CAST(5000000000000000000000000000000000000 AS DECIMAL(38,1)) + CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) + CAST(5000000000000000000000000000000000000 AS DECIMAL(38,1))):decimal(38,1)> +-- !query output NULL --- !query 15 -select (-4e36 - 0.1) - 7e36 --- !query 15 schema -struct<(CAST((CAST(-4E+36 AS DECIMAL(38,1)) - CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) - CAST(7E+36 AS DECIMAL(38,1))):decimal(38,1)> --- !query 15 output +-- !query +select (-4e36BD - 0.1) - 7e36BD +-- !query schema +struct<(CAST((CAST(-4000000000000000000000000000000000000 AS DECIMAL(38,1)) - CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) - CAST(7000000000000000000000000000000000000 AS DECIMAL(38,1))):decimal(38,1)> +-- !query output NULL --- !query 16 +-- !query select 12345678901234567890.0 * 12345678901234567890.0 --- !query 16 schema +-- !query schema struct<(12345678901234567890.0 * 12345678901234567890.0):decimal(38,2)> --- !query 16 output +-- !query output NULL --- !query 17 -select 1e35 / 0.1 --- !query 17 schema -struct<(CAST(1E+35 AS DECIMAL(37,1)) / CAST(0.1 AS DECIMAL(37,1))):decimal(38,6)> --- !query 17 output +-- !query +select 1e35BD / 0.1 +-- !query schema +struct<(CAST(100000000000000000000000000000000000 AS DECIMAL(37,1)) / CAST(0.1 AS DECIMAL(37,1))):decimal(38,6)> +-- !query output NULL --- !query 18 -select 1.2345678901234567890E30 * 1.2345678901234567890E25 --- !query 18 schema -struct<(CAST(1.2345678901234567890E+30 AS DECIMAL(25,-6)) * CAST(1.2345678901234567890E+25 AS DECIMAL(25,-6))):decimal(38,-17)> --- !query 18 output +-- !query +select 1.2345678901234567890E30BD * 1.2345678901234567890E25BD +-- !query schema +struct<(CAST(1234567890123456789000000000000 AS DECIMAL(31,0)) * CAST(12345678901234567890000000 AS DECIMAL(31,0))):decimal(38,0)> +-- !query output NULL --- !query 19 +-- !query select 12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345 --- !query 19 schema +-- !query schema struct<(CAST(12345678912345678912345678912.1234567 AS DECIMAL(38,6)) + CAST(9999999999999999999999999999999.12345 AS DECIMAL(38,6))):decimal(38,6)> --- !query 19 output +-- !query output 10012345678912345678912345678911.246907 --- !query 20 +-- !query select 123456789123456789.1234567890 * 1.123456789123456789 --- !query 20 schema +-- !query schema struct<(CAST(123456789123456789.1234567890 AS DECIMAL(36,18)) * CAST(1.123456789123456789 AS DECIMAL(36,18))):decimal(38,18)> --- !query 20 output +-- !query output 138698367904130467.654320988515622621 --- !query 21 +-- !query select 12345678912345.123456789123 / 0.000000012345678 --- !query 21 schema +-- !query schema struct<(CAST(12345678912345.123456789123 AS DECIMAL(29,15)) / CAST(1.2345678E-8 AS DECIMAL(29,15))):decimal(38,9)> --- !query 21 output +-- !query output 1000000073899961059796.725866332 --- !query 22 +-- !query set spark.sql.decimalOperations.allowPrecisionLoss=false --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output spark.sql.decimalOperations.allowPrecisionLoss false --- !query 23 +-- !query select id, a+b, a-b, a*b, a/b from decimals_test order by id --- !query 23 schema +-- !query schema struct --- !query 23 output -1 1099 -899 NULL 0.1001001001001001 -2 24690.246 0 NULL 1 -3 1234.2234567891011 -1233.9765432108989 NULL 0.000100037913541123 +-- !query output +1 1099.000000000000000000 -899.000000000000000000 NULL 0.100100100100100100 +2 24690.246000000000000000 0.000000000000000000 NULL 1.000000000000000000 +3 1234.223456789101100000 -1233.976543210898900000 NULL 0.000100037913541123 4 123456789123456790.123456789123456789 123456789123456787.876543210876543211 NULL 109890109097814272.043109406191131436 --- !query 24 +-- !query select id, a*10, b/10 from decimals_test order by id --- !query 24 schema +-- !query schema struct --- !query 24 output -1 1000 99.9 -2 123451.23 1234.5123 -3 1.234567891011 123.41 -4 1234567891234567890 0.1123456789123456789 +-- !query output +1 1000.000000000000000000 99.9000000000000000000 +2 123451.230000000000000000 1234.5123000000000000000 +3 1.234567891011000000 123.4100000000000000000 +4 1234567891234567890.000000000000000000 0.1123456789123456789 --- !query 25 +-- !query select 10.3 * 3.0 --- !query 25 schema +-- !query schema struct<(CAST(10.3 AS DECIMAL(3,1)) * CAST(3.0 AS DECIMAL(3,1))):decimal(6,2)> --- !query 25 output -30.9 +-- !query output +30.90 --- !query 26 +-- !query select 10.3000 * 3.0 --- !query 26 schema +-- !query schema struct<(CAST(10.3000 AS DECIMAL(6,4)) * CAST(3.0 AS DECIMAL(6,4))):decimal(9,5)> --- !query 26 output -30.9 +-- !query output +30.90000 --- !query 27 +-- !query select 10.30000 * 30.0 --- !query 27 schema +-- !query schema struct<(CAST(10.30000 AS DECIMAL(7,5)) * CAST(30.0 AS DECIMAL(7,5))):decimal(11,6)> --- !query 27 output -309 +-- !query output +309.000000 --- !query 28 +-- !query select 10.300000000000000000 * 3.000000000000000000 --- !query 28 schema +-- !query schema struct<(CAST(10.300000000000000000 AS DECIMAL(20,18)) * CAST(3.000000000000000000 AS DECIMAL(20,18))):decimal(38,36)> --- !query 28 output -30.9 +-- !query output +30.900000000000000000000000000000000000 --- !query 29 +-- !query select 10.300000000000000000 * 3.0000000000000000000 --- !query 29 schema +-- !query schema struct<(CAST(10.300000000000000000 AS DECIMAL(21,19)) * CAST(3.0000000000000000000 AS DECIMAL(21,19))):decimal(38,37)> --- !query 29 output +-- !query output NULL --- !query 30 +-- !query select 2.35E10 * 1.0 --- !query 30 schema -struct<(CAST(2.35E+10 AS DECIMAL(12,1)) * CAST(1.0 AS DECIMAL(12,1))):decimal(6,-7)> --- !query 30 output -23500000000 +-- !query schema +struct<(2.35E10 * CAST(1.0 AS DOUBLE)):double> +-- !query output +2.35E10 --- !query 31 -select (5e36 + 0.1) + 5e36 --- !query 31 schema -struct<(CAST((CAST(5E+36 AS DECIMAL(38,1)) + CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) + CAST(5E+36 AS DECIMAL(38,1))):decimal(38,1)> --- !query 31 output +-- !query +select (5e36BD + 0.1) + 5e36BD +-- !query schema +struct<(CAST((CAST(5000000000000000000000000000000000000 AS DECIMAL(38,1)) + CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) + CAST(5000000000000000000000000000000000000 AS DECIMAL(38,1))):decimal(38,1)> +-- !query output NULL --- !query 32 -select (-4e36 - 0.1) - 7e36 --- !query 32 schema -struct<(CAST((CAST(-4E+36 AS DECIMAL(38,1)) - CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) - CAST(7E+36 AS DECIMAL(38,1))):decimal(38,1)> --- !query 32 output +-- !query +select (-4e36BD - 0.1) - 7e36BD +-- !query schema +struct<(CAST((CAST(-4000000000000000000000000000000000000 AS DECIMAL(38,1)) - CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) - CAST(7000000000000000000000000000000000000 AS DECIMAL(38,1))):decimal(38,1)> +-- !query output NULL --- !query 33 +-- !query select 12345678901234567890.0 * 12345678901234567890.0 --- !query 33 schema +-- !query schema struct<(12345678901234567890.0 * 12345678901234567890.0):decimal(38,2)> --- !query 33 output +-- !query output NULL --- !query 34 -select 1e35 / 0.1 --- !query 34 schema -struct<(CAST(1E+35 AS DECIMAL(37,1)) / CAST(0.1 AS DECIMAL(37,1))):decimal(38,3)> --- !query 34 output +-- !query +select 1e35BD / 0.1 +-- !query schema +struct<(CAST(100000000000000000000000000000000000 AS DECIMAL(37,1)) / CAST(0.1 AS DECIMAL(37,1))):decimal(38,3)> +-- !query output NULL --- !query 35 -select 1.2345678901234567890E30 * 1.2345678901234567890E25 --- !query 35 schema -struct<(CAST(1.2345678901234567890E+30 AS DECIMAL(25,-6)) * CAST(1.2345678901234567890E+25 AS DECIMAL(25,-6))):decimal(38,-17)> --- !query 35 output +-- !query +select 1.2345678901234567890E30BD * 1.2345678901234567890E25BD +-- !query schema +struct<(CAST(1234567890123456789000000000000 AS DECIMAL(31,0)) * CAST(12345678901234567890000000 AS DECIMAL(31,0))):decimal(38,0)> +-- !query output NULL --- !query 36 +-- !query select 12345678912345678912345678912.1234567 + 9999999999999999999999999999999.12345 --- !query 36 schema +-- !query schema struct<(CAST(12345678912345678912345678912.1234567 AS DECIMAL(38,7)) + CAST(9999999999999999999999999999999.12345 AS DECIMAL(38,7))):decimal(38,7)> --- !query 36 output +-- !query output NULL --- !query 37 +-- !query select 123456789123456789.1234567890 * 1.123456789123456789 --- !query 37 schema +-- !query schema struct<(CAST(123456789123456789.1234567890 AS DECIMAL(36,18)) * CAST(1.123456789123456789 AS DECIMAL(36,18))):decimal(38,28)> --- !query 37 output +-- !query output NULL --- !query 38 +-- !query select 12345678912345.123456789123 / 0.000000012345678 --- !query 38 schema +-- !query schema struct<(CAST(12345678912345.123456789123 AS DECIMAL(29,15)) / CAST(1.2345678E-8 AS DECIMAL(29,15))):decimal(38,18)> --- !query 38 output +-- !query output NULL --- !query 39 -set spark.sql.decimalOperations.nullOnOverflow=false --- !query 39 schema -struct --- !query 39 output -spark.sql.decimalOperations.nullOnOverflow false - - --- !query 40 -select id, a*10, b/10 from decimals_test order by id --- !query 40 schema -struct --- !query 40 output -1 1000 99.9 -2 123451.23 1234.5123 -3 1.234567891011 123.41 -4 1234567891234567890 0.1123456789123456789 - - --- !query 41 -select 10.3 * 3.0 --- !query 41 schema -struct<(CAST(10.3 AS DECIMAL(3,1)) * CAST(3.0 AS DECIMAL(3,1))):decimal(6,2)> --- !query 41 output -30.9 - - --- !query 42 -select 10.3000 * 3.0 --- !query 42 schema -struct<(CAST(10.3000 AS DECIMAL(6,4)) * CAST(3.0 AS DECIMAL(6,4))):decimal(9,5)> --- !query 42 output -30.9 - - --- !query 43 -select 10.30000 * 30.0 --- !query 43 schema -struct<(CAST(10.30000 AS DECIMAL(7,5)) * CAST(30.0 AS DECIMAL(7,5))):decimal(11,6)> --- !query 43 output -309 - - --- !query 44 -select 10.300000000000000000 * 3.000000000000000000 --- !query 44 schema -struct<(CAST(10.300000000000000000 AS DECIMAL(20,18)) * CAST(3.000000000000000000 AS DECIMAL(20,18))):decimal(38,36)> --- !query 44 output -30.9 - - --- !query 45 -select 10.300000000000000000 * 3.0000000000000000000 --- !query 45 schema -struct<> --- !query 45 output -java.lang.ArithmeticException -Decimal(expanded,30.900000000000000000000000000000000000,38,36}) cannot be represented as Decimal(38, 37). - - --- !query 46 -select (5e36 + 0.1) + 5e36 --- !query 46 schema -struct<> --- !query 46 output -java.lang.ArithmeticException -Decimal(expanded,10000000000000000000000000000000000000.1,39,1}) cannot be represented as Decimal(38, 1). - - --- !query 47 -select (-4e36 - 0.1) - 7e36 --- !query 47 schema -struct<> --- !query 47 output -java.lang.ArithmeticException -Decimal(expanded,-11000000000000000000000000000000000000.1,39,1}) cannot be represented as Decimal(38, 1). - - --- !query 48 -select 12345678901234567890.0 * 12345678901234567890.0 --- !query 48 schema -struct<> --- !query 48 output -java.lang.ArithmeticException -Decimal(expanded,1.5241578753238836750190519987501905210E+38,38,-1}) cannot be represented as Decimal(38, 2). - - --- !query 49 -select 1e35 / 0.1 --- !query 49 schema -struct<> --- !query 49 output -java.lang.ArithmeticException -Decimal(expanded,1000000000000000000000000000000000000,37,0}) cannot be represented as Decimal(38, 3). - - --- !query 50 -select 123456789123456789.1234567890 * 1.123456789123456789 --- !query 50 schema -struct<> --- !query 50 output -java.lang.ArithmeticException -Decimal(expanded,138698367904130467.65432098851562262075,38,20}) cannot be represented as Decimal(38, 28). - - --- !query 51 -select 123456789123456789.1234567890 * 1.123456789123456789 --- !query 51 schema -struct<> --- !query 51 output -java.lang.ArithmeticException -Decimal(expanded,138698367904130467.65432098851562262075,38,20}) cannot be represented as Decimal(38, 28). - - --- !query 52 -select 12345678912345.123456789123 / 0.000000012345678 --- !query 52 schema -struct<> --- !query 52 output -java.lang.ArithmeticException -Decimal(expanded,1000000073899961059796.7258663315210392,38,16}) cannot be represented as Decimal(38, 18). - - --- !query 53 +-- !query drop table decimals_test --- !query 53 schema +-- !query schema struct<> --- !query 53 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/describe-part-after-analyze.sql.out b/sql/core/src/test/resources/sql-tests/results/describe-part-after-analyze.sql.out index 17dd317f63b70..24927c34c57b4 100644 --- a/sql/core/src/test/resources/sql-tests/results/describe-part-after-analyze.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/describe-part-after-analyze.sql.out @@ -2,47 +2,47 @@ -- Number of queries: 15 --- !query 0 +-- !query CREATE TABLE t (key STRING, value STRING, ds STRING, hr INT) USING parquet PARTITIONED BY (ds, hr) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query INSERT INTO TABLE t PARTITION (ds='2017-08-01', hr=10) VALUES ('k1', 100), ('k2', 200), ('k3', 300) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query INSERT INTO TABLE t PARTITION (ds='2017-08-01', hr=11) VALUES ('k1', 101), ('k2', 201), ('k3', 301), ('k4', 401) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query INSERT INTO TABLE t PARTITION (ds='2017-09-01', hr=5) VALUES ('k1', 102), ('k2', 202) --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query DESC EXTENDED t PARTITION (ds='2017-08-01', hr=10) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output key string value string ds string @@ -56,27 +56,27 @@ hr int Database default Table t Partition Values [ds=2017-08-01, hr=10] -Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=10 +Location [not included in comparison]/{warehouse_dir}/t/ds=2017-08-01/hr=10 Created Time [not included in comparison] Last Access [not included in comparison] # Storage Information -Location [not included in comparison]sql/core/spark-warehouse/t +Location [not included in comparison]/{warehouse_dir}/t --- !query 5 +-- !query ANALYZE TABLE t PARTITION (ds='2017-08-01', hr=10) COMPUTE STATISTICS --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query DESC EXTENDED t PARTITION (ds='2017-08-01', hr=10) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output key string value string ds string @@ -90,28 +90,28 @@ hr int Database default Table t Partition Values [ds=2017-08-01, hr=10] -Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=10 +Location [not included in comparison]/{warehouse_dir}/t/ds=2017-08-01/hr=10 Created Time [not included in comparison] Last Access [not included in comparison] Partition Statistics [not included in comparison] bytes, 3 rows # Storage Information -Location [not included in comparison]sql/core/spark-warehouse/t +Location [not included in comparison]/{warehouse_dir}/t --- !query 7 +-- !query ANALYZE TABLE t PARTITION (ds='2017-08-01') COMPUTE STATISTICS --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output --- !query 8 +-- !query DESC EXTENDED t PARTITION (ds='2017-08-01', hr=10) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output key string value string ds string @@ -125,20 +125,20 @@ hr int Database default Table t Partition Values [ds=2017-08-01, hr=10] -Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=10 +Location [not included in comparison]/{warehouse_dir}/t/ds=2017-08-01/hr=10 Created Time [not included in comparison] Last Access [not included in comparison] Partition Statistics [not included in comparison] bytes, 3 rows # Storage Information -Location [not included in comparison]sql/core/spark-warehouse/t +Location [not included in comparison]/{warehouse_dir}/t --- !query 9 +-- !query DESC EXTENDED t PARTITION (ds='2017-08-01', hr=11) --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output key string value string ds string @@ -152,28 +152,28 @@ hr int Database default Table t Partition Values [ds=2017-08-01, hr=11] -Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=11 +Location [not included in comparison]/{warehouse_dir}/t/ds=2017-08-01/hr=11 Created Time [not included in comparison] Last Access [not included in comparison] Partition Statistics [not included in comparison] bytes, 4 rows # Storage Information -Location [not included in comparison]sql/core/spark-warehouse/t +Location [not included in comparison]/{warehouse_dir}/t --- !query 10 +-- !query ANALYZE TABLE t PARTITION (ds, hr) COMPUTE STATISTICS --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output --- !query 11 +-- !query DESC EXTENDED t PARTITION (ds='2017-08-01', hr=10) --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output key string value string ds string @@ -187,20 +187,20 @@ hr int Database default Table t Partition Values [ds=2017-08-01, hr=10] -Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=10 +Location [not included in comparison]/{warehouse_dir}/t/ds=2017-08-01/hr=10 Created Time [not included in comparison] Last Access [not included in comparison] Partition Statistics [not included in comparison] bytes, 3 rows # Storage Information -Location [not included in comparison]sql/core/spark-warehouse/t +Location [not included in comparison]/{warehouse_dir}/t --- !query 12 +-- !query DESC EXTENDED t PARTITION (ds='2017-08-01', hr=11) --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output key string value string ds string @@ -214,20 +214,20 @@ hr int Database default Table t Partition Values [ds=2017-08-01, hr=11] -Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-08-01/hr=11 +Location [not included in comparison]/{warehouse_dir}/t/ds=2017-08-01/hr=11 Created Time [not included in comparison] Last Access [not included in comparison] Partition Statistics [not included in comparison] bytes, 4 rows # Storage Information -Location [not included in comparison]sql/core/spark-warehouse/t +Location [not included in comparison]/{warehouse_dir}/t --- !query 13 +-- !query DESC EXTENDED t PARTITION (ds='2017-09-01', hr=5) --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output key string value string ds string @@ -241,18 +241,18 @@ hr int Database default Table t Partition Values [ds=2017-09-01, hr=5] -Location [not included in comparison]sql/core/spark-warehouse/t/ds=2017-09-01/hr=5 +Location [not included in comparison]/{warehouse_dir}/t/ds=2017-09-01/hr=5 Created Time [not included in comparison] Last Access [not included in comparison] Partition Statistics [not included in comparison] bytes, 2 rows # Storage Information -Location [not included in comparison]sql/core/spark-warehouse/t +Location [not included in comparison]/{warehouse_dir}/t --- !query 14 +-- !query DROP TABLE t --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/describe-query.sql.out b/sql/core/src/test/resources/sql-tests/results/describe-query.sql.out index e41534681dc91..6b16aba268f50 100644 --- a/sql/core/src/test/resources/sql-tests/results/describe-query.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/describe-query.sql.out @@ -2,114 +2,114 @@ -- Number of queries: 19 --- !query 0 +-- !query CREATE table desc_temp1 (key int COMMENT 'column_comment', val string) USING PARQUET --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE table desc_temp2 (key int, val string) USING PARQUET --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query DESC SELECT key, key + 1 as plusone FROM desc_temp1 --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output key int column_comment plusone int --- !query 3 +-- !query DESC QUERY SELECT * FROM desc_temp2 --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output key int val string --- !query 4 +-- !query DESC SELECT key, COUNT(*) as count FROM desc_temp1 group by key --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output key int column_comment count bigint --- !query 5 +-- !query DESC SELECT 10.00D as col1 --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output col1 double --- !query 6 +-- !query DESC QUERY SELECT key FROM desc_temp1 UNION ALL select CAST(1 AS DOUBLE) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output key double --- !query 7 +-- !query DESC QUERY VALUES(1.00D, 'hello') as tab1(col1, col2) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output col1 double col2 string --- !query 8 +-- !query DESC QUERY FROM desc_temp1 a SELECT * --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output key int column_comment val string --- !query 9 +-- !query DESC WITH s AS (SELECT 'hello' as col1) SELECT * FROM s --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output col1 string --- !query 10 +-- !query DESCRIBE QUERY WITH s AS (SELECT * from desc_temp1) SELECT * FROM s --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output key int column_comment val string --- !query 11 +-- !query DESCRIBE SELECT * FROM (FROM desc_temp2 select * select *) --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output key int val string --- !query 12 +-- !query DESCRIBE INSERT INTO desc_temp1 values (1, 'val1') --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException mismatched input 'desc_temp1' expecting {, '.'}(line 1, pos 21) @@ -119,11 +119,11 @@ DESCRIBE INSERT INTO desc_temp1 values (1, 'val1') ---------------------^^^ --- !query 13 +-- !query DESCRIBE INSERT INTO desc_temp1 SELECT * FROM desc_temp2 --- !query 13 schema +-- !query schema struct<> --- !query 13 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException mismatched input 'desc_temp1' expecting {, '.'}(line 1, pos 21) @@ -133,14 +133,14 @@ DESCRIBE INSERT INTO desc_temp1 SELECT * FROM desc_temp2 ---------------------^^^ --- !query 14 +-- !query DESCRIBE FROM desc_temp1 a insert into desc_temp1 select * insert into desc_temp2 select * --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException mismatched input 'insert' expecting {'MAP', 'REDUCE', 'SELECT'}(line 3, pos 5) @@ -153,21 +153,21 @@ DESCRIBE insert into desc_temp2 select * --- !query 15 +-- !query EXPLAIN DESC QUERY SELECT * FROM desc_temp2 WHERE key > 0 --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output == Physical Plan == Execute DescribeQueryCommand +- DescribeQueryCommand SELECT * FROM desc_temp2 WHERE key > 0 --- !query 16 +-- !query EXPLAIN EXTENDED DESC WITH s AS (SELECT 'hello' as col1) SELECT * FROM s --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output == Parsed Logical Plan == DescribeQueryCommand WITH s AS (SELECT 'hello' as col1) SELECT * FROM s @@ -183,17 +183,17 @@ Execute DescribeQueryCommand +- DescribeQueryCommand WITH s AS (SELECT 'hello' as col1) SELECT * FROM s --- !query 17 +-- !query DROP TABLE desc_temp1 --- !query 17 schema +-- !query schema struct<> --- !query 17 output +-- !query output --- !query 18 +-- !query DROP TABLE desc_temp2 --- !query 18 schema +-- !query schema struct<> --- !query 18 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/describe-table-after-alter-table.sql.out b/sql/core/src/test/resources/sql-tests/results/describe-table-after-alter-table.sql.out index 7873085da5069..3029fa8e83077 100644 --- a/sql/core/src/test/resources/sql-tests/results/describe-table-after-alter-table.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/describe-table-after-alter-table.sql.out @@ -2,19 +2,19 @@ -- Number of queries: 12 --- !query 0 +-- !query CREATE TABLE table_with_comment (a STRING, b INT, c STRING, d STRING) USING parquet COMMENT 'added' --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query DESC FORMATTED table_with_comment --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output a string b int c string @@ -29,22 +29,22 @@ Created By [not included in comparison] Type MANAGED Provider parquet Comment added -Location [not included in comparison]sql/core/spark-warehouse/table_with_comment +Location [not included in comparison]/{warehouse_dir}/table_with_comment --- !query 2 +-- !query ALTER TABLE table_with_comment SET TBLPROPERTIES("comment"= "modified comment", "type"= "parquet") --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query DESC FORMATTED table_with_comment --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output a string b int c string @@ -60,30 +60,30 @@ Type MANAGED Provider parquet Comment modified comment Table Properties [type=parquet] -Location [not included in comparison]sql/core/spark-warehouse/table_with_comment +Location [not included in comparison]/{warehouse_dir}/table_with_comment --- !query 4 +-- !query DROP TABLE table_with_comment --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query CREATE TABLE table_comment (a STRING, b INT) USING parquet --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query DESC FORMATTED table_comment --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output a string b int @@ -95,22 +95,22 @@ Last Access [not included in comparison] Created By [not included in comparison] Type MANAGED Provider parquet -Location [not included in comparison]sql/core/spark-warehouse/table_comment +Location [not included in comparison]/{warehouse_dir}/table_comment --- !query 7 +-- !query ALTER TABLE table_comment SET TBLPROPERTIES(comment = "added comment") --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output --- !query 8 +-- !query DESC formatted table_comment --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output a string b int @@ -123,22 +123,22 @@ Created By [not included in comparison] Type MANAGED Provider parquet Comment added comment -Location [not included in comparison]sql/core/spark-warehouse/table_comment +Location [not included in comparison]/{warehouse_dir}/table_comment --- !query 9 +-- !query ALTER TABLE table_comment UNSET TBLPROPERTIES IF EXISTS ('comment') --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output --- !query 10 +-- !query DESC FORMATTED table_comment --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output a string b int @@ -150,12 +150,12 @@ Last Access [not included in comparison] Created By [not included in comparison] Type MANAGED Provider parquet -Location [not included in comparison]sql/core/spark-warehouse/table_comment +Location [not included in comparison]/{warehouse_dir}/table_comment --- !query 11 +-- !query DROP TABLE table_comment --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out b/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out index 6ef8af6574e98..ae9240ec588da 100644 --- a/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/describe-table-column.sql.out @@ -1,30 +1,30 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 23 +-- Number of queries: 28 --- !query 0 +-- !query CREATE TEMPORARY VIEW desc_col_temp_view (key int COMMENT 'column_comment') USING PARQUET --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query DESC desc_col_temp_view key --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output col_name key data_type int comment column_comment --- !query 2 +-- !query DESC EXTENDED desc_col_temp_view key --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output col_name key data_type int comment column_comment @@ -37,11 +37,11 @@ max_col_len NULL histogram NULL --- !query 3 +-- !query DESC FORMATTED desc_col_temp_view key --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output col_name key data_type int comment column_comment @@ -54,11 +54,11 @@ max_col_len NULL histogram NULL --- !query 4 +-- !query DESC FORMATTED desc_col_temp_view desc_col_temp_view.key --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output col_name key data_type int comment column_comment @@ -71,46 +71,46 @@ max_col_len NULL histogram NULL --- !query 5 +-- !query DESC desc_col_temp_view key1 --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output org.apache.spark.sql.AnalysisException Column key1 does not exist; --- !query 6 +-- !query CREATE TABLE desc_col_table (key int COMMENT 'column_comment') USING PARQUET --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output --- !query 7 +-- !query ANALYZE TABLE desc_col_table COMPUTE STATISTICS FOR COLUMNS key --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output --- !query 8 +-- !query DESC desc_col_table key --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output col_name key data_type int comment column_comment --- !query 9 +-- !query DESC EXTENDED desc_col_table key --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output col_name key data_type int comment column_comment @@ -123,11 +123,11 @@ max_col_len 4 histogram NULL --- !query 10 +-- !query DESC FORMATTED desc_col_table key --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output col_name key data_type int comment column_comment @@ -140,19 +140,19 @@ max_col_len 4 histogram NULL --- !query 11 +-- !query CREATE TABLE desc_complex_col_table (`a.b` int, col struct) USING PARQUET --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output --- !query 12 +-- !query DESC FORMATTED desc_complex_col_table `a.b` --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output col_name a.b data_type int comment NULL @@ -165,11 +165,11 @@ max_col_len NULL histogram NULL --- !query 13 +-- !query DESC FORMATTED desc_complex_col_table col --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output col_name col data_type struct comment NULL @@ -182,52 +182,52 @@ max_col_len NULL histogram NULL --- !query 14 +-- !query DESC FORMATTED desc_complex_col_table col.x --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output org.apache.spark.sql.AnalysisException DESC TABLE COLUMN command does not support nested data types: col.x; --- !query 15 +-- !query SET spark.sql.statistics.histogram.enabled=true --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output spark.sql.statistics.histogram.enabled true --- !query 16 +-- !query SET spark.sql.statistics.histogram.numBins=2 --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output spark.sql.statistics.histogram.numBins 2 --- !query 17 +-- !query INSERT INTO desc_col_table values 1, 2, 3, 4 --- !query 17 schema +-- !query schema struct<> --- !query 17 output +-- !query output --- !query 18 +-- !query ANALYZE TABLE desc_col_table COMPUTE STATISTICS FOR COLUMNS key --- !query 18 schema +-- !query schema struct<> --- !query 18 output +-- !query output --- !query 19 +-- !query DESC EXTENDED desc_col_table key --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output col_name key data_type int comment column_comment @@ -242,25 +242,74 @@ bin_0 lower_bound: 1.0, upper_bound: 2.0, distinct_count: 2 bin_1 lower_bound: 2.0, upper_bound: 4.0, distinct_count: 2 --- !query 20 +-- !query DROP VIEW desc_col_temp_view --- !query 20 schema +-- !query schema struct<> --- !query 20 output +-- !query output --- !query 21 +-- !query DROP TABLE desc_col_table --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output --- !query 22 +-- !query DROP TABLE desc_complex_col_table --- !query 22 schema +-- !query schema struct<> --- !query 22 output +-- !query output + + + +-- !query +CREATE TABLE customer(CName STRING) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO customer VALUES('Maria') +-- !query schema +struct<> +-- !query output + + + +-- !query +ANALYZE TABLE customer COMPUTE STATISTICS FOR COLUMNS cname +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC EXTENDED customer cname +-- !query schema +struct +-- !query output +col_name cname +data_type string +comment NULL +min NULL +max NULL +num_nulls 0 +distinct_count 1 +avg_col_len 5 +max_col_len 5 +histogram NULL + + +-- !query +DROP TABLE customer +-- !query schema +struct<> +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/describe.sql.out b/sql/core/src/test/resources/sql-tests/results/describe.sql.out index f58bdb5446b64..697e006544acf 100644 --- a/sql/core/src/test/resources/sql-tests/results/describe.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/describe.sql.out @@ -2,68 +2,68 @@ -- Number of queries: 41 --- !query 0 +-- !query CREATE TABLE t (a STRING, b INT, c STRING, d STRING) USING parquet OPTIONS (a '1', b '2') PARTITIONED BY (c, d) CLUSTERED BY (a) SORTED BY (b ASC) INTO 2 BUCKETS COMMENT 'table_comment' TBLPROPERTIES (t 'test') --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TEMPORARY VIEW temp_v AS SELECT * FROM t --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE TEMPORARY VIEW temp_Data_Source_View USING org.apache.spark.sql.sources.DDLScanSource OPTIONS ( From '1', To '10', Table 'test1') --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query CREATE VIEW v AS SELECT * FROM t --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query ALTER TABLE t SET TBLPROPERTIES (e = '3') --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query ALTER TABLE t ADD PARTITION (c='Us', d=1) --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query DESCRIBE t --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output a string b int c string @@ -74,11 +74,11 @@ c string d string --- !query 7 +-- !query DESC default.t --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output a string b int c string @@ -89,11 +89,11 @@ c string d string --- !query 8 +-- !query DESC TABLE t --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output a string b int c string @@ -104,11 +104,11 @@ c string d string --- !query 9 +-- !query DESC FORMATTED t --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output a string b int c string @@ -131,16 +131,16 @@ Bucket Columns [`a`] Sort Columns [`b`] Comment table_comment Table Properties [t=test, e=3] -Location [not included in comparison]sql/core/spark-warehouse/t +Location [not included in comparison]/{warehouse_dir}/t Storage Properties [a=1, b=2] Partition Provider Catalog --- !query 10 +-- !query DESC EXTENDED t --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output a string b int c string @@ -163,24 +163,24 @@ Bucket Columns [`a`] Sort Columns [`b`] Comment table_comment Table Properties [t=test, e=3] -Location [not included in comparison]sql/core/spark-warehouse/t +Location [not included in comparison]/{warehouse_dir}/t Storage Properties [a=1, b=2] Partition Provider Catalog --- !query 11 +-- !query ALTER TABLE t UNSET TBLPROPERTIES (e) --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output --- !query 12 +-- !query DESC EXTENDED t --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output a string b int c string @@ -203,24 +203,24 @@ Bucket Columns [`a`] Sort Columns [`b`] Comment table_comment Table Properties [t=test] -Location [not included in comparison]sql/core/spark-warehouse/t +Location [not included in comparison]/{warehouse_dir}/t Storage Properties [a=1, b=2] Partition Provider Catalog --- !query 13 +-- !query ALTER TABLE t UNSET TBLPROPERTIES (comment) --- !query 13 schema +-- !query schema struct<> --- !query 13 output +-- !query output --- !query 14 +-- !query DESC EXTENDED t --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output a string b int c string @@ -242,16 +242,16 @@ Num Buckets 2 Bucket Columns [`a`] Sort Columns [`b`] Table Properties [t=test] -Location [not included in comparison]sql/core/spark-warehouse/t +Location [not included in comparison]/{warehouse_dir}/t Storage Properties [a=1, b=2] Partition Provider Catalog --- !query 15 +-- !query DESC t PARTITION (c='Us', d=1) --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output a string b int c string @@ -262,11 +262,11 @@ c string d string --- !query 16 +-- !query DESC EXTENDED t PARTITION (c='Us', d=1) --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output a string b int c string @@ -280,7 +280,7 @@ d string Database default Table t Partition Values [c=Us, d=1] -Location [not included in comparison]sql/core/spark-warehouse/t/c=Us/d=1 +Location [not included in comparison]/{warehouse_dir}/t/c=Us/d=1 Storage Properties [a=1, b=2] Created Time [not included in comparison] Last Access [not included in comparison] @@ -289,15 +289,15 @@ Last Access [not included in comparison] Num Buckets 2 Bucket Columns [`a`] Sort Columns [`b`] -Location [not included in comparison]sql/core/spark-warehouse/t +Location [not included in comparison]/{warehouse_dir}/t Storage Properties [a=1, b=2] --- !query 17 +-- !query DESC FORMATTED t PARTITION (c='Us', d=1) --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output a string b int c string @@ -311,7 +311,7 @@ d string Database default Table t Partition Values [c=Us, d=1] -Location [not included in comparison]sql/core/spark-warehouse/t/c=Us/d=1 +Location [not included in comparison]/{warehouse_dir}/t/c=Us/d=1 Storage Properties [a=1, b=2] Created Time [not included in comparison] Last Access [not included in comparison] @@ -320,35 +320,35 @@ Last Access [not included in comparison] Num Buckets 2 Bucket Columns [`a`] Sort Columns [`b`] -Location [not included in comparison]sql/core/spark-warehouse/t +Location [not included in comparison]/{warehouse_dir}/t Storage Properties [a=1, b=2] --- !query 18 +-- !query DESC t PARTITION (c='Us', d=2) --- !query 18 schema +-- !query schema struct<> --- !query 18 output +-- !query output org.apache.spark.sql.catalyst.analysis.NoSuchPartitionException Partition not found in table 't' database 'default': c -> Us d -> 2; --- !query 19 +-- !query DESC t PARTITION (c='Us') --- !query 19 schema +-- !query schema struct<> --- !query 19 output +-- !query output org.apache.spark.sql.AnalysisException Partition spec is invalid. The spec (c) must match the partition spec (c, d) defined in table '`default`.`t`'; --- !query 20 +-- !query DESC t PARTITION (c='Us', d) --- !query 20 schema +-- !query schema struct<> --- !query 20 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException PARTITION specification is incomplete: `d`(line 1, pos 0) @@ -358,55 +358,55 @@ DESC t PARTITION (c='Us', d) ^^^ --- !query 21 +-- !query DESC temp_v --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output a string b int c string d string --- !query 22 +-- !query DESC TABLE temp_v --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output a string b int c string d string --- !query 23 +-- !query DESC FORMATTED temp_v --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output a string b int c string d string --- !query 24 +-- !query DESC EXTENDED temp_v --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output a string b int c string d string --- !query 25 +-- !query DESC temp_Data_Source_View --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output intType int test comment test1 stringType string dateType date @@ -425,42 +425,42 @@ arrayType array structType struct --- !query 26 +-- !query DESC temp_v PARTITION (c='Us', d=1) --- !query 26 schema +-- !query schema struct<> --- !query 26 output +-- !query output org.apache.spark.sql.AnalysisException DESC PARTITION is not allowed on a temporary view: temp_v; --- !query 27 +-- !query DESC v --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output a string b int c string d string --- !query 28 +-- !query DESC TABLE v --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output a string b int c string d string --- !query 29 +-- !query DESC FORMATTED v --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output a string b int c string @@ -475,16 +475,16 @@ Created By [not included in comparison] Type VIEW View Text SELECT * FROM t View Original Text SELECT * FROM t -View Default Database default +View Catalog and Namespace spark_catalog.default View Query Output Columns [a, b, c, d] -Table Properties [view.query.out.col.3=d, view.query.out.col.0=a, view.query.out.numCols=4, view.default.database=default, view.query.out.col.1=b, view.query.out.col.2=c] +Table Properties [view.query.out.col.3=d, view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=4, view.query.out.col.1=b, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=c, view.catalogAndNamespace.part.1=default] --- !query 30 +-- !query DESC EXTENDED v --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output a string b int c string @@ -499,47 +499,48 @@ Created By [not included in comparison] Type VIEW View Text SELECT * FROM t View Original Text SELECT * FROM t -View Default Database default +View Catalog and Namespace spark_catalog.default View Query Output Columns [a, b, c, d] -Table Properties [view.query.out.col.3=d, view.query.out.col.0=a, view.query.out.numCols=4, view.default.database=default, view.query.out.col.1=b, view.query.out.col.2=c] +Table Properties [view.query.out.col.3=d, view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=4, view.query.out.col.1=b, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=c, view.catalogAndNamespace.part.1=default] --- !query 31 +-- !query DESC v PARTITION (c='Us', d=1) --- !query 31 schema +-- !query schema struct<> --- !query 31 output +-- !query output org.apache.spark.sql.AnalysisException DESC PARTITION is not allowed on a view: v; --- !query 32 +-- !query EXPLAIN DESC t --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output == Physical Plan == Execute DescribeTableCommand +- DescribeTableCommand `t`, false --- !query 33 +-- !query EXPLAIN DESC EXTENDED t --- !query 33 schema +-- !query schema struct --- !query 33 output +-- !query output == Physical Plan == Execute DescribeTableCommand +- DescribeTableCommand `t`, true --- !query 34 +-- !query EXPLAIN EXTENDED DESC t --- !query 34 schema +-- !query schema struct --- !query 34 output +-- !query output == Parsed Logical Plan == -'DescribeTableStatement [t], false +'DescribeRelation false ++- 'UnresolvedTableOrView [t] == Analyzed Logical Plan == col_name: string, data_type: string, comment: string @@ -553,53 +554,53 @@ Execute DescribeTableCommand +- DescribeTableCommand `t`, false --- !query 35 +-- !query EXPLAIN DESCRIBE t b --- !query 35 schema +-- !query schema struct --- !query 35 output +-- !query output == Physical Plan == Execute DescribeColumnCommand +- DescribeColumnCommand `t`, [b], false --- !query 36 +-- !query EXPLAIN DESCRIBE t PARTITION (c='Us', d=2) --- !query 36 schema +-- !query schema struct --- !query 36 output +-- !query output == Physical Plan == Execute DescribeTableCommand +- DescribeTableCommand `t`, Map(c -> Us, d -> 2), false --- !query 37 +-- !query DROP TABLE t --- !query 37 schema +-- !query schema struct<> --- !query 37 output +-- !query output --- !query 38 +-- !query DROP VIEW temp_v --- !query 38 schema +-- !query schema struct<> --- !query 38 output +-- !query output --- !query 39 +-- !query DROP VIEW temp_Data_Source_View --- !query 39 schema +-- !query schema struct<> --- !query 39 output +-- !query output --- !query 40 +-- !query DROP VIEW v --- !query 40 schema +-- !query schema struct<> --- !query 40 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/except-all.sql.out b/sql/core/src/test/resources/sql-tests/results/except-all.sql.out index 01091a2f751ce..601ff8f024214 100644 --- a/sql/core/src/test/resources/sql-tests/results/except-all.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/except-all.sql.out @@ -2,25 +2,25 @@ -- Number of queries: 27 --- !query 0 +-- !query CREATE TEMPORARY VIEW tab1 AS SELECT * FROM VALUES (0), (1), (2), (2), (2), (2), (3), (null), (null) AS tab1(c1) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TEMPORARY VIEW tab2 AS SELECT * FROM VALUES (1), (2), (2), (3), (5), (5), (null) AS tab2(c1) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE TEMPORARY VIEW tab3 AS SELECT * FROM VALUES (1, 2), (1, 2), @@ -28,13 +28,13 @@ CREATE TEMPORARY VIEW tab3 AS SELECT * FROM VALUES (2, 3), (2, 2) AS tab3(k, v) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query CREATE TEMPORARY VIEW tab4 AS SELECT * FROM VALUES (1, 2), (2, 3), @@ -42,45 +42,45 @@ CREATE TEMPORARY VIEW tab4 AS SELECT * FROM VALUES (2, 2), (2, 20) AS tab4(k, v) --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query SELECT * FROM tab1 EXCEPT ALL SELECT * FROM tab2 --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 0 2 2 NULL --- !query 5 +-- !query SELECT * FROM tab1 MINUS ALL SELECT * FROM tab2 --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 0 2 2 NULL --- !query 6 +-- !query SELECT * FROM tab1 EXCEPT ALL SELECT * FROM tab2 WHERE c1 IS NOT NULL --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 0 2 2 @@ -88,23 +88,23 @@ NULL NULL --- !query 7 +-- !query SELECT * FROM tab1 WHERE c1 > 5 EXCEPT ALL SELECT * FROM tab2 --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output --- !query 8 +-- !query SELECT * FROM tab1 EXCEPT ALL SELECT * FROM tab2 WHERE c1 > 6 --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 0 1 2 @@ -116,13 +116,13 @@ NULL NULL --- !query 9 +-- !query SELECT * FROM tab1 EXCEPT ALL SELECT CAST(1 AS BIGINT) --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 0 2 2 @@ -133,65 +133,65 @@ NULL NULL --- !query 10 +-- !query SELECT * FROM tab1 EXCEPT ALL SELECT array(1) --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output org.apache.spark.sql.AnalysisException ExceptAll can only be performed on tables with the compatible column types. array <> int at the first column of the second table; --- !query 11 +-- !query SELECT * FROM tab3 EXCEPT ALL SELECT * FROM tab4 --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 1 2 1 3 --- !query 12 +-- !query SELECT * FROM tab4 EXCEPT ALL SELECT * FROM tab3 --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 2 2 2 20 --- !query 13 +-- !query SELECT * FROM tab4 EXCEPT ALL SELECT * FROM tab3 INTERSECT DISTINCT SELECT * FROM tab4 --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output 2 2 2 20 --- !query 14 +-- !query SELECT * FROM tab4 EXCEPT ALL SELECT * FROM tab3 EXCEPT DISTINCT SELECT * FROM tab4 --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output --- !query 15 +-- !query SELECT * FROM tab3 EXCEPT ALL SELECT * FROM tab4 @@ -199,24 +199,24 @@ UNION ALL SELECT * FROM tab3 EXCEPT DISTINCT SELECT * FROM tab4 --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 1 3 --- !query 16 +-- !query SELECT k FROM tab3 EXCEPT ALL SELECT k, v FROM tab4 --- !query 16 schema +-- !query schema struct<> --- !query 16 output +-- !query output org.apache.spark.sql.AnalysisException ExceptAll can only be performed on tables with the same number of columns, but the first table has 1 columns and the second table has 2 columns; --- !query 17 +-- !query SELECT * FROM tab3 EXCEPT ALL SELECT * FROM tab4 @@ -224,13 +224,13 @@ UNION SELECT * FROM tab3 EXCEPT DISTINCT SELECT * FROM tab4 --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output 1 3 --- !query 18 +-- !query SELECT * FROM tab3 MINUS ALL SELECT * FROM tab4 @@ -238,13 +238,13 @@ UNION SELECT * FROM tab3 MINUS DISTINCT SELECT * FROM tab4 --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output 1 3 --- !query 19 +-- !query SELECT * FROM tab3 EXCEPT ALL SELECT * FROM tab4 @@ -252,13 +252,13 @@ EXCEPT DISTINCT SELECT * FROM tab3 EXCEPT DISTINCT SELECT * FROM tab4 --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output --- !query 20 +-- !query SELECT * FROM (SELECT tab3.k, tab4.v @@ -272,13 +272,13 @@ FROM (SELECT tab3.k, FROM tab3 JOIN tab4 ON tab3.k = tab4.k) --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output --- !query 21 +-- !query SELECT * FROM (SELECT tab3.k, tab4.v @@ -292,9 +292,9 @@ FROM (SELECT tab4.v AS k, FROM tab3 JOIN tab4 ON tab3.k = tab4.k) --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output 1 2 1 2 1 2 @@ -304,43 +304,43 @@ struct 2 3 --- !query 22 +-- !query SELECT v FROM tab3 GROUP BY v EXCEPT ALL SELECT k FROM tab4 GROUP BY k --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output 3 --- !query 23 +-- !query DROP VIEW IF EXISTS tab1 --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output --- !query 24 +-- !query DROP VIEW IF EXISTS tab2 --- !query 24 schema +-- !query schema struct<> --- !query 24 output +-- !query output --- !query 25 +-- !query DROP VIEW IF EXISTS tab3 --- !query 25 schema +-- !query schema struct<> --- !query 25 output +-- !query output --- !query 26 +-- !query DROP VIEW IF EXISTS tab4 --- !query 26 schema +-- !query schema struct<> --- !query 26 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/except.sql.out b/sql/core/src/test/resources/sql-tests/results/except.sql.out index c9b712d4d2949..62d695219d01d 100644 --- a/sql/core/src/test/resources/sql-tests/results/except.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/except.sql.out @@ -2,20 +2,20 @@ -- Number of queries: 9 --- !query 0 +-- !query create temporary view t1 as select * from values ("one", 1), ("two", 2), ("three", 3), ("one", NULL) as t1(k, v) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view t2 as select * from values ("one", 1), ("two", 22), @@ -23,71 +23,71 @@ create temporary view t2 as select * from values ("one", NULL), (NULL, 5) as t2(k, v) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query SELECT * FROM t1 EXCEPT SELECT * FROM t2 --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output three 3 two 2 --- !query 3 +-- !query SELECT * FROM t1 EXCEPT SELECT * FROM t1 where v <> 1 and v <> 2 --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output one 1 one NULL two 2 --- !query 4 +-- !query SELECT * FROM t1 where v <> 1 and v <> 22 EXCEPT SELECT * FROM t1 where v <> 2 and v >= 3 --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output two 2 --- !query 5 +-- !query SELECT t1.* FROM t1, t2 where t1.k = t2.k EXCEPT SELECT t1.* FROM t1, t2 where t1.k = t2.k and t1.k != 'one' --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output one 1 one NULL --- !query 6 +-- !query SELECT * FROM t2 where v >= 1 and v <> 22 EXCEPT SELECT * FROM t1 --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output NULL 5 one 5 --- !query 7 +-- !query SELECT (SELECT min(k) FROM t2 WHERE t2.k = t1.k) min_t2 FROM t1 MINUS SELECT (SELECT min(k) FROM t2) abs_min_t2 FROM t1 WHERE t1.k = 'one' --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output NULL two --- !query 8 +-- !query SELECT t1.k FROM t1 WHERE t1.v <= (SELECT max(t2.v) @@ -99,7 +99,7 @@ FROM t1 WHERE t1.v >= (SELECT min(t2.v) FROM t2 WHERE t2.k = t1.k) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output two diff --git a/sql/core/src/test/resources/sql-tests/results/explain.sql.out b/sql/core/src/test/resources/sql-tests/results/explain.sql.out index 4a08cfada292d..bc28d7f87bf00 100644 --- a/sql/core/src/test/resources/sql-tests/results/explain.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/explain.sql.out @@ -1,49 +1,57 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 18 +-- Number of queries: 22 --- !query 0 +-- !query CREATE table explain_temp1 (key int, val int) USING PARQUET --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE table explain_temp2 (key int, val int) USING PARQUET --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE table explain_temp3 (key int, val int) USING PARQUET --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query +CREATE table explain_temp4 (key int, val string) USING PARQUET +-- !query schema +struct<> +-- !query output + + + +-- !query SET spark.sql.codegen.wholeStage = true --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output spark.sql.codegen.wholeStage true --- !query 4 +-- !query EXPLAIN FORMATTED SELECT key, max(val) FROM explain_temp1 WHERE key > 0 GROUP BY key ORDER BY key --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output == Physical Plan == * Sort (9) +- Exchange (8) @@ -58,6 +66,10 @@ struct (1) Scan parquet default.explain_temp1 Output: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +PushedFilters: [IsNotNull(key), GreaterThan(key,0)] +ReadSchema: struct (2) ColumnarToRow [codegen id : 1] Input: [key#x, val#x] @@ -72,12 +84,20 @@ Input : [key#x, val#x] (5) HashAggregate [codegen id : 1] Input: [key#x, val#x] +Keys: [key#x] +Functions: [partial_max(val#x)] +Aggregate Attributes: [max#x] +Results: [key#x, max#x] (6) Exchange Input: [key#x, max#x] (7) HashAggregate [codegen id : 2] Input: [key#x, max#x] +Keys: [key#x] +Functions: [max(val#x)] +Aggregate Attributes: [max(val#x)#x] +Results: [key#x, max(val#x)#x AS max(val)#x] (8) Exchange Input: [key#x, max(val)#x] @@ -86,16 +106,16 @@ Input: [key#x, max(val)#x] Input: [key#x, max(val)#x] --- !query 5 +-- !query EXPLAIN FORMATTED SELECT key, max(val) FROM explain_temp1 WHERE key > 0 GROUP BY key HAVING max(val) > 0 --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output == Physical Plan == * Project (9) +- * Filter (8) @@ -110,6 +130,10 @@ struct (1) Scan parquet default.explain_temp1 Output: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +PushedFilters: [IsNotNull(key), GreaterThan(key,0)] +ReadSchema: struct (2) ColumnarToRow [codegen id : 1] Input: [key#x, val#x] @@ -124,12 +148,20 @@ Input : [key#x, val#x] (5) HashAggregate [codegen id : 1] Input: [key#x, val#x] +Keys: [key#x] +Functions: [partial_max(val#x)] +Aggregate Attributes: [max#x] +Results: [key#x, max#x] (6) Exchange Input: [key#x, max#x] (7) HashAggregate [codegen id : 2] Input: [key#x, max#x] +Keys: [key#x] +Functions: [max(val#x)] +Aggregate Attributes: [max(val#x)#x] +Results: [key#x, max(val#x)#x AS max(val)#x, max(val#x)#x AS max(val#x)#x] (8) Filter [codegen id : 2] Input : [key#x, max(val)#x, max(val#x)#x] @@ -140,14 +172,14 @@ Output : [key#x, max(val)#x] Input : [key#x, max(val)#x, max(val#x)#x] --- !query 6 +-- !query EXPLAIN FORMATTED SELECT key, val FROM explain_temp1 WHERE key > 0 UNION SELECT key, val FROM explain_temp1 WHERE key > 0 --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output == Physical Plan == * HashAggregate (12) +- Exchange (11) @@ -165,6 +197,10 @@ struct (1) Scan parquet default.explain_temp1 Output: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +PushedFilters: [IsNotNull(key), GreaterThan(key,0)] +ReadSchema: struct (2) ColumnarToRow [codegen id : 1] Input: [key#x, val#x] @@ -179,6 +215,10 @@ Input : [key#x, val#x] (5) Scan parquet default.explain_temp1 Output: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +PushedFilters: [IsNotNull(key), GreaterThan(key,0)] +ReadSchema: struct (6) ColumnarToRow [codegen id : 2] Input: [key#x, val#x] @@ -195,23 +235,31 @@ Input : [key#x, val#x] (10) HashAggregate [codegen id : 3] Input: [key#x, val#x] +Keys: [key#x, val#x] +Functions: [] +Aggregate Attributes: [] +Results: [key#x, val#x] (11) Exchange Input: [key#x, val#x] (12) HashAggregate [codegen id : 4] Input: [key#x, val#x] +Keys: [key#x, val#x] +Functions: [] +Aggregate Attributes: [] +Results: [key#x, val#x] --- !query 7 +-- !query EXPLAIN FORMATTED SELECT * FROM explain_temp1 a, explain_temp2 b WHERE a.key = b.key --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output == Physical Plan == * BroadcastHashJoin Inner BuildRight (10) :- * Project (4) @@ -227,6 +275,10 @@ struct (1) Scan parquet default.explain_temp1 Output: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +PushedFilters: [IsNotNull(key)] +ReadSchema: struct (2) ColumnarToRow [codegen id : 2] Input: [key#x, val#x] @@ -241,6 +293,10 @@ Input : [key#x, val#x] (5) Scan parquet default.explain_temp2 Output: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp2] +PushedFilters: [IsNotNull(key)] +ReadSchema: struct (6) ColumnarToRow [codegen id : 1] Input: [key#x, val#x] @@ -262,15 +318,15 @@ Right keys: List(key#x) Join condition: None --- !query 8 +-- !query EXPLAIN FORMATTED SELECT * FROM explain_temp1 a LEFT OUTER JOIN explain_temp2 b ON a.key = b.key --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output == Physical Plan == * BroadcastHashJoin LeftOuter BuildRight (8) :- * ColumnarToRow (2) @@ -284,12 +340,19 @@ struct (1) Scan parquet default.explain_temp1 Output: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +ReadSchema: struct (2) ColumnarToRow [codegen id : 2] Input: [key#x, val#x] (3) Scan parquet default.explain_temp2 Output: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp2] +PushedFilters: [IsNotNull(key)] +ReadSchema: struct (4) ColumnarToRow [codegen id : 1] Input: [key#x, val#x] @@ -311,7 +374,7 @@ Right keys: List(key#x) Join condition: None --- !query 9 +-- !query EXPLAIN FORMATTED SELECT * FROM explain_temp1 @@ -322,9 +385,9 @@ EXPLAIN FORMATTED WHERE val > 0) AND val = 2) AND val > 3 --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output == Physical Plan == * Project (4) +- * Filter (3) @@ -334,6 +397,10 @@ struct (1) Scan parquet default.explain_temp1 Output: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +PushedFilters: [IsNotNull(key), IsNotNull(val), GreaterThan(val,3)] +ReadSchema: struct (2) ColumnarToRow [codegen id : 1] Input: [key#x, val#x] @@ -360,6 +427,10 @@ Subquery:1 Hosting operator id = 3 Hosting Expression = Subquery scalar-subquery (5) Scan parquet default.explain_temp2 Output: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp2] +PushedFilters: [IsNotNull(key), IsNotNull(val), EqualTo(val,2)] +ReadSchema: struct (6) ColumnarToRow [codegen id : 1] Input: [key#x, val#x] @@ -374,12 +445,20 @@ Input : [key#x, val#x] (9) HashAggregate [codegen id : 1] Input: [key#x] +Keys: [] +Functions: [partial_max(key#x)] +Aggregate Attributes: [max#x] +Results: [max#x] (10) Exchange Input: [max#x] (11) HashAggregate [codegen id : 2] Input: [max#x] +Keys: [] +Functions: [max(key#x)] +Aggregate Attributes: [max(key#x)#x] +Results: [max(key#x)#x AS max(key)#x] Subquery:2 Hosting operator id = 7 Hosting Expression = Subquery scalar-subquery#x, [id=#x] * HashAggregate (18) @@ -393,6 +472,10 @@ Subquery:2 Hosting operator id = 7 Hosting Expression = Subquery scalar-subquery (12) Scan parquet default.explain_temp3 Output: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp3] +PushedFilters: [IsNotNull(val), GreaterThan(val,0)] +ReadSchema: struct (13) ColumnarToRow [codegen id : 1] Input: [key#x, val#x] @@ -407,15 +490,23 @@ Input : [key#x, val#x] (16) HashAggregate [codegen id : 1] Input: [key#x] +Keys: [] +Functions: [partial_max(key#x)] +Aggregate Attributes: [max#x] +Results: [max#x] (17) Exchange Input: [max#x] (18) HashAggregate [codegen id : 2] Input: [max#x] +Keys: [] +Functions: [max(key#x)] +Aggregate Attributes: [max(key#x)#x] +Results: [max(key#x)#x AS max(key)#x] --- !query 10 +-- !query EXPLAIN FORMATTED SELECT * FROM explain_temp1 @@ -423,12 +514,12 @@ EXPLAIN FORMATTED FROM explain_temp2 WHERE val > 0) OR - key = (SELECT max(key) + key = (SELECT avg(key) FROM explain_temp3 WHERE val > 0) --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output == Physical Plan == * Filter (3) +- * ColumnarToRow (2) @@ -437,13 +528,16 @@ struct (1) Scan parquet default.explain_temp1 Output: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +ReadSchema: struct (2) ColumnarToRow [codegen id : 1] Input: [key#x, val#x] (3) Filter [codegen id : 1] Input : [key#x, val#x] -Condition : ((key#x = Subquery scalar-subquery#x, [id=#x]) OR (key#x = Subquery scalar-subquery#x, [id=#x])) +Condition : ((key#x = Subquery scalar-subquery#x, [id=#x]) OR (cast(key#x as double) = Subquery scalar-subquery#x, [id=#x])) ===== Subqueries ===== @@ -459,6 +553,10 @@ Subquery:1 Hosting operator id = 3 Hosting Expression = Subquery scalar-subquery (4) Scan parquet default.explain_temp2 Output: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp2] +PushedFilters: [IsNotNull(val), GreaterThan(val,0)] +ReadSchema: struct (5) ColumnarToRow [codegen id : 1] Input: [key#x, val#x] @@ -473,12 +571,20 @@ Input : [key#x, val#x] (8) HashAggregate [codegen id : 1] Input: [key#x] +Keys: [] +Functions: [partial_max(key#x)] +Aggregate Attributes: [max#x] +Results: [max#x] (9) Exchange Input: [max#x] (10) HashAggregate [codegen id : 2] Input: [max#x] +Keys: [] +Functions: [max(key#x)] +Aggregate Attributes: [max(key#x)#x] +Results: [max(key#x)#x AS max(key)#x] Subquery:2 Hosting operator id = 3 Hosting Expression = Subquery scalar-subquery#x, [id=#x] * HashAggregate (17) @@ -492,6 +598,10 @@ Subquery:2 Hosting operator id = 3 Hosting Expression = Subquery scalar-subquery (11) Scan parquet default.explain_temp3 Output: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp3] +PushedFilters: [IsNotNull(val), GreaterThan(val,0)] +ReadSchema: struct (12) ColumnarToRow [codegen id : 1] Input: [key#x, val#x] @@ -506,21 +616,29 @@ Input : [key#x, val#x] (15) HashAggregate [codegen id : 1] Input: [key#x] +Keys: [] +Functions: [partial_avg(cast(key#x as bigint))] +Aggregate Attributes: [sum#x, count#xL] +Results: [sum#x, count#xL] (16) Exchange -Input: [max#x] +Input: [sum#x, count#xL] (17) HashAggregate [codegen id : 2] -Input: [max#x] +Input: [sum#x, count#xL] +Keys: [] +Functions: [avg(cast(key#x as bigint))] +Aggregate Attributes: [avg(cast(key#x as bigint))#x] +Results: [avg(cast(key#x as bigint))#x AS avg(key)#x] --- !query 11 +-- !query EXPLAIN FORMATTED SELECT (SELECT Avg(key) FROM explain_temp1) + (SELECT Avg(key) FROM explain_temp1) FROM explain_temp1 --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output == Physical Plan == * Project (3) +- * ColumnarToRow (2) @@ -529,6 +647,9 @@ struct (1) Scan parquet default.explain_temp1 Output: [] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +ReadSchema: struct<> (2) ColumnarToRow [codegen id : 1] Input: [] @@ -549,23 +670,34 @@ Subquery:1 Hosting operator id = 3 Hosting Expression = Subquery scalar-subquery (4) Scan parquet default.explain_temp1 Output: [key#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +ReadSchema: struct (5) ColumnarToRow [codegen id : 1] Input: [key#x] (6) HashAggregate [codegen id : 1] Input: [key#x] +Keys: [] +Functions: [partial_avg(cast(key#x as bigint))] +Aggregate Attributes: [sum#x, count#xL] +Results: [sum#x, count#xL] (7) Exchange Input: [sum#x, count#xL] (8) HashAggregate [codegen id : 2] Input: [sum#x, count#xL] +Keys: [] +Functions: [avg(cast(key#x as bigint))] +Aggregate Attributes: [avg(cast(key#x as bigint))#x] +Results: [avg(cast(key#x as bigint))#x AS avg(key)#x] Subquery:2 Hosting operator id = 3 Hosting Expression = ReusedSubquery Subquery scalar-subquery#x, [id=#x] --- !query 12 +-- !query EXPLAIN FORMATTED WITH cte1 AS ( SELECT * @@ -573,9 +705,9 @@ EXPLAIN FORMATTED WHERE key > 10 ) SELECT * FROM cte1 a, cte1 b WHERE a.key = b.key --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output == Physical Plan == * BroadcastHashJoin Inner BuildRight (10) :- * Project (4) @@ -591,6 +723,10 @@ struct (1) Scan parquet default.explain_temp1 Output: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +PushedFilters: [IsNotNull(key), GreaterThan(key,10)] +ReadSchema: struct (2) ColumnarToRow [codegen id : 2] Input: [key#x, val#x] @@ -605,6 +741,10 @@ Input : [key#x, val#x] (5) Scan parquet default.explain_temp1 Output: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +PushedFilters: [IsNotNull(key), GreaterThan(key,10)] +ReadSchema: struct (6) ColumnarToRow [codegen id : 1] Input: [key#x, val#x] @@ -626,7 +766,7 @@ Right keys: List(key#x) Join condition: None --- !query 13 +-- !query EXPLAIN FORMATTED WITH cte1 AS ( SELECT key, max(val) @@ -635,9 +775,9 @@ EXPLAIN FORMATTED GROUP BY key ) SELECT * FROM cte1 a, cte1 b WHERE a.key = b.key --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output == Physical Plan == * BroadcastHashJoin Inner BuildRight (11) :- * HashAggregate (7) @@ -654,6 +794,10 @@ struct (1) Scan parquet default.explain_temp1 Output: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +PushedFilters: [IsNotNull(key), GreaterThan(key,10)] +ReadSchema: struct (2) ColumnarToRow [codegen id : 1] Input: [key#x, val#x] @@ -668,18 +812,30 @@ Input : [key#x, val#x] (5) HashAggregate [codegen id : 1] Input: [key#x, val#x] +Keys: [key#x] +Functions: [partial_max(val#x)] +Aggregate Attributes: [max#x] +Results: [key#x, max#x] (6) Exchange Input: [key#x, max#x] (7) HashAggregate [codegen id : 4] Input: [key#x, max#x] +Keys: [key#x] +Functions: [max(val#x)] +Aggregate Attributes: [max(val#x)#x] +Results: [key#x, max(val#x)#x AS max(val)#x] (8) ReusedExchange [Reuses operator id: 6] Output : ArrayBuffer(key#x, max#x) (9) HashAggregate [codegen id : 3] Input: [key#x, max#x] +Keys: [key#x] +Functions: [max(val#x)] +Aggregate Attributes: [max(val#x)#x] +Results: [key#x, max(val#x)#x AS max(val)#x] (10) BroadcastExchange Input: [key#x, max(val)#x] @@ -690,13 +846,13 @@ Right keys: List(key#x) Join condition: None --- !query 14 +-- !query EXPLAIN FORMATTED CREATE VIEW explain_view AS SELECT key, val FROM explain_temp1 --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output == Physical Plan == Execute CreateViewCommand (1) +- CreateViewCommand (2) @@ -714,25 +870,163 @@ Output: [] (4) Project --- !query 15 +-- !query +EXPLAIN FORMATTED + SELECT + COUNT(val) + SUM(key) as TOTAL, + COUNT(key) FILTER (WHERE val > 1) + FROM explain_temp1 +-- !query schema +struct +-- !query output +== Physical Plan == +* HashAggregate (5) ++- Exchange (4) + +- HashAggregate (3) + +- * ColumnarToRow (2) + +- Scan parquet default.explain_temp1 (1) + + +(1) Scan parquet default.explain_temp1 +Output: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp1] +ReadSchema: struct + +(2) ColumnarToRow [codegen id : 1] +Input: [key#x, val#x] + +(3) HashAggregate +Input: [key#x, val#x] +Keys: [] +Functions: [partial_count(val#x), partial_sum(cast(key#x as bigint)), partial_count(key#x) FILTER (WHERE (val#x > 1))] +Aggregate Attributes: [count#xL, sum#xL, count#xL] +Results: [count#xL, sum#xL, count#xL] + +(4) Exchange +Input: [count#xL, sum#xL, count#xL] + +(5) HashAggregate [codegen id : 2] +Input: [count#xL, sum#xL, count#xL] +Keys: [] +Functions: [count(val#x), sum(cast(key#x as bigint)), count(key#x)] +Aggregate Attributes: [count(val#x)#xL, sum(cast(key#x as bigint))#xL, count(key#x)#xL] +Results: [(count(val#x)#xL + sum(cast(key#x as bigint))#xL) AS TOTAL#xL, count(key#x)#xL AS count(key) FILTER (WHERE (val > 1))#xL] + + +-- !query +EXPLAIN FORMATTED + SELECT key, sort_array(collect_set(val))[0] + FROM explain_temp4 + GROUP BY key +-- !query schema +struct +-- !query output +== Physical Plan == +ObjectHashAggregate (5) ++- Exchange (4) + +- ObjectHashAggregate (3) + +- * ColumnarToRow (2) + +- Scan parquet default.explain_temp4 (1) + + +(1) Scan parquet default.explain_temp4 +Output: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp4] +ReadSchema: struct + +(2) ColumnarToRow [codegen id : 1] +Input: [key#x, val#x] + +(3) ObjectHashAggregate +Input: [key#x, val#x] +Keys: [key#x] +Functions: [partial_collect_set(val#x, 0, 0)] +Aggregate Attributes: [buf#x] +Results: [key#x, buf#x] + +(4) Exchange +Input: [key#x, buf#x] + +(5) ObjectHashAggregate +Input: [key#x, buf#x] +Keys: [key#x] +Functions: [collect_set(val#x, 0, 0)] +Aggregate Attributes: [collect_set(val#x, 0, 0)#x] +Results: [key#x, sort_array(collect_set(val#x, 0, 0)#x, true)[0] AS sort_array(collect_set(val), true)[0]#x] + + +-- !query +EXPLAIN FORMATTED + SELECT key, MIN(val) + FROM explain_temp4 + GROUP BY key +-- !query schema +struct +-- !query output +== Physical Plan == +SortAggregate (7) ++- * Sort (6) + +- Exchange (5) + +- SortAggregate (4) + +- * Sort (3) + +- * ColumnarToRow (2) + +- Scan parquet default.explain_temp4 (1) + + +(1) Scan parquet default.explain_temp4 +Output: [key#x, val#x] +Batched: true +Location [not included in comparison]/{warehouse_dir}/explain_temp4] +ReadSchema: struct + +(2) ColumnarToRow [codegen id : 1] +Input: [key#x, val#x] + +(3) Sort [codegen id : 1] +Input: [key#x, val#x] + +(4) SortAggregate +Input: [key#x, val#x] +Keys: [key#x] +Functions: [partial_min(val#x)] +Aggregate Attributes: [min#x] +Results: [key#x, min#x] + +(5) Exchange +Input: [key#x, min#x] + +(6) Sort [codegen id : 2] +Input: [key#x, min#x] + +(7) SortAggregate +Input: [key#x, min#x] +Keys: [key#x] +Functions: [min(val#x)] +Aggregate Attributes: [min(val#x)#x] +Results: [key#x, min(val#x)#x AS min(val)#x] + + +-- !query DROP TABLE explain_temp1 --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output --- !query 16 +-- !query DROP TABLE explain_temp2 --- !query 16 schema +-- !query schema struct<> --- !query 16 output +-- !query output --- !query 17 +-- !query DROP TABLE explain_temp3 --- !query 17 schema +-- !query schema struct<> --- !query 17 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/extract.sql.out b/sql/core/src/test/resources/sql-tests/results/extract.sql.out index b02dfe054344b..583459f9037b8 100644 --- a/sql/core/src/test/resources/sql-tests/results/extract.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/extract.sql.out @@ -2,518 +2,518 @@ -- Number of queries: 64 --- !query 0 +-- !query CREATE TEMPORARY VIEW t AS select '2011-05-06 07:08:09.1234567' as c --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query select extract(millennium from c) from t --- !query 1 schema -struct --- !query 1 output +-- !query schema +struct +-- !query output 3 --- !query 2 +-- !query select extract(millennia from c) from t --- !query 2 schema -struct --- !query 2 output +-- !query schema +struct +-- !query output 3 --- !query 3 +-- !query select extract(mil from c) from t --- !query 3 schema -struct --- !query 3 output +-- !query schema +struct +-- !query output 3 --- !query 4 +-- !query select extract(mils from c) from t --- !query 4 schema -struct --- !query 4 output +-- !query schema +struct +-- !query output 3 --- !query 5 +-- !query select extract(century from c) from t --- !query 5 schema -struct --- !query 5 output +-- !query schema +struct +-- !query output 21 --- !query 6 +-- !query select extract(centuries from c) from t --- !query 6 schema -struct --- !query 6 output +-- !query schema +struct +-- !query output 21 --- !query 7 +-- !query select extract(c from c) from t --- !query 7 schema -struct --- !query 7 output +-- !query schema +struct +-- !query output 21 --- !query 8 +-- !query select extract(cent from c) from t --- !query 8 schema -struct --- !query 8 output +-- !query schema +struct +-- !query output 21 --- !query 9 +-- !query select extract(decade from c) from t --- !query 9 schema -struct --- !query 9 output +-- !query schema +struct +-- !query output 201 --- !query 10 +-- !query select extract(decades from c) from t --- !query 10 schema -struct --- !query 10 output +-- !query schema +struct +-- !query output 201 --- !query 11 +-- !query select extract(dec from c) from t --- !query 11 schema -struct --- !query 11 output +-- !query schema +struct +-- !query output 201 --- !query 12 +-- !query select extract(decs from c) from t --- !query 12 schema -struct --- !query 12 output +-- !query schema +struct +-- !query output 201 --- !query 13 +-- !query select extract(year from c) from t --- !query 13 schema -struct --- !query 13 output +-- !query schema +struct +-- !query output 2011 --- !query 14 +-- !query select extract(y from c) from t --- !query 14 schema -struct --- !query 14 output +-- !query schema +struct +-- !query output 2011 --- !query 15 +-- !query select extract(years from c) from t --- !query 15 schema -struct --- !query 15 output +-- !query schema +struct +-- !query output 2011 --- !query 16 +-- !query select extract(yr from c) from t --- !query 16 schema -struct --- !query 16 output +-- !query schema +struct +-- !query output 2011 --- !query 17 +-- !query select extract(yrs from c) from t --- !query 17 schema -struct --- !query 17 output +-- !query schema +struct +-- !query output 2011 --- !query 18 +-- !query select extract(isoyear from c) from t --- !query 18 schema -struct --- !query 18 output +-- !query schema +struct +-- !query output 2011 --- !query 19 +-- !query select extract(quarter from c) from t --- !query 19 schema -struct --- !query 19 output +-- !query schema +struct +-- !query output 2 --- !query 20 +-- !query select extract(qtr from c) from t --- !query 20 schema -struct --- !query 20 output +-- !query schema +struct +-- !query output 2 --- !query 21 +-- !query select extract(month from c) from t --- !query 21 schema -struct --- !query 21 output +-- !query schema +struct +-- !query output 5 --- !query 22 +-- !query select extract(mon from c) from t --- !query 22 schema -struct --- !query 22 output +-- !query schema +struct +-- !query output 5 --- !query 23 +-- !query select extract(mons from c) from t --- !query 23 schema -struct --- !query 23 output +-- !query schema +struct +-- !query output 5 --- !query 24 +-- !query select extract(months from c) from t --- !query 24 schema -struct --- !query 24 output +-- !query schema +struct +-- !query output 5 --- !query 25 +-- !query select extract(week from c) from t --- !query 25 schema -struct --- !query 25 output +-- !query schema +struct +-- !query output 18 --- !query 26 +-- !query select extract(w from c) from t --- !query 26 schema -struct --- !query 26 output +-- !query schema +struct +-- !query output 18 --- !query 27 +-- !query select extract(weeks from c) from t --- !query 27 schema -struct --- !query 27 output +-- !query schema +struct +-- !query output 18 --- !query 28 +-- !query select extract(day from c) from t --- !query 28 schema -struct --- !query 28 output +-- !query schema +struct +-- !query output 6 --- !query 29 +-- !query select extract(d from c) from t --- !query 29 schema -struct --- !query 29 output +-- !query schema +struct +-- !query output 6 --- !query 30 +-- !query select extract(days from c) from t --- !query 30 schema -struct --- !query 30 output +-- !query schema +struct +-- !query output 6 --- !query 31 +-- !query select extract(dayofweek from c) from t --- !query 31 schema -struct --- !query 31 output +-- !query schema +struct +-- !query output 6 --- !query 32 +-- !query select extract(dow from c) from t --- !query 32 schema -struct<(dayofweek(CAST(c AS DATE)) - 1):int> --- !query 32 output +-- !query schema +struct +-- !query output 5 --- !query 33 +-- !query select extract(isodow from c) from t --- !query 33 schema -struct<(weekday(CAST(c AS DATE)) + 1):int> --- !query 33 output +-- !query schema +struct +-- !query output 5 --- !query 34 +-- !query select extract(doy from c) from t --- !query 34 schema -struct --- !query 34 output +-- !query schema +struct +-- !query output 126 --- !query 35 +-- !query select extract(hour from c) from t --- !query 35 schema -struct --- !query 35 output +-- !query schema +struct +-- !query output 7 --- !query 36 +-- !query select extract(h from c) from t --- !query 36 schema -struct --- !query 36 output +-- !query schema +struct +-- !query output 7 --- !query 37 +-- !query select extract(hours from c) from t --- !query 37 schema -struct --- !query 37 output +-- !query schema +struct +-- !query output 7 --- !query 38 +-- !query select extract(hr from c) from t --- !query 38 schema -struct --- !query 38 output +-- !query schema +struct +-- !query output 7 --- !query 39 +-- !query select extract(hrs from c) from t --- !query 39 schema -struct --- !query 39 output +-- !query schema +struct +-- !query output 7 --- !query 40 +-- !query select extract(minute from c) from t --- !query 40 schema -struct --- !query 40 output +-- !query schema +struct +-- !query output 8 --- !query 41 +-- !query select extract(m from c) from t --- !query 41 schema -struct --- !query 41 output +-- !query schema +struct +-- !query output 8 --- !query 42 +-- !query select extract(min from c) from t --- !query 42 schema -struct --- !query 42 output +-- !query schema +struct +-- !query output 8 --- !query 43 +-- !query select extract(mins from c) from t --- !query 43 schema -struct --- !query 43 output +-- !query schema +struct +-- !query output 8 --- !query 44 +-- !query select extract(minutes from c) from t --- !query 44 schema -struct --- !query 44 output +-- !query schema +struct +-- !query output 8 --- !query 45 +-- !query select extract(second from c) from t --- !query 45 schema -struct --- !query 45 output -9 +-- !query schema +struct +-- !query output +9.123456 --- !query 46 +-- !query select extract(s from c) from t --- !query 46 schema -struct --- !query 46 output -9 +-- !query schema +struct +-- !query output +9.123456 --- !query 47 +-- !query select extract(sec from c) from t --- !query 47 schema -struct --- !query 47 output -9 +-- !query schema +struct +-- !query output +9.123456 --- !query 48 +-- !query select extract(seconds from c) from t --- !query 48 schema -struct --- !query 48 output -9 +-- !query schema +struct +-- !query output +9.123456 --- !query 49 +-- !query select extract(secs from c) from t --- !query 49 schema -struct --- !query 49 output -9 +-- !query schema +struct +-- !query output +9.123456 --- !query 50 +-- !query select extract(milliseconds from c) from t --- !query 50 schema -struct --- !query 50 output +-- !query schema +struct +-- !query output 9123.456 --- !query 51 +-- !query select extract(msec from c) from t --- !query 51 schema -struct --- !query 51 output +-- !query schema +struct +-- !query output 9123.456 --- !query 52 +-- !query select extract(msecs from c) from t --- !query 52 schema -struct --- !query 52 output +-- !query schema +struct +-- !query output 9123.456 --- !query 53 +-- !query select extract(millisecon from c) from t --- !query 53 schema -struct --- !query 53 output +-- !query schema +struct +-- !query output 9123.456 --- !query 54 +-- !query select extract(mseconds from c) from t --- !query 54 schema -struct --- !query 54 output +-- !query schema +struct +-- !query output 9123.456 --- !query 55 +-- !query select extract(ms from c) from t --- !query 55 schema -struct --- !query 55 output +-- !query schema +struct +-- !query output 9123.456 --- !query 56 +-- !query select extract(microseconds from c) from t --- !query 56 schema -struct --- !query 56 output +-- !query schema +struct +-- !query output 9123456 --- !query 57 +-- !query select extract(usec from c) from t --- !query 57 schema -struct --- !query 57 output +-- !query schema +struct +-- !query output 9123456 --- !query 58 +-- !query select extract(usecs from c) from t --- !query 58 schema -struct --- !query 58 output +-- !query schema +struct +-- !query output 9123456 --- !query 59 +-- !query select extract(useconds from c) from t --- !query 59 schema -struct --- !query 59 output +-- !query schema +struct +-- !query output 9123456 --- !query 60 +-- !query select extract(microsecon from c) from t --- !query 60 schema -struct --- !query 60 output +-- !query schema +struct +-- !query output 9123456 --- !query 61 +-- !query select extract(us from c) from t --- !query 61 schema -struct --- !query 61 output +-- !query schema +struct +-- !query output 9123456 --- !query 62 +-- !query select extract(epoch from c) from t --- !query 62 schema -struct --- !query 62 output +-- !query schema +struct +-- !query output 1304665689.123456 --- !query 63 +-- !query select extract(not_supported from c) from t --- !query 63 schema +-- !query schema struct<> --- !query 63 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException -Literals of type 'NOT_SUPPORTED' are currently not supported.(line 1, pos 7) +Literals of type 'not_supported' are currently not supported.(line 1, pos 7) == SQL == select extract(not_supported from c) from t diff --git a/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out b/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out index 3439a05727f95..4584b823a6e70 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-analytics.sql.out @@ -2,21 +2,21 @@ -- Number of queries: 29 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES (1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2) AS testData(a, b) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT a + b, b, SUM(a - b) FROM testData GROUP BY a + b, b WITH CUBE --- !query 1 schema +-- !query schema struct<(a + b):int,b:int,sum((a - b)):bigint> --- !query 1 output +-- !query output 2 1 0 2 NULL 0 3 1 1 @@ -32,11 +32,11 @@ NULL 2 0 NULL NULL 3 --- !query 2 +-- !query SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH CUBE --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 1 1 1 1 2 2 1 NULL 3 @@ -51,11 +51,11 @@ NULL 2 6 NULL NULL 9 --- !query 3 +-- !query SELECT a + b, b, SUM(a - b) FROM testData GROUP BY a + b, b WITH ROLLUP --- !query 3 schema +-- !query schema struct<(a + b):int,b:int,sum((a - b)):bigint> --- !query 3 output +-- !query output 2 1 0 2 NULL 0 3 1 1 @@ -69,11 +69,11 @@ struct<(a + b):int,b:int,sum((a - b)):bigint> NULL NULL 3 --- !query 4 +-- !query SELECT a, b, SUM(b) FROM testData GROUP BY a, b WITH ROLLUP --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 1 1 1 1 2 2 1 NULL 3 @@ -86,21 +86,21 @@ struct NULL NULL 9 --- !query 5 +-- !query CREATE OR REPLACE TEMPORARY VIEW courseSales AS SELECT * FROM VALUES ("dotNET", 2012, 10000), ("Java", 2012, 20000), ("dotNET", 2012, 5000), ("dotNET", 2013, 48000), ("Java", 2013, 30000) AS courseSales(course, year, earnings) --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course, year) ORDER BY course, year --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output NULL NULL 113000 Java NULL 50000 Java 2012 20000 @@ -110,11 +110,11 @@ dotNET 2012 15000 dotNET 2013 48000 --- !query 7 +-- !query SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE(course, year) ORDER BY course, year --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output NULL NULL 113000 NULL 2012 35000 NULL 2013 78000 @@ -126,41 +126,41 @@ dotNET 2012 15000 dotNET 2013 48000 --- !query 8 +-- !query SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(course, year) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output Java NULL 50000 NULL 2012 35000 NULL 2013 78000 dotNET NULL 63000 --- !query 9 +-- !query SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(course) --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output Java NULL 50000 dotNET NULL 63000 --- !query 10 +-- !query SELECT course, year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(year) --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output NULL 2012 35000 NULL 2013 78000 --- !query 11 +-- !query SELECT course, SUM(earnings) AS sum FROM courseSales GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER BY course, sum --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output NULL 113000 Java 20000 Java 30000 @@ -171,12 +171,12 @@ dotNET 48000 dotNET 63000 --- !query 12 +-- !query SELECT course, SUM(earnings) AS sum, GROUPING_ID(course, earnings) FROM courseSales GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER BY course, sum --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output NULL 113000 3 Java 20000 0 Java 30000 0 @@ -187,12 +187,12 @@ dotNET 48000 0 dotNET 63000 1 --- !query 13 +-- !query SELECT course, year, GROUPING(course), GROUPING(year), GROUPING_ID(course, year) FROM courseSales GROUP BY CUBE(course, year) --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output Java 2012 0 0 0 Java 2013 0 0 0 Java NULL 0 1 1 @@ -204,29 +204,29 @@ dotNET 2013 0 0 0 dotNET NULL 0 1 1 --- !query 14 +-- !query SELECT course, year, GROUPING(course) FROM courseSales GROUP BY course, year --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output org.apache.spark.sql.AnalysisException grouping() can only be used with GroupingSets/Cube/Rollup; --- !query 15 +-- !query SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY course, year --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output org.apache.spark.sql.AnalysisException grouping_id() can only be used with GroupingSets/Cube/Rollup; --- !query 16 +-- !query SELECT course, year, grouping__id FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id, course, year --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output Java 2012 0 Java 2013 0 dotNET 2012 0 @@ -238,40 +238,40 @@ NULL 2013 2 NULL NULL 3 --- !query 17 +-- !query SELECT course, year FROM courseSales GROUP BY CUBE(course, year) HAVING GROUPING(year) = 1 AND GROUPING_ID(course, year) > 0 ORDER BY course, year --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output NULL NULL Java NULL dotNET NULL --- !query 18 +-- !query SELECT course, year FROM courseSales GROUP BY course, year HAVING GROUPING(course) > 0 --- !query 18 schema +-- !query schema struct<> --- !query 18 output +-- !query output org.apache.spark.sql.AnalysisException grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup; --- !query 19 +-- !query SELECT course, year FROM courseSales GROUP BY course, year HAVING GROUPING_ID(course) > 0 --- !query 19 schema +-- !query schema struct<> --- !query 19 output +-- !query output org.apache.spark.sql.AnalysisException grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup; --- !query 20 +-- !query SELECT course, year FROM courseSales GROUP BY CUBE(course, year) HAVING grouping__id > 0 --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output Java NULL NULL 2012 NULL 2013 @@ -279,12 +279,12 @@ NULL NULL dotNET NULL --- !query 21 +-- !query SELECT course, year, GROUPING(course), GROUPING(year) FROM courseSales GROUP BY CUBE(course, year) ORDER BY GROUPING(course), GROUPING(year), course, year --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output Java 2012 0 0 Java 2013 0 0 dotNET 2012 0 0 @@ -296,12 +296,12 @@ NULL 2013 1 0 NULL NULL 1 1 --- !query 22 +-- !query SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY CUBE(course, year) ORDER BY GROUPING(course), GROUPING(year), course, year --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output Java 2012 0 Java 2013 0 dotNET 2012 0 @@ -313,29 +313,29 @@ NULL 2013 2 NULL NULL 3 --- !query 23 +-- !query SELECT course, year FROM courseSales GROUP BY course, year ORDER BY GROUPING(course) --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output org.apache.spark.sql.AnalysisException grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup; --- !query 24 +-- !query SELECT course, year FROM courseSales GROUP BY course, year ORDER BY GROUPING_ID(course) --- !query 24 schema +-- !query schema struct<> --- !query 24 output +-- !query output org.apache.spark.sql.AnalysisException grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup; --- !query 25 +-- !query SELECT course, year FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id, course, year --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output Java 2012 Java 2013 dotNET 2012 @@ -347,11 +347,11 @@ NULL 2013 NULL NULL --- !query 26 +-- !query SELECT a + b AS k1, b AS k2, SUM(a - b) FROM testData GROUP BY CUBE(k1, k2) --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output 2 1 0 2 NULL 0 3 1 1 @@ -367,11 +367,11 @@ NULL 2 0 NULL NULL 3 --- !query 27 +-- !query SELECT a + b AS k, b, SUM(a - b) FROM testData GROUP BY ROLLUP(k, b) --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output 2 1 0 2 NULL 0 3 1 1 @@ -385,10 +385,10 @@ struct NULL NULL 3 --- !query 28 +-- !query SELECT a + b, b AS k, SUM(a - b) FROM testData GROUP BY a + b, k GROUPING SETS(k) --- !query 28 schema +-- !query schema struct<(a + b):int,k:int,sum((a - b)):bigint> --- !query 28 output +-- !query output NULL 1 3 NULL 2 0 diff --git a/sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out new file mode 100644 index 0000000000000..a4c7c2cf90cd7 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/group-by-filter.sql.out @@ -0,0 +1,464 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 37 + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES +(1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null) +AS testData(a, b) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW EMP AS SELECT * FROM VALUES + (100, "emp 1", date "2005-01-01", 100.00D, 10), + (100, "emp 1", date "2005-01-01", 100.00D, 10), + (200, "emp 2", date "2003-01-01", 200.00D, 10), + (300, "emp 3", date "2002-01-01", 300.00D, 20), + (400, "emp 4", date "2005-01-01", 400.00D, 30), + (500, "emp 5", date "2001-01-01", 400.00D, NULL), + (600, "emp 6 - no dept", date "2001-01-01", 400.00D, 100), + (700, "emp 7", date "2010-01-01", 400.00D, 100), + (800, "emp 8", date "2016-01-01", 150.00D, 70) +AS EMP(id, emp_name, hiredate, salary, dept_id) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES + (10, "dept 1", "CA"), + (20, "dept 2", "NY"), + (30, "dept 3", "TX"), + (40, "dept 4 - unassigned", "OR"), + (50, "dept 5 - unassigned", "NJ"), + (70, "dept 7", "FL") +AS DEPT(dept_id, dept_name, state) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +grouping expressions sequence is empty, and 'testdata.`a`' is not an aggregate function. Wrap '(count(testdata.`b`) FILTER (WHERE (testdata.`a` >= 2)) AS `count(b) FILTER (WHERE (a >= 2))`)' in windowing function(s) or wrap 'testdata.`a`' in first() (or first_value) if you don't care which value you get.; + + +-- !query +SELECT COUNT(a) FILTER (WHERE a = 1), COUNT(b) FILTER (WHERE a > 1) FROM testData +-- !query schema +struct 1)):bigint> +-- !query output +2 4 + + +-- !query +SELECT COUNT(id) FILTER (WHERE hiredate = date "2001-01-01") FROM emp +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT COUNT(id) FILTER (WHERE hiredate = to_date('2001-01-01 00:00:00')) FROM emp +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT COUNT(id) FILTER (WHERE hiredate = to_timestamp("2001-01-01 00:00:00")) FROM emp +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT COUNT(id) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd") = "2001-01-01") FROM emp +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT a, COUNT(b) FILTER (WHERE a >= 2) FROM testData GROUP BY a +-- !query schema +struct= 2)):bigint> +-- !query output +1 0 +2 2 +3 2 +NULL 0 + + +-- !query +SELECT a, COUNT(b) FILTER (WHERE a != 2) FROM testData GROUP BY b +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +expression 'testdata.`a`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.; + + +-- !query +SELECT COUNT(a) FILTER (WHERE a >= 0), COUNT(b) FILTER (WHERE a >= 3) FROM testData GROUP BY a +-- !query schema +struct= 0)):bigint,count(b) FILTER (WHERE (a >= 3)):bigint> +-- !query output +0 0 +2 0 +2 0 +3 2 + + +-- !query +SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > date "2003-01-01") FROM emp GROUP BY dept_id +-- !query schema +struct DATE '2003-01-01')):double> +-- !query output +10 200.0 +100 400.0 +20 NULL +30 400.0 +70 150.0 +NULL NULL + + +-- !query +SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_date("2003-01-01")) FROM emp GROUP BY dept_id +-- !query schema +struct to_date('2003-01-01'))):double> +-- !query output +10 200.0 +100 400.0 +20 NULL +30 400.0 +70 150.0 +NULL NULL + + +-- !query +SELECT dept_id, SUM(salary) FILTER (WHERE hiredate > to_timestamp("2003-01-01 00:00:00")) FROM emp GROUP BY dept_id +-- !query schema +struct to_timestamp('2003-01-01 00:00:00'))):double> +-- !query output +10 200.0 +100 400.0 +20 NULL +30 400.0 +70 150.0 +NULL NULL + + +-- !query +SELECT dept_id, SUM(salary) FILTER (WHERE date_format(hiredate, "yyyy-MM-dd") > "2003-01-01") FROM emp GROUP BY dept_id +-- !query schema +struct 2003-01-01)):double> +-- !query output +10 200.0 +100 400.0 +20 NULL +30 400.0 +70 150.0 +NULL NULL + + +-- !query +SELECT 'foo', COUNT(a) FILTER (WHERE b <= 2) FROM testData GROUP BY 1 +-- !query schema +struct +-- !query output +foo 6 + + +-- !query +SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= date "2003-01-01") FROM emp GROUP BY 1 +-- !query schema +struct= DATE '2003-01-01')):double> +-- !query output +foo 1350.0 + + +-- !query +SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= to_date("2003-01-01")) FROM emp GROUP BY 1 +-- !query schema +struct= to_date('2003-01-01'))):double> +-- !query output +foo 1350.0 + + +-- !query +SELECT 'foo', SUM(salary) FILTER (WHERE hiredate >= to_timestamp("2003-01-01")) FROM emp GROUP BY 1 +-- !query schema +struct= to_timestamp('2003-01-01'))):double> +-- !query output +foo 1350.0 + + +-- !query +select dept_id, count(distinct emp_name), count(distinct hiredate), sum(salary), sum(salary) filter (where id > 200) from emp group by dept_id +-- !query schema +struct 200)):double> +-- !query output +10 2 2 400.0 NULL +100 2 2 800.0 800.0 +20 1 1 300.0 300.0 +30 1 1 400.0 400.0 +70 1 1 150.0 150.0 +NULL 1 1 400.0 400.0 + + +-- !query +select dept_id, count(distinct emp_name), count(distinct hiredate), sum(salary), sum(salary) filter (where id + dept_id > 500) from emp group by dept_id +-- !query schema +struct 500)):double> +-- !query output +10 2 2 400.0 NULL +100 2 2 800.0 800.0 +20 1 1 300.0 NULL +30 1 1 400.0 NULL +70 1 1 150.0 150.0 +NULL 1 1 400.0 NULL + + +-- !query +select dept_id, count(distinct emp_name), count(distinct hiredate), sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id > 200) from emp group by dept_id +-- !query schema +struct 200)):double> +-- !query output +10 2 2 400.0 NULL +100 2 2 NULL 800.0 +20 1 1 300.0 300.0 +30 1 1 NULL 400.0 +70 1 1 150.0 150.0 +NULL 1 1 NULL 400.0 + + +-- !query +select dept_id, count(distinct emp_name), count(distinct hiredate), sum(salary) filter (where salary < 400.00D), sum(salary) filter (where id + dept_id > 500) from emp group by dept_id +-- !query schema +struct 500)):double> +-- !query output +10 2 2 400.0 NULL +100 2 2 NULL 800.0 +20 1 1 300.0 NULL +30 1 1 NULL NULL +70 1 1 150.0 150.0 +NULL 1 1 NULL NULL + + +-- !query +SELECT 'foo', APPROX_COUNT_DISTINCT(a) FILTER (WHERE b >= 0) FROM testData WHERE a = 0 GROUP BY 1 +-- !query schema +struct= 0)):bigint> +-- !query output + + + +-- !query +SELECT 'foo', MAX(STRUCT(a)) FILTER (WHERE b >= 1) FROM testData WHERE a = 0 GROUP BY 1 +-- !query schema +struct= 1)):struct> +-- !query output + + + +-- !query +SELECT a + b, COUNT(b) FILTER (WHERE b >= 2) FROM testData GROUP BY a + b +-- !query schema +struct<(a + b):int,count(b) FILTER (WHERE (b >= 2)):bigint> +-- !query output +2 0 +3 1 +4 1 +5 1 +NULL 0 + + +-- !query +SELECT a + 2, COUNT(b) FILTER (WHERE b IN (1, 2)) FROM testData GROUP BY a + 1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +expression 'testdata.`a`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.; + + +-- !query +SELECT a + 1 + 1, COUNT(b) FILTER (WHERE b > 0) FROM testData GROUP BY a + 1 +-- !query schema +struct<((a + 1) + 1):int,count(b) FILTER (WHERE (b > 0)):bigint> +-- !query output +3 2 +4 2 +5 2 +NULL 1 + + +-- !query +SELECT a AS k, COUNT(b) FILTER (WHERE b > 0) FROM testData GROUP BY k +-- !query schema +struct 0)):bigint> +-- !query output +1 2 +2 2 +3 2 +NULL 1 + + +-- !query +SELECT emp.dept_id, + avg(salary), + avg(salary) FILTER (WHERE id > (SELECT 200)) +FROM emp +GROUP BY dept_id +-- !query schema +struct scalarsubquery())):double> +-- !query output +10 133.33333333333334 NULL +100 400.0 400.0 +20 300.0 300.0 +30 400.0 400.0 +70 150.0 150.0 +NULL 400.0 400.0 + + +-- !query +SELECT emp.dept_id, + avg(salary), + avg(salary) FILTER (WHERE emp.dept_id = (SELECT dept_id FROM dept LIMIT 1)) +FROM emp +GROUP BY dept_id +-- !query schema +struct +-- !query output +10 133.33333333333334 133.33333333333334 +100 400.0 NULL +20 300.0 NULL +30 400.0 NULL +70 150.0 NULL +NULL 400.0 NULL + + +-- !query +SELECT emp.dept_id, + avg(salary), + avg(salary) FILTER (WHERE EXISTS (SELECT state + FROM dept + WHERE dept.dept_id = emp.dept_id)) +FROM emp +GROUP BY dept_id +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +IN/EXISTS predicate sub-queries can only be used in Filter/Join and a few commands: Aggregate [dept_id#x], [dept_id#x, avg(salary#x) AS avg(salary)#x, avg(salary#x) FILTER (WHERE exists#x [dept_id#x]) AS avg(salary) FILTER (WHERE exists(dept_id))#x] +: +- Project [state#x] +: +- Filter (dept_id#x = outer(dept_id#x)) +: +- SubqueryAlias dept +: +- Project [dept_id#x, dept_name#x, state#x] +: +- SubqueryAlias DEPT +: +- LocalRelation [dept_id#x, dept_name#x, state#x] ++- SubqueryAlias emp + +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + +- SubqueryAlias EMP + +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] +; + + +-- !query +SELECT emp.dept_id, + Sum(salary), + Sum(salary) FILTER (WHERE NOT EXISTS (SELECT state + FROM dept + WHERE dept.dept_id = emp.dept_id)) +FROM emp +GROUP BY dept_id +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +IN/EXISTS predicate sub-queries can only be used in Filter/Join and a few commands: Aggregate [dept_id#x], [dept_id#x, sum(salary#x) AS sum(salary)#x, sum(salary#x) FILTER (WHERE NOT exists#x [dept_id#x]) AS sum(salary) FILTER (WHERE (NOT exists(dept_id)))#x] +: +- Project [state#x] +: +- Filter (dept_id#x = outer(dept_id#x)) +: +- SubqueryAlias dept +: +- Project [dept_id#x, dept_name#x, state#x] +: +- SubqueryAlias DEPT +: +- LocalRelation [dept_id#x, dept_name#x, state#x] ++- SubqueryAlias emp + +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + +- SubqueryAlias EMP + +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] +; + + +-- !query +SELECT emp.dept_id, + avg(salary), + avg(salary) FILTER (WHERE emp.dept_id IN (SELECT DISTINCT dept_id + FROM dept)) +FROM emp +GROUP BY dept_id +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +IN/EXISTS predicate sub-queries can only be used in Filter/Join and a few commands: Aggregate [dept_id#x], [dept_id#x, avg(salary#x) AS avg(salary)#x, avg(salary#x) FILTER (WHERE dept_id#x IN (list#x [])) AS avg(salary) FILTER (WHERE (dept_id IN (listquery())))#x] +: +- Distinct +: +- Project [dept_id#x] +: +- SubqueryAlias dept +: +- Project [dept_id#x, dept_name#x, state#x] +: +- SubqueryAlias DEPT +: +- LocalRelation [dept_id#x, dept_name#x, state#x] ++- SubqueryAlias emp + +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + +- SubqueryAlias EMP + +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] +; + + +-- !query +SELECT emp.dept_id, + Sum(salary), + Sum(salary) FILTER (WHERE emp.dept_id NOT IN (SELECT DISTINCT dept_id + FROM dept)) +FROM emp +GROUP BY dept_id +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +IN/EXISTS predicate sub-queries can only be used in Filter/Join and a few commands: Aggregate [dept_id#x], [dept_id#x, sum(salary#x) AS sum(salary)#x, sum(salary#x) FILTER (WHERE NOT dept_id#x IN (list#x [])) AS sum(salary) FILTER (WHERE (NOT (dept_id IN (listquery()))))#x] +: +- Distinct +: +- Project [dept_id#x] +: +- SubqueryAlias dept +: +- Project [dept_id#x, dept_name#x, state#x] +: +- SubqueryAlias DEPT +: +- LocalRelation [dept_id#x, dept_name#x, state#x] ++- SubqueryAlias emp + +- Project [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] + +- SubqueryAlias EMP + +- LocalRelation [id#x, emp_name#x, hiredate#x, salary#x, dept_id#x] +; + + +-- !query +SELECT t1.b FROM (SELECT COUNT(b) FILTER (WHERE a >= 2) AS b FROM testData) t1 +-- !query schema +struct +-- !query output +4 diff --git a/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out index 09e2c632f6386..bf9f606a2224e 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 20 --- !query 0 +-- !query create temporary view data as select * from values (1, 1), (1, 2), @@ -11,55 +11,55 @@ create temporary view data as select * from values (3, 1), (3, 2) as data(a, b) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query select a, sum(b) from data group by 1 --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 1 3 2 3 3 3 --- !query 2 +-- !query select 1, 2, sum(b) from data group by 1, 2 --- !query 2 schema +-- !query schema struct<1:int,2:int,sum(b):bigint> --- !query 2 output +-- !query output 1 2 9 --- !query 3 +-- !query select a, 1, sum(b) from data group by a, 1 --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 1 1 3 2 1 3 3 1 3 --- !query 4 +-- !query select a, 1, sum(b) from data group by 1, 2 --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 1 1 3 2 1 3 3 1 3 --- !query 5 +-- !query select a, b + 2, count(2) from data group by a, 2 --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 1 3 1 1 4 1 2 3 1 @@ -68,11 +68,11 @@ struct 3 4 1 --- !query 6 +-- !query select a as aa, b + 2 as bb, count(2) from data group by 1, 2 --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 1 3 1 1 4 1 2 3 1 @@ -81,66 +81,66 @@ struct 3 4 1 --- !query 7 +-- !query select sum(b) from data group by 1 + 0 --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 9 --- !query 8 +-- !query select a, b from data group by -1 --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output org.apache.spark.sql.AnalysisException GROUP BY position -1 is not in select list (valid range is [1, 2]); line 1 pos 31 --- !query 9 +-- !query select a, b from data group by 0 --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output org.apache.spark.sql.AnalysisException GROUP BY position 0 is not in select list (valid range is [1, 2]); line 1 pos 31 --- !query 10 +-- !query select a, b from data group by 3 --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output org.apache.spark.sql.AnalysisException GROUP BY position 3 is not in select list (valid range is [1, 2]); line 1 pos 31 --- !query 11 +-- !query select a, b, sum(b) from data group by 3 --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output org.apache.spark.sql.AnalysisException aggregate functions are not allowed in GROUP BY, but found sum(CAST(data.`b` AS BIGINT)); --- !query 12 +-- !query select a, b, sum(b) + 2 from data group by 3 --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output org.apache.spark.sql.AnalysisException aggregate functions are not allowed in GROUP BY, but found (sum(CAST(data.`b` AS BIGINT)) + CAST(2 AS BIGINT)); --- !query 13 +-- !query select a, rand(0), sum(b) from (select /*+ REPARTITION(1) */ a, b from data) group by a, 2 --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output 1 0.5234194256885571 2 1 0.7604953758285915 1 2 0.0953472826424725 1 @@ -149,52 +149,52 @@ struct 3 0.7141011170991605 1 --- !query 14 +-- !query select * from data group by a, b, 1 --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output org.apache.spark.sql.AnalysisException Star (*) is not allowed in select list when GROUP BY ordinal position is used; --- !query 15 +-- !query select a, count(a) from (select 1 as a) tmp group by 1 order by 1 --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 1 1 --- !query 16 +-- !query select count(a), a from (select 1 as a) tmp group by 2 having a > 0 --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output 1 1 --- !query 17 +-- !query select a, a AS k, count(b) from data group by k, 1 --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output 1 1 2 2 2 2 3 3 2 --- !query 18 +-- !query set spark.sql.groupByOrdinal=false --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output spark.sql.groupByOrdinal false --- !query 19 +-- !query select sum(b) from data group by -1 --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output 9 diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out index 3a5df254f2cd9..7bfdd0ad53a95 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out @@ -1,102 +1,102 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 52 +-- Number of queries: 56 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES (1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null) AS testData(a, b) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT a, COUNT(b) FROM testData --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output org.apache.spark.sql.AnalysisException grouping expressions sequence is empty, and 'testdata.`a`' is not an aggregate function. Wrap '(count(testdata.`b`) AS `count(b)`)' in windowing function(s) or wrap 'testdata.`a`' in first() (or first_value) if you don't care which value you get.; --- !query 2 +-- !query SELECT COUNT(a), COUNT(b) FROM testData --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 7 7 --- !query 3 +-- !query SELECT a, COUNT(b) FROM testData GROUP BY a --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 1 2 2 2 3 2 NULL 1 --- !query 4 +-- !query SELECT a, COUNT(b) FROM testData GROUP BY b --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output org.apache.spark.sql.AnalysisException expression 'testdata.`a`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.; --- !query 5 +-- !query SELECT COUNT(a), COUNT(b) FROM testData GROUP BY a --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 0 1 2 2 2 2 3 2 --- !query 6 +-- !query SELECT 'foo', COUNT(a) FROM testData GROUP BY 1 --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output foo 7 --- !query 7 +-- !query SELECT 'foo' FROM testData WHERE a = 0 GROUP BY 1 --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output --- !query 8 +-- !query SELECT 'foo', APPROX_COUNT_DISTINCT(a) FROM testData WHERE a = 0 GROUP BY 1 --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output --- !query 9 +-- !query SELECT 'foo', MAX(STRUCT(a)) FROM testData WHERE a = 0 GROUP BY 1 --- !query 9 schema +-- !query schema struct> --- !query 9 output +-- !query output --- !query 10 +-- !query SELECT a + b, COUNT(b) FROM testData GROUP BY a + b --- !query 10 schema +-- !query schema struct<(a + b):int,count(b):bigint> --- !query 10 output +-- !query output 2 1 3 2 4 2 @@ -104,132 +104,132 @@ struct<(a + b):int,count(b):bigint> NULL 1 --- !query 11 +-- !query SELECT a + 2, COUNT(b) FROM testData GROUP BY a + 1 --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output org.apache.spark.sql.AnalysisException expression 'testdata.`a`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.; --- !query 12 +-- !query SELECT a + 1 + 1, COUNT(b) FROM testData GROUP BY a + 1 --- !query 12 schema +-- !query schema struct<((a + 1) + 1):int,count(b):bigint> --- !query 12 output +-- !query output 3 2 4 2 5 2 NULL 1 --- !query 13 +-- !query SELECT SKEWNESS(a), KURTOSIS(a), MIN(a), MAX(a), AVG(a), VARIANCE(a), STDDEV(a), SUM(a), COUNT(a) FROM testData --- !query 13 schema -struct --- !query 13 output +-- !query schema +struct +-- !query output -0.2723801058145729 -1.5069204152249134 1 3 2.142857142857143 0.8095238095238094 0.8997354108424372 15 7 --- !query 14 +-- !query SELECT COUNT(DISTINCT b), COUNT(DISTINCT b, c) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY a --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output 1 1 --- !query 15 +-- !query SELECT a AS k, COUNT(b) FROM testData GROUP BY k --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 1 2 2 2 3 2 NULL 1 --- !query 16 +-- !query SELECT a AS k, COUNT(b) FROM testData GROUP BY k HAVING k > 1 --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output 2 2 3 2 --- !query 17 +-- !query SELECT COUNT(b) AS k FROM testData GROUP BY k --- !query 17 schema +-- !query schema struct<> --- !query 17 output +-- !query output org.apache.spark.sql.AnalysisException aggregate functions are not allowed in GROUP BY, but found count(testdata.`b`); --- !query 18 +-- !query CREATE OR REPLACE TEMPORARY VIEW testDataHasSameNameWithAlias AS SELECT * FROM VALUES (1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v) --- !query 18 schema +-- !query schema struct<> --- !query 18 output +-- !query output --- !query 19 +-- !query SELECT k AS a, COUNT(v) FROM testDataHasSameNameWithAlias GROUP BY a --- !query 19 schema +-- !query schema struct<> --- !query 19 output +-- !query output org.apache.spark.sql.AnalysisException expression 'testdatahassamenamewithalias.`k`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.; --- !query 20 +-- !query set spark.sql.groupByAliases=false --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output spark.sql.groupByAliases false --- !query 21 +-- !query SELECT a AS k, COUNT(b) FROM testData GROUP BY k --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`k`' given input columns: [testdata.a, testdata.b]; line 1 pos 47 --- !query 22 +-- !query SELECT a, COUNT(1) FROM testData WHERE false GROUP BY a --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output --- !query 23 +-- !query SELECT COUNT(1) FROM testData WHERE false --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output 0 --- !query 24 +-- !query SELECT 1 FROM (SELECT COUNT(1) FROM testData WHERE false) t --- !query 24 schema +-- !query schema struct<1:int> --- !query 24 output +-- !query output 1 --- !query 25 +-- !query SELECT 1 from ( SELECT 1 AS z, MIN(a.x) @@ -237,114 +237,114 @@ SELECT 1 from ( WHERE false ) b where b.z != b.z --- !query 25 schema +-- !query schema struct<1:int> --- !query 25 output +-- !query output --- !query 26 +-- !query SELECT corr(DISTINCT x, y), corr(DISTINCT y, x), count(*) FROM (VALUES (1, 1), (2, 2), (2, 2)) t(x, y) --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output 1.0 1.0 3 --- !query 27 +-- !query SELECT 1 FROM range(10) HAVING true --- !query 27 schema +-- !query schema struct<1:int> --- !query 27 output +-- !query output 1 --- !query 28 +-- !query SELECT 1 FROM range(10) HAVING MAX(id) > 0 --- !query 28 schema +-- !query schema struct<1:int> --- !query 28 output +-- !query output 1 --- !query 29 +-- !query SELECT id FROM range(10) HAVING id > 0 --- !query 29 schema +-- !query schema struct<> --- !query 29 output +-- !query output org.apache.spark.sql.AnalysisException grouping expressions sequence is empty, and '`id`' is not an aggregate function. Wrap '()' in windowing function(s) or wrap '`id`' in first() (or first_value) if you don't care which value you get.; --- !query 30 +-- !query CREATE OR REPLACE TEMPORARY VIEW test_agg AS SELECT * FROM VALUES (1, true), (1, false), (2, true), (3, false), (3, null), (4, null), (4, null), (5, null), (5, true), (5, false) AS test_agg(k, v) --- !query 30 schema +-- !query schema struct<> --- !query 30 output +-- !query output --- !query 31 -SELECT every(v), some(v), any(v) FROM test_agg WHERE 1 = 0 --- !query 31 schema -struct --- !query 31 output -NULL NULL NULL +-- !query +SELECT every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg WHERE 1 = 0 +-- !query schema +struct +-- !query output +NULL NULL NULL NULL NULL --- !query 32 -SELECT every(v), some(v), any(v) FROM test_agg WHERE k = 4 --- !query 32 schema -struct --- !query 32 output -NULL NULL NULL +-- !query +SELECT every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg WHERE k = 4 +-- !query schema +struct +-- !query output +NULL NULL NULL NULL NULL --- !query 33 -SELECT every(v), some(v), any(v) FROM test_agg WHERE k = 5 --- !query 33 schema -struct --- !query 33 output -false true true +-- !query +SELECT every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg WHERE k = 5 +-- !query schema +struct +-- !query output +false true true false true --- !query 34 -SELECT k, every(v), some(v), any(v) FROM test_agg GROUP BY k --- !query 34 schema -struct --- !query 34 output -1 false true true -2 true true true -3 false false false -4 NULL NULL NULL -5 false true true +-- !query +SELECT k, every(v), some(v), any(v), bool_and(v), bool_or(v) FROM test_agg GROUP BY k +-- !query schema +struct +-- !query output +1 false true true false true +2 true true true true true +3 false false false false false +4 NULL NULL NULL NULL NULL +5 false true true false true --- !query 35 +-- !query SELECT k, every(v) FROM test_agg GROUP BY k HAVING every(v) = false --- !query 35 schema +-- !query schema struct --- !query 35 output +-- !query output 1 false 3 false 5 false --- !query 36 +-- !query SELECT k, every(v) FROM test_agg GROUP BY k HAVING every(v) IS NULL --- !query 36 schema +-- !query schema struct --- !query 36 output +-- !query output 4 NULL --- !query 37 +-- !query SELECT k, Every(v) AS every FROM test_agg @@ -353,13 +353,13 @@ WHERE k = 2 FROM test_agg WHERE k = 1) GROUP BY k --- !query 37 schema +-- !query schema struct --- !query 37 output +-- !query output 2 true --- !query 38 +-- !query SELECT k, Every(v) AS every FROM test_agg @@ -368,53 +368,71 @@ WHERE k = 2 FROM test_agg WHERE k = 1) GROUP BY k --- !query 38 schema +-- !query schema struct --- !query 38 output +-- !query output --- !query 39 +-- !query SELECT every(1) --- !query 39 schema +-- !query schema struct<> --- !query 39 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'every(1)' due to data type mismatch: Input to function 'every' should have been boolean, but it's [int].; line 1 pos 7 --- !query 40 +-- !query SELECT some(1S) --- !query 40 schema +-- !query schema struct<> --- !query 40 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'some(1S)' due to data type mismatch: Input to function 'some' should have been boolean, but it's [smallint].; line 1 pos 7 --- !query 41 +-- !query SELECT any(1L) --- !query 41 schema +-- !query schema struct<> --- !query 41 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'any(1L)' due to data type mismatch: Input to function 'any' should have been boolean, but it's [bigint].; line 1 pos 7 --- !query 42 +-- !query SELECT every("true") --- !query 42 schema +-- !query schema struct<> --- !query 42 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'every('true')' due to data type mismatch: Input to function 'every' should have been boolean, but it's [string].; line 1 pos 7 --- !query 43 +-- !query +SELECT bool_and(1.0) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'bool_and(1.0BD)' due to data type mismatch: Input to function 'bool_and' should have been boolean, but it's [decimal(2,1)].; line 1 pos 7 + + +-- !query +SELECT bool_or(1.0D) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'bool_or(1.0D)' due to data type mismatch: Input to function 'bool_or' should have been boolean, but it's [double].; line 1 pos 7 + + +-- !query SELECT k, v, every(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg --- !query 43 schema +-- !query schema struct --- !query 43 output +-- !query output 1 false false 1 true false 2 true true @@ -427,11 +445,11 @@ struct --- !query 44 output +-- !query output 1 false false 1 true true 2 true true @@ -444,11 +462,45 @@ struct --- !query 45 output +-- !query output +1 false false +1 true true +2 true true +3 NULL NULL +3 false false +4 NULL NULL +4 NULL NULL +5 NULL NULL +5 false false +5 true true + + +-- !query +SELECT k, v, bool_and(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg +-- !query schema +struct +-- !query output +1 false false +1 true false +2 true true +3 NULL NULL +3 false false +4 NULL NULL +4 NULL NULL +5 NULL NULL +5 false false +5 true false + + +-- !query +SELECT k, v, bool_or(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg +-- !query schema +struct +-- !query output 1 false false 1 true true 2 true true @@ -461,37 +513,37 @@ struct 1L --- !query 46 schema +-- !query schema struct --- !query 46 output +-- !query output 10 --- !query 47 +-- !query SELECT k, max(v) FROM test_agg GROUP BY k HAVING max(v) = true --- !query 47 schema +-- !query schema struct --- !query 47 output +-- !query output 1 true 2 true 5 true --- !query 48 +-- !query SELECT * FROM (SELECT COUNT(*) AS cnt FROM test_agg) WHERE cnt > 1L --- !query 48 schema +-- !query schema struct --- !query 48 output +-- !query output 10 --- !query 49 +-- !query SELECT count(*) FROM test_agg WHERE count(*) > 1L --- !query 49 schema +-- !query schema struct<> --- !query 49 output +-- !query output org.apache.spark.sql.AnalysisException Aggregate/Window/Generate expressions are not valid in where clause of the query. @@ -499,11 +551,11 @@ Expression in where clause: [(count(1) > 1L)] Invalid expressions: [count(1)]; --- !query 50 +-- !query SELECT count(*) FROM test_agg WHERE count(*) + 1L > 1L --- !query 50 schema +-- !query schema struct<> --- !query 50 output +-- !query output org.apache.spark.sql.AnalysisException Aggregate/Window/Generate expressions are not valid in where clause of the query. @@ -511,11 +563,11 @@ Expression in where clause: [((count(1) + 1L) > 1L)] Invalid expressions: [count(1)]; --- !query 51 +-- !query SELECT count(*) FROM test_agg WHERE k = 1 or k = 2 or count(*) + 1L > 1L or max(k) > 1 --- !query 51 schema +-- !query schema struct<> --- !query 51 output +-- !query output org.apache.spark.sql.AnalysisException Aggregate/Window/Generate expressions are not valid in where clause of the query. diff --git a/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out b/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out index 34ab09c5e3bba..8eeabb34b4fab 100644 --- a/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/grouping_set.sql.out @@ -1,139 +1,141 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 15 +-- Number of queries: 18 --- !query 0 +-- !query CREATE TEMPORARY VIEW grouping AS SELECT * FROM VALUES ("1", "2", "3", 1), ("4", "5", "6", 1), ("7", "8", "9", 1) as grouping(a, b, c, d) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS (()) --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output NULL NULL NULL 3 --- !query 2 +-- !query SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((a)) --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 1 NULL NULL 1 4 NULL NULL 1 7 NULL NULL 1 --- !query 3 +-- !query SELECT a, b, c, count(d) FROM grouping GROUP BY a, b, c GROUPING SETS ((c)) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output NULL NULL 3 1 NULL NULL 6 1 NULL NULL 9 1 --- !query 4 +-- !query SELECT c1, sum(c2) FROM (VALUES ('x', 10, 0), ('y', 20, 0)) AS t (c1, c2, c3) GROUP BY GROUPING SETS (c1) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output x 10 y 20 --- !query 5 +-- !query SELECT c1, sum(c2), grouping(c1) FROM (VALUES ('x', 10, 0), ('y', 20, 0)) AS t (c1, c2, c3) GROUP BY GROUPING SETS (c1) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output x 10 0 y 20 0 --- !query 6 +-- !query SELECT c1, c2, Sum(c3), grouping__id FROM (VALUES ('x', 'a', 10), ('y', 'b', 20) ) AS t (c1, c2, c3) GROUP BY GROUPING SETS ( ( c1 ), ( c2 ) ) HAVING GROUPING__ID > 1 --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output NULL a 10 2 NULL b 20 2 --- !query 7 +-- !query SELECT grouping(c1) FROM (VALUES ('x', 'a', 10), ('y', 'b', 20)) AS t (c1, c2, c3) GROUP BY GROUPING SETS (c1,c2) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 0 0 1 1 --- !query 8 +-- !query SELECT -c1 AS c1 FROM (values (1,2), (3,2)) t(c1, c2) GROUP BY GROUPING SETS ((c1), (c1, c2)) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output -1 -1 -3 -3 --- !query 9 +-- !query SELECT a + b, b, sum(c) FROM (VALUES (1,1,1),(2,2,2)) AS t(a,b,c) GROUP BY GROUPING SETS ( (a + b), (b)) --- !query 9 schema +-- !query schema struct<(a + b):int,b:int,sum(c):bigint> --- !query 9 output +-- !query output 2 NULL 1 4 NULL 2 NULL 1 1 NULL 2 2 --- !query 10 +-- !query SELECT a + b, b, sum(c) FROM (VALUES (1,1,1),(2,2,2)) AS t(a,b,c) GROUP BY GROUPING SETS ( (a + b), (b + a), (b)) --- !query 10 schema +-- !query schema struct<(a + b):int,b:int,sum(c):bigint> --- !query 10 output -2 NULL 2 -4 NULL 4 +-- !query output +2 NULL 1 +2 NULL 1 +4 NULL 2 +4 NULL 2 NULL 1 1 NULL 2 2 --- !query 11 +-- !query SELECT c1 AS col1, c2 AS col2 FROM (VALUES (1, 2), (3, 2)) t(c1, c2) GROUP BY GROUPING SETS ( ( c1 ), ( c1, c2 ) ) HAVING col2 IS NOT NULL ORDER BY -col1 --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 3 2 1 2 --- !query 12 +-- !query SELECT a, b, c, count(d) FROM grouping GROUP BY WITH ROLLUP --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException extraneous input 'ROLLUP' expecting (line 1, pos 53) @@ -143,11 +145,11 @@ SELECT a, b, c, count(d) FROM grouping GROUP BY WITH ROLLUP -----------------------------------------------------^^^ --- !query 13 +-- !query SELECT a, b, c, count(d) FROM grouping GROUP BY WITH CUBE --- !query 13 schema +-- !query schema struct<> --- !query 13 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException extraneous input 'CUBE' expecting (line 1, pos 53) @@ -157,10 +159,49 @@ SELECT a, b, c, count(d) FROM grouping GROUP BY WITH CUBE -----------------------------------------------------^^^ --- !query 14 +-- !query SELECT c1 FROM (values (1,2), (3,2)) t(c1, c2) GROUP BY GROUPING SETS (()) --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output org.apache.spark.sql.AnalysisException expression '`c1`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.; + + +-- !query +SELECT k1, k2, avg(v) FROM (VALUES (1,1,1),(2,2,2)) AS t(k1,k2,v) GROUP BY GROUPING SETS ((k1),(k1,k2),(k2,k1)) +-- !query schema +struct +-- !query output +1 1 1.0 +1 1 1.0 +1 NULL 1.0 +2 2 2.0 +2 2 2.0 +2 NULL 2.0 + + +-- !query +SELECT grouping__id, k1, k2, avg(v) FROM (VALUES (1,1,1),(2,2,2)) AS t(k1,k2,v) GROUP BY GROUPING SETS ((k1),(k1,k2),(k2,k1)) +-- !query schema +struct +-- !query output +0 1 1 1.0 +0 1 1 1.0 +0 2 2 2.0 +0 2 2 2.0 +1 1 NULL 1.0 +1 2 NULL 2.0 + + +-- !query +SELECT grouping(k1), k1, k2, avg(v) FROM (VALUES (1,1,1),(2,2,2)) AS t(k1,k2,v) GROUP BY GROUPING SETS ((k1),(k1,k2),(k2,k1)) +-- !query schema +struct +-- !query output +0 1 1 1.0 +0 1 1 1.0 +0 1 NULL 1.0 +0 2 2 2.0 +0 2 2 2.0 +0 2 NULL 2.0 diff --git a/sql/core/src/test/resources/sql-tests/results/having.sql.out b/sql/core/src/test/resources/sql-tests/results/having.sql.out index d87ee5221647f..5bd185d7b815d 100644 --- a/sql/core/src/test/resources/sql-tests/results/having.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/having.sql.out @@ -2,48 +2,48 @@ -- Number of queries: 5 --- !query 0 +-- !query create temporary view hav as select * from values ("one", 1), ("two", 2), ("three", 3), ("one", 5) as hav(k, v) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT k, sum(v) FROM hav GROUP BY k HAVING sum(v) > 2 --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output one 6 three 3 --- !query 2 +-- !query SELECT count(k) FROM hav GROUP BY v + 1 HAVING v + 1 = 2 --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 1 --- !query 3 +-- !query SELECT MIN(t.v) FROM (SELECT * FROM hav WHERE v > 0) t HAVING(COUNT(1) > 0) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 1 --- !query 4 +-- !query SELECT a + b FROM VALUES (1L, 2), (3L, 4) AS T(a, b) GROUP BY a + b HAVING a + b > 1 --- !query 4 schema +-- !query schema struct<(a + CAST(b AS BIGINT)):bigint> --- !query 4 output +-- !query output 3 7 diff --git a/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out index 1b7c6f4f76250..d35d0d5d944bb 100644 --- a/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/higher-order-functions.sql.out @@ -1,256 +1,272 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 27 +-- Number of queries: 29 --- !query 0 +-- !query create or replace temporary view nested as values (1, array(32, 97), array(array(12, 99), array(123, 42), array(1))), (2, array(77, -76), array(array(6, 96, 65), array(-1, -2))), (3, array(12), array(array(17))) as t(x, ys, zs) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query select upper(x -> x) as v --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output org.apache.spark.sql.AnalysisException A lambda function should only be used in a higher order function. However, its class is org.apache.spark.sql.catalyst.expressions.Upper, which is not a higher order function.; line 1 pos 7 --- !query 2 +-- !query select transform(zs, z -> z) as v from nested --- !query 2 schema +-- !query schema struct>> --- !query 2 output +-- !query output [[12,99],[123,42],[1]] [[17]] [[6,96,65],[-1,-2]] --- !query 3 +-- !query select transform(ys, y -> y * y) as v from nested --- !query 3 schema +-- !query schema struct> --- !query 3 output +-- !query output [1024,9409] [144] [5929,5776] --- !query 4 +-- !query select transform(ys, (y, i) -> y + i) as v from nested --- !query 4 schema +-- !query schema struct> --- !query 4 output +-- !query output [12] [32,98] [77,-75] --- !query 5 +-- !query select transform(zs, z -> concat(ys, z)) as v from nested --- !query 5 schema +-- !query schema struct>> --- !query 5 output +-- !query output [[12,17]] [[32,97,12,99],[32,97,123,42],[32,97,1]] [[77,-76,6,96,65],[77,-76,-1,-2]] --- !query 6 +-- !query select transform(ys, 0) as v from nested --- !query 6 schema +-- !query schema struct> --- !query 6 output +-- !query output [0,0] [0,0] [0] --- !query 7 +-- !query select transform(cast(null as array), x -> x + 1) as v --- !query 7 schema +-- !query schema struct> --- !query 7 output +-- !query output NULL --- !query 8 +-- !query select filter(ys, y -> y > 30) as v from nested --- !query 8 schema +-- !query schema struct> --- !query 8 output +-- !query output [32,97] [77] [] --- !query 9 +-- !query select filter(cast(null as array), y -> true) as v --- !query 9 schema +-- !query schema struct> --- !query 9 output +-- !query output NULL --- !query 10 +-- !query select transform(zs, z -> filter(z, zz -> zz > 50)) as v from nested --- !query 10 schema +-- !query schema struct>> --- !query 10 output +-- !query output [[96,65],[]] [[99],[123],[]] [[]] --- !query 11 +-- !query select aggregate(ys, 0, (y, a) -> y + a + x) as v from nested --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 131 15 5 --- !query 12 +-- !query select aggregate(ys, (0 as sum, 0 as n), (acc, x) -> (acc.sum + x, acc.n + 1), acc -> acc.sum / acc.n) as v from nested --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 0.5 12.0 64.5 --- !query 13 +-- !query select transform(zs, z -> aggregate(z, 1, (acc, val) -> acc * val * size(z))) as v from nested --- !query 13 schema +-- !query schema struct> --- !query 13 output +-- !query output [1010880,8] [17] [4752,20664,1] --- !query 14 +-- !query select aggregate(cast(null as array), 0, (a, y) -> a + y + 1, a -> a + 2) as v --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output NULL --- !query 15 +-- !query select exists(ys, y -> y > 30) as v from nested --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output false true true --- !query 16 +-- !query select exists(cast(null as array), y -> y > 30) as v --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output NULL --- !query 17 +-- !query select zip_with(ys, zs, (a, b) -> a + size(b)) as v from nested --- !query 17 schema +-- !query schema struct> --- !query 17 output +-- !query output [13] [34,99,null] [80,-74] --- !query 18 +-- !query select zip_with(array('a', 'b', 'c'), array('d', 'e', 'f'), (x, y) -> concat(x, y)) as v --- !query 18 schema +-- !query schema struct> --- !query 18 output +-- !query output ["ad","be","cf"] --- !query 19 +-- !query select zip_with(array('a'), array('d', null, 'f'), (x, y) -> coalesce(x, y)) as v --- !query 19 schema +-- !query schema struct> --- !query 19 output +-- !query output ["a",null,"f"] --- !query 20 +-- !query create or replace temporary view nested as values (1, map(1, 1, 2, 2, 3, 3)), (2, map(4, 4, 5, 5, 6, 6)) as t(x, ys) --- !query 20 schema +-- !query schema struct<> --- !query 20 output +-- !query output --- !query 21 +-- !query select transform_keys(ys, (k, v) -> k) as v from nested --- !query 21 schema +-- !query schema struct> --- !query 21 output +-- !query output {1:1,2:2,3:3} {4:4,5:5,6:6} --- !query 22 +-- !query select transform_keys(ys, (k, v) -> k + 1) as v from nested --- !query 22 schema +-- !query schema struct> --- !query 22 output +-- !query output {2:1,3:2,4:3} {5:4,6:5,7:6} --- !query 23 +-- !query select transform_keys(ys, (k, v) -> k + v) as v from nested --- !query 23 schema +-- !query schema struct> --- !query 23 output +-- !query output {10:5,12:6,8:4} {2:1,4:2,6:3} --- !query 24 +-- !query select transform_values(ys, (k, v) -> v) as v from nested --- !query 24 schema +-- !query schema struct> --- !query 24 output +-- !query output {1:1,2:2,3:3} {4:4,5:5,6:6} --- !query 25 +-- !query select transform_values(ys, (k, v) -> v + 1) as v from nested --- !query 25 schema +-- !query schema struct> --- !query 25 output +-- !query output {1:2,2:3,3:4} {4:5,5:6,6:7} --- !query 26 +-- !query select transform_values(ys, (k, v) -> k + v) as v from nested --- !query 26 schema +-- !query schema struct> --- !query 26 output +-- !query output {1:2,2:4,3:6} {4:8,5:10,6:12} + + +-- !query +select transform(ys, all -> all * all) as v from values (array(32, 97)) as t(ys) +-- !query schema +struct> +-- !query output +[1024,9409] + + +-- !query +select transform(ys, (all, i) -> all + i) as v from values (array(32, 97)) as t(ys) +-- !query schema +struct> +-- !query output +[32,98] diff --git a/sql/core/src/test/resources/sql-tests/results/inline-table.sql.out b/sql/core/src/test/resources/sql-tests/results/inline-table.sql.out index 4e80f0bda5513..9943b93c431df 100644 --- a/sql/core/src/test/resources/sql-tests/results/inline-table.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/inline-table.sql.out @@ -2,152 +2,152 @@ -- Number of queries: 17 --- !query 0 +-- !query select * from values ("one", 1) --- !query 0 schema +-- !query schema struct --- !query 0 output +-- !query output one 1 --- !query 1 +-- !query select * from values ("one", 1) as data --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output one 1 --- !query 2 +-- !query select * from values ("one", 1) as data(a, b) --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output one 1 --- !query 3 +-- !query select * from values 1, 2, 3 as data(a) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 1 2 3 --- !query 4 +-- !query select * from values ("one", 1), ("two", 2), ("three", null) as data(a, b) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output one 1 three NULL two 2 --- !query 5 +-- !query select * from values ("one", null), ("two", null) as data(a, b) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output one NULL two NULL --- !query 6 +-- !query select * from values ("one", 1), ("two", 2L) as data(a, b) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output one 1 two 2 --- !query 7 +-- !query select * from values ("one", 1 + 0), ("two", 1 + 3L) as data(a, b) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output one 1 two 4 --- !query 8 +-- !query select * from values ("one", array(0, 1)), ("two", array(2, 3)) as data(a, b) --- !query 8 schema +-- !query schema struct> --- !query 8 output +-- !query output one [0,1] two [2,3] --- !query 9 +-- !query select * from values ("one", 2.0), ("two", 3.0D) as data(a, b) --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output one 2.0 two 3.0 --- !query 10 +-- !query select * from values ("one", rand(5)), ("two", 3.0D) as data(a, b) --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output org.apache.spark.sql.AnalysisException cannot evaluate expression rand(5) in inline table definition; line 1 pos 29 --- !query 11 +-- !query select * from values ("one", 2.0), ("two") as data(a, b) --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output org.apache.spark.sql.AnalysisException expected 2 columns but found 1 columns in row 1; line 1 pos 14 --- !query 12 +-- !query select * from values ("one", array(0, 1)), ("two", struct(1, 2)) as data(a, b) --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output org.apache.spark.sql.AnalysisException incompatible types found in column b for inline table; line 1 pos 14 --- !query 13 +-- !query select * from values ("one"), ("two") as data(a, b) --- !query 13 schema +-- !query schema struct<> --- !query 13 output +-- !query output org.apache.spark.sql.AnalysisException expected 2 columns but found 1 columns in row 0; line 1 pos 14 --- !query 14 +-- !query select * from values ("one", random_not_exist_func(1)), ("two", 2) as data(a, b) --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output org.apache.spark.sql.AnalysisException Undefined function: 'random_not_exist_func'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 29 --- !query 15 +-- !query select * from values ("one", count(1)), ("two", 2) as data(a, b) --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output org.apache.spark.sql.AnalysisException cannot evaluate expression count(1) in inline table definition; line 1 pos 29 --- !query 16 +-- !query select * from values (timestamp('1991-12-06 00:00:00.0'), array(timestamp('1991-12-06 01:00:00.0'), timestamp('1991-12-06 12:00:00.0'))) as data(a, b) --- !query 16 schema +-- !query schema struct> --- !query 16 output -1991-12-06 00:00:00 [1991-12-06 01:00:00.0,1991-12-06 12:00:00.0] +-- !query output +1991-12-06 00:00:00 [1991-12-06 01:00:00,1991-12-06 12:00:00] diff --git a/sql/core/src/test/resources/sql-tests/results/inner-join.sql.out b/sql/core/src/test/resources/sql-tests/results/inner-join.sql.out index 8d56ebe9fd3b4..7c3774c8bd7fb 100644 --- a/sql/core/src/test/resources/sql-tests/results/inner-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/inner-join.sql.out @@ -2,65 +2,65 @@ -- Number of queries: 7 --- !query 0 +-- !query CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (1) AS GROUPING(a) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (1) AS GROUPING(a) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE TEMPORARY VIEW t3 AS SELECT * FROM VALUES (1), (1) AS GROUPING(a) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query CREATE TEMPORARY VIEW t4 AS SELECT * FROM VALUES (1), (1) AS GROUPING(a) --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query CREATE TEMPORARY VIEW ta AS SELECT a, 'a' AS tag FROM t1 UNION ALL SELECT a, 'b' AS tag FROM t2 --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query CREATE TEMPORARY VIEW tb AS SELECT a, 'a' AS tag FROM t3 UNION ALL SELECT a, 'b' AS tag FROM t4 --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query SELECT tb.* FROM ta INNER JOIN tb ON ta.a = tb.a AND ta.tag = tb.tag --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 1 a 1 a 1 b diff --git a/sql/core/src/test/resources/sql-tests/results/intersect-all.sql.out b/sql/core/src/test/resources/sql-tests/results/intersect-all.sql.out index 63dd56ce468bc..4762082dc3be2 100644 --- a/sql/core/src/test/resources/sql-tests/results/intersect-all.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/intersect-all.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 22 --- !query 0 +-- !query CREATE TEMPORARY VIEW tab1 AS SELECT * FROM VALUES (1, 2), (1, 2), @@ -12,13 +12,13 @@ CREATE TEMPORARY VIEW tab1 AS SELECT * FROM VALUES (null, null), (null, null) AS tab1(k, v) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TEMPORARY VIEW tab2 AS SELECT * FROM VALUES (1, 2), (1, 2), @@ -27,19 +27,19 @@ CREATE TEMPORARY VIEW tab2 AS SELECT * FROM VALUES (null, null), (null, null) AS tab2(k, v) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query SELECT * FROM tab1 INTERSECT ALL SELECT * FROM tab2 --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 1 2 1 2 2 3 @@ -47,80 +47,80 @@ NULL NULL NULL NULL --- !query 3 +-- !query SELECT * FROM tab1 INTERSECT ALL SELECT * FROM tab1 WHERE k = 1 --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 1 2 1 2 1 3 1 3 --- !query 4 +-- !query SELECT * FROM tab1 WHERE k > 2 INTERSECT ALL SELECT * FROM tab2 --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output --- !query 5 +-- !query SELECT * FROM tab1 INTERSECT ALL SELECT * FROM tab2 WHERE k > 3 --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output --- !query 6 +-- !query SELECT * FROM tab1 INTERSECT ALL SELECT CAST(1 AS BIGINT), CAST(2 AS BIGINT) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 1 2 --- !query 7 +-- !query SELECT * FROM tab1 INTERSECT ALL SELECT array(1), 2 --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output org.apache.spark.sql.AnalysisException IntersectAll can only be performed on tables with the compatible column types. array <> int at the first column of the second table; --- !query 8 +-- !query SELECT k FROM tab1 INTERSECT ALL SELECT k, v FROM tab2 --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output org.apache.spark.sql.AnalysisException IntersectAll can only be performed on tables with the same number of columns, but the first table has 1 columns and the second table has 2 columns; --- !query 9 +-- !query SELECT * FROM tab2 INTERSECT ALL SELECT * FROM tab1 INTERSECT ALL SELECT * FROM tab2 --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 1 2 1 2 2 3 @@ -128,7 +128,7 @@ NULL NULL NULL NULL --- !query 10 +-- !query SELECT * FROM tab1 EXCEPT SELECT * FROM tab2 @@ -136,9 +136,9 @@ UNION ALL SELECT * FROM tab1 INTERSECT ALL SELECT * FROM tab2 --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 1 2 1 2 1 3 @@ -147,7 +147,7 @@ NULL NULL NULL NULL --- !query 11 +-- !query SELECT * FROM tab1 EXCEPT SELECT * FROM tab2 @@ -155,13 +155,13 @@ EXCEPT SELECT * FROM tab1 INTERSECT ALL SELECT * FROM tab2 --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 1 3 --- !query 12 +-- !query ( ( ( @@ -175,13 +175,13 @@ struct INTERSECT ALL SELECT * FROM tab2 ) --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output --- !query 13 +-- !query SELECT * FROM (SELECT tab1.k, tab2.v @@ -195,9 +195,9 @@ FROM (SELECT tab1.k, FROM tab1 JOIN tab2 ON tab1.k = tab2.k) --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output 1 2 1 2 1 2 @@ -209,7 +209,7 @@ struct 2 3 --- !query 14 +-- !query SELECT * FROM (SELECT tab1.k, tab2.v @@ -223,33 +223,33 @@ FROM (SELECT tab2.v AS k, FROM tab1 JOIN tab2 ON tab1.k = tab2.k) --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output --- !query 15 +-- !query SELECT v FROM tab1 GROUP BY v INTERSECT ALL SELECT k FROM tab2 GROUP BY k --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 2 3 NULL --- !query 16 +-- !query SET spark.sql.legacy.setopsPrecedence.enabled= true --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output spark.sql.legacy.setopsPrecedence.enabled true --- !query 17 +-- !query SELECT * FROM tab1 EXCEPT SELECT * FROM tab2 @@ -257,9 +257,9 @@ UNION ALL SELECT * FROM tab1 INTERSECT ALL SELECT * FROM tab2 --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output 1 2 1 2 2 3 @@ -267,7 +267,7 @@ NULL NULL NULL NULL --- !query 18 +-- !query SELECT * FROM tab1 EXCEPT SELECT * FROM tab2 @@ -275,33 +275,33 @@ UNION ALL SELECT * FROM tab1 INTERSECT SELECT * FROM tab2 --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output 1 2 2 3 NULL NULL --- !query 19 +-- !query SET spark.sql.legacy.setopsPrecedence.enabled = false --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output spark.sql.legacy.setopsPrecedence.enabled false --- !query 20 +-- !query DROP VIEW IF EXISTS tab1 --- !query 20 schema +-- !query schema struct<> --- !query 20 output +-- !query output --- !query 21 +-- !query DROP VIEW IF EXISTS tab2 --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/interval.sql.out new file mode 100644 index 0000000000000..3c4b4301d0025 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/interval.sql.out @@ -0,0 +1,992 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 99 + + +-- !query +select 3 * (timestamp'2019-10-15 10:11:12.001002' - date'2019-10-15') +-- !query schema +struct +-- !query output +30 hours 33 minutes 36.003006 seconds + + +-- !query +select interval 4 month 2 weeks 3 microseconds * 1.5 +-- !query schema +struct +-- !query output +6 months 21 days 0.000005 seconds + + +-- !query +select (timestamp'2019-10-15' - timestamp'2019-10-14') / 1.5 +-- !query schema +struct +-- !query output +16 hours + + +-- !query +select interval '2 seconds' / 0 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +divide by zero + + +-- !query +select interval '2 seconds' / null +-- !query schema +struct +-- !query output +NULL + + +-- !query +select interval '2 seconds' * null +-- !query schema +struct +-- !query output +NULL + + +-- !query +select null * interval '2 seconds' +-- !query schema +struct +-- !query output +NULL + + +-- !query +select -interval '-1 month 1 day -1 second' +-- !query schema +struct<(- INTERVAL '-1 months 1 days -1 seconds'):interval> +-- !query output +1 months -1 days 1 seconds + + +-- !query +select -interval -1 month 1 day -1 second +-- !query schema +struct<(- INTERVAL '-1 months 1 days -1 seconds'):interval> +-- !query output +1 months -1 days 1 seconds + + +-- !query +select +interval '-1 month 1 day -1 second' +-- !query schema +struct<(+ INTERVAL '-1 months 1 days -1 seconds'):interval> +-- !query output +-1 months 1 days -1 seconds + + +-- !query +select +interval -1 month 1 day -1 second +-- !query schema +struct<(+ INTERVAL '-1 months 1 days -1 seconds'):interval> +-- !query output +-1 months 1 days -1 seconds + + +-- !query +select make_interval(1) +-- !query schema +struct +-- !query output +1 years + + +-- !query +select make_interval(1, 2) +-- !query schema +struct +-- !query output +1 years 2 months + + +-- !query +select make_interval(1, 2, 3) +-- !query schema +struct +-- !query output +1 years 2 months 21 days + + +-- !query +select make_interval(1, 2, 3, 4) +-- !query schema +struct +-- !query output +1 years 2 months 25 days + + +-- !query +select make_interval(1, 2, 3, 4, 5) +-- !query schema +struct +-- !query output +1 years 2 months 25 days 5 hours + + +-- !query +select make_interval(1, 2, 3, 4, 5, 6) +-- !query schema +struct +-- !query output +1 years 2 months 25 days 5 hours 6 minutes + + +-- !query +select make_interval(1, 2, 3, 4, 5, 6, 7.008009) +-- !query schema +struct +-- !query output +1 years 2 months 25 days 5 hours 6 minutes 7.008009 seconds + + +-- !query +select cast('1 second' as interval) +-- !query schema +struct +-- !query output +1 seconds + + +-- !query +select cast('+1 second' as interval) +-- !query schema +struct +-- !query output +1 seconds + + +-- !query +select cast('-1 second' as interval) +-- !query schema +struct +-- !query output +-1 seconds + + +-- !query +select cast('+ 1 second' as interval) +-- !query schema +struct +-- !query output +1 seconds + + +-- !query +select cast('- 1 second' as interval) +-- !query schema +struct +-- !query output +-1 seconds + + +-- !query +select cast('- -1 second' as interval) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select cast('- +1 second' as interval) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select interval 13.123456789 seconds, interval -13.123456789 second +-- !query schema +struct +-- !query output +13.123456 seconds -13.123456 seconds + + +-- !query +select interval 1 year 2 month 3 week 4 day 5 hour 6 minute 7 seconds 8 millisecond 9 microsecond +-- !query schema +struct +-- !query output +1 years 2 months 25 days 5 hours 6 minutes 7.008009 seconds + + +-- !query +select interval '30' year '25' month '-100' day '40' hour '80' minute '299.889987299' second +-- !query schema +struct +-- !query output +32 years 1 months -100 days 41 hours 24 minutes 59.889987 seconds + + +-- !query +select interval '0 0:0:0.1' day to second +-- !query schema +struct +-- !query output +0.1 seconds + + +-- !query +select interval '10-9' year to month +-- !query schema +struct +-- !query output +10 years 9 months + + +-- !query +select interval '20 15' day to hour +-- !query schema +struct +-- !query output +20 days 15 hours + + +-- !query +select interval '20 15:40' day to minute +-- !query schema +struct +-- !query output +20 days 15 hours 40 minutes + + +-- !query +select interval '20 15:40:32.99899999' day to second +-- !query schema +struct +-- !query output +20 days 15 hours 40 minutes 32.998999 seconds + + +-- !query +select interval '15:40' hour to minute +-- !query schema +struct +-- !query output +15 hours 40 minutes + + +-- !query +select interval '15:40:32.99899999' hour to second +-- !query schema +struct +-- !query output +15 hours 40 minutes 32.998999 seconds + + +-- !query +select interval '40:32.99899999' minute to second +-- !query schema +struct +-- !query output +40 minutes 32.998999 seconds + + +-- !query +select interval '40:32' minute to second +-- !query schema +struct +-- !query output +40 minutes 32 seconds + + +-- !query +select interval 30 day day +-- !query schema +struct +-- !query output +30 days + + +-- !query +select interval '20 15:40:32.99899999' day to hour +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2})$': 20 15:40:32.99899999, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) + +== SQL == +select interval '20 15:40:32.99899999' day to hour +----------------^^^ + + +-- !query +select interval '20 15:40:32.99899999' day to minute +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2}):(?\d{1,2})$': 20 15:40:32.99899999, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) + +== SQL == +select interval '20 15:40:32.99899999' day to minute +----------------^^^ + + +-- !query +select interval '15:40:32.99899999' hour to minute +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2})$': 15:40:32.99899999, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) + +== SQL == +select interval '15:40:32.99899999' hour to minute +----------------^^^ + + +-- !query +select interval '15:40.99899999' hour to second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 15:40.99899999, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) + +== SQL == +select interval '15:40.99899999' hour to second +----------------^^^ + + +-- !query +select interval '15:40' hour to second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 15:40, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) + +== SQL == +select interval '15:40' hour to second +----------------^^^ + + +-- !query +select interval '20 40:32.99899999' minute to second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 20 40:32.99899999, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) + +== SQL == +select interval '20 40:32.99899999' minute to second +----------------^^^ + + +-- !query +select interval 10 nanoseconds +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Error parsing ' 10 nanoseconds' to interval, invalid unit 'nanoseconds'(line 1, pos 16) + +== SQL == +select interval 10 nanoseconds +----------------^^^ + + +-- !query +select map(1, interval 1 day, 2, interval 3 week) +-- !query schema +struct> +-- !query output +{1:1 days,2:21 days} + + +-- !query +select interval 'interval 3 year 1 hour' +-- !query schema +struct +-- !query output +3 years 1 hours + + +-- !query +select interval '3 year 1 hour' +-- !query schema +struct +-- !query output +3 years 1 hours + + +-- !query +select interval +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +at least one time unit should be given for interval literal(line 1, pos 7) + +== SQL == +select interval +-------^^^ + + +-- !query +select interval 1 fake_unit +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Error parsing ' 1 fake_unit' to interval, invalid unit 'fake_unit'(line 1, pos 16) + +== SQL == +select interval 1 fake_unit +----------------^^^ + + +-- !query +select interval 1 year to month +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +The value of from-to unit must be a string(line 1, pos 16) + +== SQL == +select interval 1 year to month +----------------^^^ + + +-- !query +select interval '1' year to second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Intervals FROM year TO second are not supported.(line 1, pos 16) + +== SQL == +select interval '1' year to second +----------------^^^ + + +-- !query +select interval '10-9' year to month '2-1' year to month +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Can only have a single from-to unit in the interval literal syntax(line 1, pos 37) + +== SQL == +select interval '10-9' year to month '2-1' year to month +-------------------------------------^^^ + + +-- !query +select interval '10-9' year to month '12:11:10' hour to second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Can only have a single from-to unit in the interval literal syntax(line 1, pos 37) + +== SQL == +select interval '10-9' year to month '12:11:10' hour to second +-------------------------------------^^^ + + +-- !query +select interval '1 15:11' day to minute '12:11:10' hour to second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Can only have a single from-to unit in the interval literal syntax(line 1, pos 40) + +== SQL == +select interval '1 15:11' day to minute '12:11:10' hour to second +----------------------------------------^^^ + + +-- !query +select interval 1 year '2-1' year to month +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Can only have a single from-to unit in the interval literal syntax(line 1, pos 23) + +== SQL == +select interval 1 year '2-1' year to month +-----------------------^^^ + + +-- !query +select interval 1 year '12:11:10' hour to second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Can only have a single from-to unit in the interval literal syntax(line 1, pos 23) + +== SQL == +select interval 1 year '12:11:10' hour to second +-----------------------^^^ + + +-- !query +select interval '10-9' year to month '1' year +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Can only have a single from-to unit in the interval literal syntax(line 1, pos 37) + +== SQL == +select interval '10-9' year to month '1' year +-------------------------------------^^^ + + +-- !query +select interval '12:11:10' hour to second '1' year +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Can only have a single from-to unit in the interval literal syntax(line 1, pos 42) + +== SQL == +select interval '12:11:10' hour to second '1' year +------------------------------------------^^^ + + +-- !query +select interval (-30) day +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Undefined function: 'interval'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 7 + + +-- !query +select interval (a + 1) day +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Undefined function: 'interval'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 7 + + +-- !query +select interval 30 day day day +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +extraneous input 'day' expecting (line 1, pos 27) + +== SQL == +select interval 30 day day day +---------------------------^^^ + + +-- !query +select sum(cast(null as interval)) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select sum(cast(v as interval)) from VALUES ('1 seconds') t(v) where 1=0 +-- !query schema +struct +-- !query output +NULL + + +-- !query +select sum(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v) +-- !query schema +struct +-- !query output +3 seconds + + +-- !query +select sum(cast(v as interval)) from VALUES ('-1 seconds'), ('2 seconds'), (null) t(v) +-- !query schema +struct +-- !query output +1 seconds + + +-- !query +select sum(cast(v as interval)) from VALUES ('-1 seconds'), ('-2 seconds'), (null) t(v) +-- !query schema +struct +-- !query output +-3 seconds + + +-- !query +select sum(cast(v as interval)) from VALUES ('-1 weeks'), ('2 seconds'), (null) t(v) +-- !query schema +struct +-- !query output +-7 days 2 seconds + + +-- !query +select + i, + sum(cast(v as interval)) +from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) +group by i +-- !query schema +struct +-- !query output +1 -2 days +2 2 seconds +3 NULL + + +-- !query +select + sum(cast(v as interval)) as sv +from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) +having sv is not null +-- !query schema +struct +-- !query output +-2 days 2 seconds + + +-- !query +SELECT + i, + sum(cast(v as interval)) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) +FROM VALUES(1, '1 seconds'), (1, '2 seconds'), (2, NULL), (2, NULL) t(i,v) +-- !query schema +struct +-- !query output +1 2 seconds +1 3 seconds +2 NULL +2 NULL + + +-- !query +select avg(cast(v as interval)) from VALUES (null) t(v) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select avg(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v) where 1=0 +-- !query schema +struct +-- !query output +NULL + + +-- !query +select avg(cast(v as interval)) from VALUES ('1 seconds'), ('2 seconds'), (null) t(v) +-- !query schema +struct +-- !query output +1.5 seconds + + +-- !query +select avg(cast(v as interval)) from VALUES ('-1 seconds'), ('2 seconds'), (null) t(v) +-- !query schema +struct +-- !query output +0.5 seconds + + +-- !query +select avg(cast(v as interval)) from VALUES ('-1 seconds'), ('-2 seconds'), (null) t(v) +-- !query schema +struct +-- !query output +-1.5 seconds + + +-- !query +select avg(cast(v as interval)) from VALUES ('-1 weeks'), ('2 seconds'), (null) t(v) +-- !query schema +struct +-- !query output +-3 days -11 hours -59 minutes -59 seconds + + +-- !query +select + i, + avg(cast(v as interval)) +from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) +group by i +-- !query schema +struct +-- !query output +1 -1 days +2 2 seconds +3 NULL + + +-- !query +select + avg(cast(v as interval)) as sv +from VALUES (1, '-1 weeks'), (2, '2 seconds'), (3, null), (1, '5 days') t(i, v) +having sv is not null +-- !query schema +struct +-- !query output +-15 hours -59 minutes -59.333333 seconds + + +-- !query +SELECT + i, + avg(cast(v as interval)) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) +FROM VALUES (1,'1 seconds'), (1,'2 seconds'), (2,NULL), (2,NULL) t(i,v) +-- !query schema +struct +-- !query output +1 1.5 seconds +1 2 seconds +2 NULL +2 NULL + + +-- !query +create temporary view interval_arithmetic as + select CAST(dateval AS date), CAST(tsval AS timestamp) from values + ('2012-01-01', '2012-01-01') + as interval_arithmetic(dateval, tsval) +-- !query schema +struct<> +-- !query output + + + +-- !query +select + dateval, + dateval - interval '2-2' year to month, + dateval - interval '-2-2' year to month, + dateval + interval '2-2' year to month, + dateval + interval '-2-2' year to month, + - interval '2-2' year to month + dateval, + interval '2-2' year to month + dateval +from interval_arithmetic +-- !query schema +struct +-- !query output +2012-01-01 2009-11-01 2014-03-01 2014-03-01 2009-11-01 2009-11-01 2014-03-01 + + +-- !query +select + tsval, + tsval - interval '2-2' year to month, + tsval - interval '-2-2' year to month, + tsval + interval '2-2' year to month, + tsval + interval '-2-2' year to month, + - interval '2-2' year to month + tsval, + interval '2-2' year to month + tsval +from interval_arithmetic +-- !query schema +struct +-- !query output +2012-01-01 00:00:00 2009-11-01 00:00:00 2014-03-01 00:00:00 2014-03-01 00:00:00 2009-11-01 00:00:00 2009-11-01 00:00:00 2014-03-01 00:00:00 + + +-- !query +select + interval '2-2' year to month + interval '3-3' year to month, + interval '2-2' year to month - interval '3-3' year to month +from interval_arithmetic +-- !query schema +struct<(INTERVAL '2 years 2 months' + INTERVAL '3 years 3 months'):interval,(INTERVAL '2 years 2 months' - INTERVAL '3 years 3 months'):interval> +-- !query output +5 years 5 months -1 years -1 months + + +-- !query +select + dateval, + dateval - interval '99 11:22:33.123456789' day to second, + dateval - interval '-99 11:22:33.123456789' day to second, + dateval + interval '99 11:22:33.123456789' day to second, + dateval + interval '-99 11:22:33.123456789' day to second, + -interval '99 11:22:33.123456789' day to second + dateval, + interval '99 11:22:33.123456789' day to second + dateval +from interval_arithmetic +-- !query schema +struct +-- !query output +2012-01-01 2011-09-23 2012-04-09 2012-04-09 2011-09-23 2011-09-23 2012-04-09 + + +-- !query +select + tsval, + tsval - interval '99 11:22:33.123456789' day to second, + tsval - interval '-99 11:22:33.123456789' day to second, + tsval + interval '99 11:22:33.123456789' day to second, + tsval + interval '-99 11:22:33.123456789' day to second, + -interval '99 11:22:33.123456789' day to second + tsval, + interval '99 11:22:33.123456789' day to second + tsval +from interval_arithmetic +-- !query schema +struct +-- !query output +2012-01-01 00:00:00 2011-09-23 12:37:26.876544 2012-04-09 11:22:33.123456 2012-04-09 11:22:33.123456 2011-09-23 12:37:26.876544 2011-09-23 12:37:26.876544 2012-04-09 11:22:33.123456 + + +-- !query +select + interval '99 11:22:33.123456789' day to second + interval '10 9:8:7.123456789' day to second, + interval '99 11:22:33.123456789' day to second - interval '10 9:8:7.123456789' day to second +from interval_arithmetic +-- !query schema +struct<(INTERVAL '99 days 11 hours 22 minutes 33.123456 seconds' + INTERVAL '10 days 9 hours 8 minutes 7.123456 seconds'):interval,(INTERVAL '99 days 11 hours 22 minutes 33.123456 seconds' - INTERVAL '10 days 9 hours 8 minutes 7.123456 seconds'):interval> +-- !query output +109 days 20 hours 30 minutes 40.246912 seconds 89 days 2 hours 14 minutes 26 seconds + + +-- !query +select interval '\t interval 1 day' +-- !query schema +struct +-- !query output +1 days + + +-- !query +select interval 'interval \t 1\tday' +-- !query schema +struct +-- !query output +1 days + + +-- !query +select interval 'interval\t1\tday' +-- !query schema +struct +-- !query output +1 days + + +-- !query +select interval '1\t' day +-- !query schema +struct +-- !query output +1 days + + +-- !query +select interval '1 ' day +-- !query schema +struct +-- !query output +1 days + + +-- !query +select -(a) from values (interval '-2147483648 months', interval '2147483647 months') t(a, b) +-- !query schema +struct<(- a):interval> +-- !query output +-178956970 years -8 months + + +-- !query +select a - b from values (interval '-2147483648 months', interval '2147483647 months') t(a, b) +-- !query schema +struct<(a - b):interval> +-- !query output +1 months + + +-- !query +select b + interval '1 month' from values (interval '-2147483648 months', interval '2147483647 months') t(a, b) +-- !query schema +struct<(b + INTERVAL '1 months'):interval> +-- !query output +-178956970 years -8 months + + +-- !query +select a * 1.1 from values (interval '-2147483648 months', interval '2147483647 months') t(a, b) +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +integer overflow + + +-- !query +select a / 0.5 from values (interval '-2147483648 months', interval '2147483647 months') t(a, b) +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +integer overflow + + +-- !query +SELECT + from_csv('1, 1 day', 'a INT, b interval'), + to_csv(from_csv('1, 1 day', 'a INT, b interval')), + to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)), + from_csv(to_csv(named_struct('a', interval 32 month, 'b', interval 70 minute)), 'a interval, b interval') +-- !query schema +struct,to_csv(from_csv(1, 1 day)):string,to_csv(named_struct(a, INTERVAL '2 years 8 months', b, INTERVAL '1 hours 10 minutes')):string,from_csv(to_csv(named_struct(a, INTERVAL '2 years 8 months', b, INTERVAL '1 hours 10 minutes'))):struct> +-- !query output +{"a":1,"b":1 days} 1,1 days 2 years 8 months,1 hours 10 minutes {"a":2 years 8 months,"b":1 hours 10 minutes} + + +-- !query +SELECT + from_json('{"a":"1 days"}', 'a interval'), + to_json(from_json('{"a":"1 days"}', 'a interval')), + to_json(map('a', interval 25 month 100 day 130 minute)), + from_json(to_json(map('a', interval 25 month 100 day 130 minute)), 'a interval') +-- !query schema +struct,to_json(from_json({"a":"1 days"})):string,to_json(map(a, INTERVAL '2 years 1 months 100 days 2 hours 10 minutes')):string,from_json(to_json(map(a, INTERVAL '2 years 1 months 100 days 2 hours 10 minutes'))):struct> +-- !query output +{"a":1 days} {"a":"1 days"} {"a":"2 years 1 months 100 days 2 hours 10 minutes"} {"a":2 years 1 months 100 days 2 hours 10 minutes} diff --git a/sql/core/src/test/resources/sql-tests/results/join-empty-relation.sql.out b/sql/core/src/test/resources/sql-tests/results/join-empty-relation.sql.out index 857073a827f24..6b7edfb2356f4 100644 --- a/sql/core/src/test/resources/sql-tests/results/join-empty-relation.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/join-empty-relation.sql.out @@ -2,193 +2,193 @@ -- Number of queries: 24 --- !query 0 +-- !query CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (1) AS GROUPING(a) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (1) AS GROUPING(a) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE TEMPORARY VIEW empty_table as SELECT a FROM t2 WHERE false --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT * FROM t1 INNER JOIN empty_table --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output --- !query 4 +-- !query SELECT * FROM t1 CROSS JOIN empty_table --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output --- !query 5 +-- !query SELECT * FROM t1 LEFT OUTER JOIN empty_table --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 1 NULL --- !query 6 +-- !query SELECT * FROM t1 RIGHT OUTER JOIN empty_table --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output --- !query 7 +-- !query SELECT * FROM t1 FULL OUTER JOIN empty_table --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 1 NULL --- !query 8 +-- !query SELECT * FROM t1 LEFT SEMI JOIN empty_table --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output --- !query 9 +-- !query SELECT * FROM t1 LEFT ANTI JOIN empty_table --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 1 --- !query 10 +-- !query SELECT * FROM empty_table INNER JOIN t1 --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output --- !query 11 +-- !query SELECT * FROM empty_table CROSS JOIN t1 --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output --- !query 12 +-- !query SELECT * FROM empty_table LEFT OUTER JOIN t1 --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output --- !query 13 +-- !query SELECT * FROM empty_table RIGHT OUTER JOIN t1 --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output NULL 1 --- !query 14 +-- !query SELECT * FROM empty_table FULL OUTER JOIN t1 --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output NULL 1 --- !query 15 +-- !query SELECT * FROM empty_table LEFT SEMI JOIN t1 --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output --- !query 16 +-- !query SELECT * FROM empty_table LEFT ANTI JOIN t1 --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output --- !query 17 +-- !query SELECT * FROM empty_table INNER JOIN empty_table --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output --- !query 18 +-- !query SELECT * FROM empty_table CROSS JOIN empty_table --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output --- !query 19 +-- !query SELECT * FROM empty_table LEFT OUTER JOIN empty_table --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output --- !query 20 +-- !query SELECT * FROM empty_table RIGHT OUTER JOIN empty_table --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output --- !query 21 +-- !query SELECT * FROM empty_table FULL OUTER JOIN empty_table --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output --- !query 22 +-- !query SELECT * FROM empty_table LEFT SEMI JOIN empty_table --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output --- !query 23 +-- !query SELECT * FROM empty_table LEFT ANTI JOIN empty_table --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out index ca0cd90d94fa7..21a3531caf732 100644 --- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out @@ -2,127 +2,127 @@ -- Number of queries: 42 --- !query 0 +-- !query select to_json(named_struct('a', 1, 'b', 2)) --- !query 0 schema +-- !query schema struct --- !query 0 output +-- !query output {"a":1,"b":2} --- !query 1 +-- !query select to_json(named_struct('time', to_timestamp('2015-08-26', 'yyyy-MM-dd')), map('timestampFormat', 'dd/MM/yyyy')) --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output {"time":"26/08/2015"} --- !query 2 +-- !query select to_json(array(named_struct('a', 1, 'b', 2))) --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output [{"a":1,"b":2}] --- !query 3 +-- !query select to_json(map(named_struct('a', 1, 'b', 2), named_struct('a', 1, 'b', 2))) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output {"[1,2]":{"a":1,"b":2}} --- !query 4 +-- !query select to_json(map('a', named_struct('a', 1, 'b', 2))) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output {"a":{"a":1,"b":2}} --- !query 5 +-- !query select to_json(map('a', 1)) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output {"a":1} --- !query 6 +-- !query select to_json(array(map('a',1))) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output [{"a":1}] --- !query 7 +-- !query select to_json(array(map('a',1), map('b',2))) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output [{"a":1},{"b":2}] --- !query 8 +-- !query select to_json(named_struct('a', 1, 'b', 2), named_struct('mode', 'PERMISSIVE')) --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output org.apache.spark.sql.AnalysisException Must use a map() function for options;; line 1 pos 7 --- !query 9 +-- !query select to_json(named_struct('a', 1, 'b', 2), map('mode', 1)) --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output org.apache.spark.sql.AnalysisException A type of keys and values in map() must be string, but got map;; line 1 pos 7 --- !query 10 +-- !query select to_json() --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output org.apache.spark.sql.AnalysisException Invalid number of arguments for function to_json. Expected: one of 1 and 2; Found: 0; line 1 pos 7 --- !query 11 +-- !query select from_json('{"a":1}', 'a INT') --- !query 11 schema +-- !query schema struct> --- !query 11 output +-- !query output {"a":1} --- !query 12 +-- !query select from_json('{"time":"26/08/2015"}', 'time Timestamp', map('timestampFormat', 'dd/MM/yyyy')) --- !query 12 schema +-- !query schema struct> --- !query 12 output -{"time":2015-08-26 00:00:00.0} +-- !query output +{"time":2015-08-26 00:00:00} --- !query 13 +-- !query select from_json('{"a":1}', 1) --- !query 13 schema +-- !query schema struct<> --- !query 13 output +-- !query output org.apache.spark.sql.AnalysisException Schema should be specified in DDL format as a string literal or output of the schema_of_json function instead of 1;; line 1 pos 7 --- !query 14 +-- !query select from_json('{"a":1}', 'a InvalidType') --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output org.apache.spark.sql.AnalysisException DataType invalidtype is not supported.(line 1, pos 2) @@ -133,222 +133,222 @@ a InvalidType ; line 1 pos 7 --- !query 15 +-- !query select from_json('{"a":1}', 'a INT', named_struct('mode', 'PERMISSIVE')) --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output org.apache.spark.sql.AnalysisException Must use a map() function for options;; line 1 pos 7 --- !query 16 +-- !query select from_json('{"a":1}', 'a INT', map('mode', 1)) --- !query 16 schema +-- !query schema struct<> --- !query 16 output +-- !query output org.apache.spark.sql.AnalysisException A type of keys and values in map() must be string, but got map;; line 1 pos 7 --- !query 17 +-- !query select from_json() --- !query 17 schema +-- !query schema struct<> --- !query 17 output +-- !query output org.apache.spark.sql.AnalysisException Invalid number of arguments for function from_json. Expected: one of 2 and 3; Found: 0; line 1 pos 7 --- !query 18 +-- !query SELECT json_tuple('{"a" : 1, "b" : 2}', CAST(NULL AS STRING), 'b', CAST(NULL AS STRING), 'a') --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output NULL 2 NULL 1 --- !query 19 +-- !query CREATE TEMPORARY VIEW jsonTable(jsonField, a) AS SELECT * FROM VALUES ('{"a": 1, "b": 2}', 'a') --- !query 19 schema +-- !query schema struct<> --- !query 19 output +-- !query output --- !query 20 +-- !query SELECT json_tuple(jsonField, 'b', CAST(NULL AS STRING), a) FROM jsonTable --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output 2 NULL 1 --- !query 21 +-- !query DROP VIEW IF EXISTS jsonTable --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output --- !query 22 +-- !query select from_json('{"a":1, "b":2}', 'map') --- !query 22 schema +-- !query schema struct> --- !query 22 output +-- !query output {"a":1,"b":2} --- !query 23 +-- !query select from_json('{"a":1, "b":"2"}', 'struct') --- !query 23 schema +-- !query schema struct> --- !query 23 output +-- !query output {"a":1,"b":"2"} --- !query 24 +-- !query select schema_of_json('{"c1":0, "c2":[1]}') --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output struct> --- !query 25 +-- !query select from_json('{"c1":[1, 2, 3]}', schema_of_json('{"c1":[0]}')) --- !query 25 schema +-- !query schema struct>> --- !query 25 output +-- !query output {"c1":[1,2,3]} --- !query 26 +-- !query select from_json('[1, 2, 3]', 'array') --- !query 26 schema +-- !query schema struct> --- !query 26 output +-- !query output [1,2,3] --- !query 27 +-- !query select from_json('[1, "2", 3]', 'array') --- !query 27 schema +-- !query schema struct> --- !query 27 output +-- !query output NULL --- !query 28 +-- !query select from_json('[1, 2, null]', 'array') --- !query 28 schema +-- !query schema struct> --- !query 28 output +-- !query output [1,2,null] --- !query 29 +-- !query select from_json('[{"a": 1}, {"a":2}]', 'array>') --- !query 29 schema +-- !query schema struct>> --- !query 29 output +-- !query output [{"a":1},{"a":2}] --- !query 30 +-- !query select from_json('{"a": 1}', 'array>') --- !query 30 schema +-- !query schema struct>> --- !query 30 output +-- !query output [{"a":1}] --- !query 31 +-- !query select from_json('[null, {"a":2}]', 'array>') --- !query 31 schema +-- !query schema struct>> --- !query 31 output +-- !query output [null,{"a":2}] --- !query 32 +-- !query select from_json('[{"a": 1}, {"b":2}]', 'array>') --- !query 32 schema +-- !query schema struct>> --- !query 32 output +-- !query output [{"a":1},{"b":2}] --- !query 33 +-- !query select from_json('[{"a": 1}, 2]', 'array>') --- !query 33 schema +-- !query schema struct>> --- !query 33 output +-- !query output NULL --- !query 34 +-- !query select to_json(array('1', '2', '3')) --- !query 34 schema +-- !query schema struct --- !query 34 output +-- !query output ["1","2","3"] --- !query 35 +-- !query select to_json(array(array(1, 2, 3), array(4))) --- !query 35 schema +-- !query schema struct --- !query 35 output +-- !query output [[1,2,3],[4]] --- !query 36 +-- !query select schema_of_json('{"c1":1}', map('primitivesAsString', 'true')) --- !query 36 schema +-- !query schema struct --- !query 36 output +-- !query output struct --- !query 37 +-- !query select schema_of_json('{"c1":01, "c2":0.1}', map('allowNumericLeadingZeros', 'true', 'prefersDecimal', 'true')) --- !query 37 schema +-- !query schema struct --- !query 37 output +-- !query output struct --- !query 38 +-- !query select schema_of_json(null) --- !query 38 schema +-- !query schema struct<> --- !query 38 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'schema_of_json(NULL)' due to data type mismatch: The input json should be a string literal and not null; however, got NULL.; line 1 pos 7 --- !query 39 +-- !query CREATE TEMPORARY VIEW jsonTable(jsonField, a) AS SELECT * FROM VALUES ('{"a": 1, "b": 2}', 'a') --- !query 39 schema +-- !query schema struct<> --- !query 39 output +-- !query output --- !query 40 +-- !query SELECT schema_of_json(jsonField) FROM jsonTable --- !query 40 schema +-- !query schema struct<> --- !query 40 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'schema_of_json(jsontable.`jsonField`)' due to data type mismatch: The input json should be a string literal and not null; however, got jsontable.`jsonField`.; line 1 pos 7 --- !query 41 +-- !query DROP VIEW IF EXISTS jsonTable --- !query 41 schema +-- !query schema struct<> --- !query 41 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/limit.sql.out b/sql/core/src/test/resources/sql-tests/results/limit.sql.out index 02fe1de84f753..281326e22a97a 100644 --- a/sql/core/src/test/resources/sql-tests/results/limit.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/limit.sql.out @@ -2,125 +2,125 @@ -- Number of queries: 14 --- !query 0 +-- !query SELECT * FROM testdata LIMIT 2 --- !query 0 schema +-- !query schema struct --- !query 0 output +-- !query output 1 1 2 2 --- !query 1 +-- !query SELECT * FROM arraydata LIMIT 2 --- !query 1 schema +-- !query schema struct,nestedarraycol:array>> --- !query 1 output +-- !query output [1,2,3] [[1,2,3]] [2,3,4] [[2,3,4]] --- !query 2 +-- !query SELECT * FROM mapdata LIMIT 2 --- !query 2 schema +-- !query schema struct> --- !query 2 output +-- !query output {1:"a1",2:"b1",3:"c1",4:"d1",5:"e1"} {1:"a2",2:"b2",3:"c2",4:"d2"} --- !query 3 +-- !query SELECT * FROM testdata LIMIT 2 + 1 --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 1 1 2 2 3 3 --- !query 4 +-- !query SELECT * FROM testdata LIMIT CAST(1 AS int) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 1 1 --- !query 5 +-- !query SELECT * FROM testdata LIMIT -1 --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output org.apache.spark.sql.AnalysisException The limit expression must be equal to or greater than 0, but got -1; --- !query 6 +-- !query SELECT * FROM testData TABLESAMPLE (-1 ROWS) --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output org.apache.spark.sql.AnalysisException The limit expression must be equal to or greater than 0, but got -1; --- !query 7 +-- !query SELECT * FROM testdata LIMIT CAST(1 AS INT) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 1 1 --- !query 8 +-- !query SELECT * FROM testdata LIMIT CAST(NULL AS INT) --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output org.apache.spark.sql.AnalysisException The evaluated limit expression must not be null, but got CAST(NULL AS INT); --- !query 9 +-- !query SELECT * FROM testdata LIMIT key > 3 --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output org.apache.spark.sql.AnalysisException The limit expression must evaluate to a constant value, but got (testdata.`key` > 3); --- !query 10 +-- !query SELECT * FROM testdata LIMIT true --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output org.apache.spark.sql.AnalysisException The limit expression must be integer type, but got boolean; --- !query 11 +-- !query SELECT * FROM testdata LIMIT 'a' --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output org.apache.spark.sql.AnalysisException The limit expression must be integer type, but got string; --- !query 12 +-- !query SELECT * FROM (SELECT * FROM range(10) LIMIT 5) WHERE id > 3 --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 4 --- !query 13 +-- !query SELECT * FROM testdata WHERE key < 3 LIMIT ALL --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output 1 1 2 2 diff --git a/sql/core/src/test/resources/sql-tests/results/literals.sql.out b/sql/core/src/test/resources/sql-tests/results/literals.sql.out index e1e8d685e8787..f6720f6c5faa4 100644 --- a/sql/core/src/test/resources/sql-tests/results/literals.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/literals.sql.out @@ -1,44 +1,44 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 46 +-- Number of queries: 50 --- !query 0 +-- !query select null, Null, nUll --- !query 0 schema +-- !query schema struct --- !query 0 output +-- !query output NULL NULL NULL --- !query 1 +-- !query select true, tRue, false, fALse --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output true true false false --- !query 2 +-- !query select 1Y --- !query 2 schema +-- !query schema struct<1:tinyint> --- !query 2 output +-- !query output 1 --- !query 3 +-- !query select 127Y, -128Y --- !query 3 schema +-- !query schema struct<127:tinyint,-128:tinyint> --- !query 3 output +-- !query output 127 -128 --- !query 4 +-- !query select 128Y --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException Numeric literal 128 does not fit in range [-128, 127] for type tinyint(line 1, pos 7) @@ -48,27 +48,27 @@ select 128Y -------^^^ --- !query 5 +-- !query select 1S --- !query 5 schema +-- !query schema struct<1:smallint> --- !query 5 output +-- !query output 1 --- !query 6 +-- !query select 32767S, -32768S --- !query 6 schema +-- !query schema struct<32767:smallint,-32768:smallint> --- !query 6 output +-- !query output 32767 -32768 --- !query 7 +-- !query select 32768S --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException Numeric literal 32768 does not fit in range [-32768, 32767] for type smallint(line 1, pos 7) @@ -78,27 +78,27 @@ select 32768S -------^^^ --- !query 8 +-- !query select 1L, 2147483648L --- !query 8 schema +-- !query schema struct<1:bigint,2147483648:bigint> --- !query 8 output +-- !query output 1 2147483648 --- !query 9 +-- !query select 9223372036854775807L, -9223372036854775808L --- !query 9 schema +-- !query schema struct<9223372036854775807:bigint,-9223372036854775808:bigint> --- !query 9 output +-- !query output 9223372036854775807 -9223372036854775808 --- !query 10 +-- !query select 9223372036854775808L --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException Numeric literal 9223372036854775808 does not fit in range [-9223372036854775808, 9223372036854775807] for type bigint(line 1, pos 7) @@ -108,43 +108,43 @@ select 9223372036854775808L -------^^^ --- !query 11 +-- !query select 1, -1 --- !query 11 schema +-- !query schema struct<1:int,-1:int> --- !query 11 output +-- !query output 1 -1 --- !query 12 +-- !query select 2147483647, -2147483648 --- !query 12 schema +-- !query schema struct<2147483647:int,-2147483648:int> --- !query 12 output +-- !query output 2147483647 -2147483648 --- !query 13 +-- !query select 9223372036854775807, -9223372036854775808 --- !query 13 schema +-- !query schema struct<9223372036854775807:bigint,-9223372036854775808:bigint> --- !query 13 output +-- !query output 9223372036854775807 -9223372036854775808 --- !query 14 +-- !query select 9223372036854775808, -9223372036854775809 --- !query 14 schema +-- !query schema struct<9223372036854775808:decimal(19,0),-9223372036854775809:decimal(19,0)> --- !query 14 output +-- !query output 9223372036854775808 -9223372036854775809 --- !query 15 +-- !query select 1234567890123456789012345678901234567890 --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException decimal can only support precision up to 38 @@ -152,11 +152,11 @@ decimal can only support precision up to 38 select 1234567890123456789012345678901234567890 --- !query 16 +-- !query select 1234567890123456789012345678901234567890.0 --- !query 16 schema +-- !query schema struct<> --- !query 16 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException decimal can only support precision up to 38 @@ -164,27 +164,27 @@ decimal can only support precision up to 38 select 1234567890123456789012345678901234567890.0 --- !query 17 +-- !query select 1D, 1.2D, 1e10, 1.5e5, .10D, 0.10D, .1e5, .9e+2, 0.9e+2, 900e-1, 9.e+1 --- !query 17 schema -struct<1.0:double,1.2:double,1E+10:decimal(1,-10),1.5E+5:decimal(2,-4),0.1:double,0.1:double,1E+4:decimal(1,-4),9E+1:decimal(1,-1),9E+1:decimal(1,-1),90.0:decimal(3,1),9E+1:decimal(1,-1)> --- !query 17 output -1.0 1.2 10000000000 150000 0.1 0.1 10000 90 90 90 90 +-- !query schema +struct<1.0:double,1.2:double,1.0E10:double,150000.0:double,0.1:double,0.1:double,10000.0:double,90.0:double,90.0:double,90.0:double,90.0:double> +-- !query output +1.0 1.2 1.0E10 150000.0 0.1 0.1 10000.0 90.0 90.0 90.0 90.0 --- !query 18 +-- !query select -1D, -1.2D, -1e10, -1.5e5, -.10D, -0.10D, -.1e5 --- !query 18 schema -struct<-1.0:double,-1.2:double,-1E+10:decimal(1,-10),-1.5E+5:decimal(2,-4),-0.1:double,-0.1:double,-1E+4:decimal(1,-4)> --- !query 18 output --1.0 -1.2 -10000000000 -150000 -0.1 -0.1 -10000 +-- !query schema +struct<-1.0:double,-1.2:double,-1.0E10:double,-150000.0:double,-0.1:double,-0.1:double,-10000.0:double> +-- !query output +-1.0 -1.2 -1.0E10 -150000.0 -0.1 -0.1 -10000.0 --- !query 19 +-- !query select .e3 --- !query 19 schema +-- !query schema struct<> --- !query 19 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException no viable alternative at input 'select .'(line 1, pos 7) @@ -194,101 +194,107 @@ select .e3 -------^^^ --- !query 20 +-- !query select 1E309, -1E309 --- !query 20 schema -struct<1E+309:decimal(1,-309),-1E+309:decimal(1,-309)> --- !query 20 output -1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 -1000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Numeric literal 1E309 does not fit in range [-1.7976931348623157E+308, 1.7976931348623157E+308] for type double(line 1, pos 7) + +== SQL == +select 1E309, -1E309 +-------^^^ --- !query 21 +-- !query select 0.3, -0.8, .5, -.18, 0.1111, .1111 --- !query 21 schema +-- !query schema struct<0.3:decimal(1,1),-0.8:decimal(1,1),0.5:decimal(1,1),-0.18:decimal(2,2),0.1111:decimal(4,4),0.1111:decimal(4,4)> --- !query 21 output +-- !query output 0.3 -0.8 0.5 -0.18 0.1111 0.1111 --- !query 22 +-- !query select 123456789012345678901234567890123456789e10d, 123456789012345678901234567890123456789.1e10d --- !query 22 schema +-- !query schema struct<1.2345678901234568E48:double,1.2345678901234568E48:double> --- !query 22 output +-- !query output 1.2345678901234568E48 1.2345678901234568E48 --- !query 23 +-- !query select "Hello Peter!", 'hello lee!' --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output Hello Peter! hello lee! --- !query 24 +-- !query select 'hello' 'world', 'hello' " " 'lee' --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output helloworld hello lee --- !query 25 +-- !query select "hello 'peter'" --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output hello 'peter' --- !query 26 +-- !query select 'pattern%', 'no-pattern\%', 'pattern\\%', 'pattern\\\%' --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output pattern% no-pattern\% pattern\% pattern\\% --- !query 27 +-- !query select '\'', '"', '\n', '\r', '\t', 'Z' --- !query 27 schema +-- !query schema struct<':string,":string, :string, :string, :string,Z:string> --- !query 27 output +-- !query output ' " Z --- !query 28 +-- !query select '\110\145\154\154\157\041' --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output Hello! --- !query 29 +-- !query select '\u0057\u006F\u0072\u006C\u0064\u0020\u003A\u0029' --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output World :) --- !query 30 +-- !query select dAte '2016-03-12' --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output 2016-03-12 --- !query 31 +-- !query select date 'mar 11 2016' --- !query 31 schema +-- !query schema struct<> --- !query 31 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException Cannot parse the DATE value: mar 11 2016(line 1, pos 7) @@ -298,19 +304,19 @@ select date 'mar 11 2016' -------^^^ --- !query 32 +-- !query select tImEstAmp '2016-03-11 20:54:00.000' --- !query 32 schema -struct --- !query 32 output +-- !query schema +struct +-- !query output 2016-03-11 20:54:00 --- !query 33 +-- !query select timestamp '2016-33-11 20:54:00.000' --- !query 33 schema +-- !query schema struct<> --- !query 33 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException Cannot parse the TIMESTAMP value: 2016-33-11 20:54:00.000(line 1, pos 7) @@ -320,41 +326,11 @@ select timestamp '2016-33-11 20:54:00.000' -------^^^ --- !query 34 -select interval 13.123456789 seconds, interval -13.123456789 second --- !query 34 schema -struct --- !query 34 output -interval 13 seconds 123 milliseconds 456 microseconds interval -12 seconds -876 milliseconds -544 microseconds - - --- !query 35 -select interval 1 year 2 month 3 week 4 day 5 hour 6 minute 7 seconds 8 millisecond, 9 microsecond --- !query 35 schema -struct --- !query 35 output -interval 1 years 2 months 3 weeks 4 days 5 hours 6 minutes 7 seconds 8 milliseconds 9 - - --- !query 36 -select interval 10 nanoseconds --- !query 36 schema -struct<> --- !query 36 output -org.apache.spark.sql.catalyst.parser.ParseException - -no viable alternative at input 'interval 10 nanoseconds'(line 1, pos 19) - -== SQL == -select interval 10 nanoseconds --------------------^^^ - - --- !query 37 +-- !query select GEO '(10,-6)' --- !query 37 schema +-- !query schema struct<> --- !query 37 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException Literals of type 'GEO' are currently not supported.(line 1, pos 7) @@ -364,19 +340,19 @@ select GEO '(10,-6)' -------^^^ --- !query 38 +-- !query select 90912830918230182310293801923652346786BD, 123.0E-28BD, 123.08BD --- !query 38 schema +-- !query schema struct<90912830918230182310293801923652346786:decimal(38,0),1.230E-26:decimal(29,29),123.08:decimal(5,2)> --- !query 38 output -90912830918230182310293801923652346786 0.0000000000000000000000000123 123.08 +-- !query output +90912830918230182310293801923652346786 0.00000000000000000000000001230 123.08 --- !query 39 +-- !query select 1.20E-38BD --- !query 39 schema +-- !query schema struct<> --- !query 39 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException decimal can only support precision up to 38(line 1, pos 7) @@ -386,19 +362,19 @@ select 1.20E-38BD -------^^^ --- !query 40 +-- !query select x'2379ACFe' --- !query 40 schema +-- !query schema struct --- !query 40 output +-- !query output #y�� --- !query 41 +-- !query select X'XuZ' --- !query 41 schema +-- !query schema struct<> --- !query 41 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException contains illegal character for hexBinary: 0XuZ(line 1, pos 7) @@ -408,33 +384,98 @@ select X'XuZ' -------^^^ --- !query 42 +-- !query SELECT 3.14, -3.14, 3.14e8, 3.14e-8, -3.14e8, -3.14e-8, 3.14e+8, 3.14E8, 3.14E-8 --- !query 42 schema -struct<3.14:decimal(3,2),-3.14:decimal(3,2),3.14E+8:decimal(3,-6),3.14E-8:decimal(10,10),-3.14E+8:decimal(3,-6),-3.14E-8:decimal(10,10),3.14E+8:decimal(3,-6),3.14E+8:decimal(3,-6),3.14E-8:decimal(10,10)> --- !query 42 output -3.14 -3.14 314000000 0.0000000314 -314000000 -0.0000000314 314000000 314000000 0.0000000314 - - --- !query 43 -select map(1, interval 1 day, 2, interval 3 week) --- !query 43 schema -struct> --- !query 43 output -{1:interval 1 days,2:interval 3 weeks} - - --- !query 44 -select interval 'interval 3 year 1 hour' --- !query 44 schema -struct --- !query 44 output -interval 3 years 1 hours - - --- !query 45 -select interval '3 year 1 hour' --- !query 45 schema -struct --- !query 45 output -NULL +-- !query schema +struct<3.14:decimal(3,2),-3.14:decimal(3,2),3.14E8:double,3.14E-8:double,-3.14E8:double,-3.14E-8:double,3.14E8:double,3.14E8:double,3.14E-8:double> +-- !query output +3.14 -3.14 3.14E8 3.14E-8 -3.14E8 -3.14E-8 3.14E8 3.14E8 3.14E-8 + + +-- !query +select +date '1999-01-01' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(+ DATE '1999-01-01')' due to data type mismatch: argument 1 requires (numeric or interval) type, however, 'DATE '1999-01-01'' is of date type.; line 1 pos 7 + + +-- !query +select +timestamp '1999-01-01' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(+ TIMESTAMP '1999-01-01 00:00:00')' due to data type mismatch: argument 1 requires (numeric or interval) type, however, 'TIMESTAMP '1999-01-01 00:00:00'' is of timestamp type.; line 1 pos 7 + + +-- !query +select +interval '1 day' +-- !query schema +struct<(+ INTERVAL '1 days'):interval> +-- !query output +1 days + + +-- !query +select +map(1, 2) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(+ map(1, 2))' due to data type mismatch: argument 1 requires (numeric or interval) type, however, 'map(1, 2)' is of map type.; line 1 pos 7 + + +-- !query +select +array(1,2) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(+ array(1, 2))' due to data type mismatch: argument 1 requires (numeric or interval) type, however, 'array(1, 2)' is of array type.; line 1 pos 7 + + +-- !query +select +named_struct('a', 1, 'b', 'spark') +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(+ named_struct('a', 1, 'b', 'spark'))' due to data type mismatch: argument 1 requires (numeric or interval) type, however, 'named_struct('a', 1, 'b', 'spark')' is of struct type.; line 1 pos 7 + + +-- !query +select +X'1' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(+ X'01')' due to data type mismatch: argument 1 requires (numeric or interval) type, however, 'X'01'' is of binary type.; line 1 pos 7 + + +-- !query +select -date '1999-01-01' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(- DATE '1999-01-01')' due to data type mismatch: argument 1 requires (numeric or interval) type, however, 'DATE '1999-01-01'' is of date type.; line 1 pos 7 + + +-- !query +select -timestamp '1999-01-01' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(- TIMESTAMP '1999-01-01 00:00:00')' due to data type mismatch: argument 1 requires (numeric or interval) type, however, 'TIMESTAMP '1999-01-01 00:00:00'' is of timestamp type.; line 1 pos 7 + + +-- !query +select -x'2379ACFe' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(- X'2379ACFE')' due to data type mismatch: argument 1 requires (numeric or interval) type, however, 'X'2379ACFE'' is of binary type.; line 1 pos 7 diff --git a/sql/core/src/test/resources/sql-tests/results/misc-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/misc-functions.sql.out new file mode 100644 index 0000000000000..bd8ffb82ee129 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/misc-functions.sql.out @@ -0,0 +1,58 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 7 + + +-- !query +select typeof(null) +-- !query schema +struct +-- !query output +null + + +-- !query +select typeof(true) +-- !query schema +struct +-- !query output +boolean + + +-- !query +select typeof(1Y), typeof(1S), typeof(1), typeof(1L) +-- !query schema +struct +-- !query output +tinyint smallint int bigint + + +-- !query +select typeof(cast(1.0 as float)), typeof(1.0D), typeof(1.2) +-- !query schema +struct +-- !query output +float double decimal(2,1) + + +-- !query +select typeof(date '1986-05-23'), typeof(timestamp '1986-05-23'), typeof(interval '23 days') +-- !query schema +struct +-- !query output +date timestamp interval + + +-- !query +select typeof(x'ABCD'), typeof('SPARK') +-- !query schema +struct +-- !query output +binary string + + +-- !query +select typeof(array(1, 2)), typeof(map(1, 2)), typeof(named_struct('a', 1, 'b', 'spark')) +-- !query schema +struct +-- !query output +array map struct diff --git a/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out b/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out index 43f2f9af61d9b..13f319700df3f 100644 --- a/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/natural-join.sql.out @@ -2,63 +2,63 @@ -- Number of queries: 6 --- !query 0 +-- !query create temporary view nt1 as select * from values ("one", 1), ("two", 2), ("three", 3) as nt1(k, v1) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view nt2 as select * from values ("one", 1), ("two", 22), ("one", 5) as nt2(k, v2) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query SELECT * FROM nt1 natural join nt2 where k = "one" --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output one 1 1 one 1 5 --- !query 3 +-- !query SELECT * FROM nt1 natural left join nt2 order by v1, v2 --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output one 1 1 one 1 5 two 2 22 three 3 NULL --- !query 4 +-- !query SELECT * FROM nt1 natural right join nt2 order by v1, v2 --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output one 1 1 one 1 5 two 2 22 --- !query 5 +-- !query SELECT count(*) FROM nt1 natural full outer join nt2 --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 4 diff --git a/sql/core/src/test/resources/sql-tests/results/null-handling.sql.out b/sql/core/src/test/resources/sql-tests/results/null-handling.sql.out index 5005dfeb6cd14..5e7eec56743b1 100644 --- a/sql/core/src/test/resources/sql-tests/results/null-handling.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/null-handling.sql.out @@ -2,75 +2,75 @@ -- Number of queries: 28 --- !query 0 +-- !query create table t1(a int, b int, c int) using parquet --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query insert into t1 values(1,0,0) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query insert into t1 values(2,0,1) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query insert into t1 values(3,1,0) --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query insert into t1 values(4,1,1) --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query insert into t1 values(5,null,0) --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query insert into t1 values(6,null,1) --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output --- !query 7 +-- !query insert into t1 values(7,null,null) --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output --- !query 8 +-- !query select a, b+c from t1 --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 1 0 2 1 3 1 @@ -80,11 +80,11 @@ struct 7 NULL --- !query 9 +-- !query select a+10, b*0 from t1 --- !query 9 schema +-- !query schema struct<(a + 10):int,(b * 0):int> --- !query 9 output +-- !query output 11 0 12 0 13 0 @@ -94,31 +94,31 @@ struct<(a + 10):int,(b * 0):int> 17 NULL --- !query 10 +-- !query select distinct b from t1 --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 0 1 NULL --- !query 11 +-- !query select b from t1 union select b from t1 --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 0 1 NULL --- !query 12 +-- !query select a+20, case b when c then 1 else 0 end from t1 --- !query 12 schema +-- !query schema struct<(a + 20):int,CASE WHEN (b = c) THEN 1 ELSE 0 END:int> --- !query 12 output +-- !query output 21 1 22 0 23 0 @@ -128,11 +128,11 @@ struct<(a + 20):int,CASE WHEN (b = c) THEN 1 ELSE 0 END:int> 27 0 --- !query 13 +-- !query select a+30, case c when b then 1 else 0 end from t1 --- !query 13 schema +-- !query schema struct<(a + 30):int,CASE WHEN (c = b) THEN 1 ELSE 0 END:int> --- !query 13 output +-- !query output 31 1 32 0 33 0 @@ -142,11 +142,11 @@ struct<(a + 30):int,CASE WHEN (c = b) THEN 1 ELSE 0 END:int> 37 0 --- !query 14 +-- !query select a+40, case when b<>0 then 1 else 0 end from t1 --- !query 14 schema +-- !query schema struct<(a + 40):int,CASE WHEN (NOT (b = 0)) THEN 1 ELSE 0 END:int> --- !query 14 output +-- !query output 41 0 42 0 43 1 @@ -156,11 +156,11 @@ struct<(a + 40):int,CASE WHEN (NOT (b = 0)) THEN 1 ELSE 0 END:int> 47 0 --- !query 15 +-- !query select a+50, case when not b<>0 then 1 else 0 end from t1 --- !query 15 schema +-- !query schema struct<(a + 50):int,CASE WHEN (NOT (NOT (b = 0))) THEN 1 ELSE 0 END:int> --- !query 15 output +-- !query output 51 1 52 1 53 0 @@ -170,11 +170,11 @@ struct<(a + 50):int,CASE WHEN (NOT (NOT (b = 0))) THEN 1 ELSE 0 END:int> 57 0 --- !query 16 +-- !query select a+60, case when b<>0 and c<>0 then 1 else 0 end from t1 --- !query 16 schema +-- !query schema struct<(a + 60):int,CASE WHEN ((NOT (b = 0)) AND (NOT (c = 0))) THEN 1 ELSE 0 END:int> --- !query 16 output +-- !query output 61 0 62 0 63 0 @@ -184,11 +184,11 @@ struct<(a + 60):int,CASE WHEN ((NOT (b = 0)) AND (NOT (c = 0))) THEN 1 ELSE 0 EN 67 0 --- !query 17 +-- !query select a+70, case when not (b<>0 and c<>0) then 1 else 0 end from t1 --- !query 17 schema +-- !query schema struct<(a + 70):int,CASE WHEN (NOT ((NOT (b = 0)) AND (NOT (c = 0)))) THEN 1 ELSE 0 END:int> --- !query 17 output +-- !query output 71 1 72 1 73 1 @@ -198,11 +198,11 @@ struct<(a + 70):int,CASE WHEN (NOT ((NOT (b = 0)) AND (NOT (c = 0)))) THEN 1 ELS 77 0 --- !query 18 +-- !query select a+80, case when b<>0 or c<>0 then 1 else 0 end from t1 --- !query 18 schema +-- !query schema struct<(a + 80):int,CASE WHEN ((NOT (b = 0)) OR (NOT (c = 0))) THEN 1 ELSE 0 END:int> --- !query 18 output +-- !query output 81 0 82 1 83 1 @@ -212,11 +212,11 @@ struct<(a + 80):int,CASE WHEN ((NOT (b = 0)) OR (NOT (c = 0))) THEN 1 ELSE 0 END 87 0 --- !query 19 +-- !query select a+90, case when not (b<>0 or c<>0) then 1 else 0 end from t1 --- !query 19 schema +-- !query schema struct<(a + 90):int,CASE WHEN (NOT ((NOT (b = 0)) OR (NOT (c = 0)))) THEN 1 ELSE 0 END:int> --- !query 19 output +-- !query output 91 1 92 0 93 0 @@ -226,41 +226,41 @@ struct<(a + 90):int,CASE WHEN (NOT ((NOT (b = 0)) OR (NOT (c = 0)))) THEN 1 ELSE 97 0 --- !query 20 +-- !query select count(*), count(b), sum(b), avg(b), min(b), max(b) from t1 --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output 7 4 2 0.5 0 1 --- !query 21 +-- !query select a+100 from t1 where b<10 --- !query 21 schema +-- !query schema struct<(a + 100):int> --- !query 21 output +-- !query output 101 102 103 104 --- !query 22 +-- !query select a+110 from t1 where not b>10 --- !query 22 schema +-- !query schema struct<(a + 110):int> --- !query 22 output +-- !query output 111 112 113 114 --- !query 23 +-- !query select a+120 from t1 where b<10 OR c=1 --- !query 23 schema +-- !query schema struct<(a + 120):int> --- !query 23 output +-- !query output 121 122 123 @@ -268,38 +268,38 @@ struct<(a + 120):int> 126 --- !query 24 +-- !query select a+130 from t1 where b<10 AND c=1 --- !query 24 schema +-- !query schema struct<(a + 130):int> --- !query 24 output +-- !query output 132 134 --- !query 25 +-- !query select a+140 from t1 where not (b<10 AND c=1) --- !query 25 schema +-- !query schema struct<(a + 140):int> --- !query 25 output +-- !query output 141 143 145 --- !query 26 +-- !query select a+150 from t1 where not (c=1 AND b<10) --- !query 26 schema +-- !query schema struct<(a + 150):int> --- !query 26 output +-- !query output 151 153 155 --- !query 27 +-- !query drop table t1 --- !query 27 schema +-- !query schema struct<> --- !query 27 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/null-propagation.sql.out b/sql/core/src/test/resources/sql-tests/results/null-propagation.sql.out index ed3a651aa6614..76a41f9170388 100644 --- a/sql/core/src/test/resources/sql-tests/results/null-propagation.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/null-propagation.sql.out @@ -2,37 +2,37 @@ -- Number of queries: 4 --- !query 0 +-- !query SELECT COUNT(NULL) FROM VALUES 1, 2, 3 --- !query 0 schema +-- !query schema struct --- !query 0 output +-- !query output 0 --- !query 1 +-- !query SELECT COUNT(1 + NULL) FROM VALUES 1, 2, 3 --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 0 --- !query 2 +-- !query SELECT COUNT(NULL) OVER () FROM VALUES 1, 2, 3 --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 0 0 0 --- !query 3 +-- !query SELECT COUNT(1 + NULL) OVER () FROM VALUES 1, 2, 3 --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 0 0 0 diff --git a/sql/core/src/test/resources/sql-tests/results/operator-div.sql.out b/sql/core/src/test/resources/sql-tests/results/operator-div.sql.out index 75736bee669b0..3f933f4c0e449 100644 --- a/sql/core/src/test/resources/sql-tests/results/operator-div.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/operator-div.sql.out @@ -2,145 +2,145 @@ -- Number of queries: 18 --- !query 0 +-- !query set spark.sql.legacy.integralDivide.returnBigint=true --- !query 0 schema +-- !query schema struct --- !query 0 output +-- !query output spark.sql.legacy.integralDivide.returnBigint true --- !query 1 +-- !query select 5 div 2 --- !query 1 schema +-- !query schema struct<(5 div 2):bigint> --- !query 1 output +-- !query output 2 --- !query 2 +-- !query select 5 div 0 --- !query 2 schema +-- !query schema struct<(5 div 0):bigint> --- !query 2 output +-- !query output NULL --- !query 3 +-- !query select 5 div null --- !query 3 schema +-- !query schema struct<(5 div CAST(NULL AS INT)):bigint> --- !query 3 output +-- !query output NULL --- !query 4 +-- !query select null div 5 --- !query 4 schema +-- !query schema struct<(CAST(NULL AS INT) div 5):bigint> --- !query 4 output +-- !query output NULL --- !query 5 +-- !query select cast(51 as decimal(10, 0)) div cast(2 as decimal(2, 0)) --- !query 5 schema +-- !query schema struct<(CAST(CAST(51 AS DECIMAL(10,0)) AS DECIMAL(10,0)) div CAST(CAST(2 AS DECIMAL(2,0)) AS DECIMAL(10,0))):bigint> --- !query 5 output +-- !query output 25 --- !query 6 +-- !query select cast(5 as decimal(1, 0)) div cast(0 as decimal(2, 0)) --- !query 6 schema +-- !query schema struct<(CAST(CAST(5 AS DECIMAL(1,0)) AS DECIMAL(2,0)) div CAST(CAST(0 AS DECIMAL(2,0)) AS DECIMAL(2,0))):bigint> --- !query 6 output +-- !query output NULL --- !query 7 +-- !query select cast(5 as decimal(1, 0)) div cast(null as decimal(2, 0)) --- !query 7 schema +-- !query schema struct<(CAST(CAST(5 AS DECIMAL(1,0)) AS DECIMAL(2,0)) div CAST(CAST(NULL AS DECIMAL(2,0)) AS DECIMAL(2,0))):bigint> --- !query 7 output +-- !query output NULL --- !query 8 +-- !query select cast(null as decimal(1, 0)) div cast(5 as decimal(2, 0)) --- !query 8 schema +-- !query schema struct<(CAST(CAST(NULL AS DECIMAL(1,0)) AS DECIMAL(2,0)) div CAST(CAST(5 AS DECIMAL(2,0)) AS DECIMAL(2,0))):bigint> --- !query 8 output +-- !query output NULL --- !query 9 +-- !query set spark.sql.legacy.integralDivide.returnBigint=false --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output spark.sql.legacy.integralDivide.returnBigint false --- !query 10 +-- !query select 5 div 2 --- !query 10 schema +-- !query schema struct<(5 div 2):int> --- !query 10 output +-- !query output 2 --- !query 11 +-- !query select 5 div 0 --- !query 11 schema +-- !query schema struct<(5 div 0):int> --- !query 11 output +-- !query output NULL --- !query 12 +-- !query select 5 div null --- !query 12 schema +-- !query schema struct<(5 div CAST(NULL AS INT)):int> --- !query 12 output +-- !query output NULL --- !query 13 +-- !query select null div 5 --- !query 13 schema +-- !query schema struct<(CAST(NULL AS INT) div 5):int> --- !query 13 output +-- !query output NULL --- !query 14 +-- !query select cast(51 as decimal(10, 0)) div cast(2 as decimal(2, 0)) --- !query 14 schema +-- !query schema struct<(CAST(CAST(51 AS DECIMAL(10,0)) AS DECIMAL(10,0)) div CAST(CAST(2 AS DECIMAL(2,0)) AS DECIMAL(10,0))):decimal(10,0)> --- !query 14 output +-- !query output 25 --- !query 15 +-- !query select cast(5 as decimal(1, 0)) div cast(0 as decimal(2, 0)) --- !query 15 schema +-- !query schema struct<(CAST(CAST(5 AS DECIMAL(1,0)) AS DECIMAL(2,0)) div CAST(CAST(0 AS DECIMAL(2,0)) AS DECIMAL(2,0))):decimal(1,0)> --- !query 15 output +-- !query output NULL --- !query 16 +-- !query select cast(5 as decimal(1, 0)) div cast(null as decimal(2, 0)) --- !query 16 schema +-- !query schema struct<(CAST(CAST(5 AS DECIMAL(1,0)) AS DECIMAL(2,0)) div CAST(CAST(NULL AS DECIMAL(2,0)) AS DECIMAL(2,0))):decimal(1,0)> --- !query 16 output +-- !query output NULL --- !query 17 +-- !query select cast(null as decimal(1, 0)) div cast(5 as decimal(2, 0)) --- !query 17 schema +-- !query schema struct<(CAST(CAST(NULL AS DECIMAL(1,0)) AS DECIMAL(2,0)) div CAST(CAST(5 AS DECIMAL(2,0)) AS DECIMAL(2,0))):decimal(1,0)> --- !query 17 output +-- !query output NULL diff --git a/sql/core/src/test/resources/sql-tests/results/operators.sql.out b/sql/core/src/test/resources/sql-tests/results/operators.sql.out index e0cbd575bc346..548281014afd7 100644 --- a/sql/core/src/test/resources/sql-tests/results/operators.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/operators.sql.out @@ -2,393 +2,393 @@ -- Number of queries: 49 --- !query 0 +-- !query select -100 --- !query 0 schema +-- !query schema struct<-100:int> --- !query 0 output +-- !query output -100 --- !query 1 +-- !query select +230 --- !query 1 schema -struct<230:int> --- !query 1 output +-- !query schema +struct<(+ 230):int> +-- !query output 230 --- !query 2 +-- !query select -5.2 --- !query 2 schema +-- !query schema struct<-5.2:decimal(2,1)> --- !query 2 output +-- !query output -5.2 --- !query 3 +-- !query select +6.8e0 --- !query 3 schema -struct<6.8:decimal(2,1)> --- !query 3 output +-- !query schema +struct<(+ 6.8):double> +-- !query output 6.8 --- !query 4 +-- !query select -key, +key from testdata where key = 2 --- !query 4 schema -struct<(- key):int,key:int> --- !query 4 output +-- !query schema +struct<(- key):int,(+ key):int> +-- !query output -2 2 --- !query 5 +-- !query select -(key + 1), - key + 1, +(key + 5) from testdata where key = 1 --- !query 5 schema -struct<(- (key + 1)):int,((- key) + 1):int,(key + 5):int> --- !query 5 output +-- !query schema +struct<(- (key + 1)):int,((- key) + 1):int,(+ (key + 5)):int> +-- !query output -2 0 6 --- !query 6 +-- !query select -max(key), +max(key) from testdata --- !query 6 schema -struct<(- max(key)):int,max(key):int> --- !query 6 output +-- !query schema +struct<(- max(key)):int,(+ max(key)):int> +-- !query output -100 100 --- !query 7 +-- !query select - (-10) --- !query 7 schema +-- !query schema struct<(- -10):int> --- !query 7 output +-- !query output 10 --- !query 8 +-- !query select + (-key) from testdata where key = 32 --- !query 8 schema -struct<(- key):int> --- !query 8 output +-- !query schema +struct<(+ (- key)):int> +-- !query output -32 --- !query 9 +-- !query select - (+max(key)) from testdata --- !query 9 schema -struct<(- max(key)):int> --- !query 9 output +-- !query schema +struct<(- (+ max(key))):int> +-- !query output -100 --- !query 10 +-- !query select - - 3 --- !query 10 schema +-- !query schema struct<(- -3):int> --- !query 10 output +-- !query output 3 --- !query 11 +-- !query select - + 20 --- !query 11 schema -struct<(- 20):int> --- !query 11 output +-- !query schema +struct<(- (+ 20)):int> +-- !query output -20 --- !query 12 +-- !query select + + 100 --- !query 12 schema -struct<100:int> --- !query 12 output +-- !query schema +struct<(+ (+ 100)):int> +-- !query output 100 --- !query 13 +-- !query select - - max(key) from testdata --- !query 13 schema +-- !query schema struct<(- (- max(key))):int> --- !query 13 output +-- !query output 100 --- !query 14 +-- !query select + - key from testdata where key = 33 --- !query 14 schema -struct<(- key):int> --- !query 14 output +-- !query schema +struct<(+ (- key)):int> +-- !query output -33 --- !query 15 +-- !query select 5 / 2 --- !query 15 schema +-- !query schema struct<(CAST(5 AS DOUBLE) / CAST(2 AS DOUBLE)):double> --- !query 15 output +-- !query output 2.5 --- !query 16 +-- !query select 5 / 0 --- !query 16 schema +-- !query schema struct<(CAST(5 AS DOUBLE) / CAST(0 AS DOUBLE)):double> --- !query 16 output +-- !query output NULL --- !query 17 +-- !query select 5 / null --- !query 17 schema +-- !query schema struct<(CAST(5 AS DOUBLE) / CAST(NULL AS DOUBLE)):double> --- !query 17 output +-- !query output NULL --- !query 18 +-- !query select null / 5 --- !query 18 schema +-- !query schema struct<(CAST(NULL AS DOUBLE) / CAST(5 AS DOUBLE)):double> --- !query 18 output +-- !query output NULL --- !query 19 +-- !query select 1 + 2 --- !query 19 schema +-- !query schema struct<(1 + 2):int> --- !query 19 output +-- !query output 3 --- !query 20 +-- !query select 1 - 2 --- !query 20 schema +-- !query schema struct<(1 - 2):int> --- !query 20 output +-- !query output -1 --- !query 21 +-- !query select 2 * 5 --- !query 21 schema +-- !query schema struct<(2 * 5):int> --- !query 21 output +-- !query output 10 --- !query 22 +-- !query select 5 % 3 --- !query 22 schema +-- !query schema struct<(5 % 3):int> --- !query 22 output +-- !query output 2 --- !query 23 +-- !query select pmod(-7, 3) --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output 2 --- !query 24 +-- !query select cot(1) --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output 0.6420926159343306 --- !query 25 +-- !query select cot(null) --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output NULL --- !query 26 +-- !query select cot(0) --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output Infinity --- !query 27 +-- !query select cot(-1) --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output -0.6420926159343306 --- !query 28 +-- !query select ceiling(0) --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output 0 --- !query 29 +-- !query select ceiling(1) --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output 1 --- !query 30 +-- !query select ceil(1234567890123456) --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output 1234567890123456 --- !query 31 +-- !query select ceiling(1234567890123456) --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output 1234567890123456 --- !query 32 +-- !query select ceil(0.01) --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output 1 --- !query 33 +-- !query select ceiling(-0.10) --- !query 33 schema +-- !query schema struct --- !query 33 output +-- !query output 0 --- !query 34 +-- !query select floor(0) --- !query 34 schema +-- !query schema struct --- !query 34 output +-- !query output 0 --- !query 35 +-- !query select floor(1) --- !query 35 schema +-- !query schema struct --- !query 35 output +-- !query output 1 --- !query 36 +-- !query select floor(1234567890123456) --- !query 36 schema +-- !query schema struct --- !query 36 output +-- !query output 1234567890123456 --- !query 37 +-- !query select floor(0.01) --- !query 37 schema +-- !query schema struct --- !query 37 output +-- !query output 0 --- !query 38 +-- !query select floor(-0.10) --- !query 38 schema +-- !query schema struct --- !query 38 output +-- !query output -1 --- !query 39 +-- !query select 1 > 0.00001 --- !query 39 schema +-- !query schema struct<(CAST(1 AS BIGINT) > 0):boolean> --- !query 39 output +-- !query output true --- !query 40 +-- !query select mod(7, 2), mod(7, 0), mod(0, 2), mod(7, null), mod(null, 2), mod(null, null) --- !query 40 schema +-- !query schema struct<(7 % 2):int,(7 % 0):int,(0 % 2):int,(7 % CAST(NULL AS INT)):int,(CAST(NULL AS INT) % 2):int,(CAST(NULL AS DOUBLE) % CAST(NULL AS DOUBLE)):double> --- !query 40 output +-- !query output 1 NULL 0 NULL NULL NULL --- !query 41 +-- !query select BIT_LENGTH('abc') --- !query 41 schema +-- !query schema struct --- !query 41 output +-- !query output 24 --- !query 42 +-- !query select CHAR_LENGTH('abc') --- !query 42 schema +-- !query schema struct --- !query 42 output +-- !query output 3 --- !query 43 +-- !query select CHARACTER_LENGTH('abc') --- !query 43 schema +-- !query schema struct --- !query 43 output +-- !query output 3 --- !query 44 +-- !query select OCTET_LENGTH('abc') --- !query 44 schema +-- !query schema struct --- !query 44 output +-- !query output 3 --- !query 45 +-- !query select abs(-3.13), abs('-2.19') --- !query 45 schema +-- !query schema struct --- !query 45 output +-- !query output 3.13 2.19 --- !query 46 +-- !query select positive('-1.11'), positive(-1.11), negative('-1.11'), negative(-1.11) --- !query 46 schema +-- !query schema struct<(+ CAST(-1.11 AS DOUBLE)):double,(+ -1.11):decimal(3,2),(- CAST(-1.11 AS DOUBLE)):double,(- -1.11):decimal(3,2)> --- !query 46 output +-- !query output -1.11 -1.11 1.11 1.11 --- !query 47 +-- !query select pmod(-7, 2), pmod(0, 2), pmod(7, 0), pmod(7, null), pmod(null, 2), pmod(null, null) --- !query 47 schema +-- !query schema struct --- !query 47 output +-- !query output 1 0 NULL NULL NULL NULL --- !query 48 +-- !query select pmod(cast(3.13 as decimal), cast(0 as decimal)), pmod(cast(2 as smallint), cast(0 as smallint)) --- !query 48 schema +-- !query schema struct --- !query 48 output +-- !query output NULL NULL diff --git a/sql/core/src/test/resources/sql-tests/results/order-by-nulls-ordering.sql.out b/sql/core/src/test/resources/sql-tests/results/order-by-nulls-ordering.sql.out index c1b63dfb8caef..67d271790eef0 100644 --- a/sql/core/src/test/resources/sql-tests/results/order-by-nulls-ordering.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/order-by-nulls-ordering.sql.out @@ -2,32 +2,32 @@ -- Number of queries: 17 --- !query 0 +-- !query create table spark_10747(col1 int, col2 int, col3 int) using parquet --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query INSERT INTO spark_10747 VALUES (6, 12, 10), (6, 11, 4), (6, 9, 10), (6, 15, 8), (6, 15, 8), (6, 7, 4), (6, 7, 8), (6, 13, null), (6, 10, null) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query select col1, col2, col3, sum(col2) over (partition by col1 order by col3 desc nulls last, col2 rows between 2 preceding and 2 following ) as sum_col2 from spark_10747 where col1 = 6 order by sum_col2 --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 6 9 10 28 6 13 NULL 34 6 10 NULL 41 @@ -39,15 +39,15 @@ struct 6 7 4 58 --- !query 3 +-- !query select col1, col2, col3, sum(col2) over (partition by col1 order by col3 desc nulls first, col2 rows between 2 preceding and 2 following ) as sum_col2 from spark_10747 where col1 = 6 order by sum_col2 --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 6 10 NULL 32 6 11 4 33 6 13 NULL 44 @@ -59,15 +59,15 @@ struct 6 7 8 58 --- !query 4 +-- !query select col1, col2, col3, sum(col2) over (partition by col1 order by col3 asc nulls last, col2 rows between 2 preceding and 2 following ) as sum_col2 from spark_10747 where col1 = 6 order by sum_col2 --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 6 7 4 25 6 13 NULL 35 6 11 4 40 @@ -79,15 +79,15 @@ struct 6 9 10 61 --- !query 5 +-- !query select col1, col2, col3, sum(col2) over (partition by col1 order by col3 asc nulls first, col2 rows between 2 preceding and 2 following ) as sum_col2 from spark_10747 where col1 = 6 order by sum_col2 --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 6 10 NULL 30 6 12 10 36 6 13 NULL 41 @@ -99,11 +99,11 @@ struct 6 15 8 58 --- !query 6 +-- !query SELECT COL1, COL2, COL3 FROM spark_10747 ORDER BY COL3 ASC NULLS FIRST, COL2 --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 6 10 NULL 6 13 NULL 6 7 4 @@ -115,11 +115,11 @@ struct 6 12 10 --- !query 7 +-- !query SELECT COL1, COL2, COL3 FROM spark_10747 ORDER BY COL3 NULLS LAST, COL2 --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 6 7 4 6 11 4 6 7 8 @@ -131,11 +131,11 @@ struct 6 13 NULL --- !query 8 +-- !query SELECT COL1, COL2, COL3 FROM spark_10747 ORDER BY COL3 DESC NULLS FIRST, COL2 --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 6 10 NULL 6 13 NULL 6 9 10 @@ -147,11 +147,11 @@ struct 6 11 4 --- !query 9 +-- !query SELECT COL1, COL2, COL3 FROM spark_10747 ORDER BY COL3 DESC NULLS LAST, COL2 --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 6 9 10 6 12 10 6 7 8 @@ -163,15 +163,15 @@ struct 6 13 NULL --- !query 10 +-- !query drop table spark_10747 --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output --- !query 11 +-- !query create table spark_10747_mix( col1 string, col2 int, @@ -179,13 +179,13 @@ col3 double, col4 decimal(10,2), col5 decimal(20,1)) using parquet --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output --- !query 12 +-- !query INSERT INTO spark_10747_mix VALUES ('b', 2, 1.0, 1.00, 10.0), ('d', 3, 2.0, 3.00, 0.0), @@ -195,60 +195,60 @@ INSERT INTO spark_10747_mix VALUES ('d', 3, null, 4.00, 1.0), ('a', 1, 1.0, 1.00, null), ('c', 3, 2.0, 2.00, null) --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output --- !query 13 +-- !query select * from spark_10747_mix order by col1 nulls last, col5 nulls last --- !query 13 schema +-- !query schema struct --- !query 13 output -a 1 1.0 1 NULL -b 2 1.0 1 10 -c 3 2.0 2 15.1 -c 3 2.0 2 NULL -d 3 2.0 3 0 -d 3 0.0 3 1 -d 3 NULL 4 1 -NULL 3 0.0 3 1 - - --- !query 14 +-- !query output +a 1 1.0 1.00 NULL +b 2 1.0 1.00 10.0 +c 3 2.0 2.00 15.1 +c 3 2.0 2.00 NULL +d 3 2.0 3.00 0.0 +d 3 0.0 3.00 1.0 +d 3 NULL 4.00 1.0 +NULL 3 0.0 3.00 1.0 + + +-- !query select * from spark_10747_mix order by col1 desc nulls first, col5 desc nulls first --- !query 14 schema +-- !query schema struct --- !query 14 output -NULL 3 0.0 3 1 -d 3 0.0 3 1 -d 3 NULL 4 1 -d 3 2.0 3 0 -c 3 2.0 2 NULL -c 3 2.0 2 15.1 -b 2 1.0 1 10 -a 1 1.0 1 NULL - - --- !query 15 +-- !query output +NULL 3 0.0 3.00 1.0 +d 3 0.0 3.00 1.0 +d 3 NULL 4.00 1.0 +d 3 2.0 3.00 0.0 +c 3 2.0 2.00 NULL +c 3 2.0 2.00 15.1 +b 2 1.0 1.00 10.0 +a 1 1.0 1.00 NULL + + +-- !query select * from spark_10747_mix order by col5 desc nulls first, col3 desc nulls last --- !query 15 schema +-- !query schema struct --- !query 15 output -c 3 2.0 2 NULL -a 1 1.0 1 NULL -c 3 2.0 2 15.1 -b 2 1.0 1 10 -d 3 0.0 3 1 -NULL 3 0.0 3 1 -d 3 NULL 4 1 -d 3 2.0 3 0 - - --- !query 16 +-- !query output +c 3 2.0 2.00 NULL +a 1 1.0 1.00 NULL +c 3 2.0 2.00 15.1 +b 2 1.0 1.00 10.0 +d 3 0.0 3.00 1.0 +NULL 3 0.0 3.00 1.0 +d 3 NULL 4.00 1.0 +d 3 2.0 3.00 0.0 + + +-- !query drop table spark_10747_mix --- !query 16 schema +-- !query schema struct<> --- !query 16 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/order-by-ordinal.sql.out b/sql/core/src/test/resources/sql-tests/results/order-by-ordinal.sql.out index cc47cc67c87c8..44c811a7439c0 100644 --- a/sql/core/src/test/resources/sql-tests/results/order-by-ordinal.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/order-by-ordinal.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 12 --- !query 0 +-- !query create temporary view data as select * from values (1, 1), (1, 2), @@ -11,17 +11,17 @@ create temporary view data as select * from values (3, 1), (3, 2) as data(a, b) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query select * from data order by 1 desc --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 3 1 3 2 2 1 @@ -30,11 +30,11 @@ struct 1 2 --- !query 2 +-- !query select * from data order by 1 desc, b desc --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 3 2 3 1 2 2 @@ -43,11 +43,11 @@ struct 1 1 --- !query 3 +-- !query select * from data order by 1 desc, 2 desc --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 3 2 3 1 2 2 @@ -56,11 +56,11 @@ struct 1 1 --- !query 4 +-- !query select * from data order by 1 + 0 desc, b desc --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 1 2 2 2 3 2 @@ -69,38 +69,38 @@ struct 3 1 --- !query 5 +-- !query select * from data order by 0 --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output org.apache.spark.sql.AnalysisException ORDER BY position 0 is not in select list (valid range is [1, 2]); line 1 pos 28 --- !query 6 +-- !query select * from data order by -1 --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output org.apache.spark.sql.AnalysisException ORDER BY position -1 is not in select list (valid range is [1, 2]); line 1 pos 28 --- !query 7 +-- !query select * from data order by 3 --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output org.apache.spark.sql.AnalysisException ORDER BY position 3 is not in select list (valid range is [1, 2]); line 1 pos 28 --- !query 8 +-- !query select * from data sort by 1 desc --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 1 1 1 2 2 1 @@ -109,19 +109,19 @@ struct 3 2 --- !query 9 +-- !query set spark.sql.orderByOrdinal=false --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output spark.sql.orderByOrdinal false --- !query 10 +-- !query select * from data order by 0 --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 1 1 1 2 2 1 @@ -130,11 +130,11 @@ struct 3 2 --- !query 11 +-- !query select * from data sort by 0 --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 1 1 1 2 2 1 diff --git a/sql/core/src/test/resources/sql-tests/results/outer-join.sql.out b/sql/core/src/test/resources/sql-tests/results/outer-join.sql.out index 5db3bae5d0379..703ce231c53ff 100644 --- a/sql/core/src/test/resources/sql-tests/results/outer-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/outer-join.sql.out @@ -1,28 +1,28 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 8 +-- Number of queries: 6 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (-234), (145), (367), (975), (298) as t1(int_col1) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE OR REPLACE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (-769, -244), (-800, -409), (940, 86), (-507, 304), (-367, 158) as t2(int_col0, int_col1) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query SELECT (SUM(COALESCE(t1.int_col1, t2.int_col0))), ((COALESCE(t1.int_col1, t2.int_col0)) * 2) @@ -33,40 +33,32 @@ GROUP BY GREATEST(COALESCE(t2.int_col1, 109), COALESCE(t1.int_col1, -449)), COALESCE(t1.int_col1, t2.int_col0) HAVING (SUM(COALESCE(t1.int_col1, t2.int_col0))) > ((COALESCE(t1.int_col1, t2.int_col0)) * 2) --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output -367 -734 -507 -1014 -769 -1538 -800 -1600 --- !query 3 +-- !query CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (97) as t1(int_col1) --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query CREATE OR REPLACE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (0) as t2(int_col1) --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 -set spark.sql.crossJoin.enabled = true --- !query 5 schema -struct --- !query 5 output -spark.sql.crossJoin.enabled true - - --- !query 6 +-- !query SELECT * FROM ( SELECT @@ -74,15 +66,7 @@ SELECT FROM t1 LEFT JOIN t2 ON false ) t where (t.int_col) is not null --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 97 - - --- !query 7 -set spark.sql.crossJoin.enabled = false --- !query 7 schema -struct --- !query 7 output -spark.sql.crossJoin.enabled false diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part2.sql.out b/sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part2.sql.out deleted file mode 100644 index 2b5371a657196..0000000000000 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part2.sql.out +++ /dev/null @@ -1,156 +0,0 @@ --- Automatically generated by SQLQueryTestSuite --- Number of queries: 16 - - --- !query 0 -create temporary view int4_tbl as select * from values - (0), - (123456), - (-123456), - (2147483647), - (-2147483647) - as int4_tbl(f1) --- !query 0 schema -struct<> --- !query 0 output - - - --- !query 1 -SELECT - (NULL AND NULL) IS NULL AS `t`, - (TRUE AND NULL) IS NULL AS `t`, - (FALSE AND NULL) IS NULL AS `t`, - (NULL AND TRUE) IS NULL AS `t`, - (NULL AND FALSE) IS NULL AS `t`, - (TRUE AND TRUE) AS `t`, - NOT (TRUE AND FALSE) AS `t`, - NOT (FALSE AND TRUE) AS `t`, - NOT (FALSE AND FALSE) AS `t` --- !query 1 schema -struct --- !query 1 output -true true false true false true true true true - - --- !query 2 -SELECT - (NULL OR NULL) IS NULL AS `t`, - (TRUE OR NULL) IS NULL AS `t`, - (FALSE OR NULL) IS NULL AS `t`, - (NULL OR TRUE) IS NULL AS `t`, - (NULL OR FALSE) IS NULL AS `t`, - (TRUE OR TRUE) AS `t`, - (TRUE OR FALSE) AS `t`, - (FALSE OR TRUE) AS `t`, - NOT (FALSE OR FALSE) AS `t` --- !query 2 schema -struct --- !query 2 output -true false true false true true true true true - - --- !query 3 -select min(unique1) from tenk1 --- !query 3 schema -struct --- !query 3 output -0 - - --- !query 4 -select max(unique1) from tenk1 --- !query 4 schema -struct --- !query 4 output -9999 - - --- !query 5 -select max(unique1) from tenk1 where unique1 < 42 --- !query 5 schema -struct --- !query 5 output -41 - - --- !query 6 -select max(unique1) from tenk1 where unique1 > 42 --- !query 6 schema -struct --- !query 6 output -9999 - - --- !query 7 -select max(unique1) from tenk1 where unique1 > 42000 --- !query 7 schema -struct --- !query 7 output -NULL - - --- !query 8 -select max(tenthous) from tenk1 where thousand = 33 --- !query 8 schema -struct --- !query 8 output -9033 - - --- !query 9 -select min(tenthous) from tenk1 where thousand = 33 --- !query 9 schema -struct --- !query 9 output -33 - - --- !query 10 -select distinct max(unique2) from tenk1 --- !query 10 schema -struct --- !query 10 output -9999 - - --- !query 11 -select max(unique2) from tenk1 order by 1 --- !query 11 schema -struct --- !query 11 output -9999 - - --- !query 12 -select max(unique2) from tenk1 order by max(unique2) --- !query 12 schema -struct --- !query 12 output -9999 - - --- !query 13 -select max(unique2) from tenk1 order by max(unique2)+1 --- !query 13 schema -struct --- !query 13 output -9999 - - --- !query 14 -select t1.max_unique2, g from (select max(unique2) as max_unique2 FROM tenk1) t1 LATERAL VIEW explode(array(1,2,3)) t2 AS g order by g desc --- !query 14 schema -struct --- !query 14 output -9999 3 -9999 2 -9999 1 - - --- !query 15 -select max(100) from tenk1 --- !query 15 schema -struct --- !query 15 output -100 diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part3.sql.out b/sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part3.sql.out deleted file mode 100644 index f102383cb4d8f..0000000000000 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part3.sql.out +++ /dev/null @@ -1,22 +0,0 @@ --- Automatically generated by SQLQueryTestSuite --- Number of queries: 2 - - --- !query 0 -select max(min(unique1)) from tenk1 --- !query 0 schema -struct<> --- !query 0 output -org.apache.spark.sql.AnalysisException -It is not allowed to use an aggregate function in the argument of another aggregate function. Please use the inner aggregate function in a sub-query.; - - --- !query 1 -select (select count(*) - from (values (1)) t0(inner_c)) -from (values (2),(3)) t1(outer_c) --- !query 1 schema -struct --- !query 1 output -1 -1 diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/date.sql.out b/sql/core/src/test/resources/sql-tests/results/pgSQL/date.sql.out deleted file mode 100644 index cb2be6d1cd22d..0000000000000 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/date.sql.out +++ /dev/null @@ -1,853 +0,0 @@ --- Automatically generated by SQLQueryTestSuite --- Number of queries: 91 - - --- !query 0 -CREATE TABLE DATE_TBL (f1 date) USING parquet --- !query 0 schema -struct<> --- !query 0 output - - - --- !query 1 -INSERT INTO DATE_TBL VALUES ('1957-04-09') --- !query 1 schema -struct<> --- !query 1 output - - - --- !query 2 -INSERT INTO DATE_TBL VALUES ('1957-06-13') --- !query 2 schema -struct<> --- !query 2 output - - - --- !query 3 -INSERT INTO DATE_TBL VALUES ('1996-02-28') --- !query 3 schema -struct<> --- !query 3 output - - - --- !query 4 -INSERT INTO DATE_TBL VALUES ('1996-02-29') --- !query 4 schema -struct<> --- !query 4 output - - - --- !query 5 -INSERT INTO DATE_TBL VALUES ('1996-03-01') --- !query 5 schema -struct<> --- !query 5 output - - - --- !query 6 -INSERT INTO DATE_TBL VALUES ('1996-03-02') --- !query 6 schema -struct<> --- !query 6 output - - - --- !query 7 -INSERT INTO DATE_TBL VALUES ('1997-02-28') --- !query 7 schema -struct<> --- !query 7 output - - - --- !query 8 -INSERT INTO DATE_TBL VALUES ('1997-03-01') --- !query 8 schema -struct<> --- !query 8 output - - - --- !query 9 -INSERT INTO DATE_TBL VALUES ('1997-03-02') --- !query 9 schema -struct<> --- !query 9 output - - - --- !query 10 -INSERT INTO DATE_TBL VALUES ('2000-04-01') --- !query 10 schema -struct<> --- !query 10 output - - - --- !query 11 -INSERT INTO DATE_TBL VALUES ('2000-04-02') --- !query 11 schema -struct<> --- !query 11 output - - - --- !query 12 -INSERT INTO DATE_TBL VALUES ('2000-04-03') --- !query 12 schema -struct<> --- !query 12 output - - - --- !query 13 -INSERT INTO DATE_TBL VALUES ('2038-04-08') --- !query 13 schema -struct<> --- !query 13 output - - - --- !query 14 -INSERT INTO DATE_TBL VALUES ('2039-04-09') --- !query 14 schema -struct<> --- !query 14 output - - - --- !query 15 -INSERT INTO DATE_TBL VALUES ('2040-04-10') --- !query 15 schema -struct<> --- !query 15 output - - - --- !query 16 -SELECT f1 AS `Fifteen` FROM DATE_TBL --- !query 16 schema -struct --- !query 16 output -1957-04-09 -1957-06-13 -1996-02-28 -1996-02-29 -1996-03-01 -1996-03-02 -1997-02-28 -1997-03-01 -1997-03-02 -2000-04-01 -2000-04-02 -2000-04-03 -2038-04-08 -2039-04-09 -2040-04-10 - - --- !query 17 -SELECT f1 AS `Nine` FROM DATE_TBL WHERE f1 < '2000-01-01' --- !query 17 schema -struct --- !query 17 output -1957-04-09 -1957-06-13 -1996-02-28 -1996-02-29 -1996-03-01 -1996-03-02 -1997-02-28 -1997-03-01 -1997-03-02 - - --- !query 18 -SELECT f1 AS `Three` FROM DATE_TBL - WHERE f1 BETWEEN '2000-01-01' AND '2001-01-01' --- !query 18 schema -struct --- !query 18 output -2000-04-01 -2000-04-02 -2000-04-03 - - --- !query 19 -SELECT date '1999-01-08' --- !query 19 schema -struct --- !query 19 output -1999-01-08 - - --- !query 20 -SELECT date '1999-01-18' --- !query 20 schema -struct --- !query 20 output -1999-01-18 - - --- !query 21 -SELECT date '1999 Jan 08' --- !query 21 schema -struct<> --- !query 21 output -org.apache.spark.sql.catalyst.parser.ParseException - -Cannot parse the DATE value: 1999 Jan 08(line 1, pos 7) - -== SQL == -SELECT date '1999 Jan 08' --------^^^ - - --- !query 22 -SELECT date '1999 08 Jan' --- !query 22 schema -struct<> --- !query 22 output -org.apache.spark.sql.catalyst.parser.ParseException - -Cannot parse the DATE value: 1999 08 Jan(line 1, pos 7) - -== SQL == -SELECT date '1999 08 Jan' --------^^^ - - --- !query 23 -SELECT date '1999-01-08' --- !query 23 schema -struct --- !query 23 output -1999-01-08 - - --- !query 24 -SELECT date '1999-08-01' --- !query 24 schema -struct --- !query 24 output -1999-08-01 - - --- !query 25 -SELECT date '1999 01 08' --- !query 25 schema -struct<> --- !query 25 output -org.apache.spark.sql.catalyst.parser.ParseException - -Cannot parse the DATE value: 1999 01 08(line 1, pos 7) - -== SQL == -SELECT date '1999 01 08' --------^^^ - - --- !query 26 -SELECT date '1999 08 01' --- !query 26 schema -struct<> --- !query 26 output -org.apache.spark.sql.catalyst.parser.ParseException - -Cannot parse the DATE value: 1999 08 01(line 1, pos 7) - -== SQL == -SELECT date '1999 08 01' --------^^^ - - --- !query 27 -SELECT date '1999-01-08' --- !query 27 schema -struct --- !query 27 output -1999-01-08 - - --- !query 28 -SELECT date '1999 Jan 08' --- !query 28 schema -struct<> --- !query 28 output -org.apache.spark.sql.catalyst.parser.ParseException - -Cannot parse the DATE value: 1999 Jan 08(line 1, pos 7) - -== SQL == -SELECT date '1999 Jan 08' --------^^^ - - --- !query 29 -SELECT date '1999 08 Jan' --- !query 29 schema -struct<> --- !query 29 output -org.apache.spark.sql.catalyst.parser.ParseException - -Cannot parse the DATE value: 1999 08 Jan(line 1, pos 7) - -== SQL == -SELECT date '1999 08 Jan' --------^^^ - - --- !query 30 -SELECT date '1999-01-08' --- !query 30 schema -struct --- !query 30 output -1999-01-08 - - --- !query 31 -SELECT date '1999-08-01' --- !query 31 schema -struct --- !query 31 output -1999-08-01 - - --- !query 32 -SELECT date '1999 01 08' --- !query 32 schema -struct<> --- !query 32 output -org.apache.spark.sql.catalyst.parser.ParseException - -Cannot parse the DATE value: 1999 01 08(line 1, pos 7) - -== SQL == -SELECT date '1999 01 08' --------^^^ - - --- !query 33 -SELECT date '1999 08 01' --- !query 33 schema -struct<> --- !query 33 output -org.apache.spark.sql.catalyst.parser.ParseException - -Cannot parse the DATE value: 1999 08 01(line 1, pos 7) - -== SQL == -SELECT date '1999 08 01' --------^^^ - - --- !query 34 -SELECT date '1999-01-08' --- !query 34 schema -struct --- !query 34 output -1999-01-08 - - --- !query 35 -SELECT date '1999-01-18' --- !query 35 schema -struct --- !query 35 output -1999-01-18 - - --- !query 36 -SELECT date '1999 Jan 08' --- !query 36 schema -struct<> --- !query 36 output -org.apache.spark.sql.catalyst.parser.ParseException - -Cannot parse the DATE value: 1999 Jan 08(line 1, pos 7) - -== SQL == -SELECT date '1999 Jan 08' --------^^^ - - --- !query 37 -SELECT date '1999 08 Jan' --- !query 37 schema -struct<> --- !query 37 output -org.apache.spark.sql.catalyst.parser.ParseException - -Cannot parse the DATE value: 1999 08 Jan(line 1, pos 7) - -== SQL == -SELECT date '1999 08 Jan' --------^^^ - - --- !query 38 -SELECT date '1999-01-08' --- !query 38 schema -struct --- !query 38 output -1999-01-08 - - --- !query 39 -SELECT date '1999-08-01' --- !query 39 schema -struct --- !query 39 output -1999-08-01 - - --- !query 40 -SELECT date '1999 01 08' --- !query 40 schema -struct<> --- !query 40 output -org.apache.spark.sql.catalyst.parser.ParseException - -Cannot parse the DATE value: 1999 01 08(line 1, pos 7) - -== SQL == -SELECT date '1999 01 08' --------^^^ - - --- !query 41 -SELECT date '1999 08 01' --- !query 41 schema -struct<> --- !query 41 output -org.apache.spark.sql.catalyst.parser.ParseException - -Cannot parse the DATE value: 1999 08 01(line 1, pos 7) - -== SQL == -SELECT date '1999 08 01' --------^^^ - - --- !query 42 -SELECT date '4714-11-24 BC' --- !query 42 schema -struct --- !query 42 output -4714-11-24 - - --- !query 43 -SELECT date '4714-11-23 BC' --- !query 43 schema -struct --- !query 43 output -4714-11-23 - - --- !query 44 -SELECT date '5874897-12-31' --- !query 44 schema -struct<> --- !query 44 output -org.apache.spark.sql.catalyst.parser.ParseException - -Cannot parse the DATE value: 5874897-12-31(line 1, pos 7) - -== SQL == -SELECT date '5874897-12-31' --------^^^ - - --- !query 45 -SELECT date '5874898-01-01' --- !query 45 schema -struct<> --- !query 45 output -org.apache.spark.sql.catalyst.parser.ParseException - -Cannot parse the DATE value: 5874898-01-01(line 1, pos 7) - -== SQL == -SELECT date '5874898-01-01' --------^^^ - - --- !query 46 -SELECT f1 - date '2000-01-01' AS `Days From 2K` FROM DATE_TBL --- !query 46 schema -struct --- !query 46 output --1035 --1036 --1037 --1400 --1401 --1402 --1403 --15542 --15607 -13977 -14343 -14710 -91 -92 -93 - - --- !query 47 -SELECT EXTRACT(EPOCH FROM DATE '1970-01-01') --- !query 47 schema -struct --- !query 47 output -0 - - --- !query 48 -SELECT EXTRACT(EPOCH FROM TIMESTAMP '1970-01-01') --- !query 48 schema -struct --- !query 48 output -0 - - --- !query 49 -SELECT EXTRACT(CENTURY FROM TO_DATE('0101-12-31 BC', 'yyyy-MM-dd G')) --- !query 49 schema -struct --- !query 49 output --2 - - --- !query 50 -SELECT EXTRACT(CENTURY FROM TO_DATE('0100-12-31 BC', 'yyyy-MM-dd G')) --- !query 50 schema -struct --- !query 50 output --1 - - --- !query 51 -SELECT EXTRACT(CENTURY FROM TO_DATE('0001-12-31 BC', 'yyyy-MM-dd G')) --- !query 51 schema -struct --- !query 51 output --1 - - --- !query 52 -SELECT EXTRACT(CENTURY FROM DATE '0001-01-01') --- !query 52 schema -struct --- !query 52 output -1 - - --- !query 53 -SELECT EXTRACT(CENTURY FROM DATE '0001-01-01 AD') --- !query 53 schema -struct --- !query 53 output -1 - - --- !query 54 -SELECT EXTRACT(CENTURY FROM DATE '1900-12-31') --- !query 54 schema -struct --- !query 54 output -19 - - --- !query 55 -SELECT EXTRACT(CENTURY FROM DATE '1901-01-01') --- !query 55 schema -struct --- !query 55 output -20 - - --- !query 56 -SELECT EXTRACT(CENTURY FROM DATE '2000-12-31') --- !query 56 schema -struct --- !query 56 output -20 - - --- !query 57 -SELECT EXTRACT(CENTURY FROM DATE '2001-01-01') --- !query 57 schema -struct --- !query 57 output -21 - - --- !query 58 -SELECT EXTRACT(CENTURY FROM CURRENT_DATE)>=21 AS True --- !query 58 schema -struct --- !query 58 output -true - - --- !query 59 -SELECT EXTRACT(MILLENNIUM FROM TO_DATE('0001-12-31 BC', 'yyyy-MM-dd G')) --- !query 59 schema -struct --- !query 59 output --1 - - --- !query 60 -SELECT EXTRACT(MILLENNIUM FROM DATE '0001-01-01 AD') --- !query 60 schema -struct --- !query 60 output -1 - - --- !query 61 -SELECT EXTRACT(MILLENNIUM FROM DATE '1000-12-31') --- !query 61 schema -struct --- !query 61 output -1 - - --- !query 62 -SELECT EXTRACT(MILLENNIUM FROM DATE '1001-01-01') --- !query 62 schema -struct --- !query 62 output -2 - - --- !query 63 -SELECT EXTRACT(MILLENNIUM FROM DATE '2000-12-31') --- !query 63 schema -struct --- !query 63 output -2 - - --- !query 64 -SELECT EXTRACT(MILLENNIUM FROM DATE '2001-01-01') --- !query 64 schema -struct --- !query 64 output -3 - - --- !query 65 -SELECT EXTRACT(MILLENNIUM FROM CURRENT_DATE) --- !query 65 schema -struct --- !query 65 output -3 - - --- !query 66 -SELECT EXTRACT(DECADE FROM DATE '1994-12-25') --- !query 66 schema -struct --- !query 66 output -199 - - --- !query 67 -SELECT EXTRACT(DECADE FROM DATE '0010-01-01') --- !query 67 schema -struct --- !query 67 output -1 - - --- !query 68 -SELECT EXTRACT(DECADE FROM DATE '0009-12-31') --- !query 68 schema -struct --- !query 68 output -0 - - --- !query 69 -SELECT EXTRACT(DECADE FROM TO_DATE('0001-01-01 BC', 'yyyy-MM-dd G')) --- !query 69 schema -struct --- !query 69 output -0 - - --- !query 70 -SELECT EXTRACT(DECADE FROM TO_DATE('0002-12-31 BC', 'yyyy-MM-dd G')) --- !query 70 schema -struct --- !query 70 output --1 - - --- !query 71 -SELECT EXTRACT(DECADE FROM TO_DATE('0011-01-01 BC', 'yyyy-MM-dd G')) --- !query 71 schema -struct --- !query 71 output --1 - - --- !query 72 -SELECT EXTRACT(DECADE FROM TO_DATE('0012-12-31 BC', 'yyyy-MM-dd G')) --- !query 72 schema -struct --- !query 72 output --2 - - --- !query 73 -SELECT EXTRACT(CENTURY FROM NOW())>=21 AS True --- !query 73 schema -struct --- !query 73 output -true - - --- !query 74 -SELECT EXTRACT(CENTURY FROM TIMESTAMP '1970-03-20 04:30:00.00000') --- !query 74 schema -struct --- !query 74 output -20 - - --- !query 75 -SELECT DATE_TRUNC('MILLENNIUM', TIMESTAMP '1970-03-20 04:30:00.00000') --- !query 75 schema -struct --- !query 75 output -1001-01-01 00:07:02 - - --- !query 76 -SELECT DATE_TRUNC('MILLENNIUM', DATE '1970-03-20') --- !query 76 schema -struct --- !query 76 output -1001-01-01 00:07:02 - - --- !query 77 -SELECT DATE_TRUNC('CENTURY', TIMESTAMP '1970-03-20 04:30:00.00000') --- !query 77 schema -struct --- !query 77 output -1901-01-01 00:00:00 - - --- !query 78 -SELECT DATE_TRUNC('CENTURY', DATE '1970-03-20') --- !query 78 schema -struct --- !query 78 output -1901-01-01 00:00:00 - - --- !query 79 -SELECT DATE_TRUNC('CENTURY', DATE '2004-08-10') --- !query 79 schema -struct --- !query 79 output -2001-01-01 00:00:00 - - --- !query 80 -SELECT DATE_TRUNC('CENTURY', DATE '0002-02-04') --- !query 80 schema -struct --- !query 80 output -0001-01-01 00:07:02 - - --- !query 81 -SELECT DATE_TRUNC('CENTURY', TO_DATE('0055-08-10 BC', 'yyyy-MM-dd G')) --- !query 81 schema -struct --- !query 81 output --0099-01-01 00:07:02 - - --- !query 82 -SELECT DATE_TRUNC('DECADE', DATE '1993-12-25') --- !query 82 schema -struct --- !query 82 output -1990-01-01 00:00:00 - - --- !query 83 -SELECT DATE_TRUNC('DECADE', DATE '0004-12-25') --- !query 83 schema -struct --- !query 83 output -0000-01-01 00:07:02 - - --- !query 84 -SELECT DATE_TRUNC('DECADE', TO_DATE('0002-12-31 BC', 'yyyy-MM-dd G')) --- !query 84 schema -struct --- !query 84 output --0010-01-01 00:07:02 - - --- !query 85 -select make_date(2013, 7, 15) --- !query 85 schema -struct --- !query 85 output -2013-07-15 - - --- !query 86 -select make_date(-44, 3, 15) --- !query 86 schema -struct --- !query 86 output --0044-03-15 - - --- !query 87 -select make_date(2013, 2, 30) --- !query 87 schema -struct --- !query 87 output -NULL - - --- !query 88 -select make_date(2013, 13, 1) --- !query 88 schema -struct --- !query 88 output -NULL - - --- !query 89 -select make_date(2013, 11, -1) --- !query 89 schema -struct --- !query 89 output -NULL - - --- !query 90 -DROP TABLE DATE_TBL --- !query 90 schema -struct<> --- !query 90 output - diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/numeric.sql.out b/sql/core/src/test/resources/sql-tests/results/pgSQL/numeric.sql.out deleted file mode 100644 index ed649feaaebb2..0000000000000 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/numeric.sql.out +++ /dev/null @@ -1,4864 +0,0 @@ --- Automatically generated by SQLQueryTestSuite --- Number of queries: 577 - - --- !query 0 -CREATE TABLE num_data (id int, val decimal(38,10)) USING parquet --- !query 0 schema -struct<> --- !query 0 output - - - --- !query 1 -CREATE TABLE num_exp_add (id1 int, id2 int, expected decimal(38,10)) USING parquet --- !query 1 schema -struct<> --- !query 1 output - - - --- !query 2 -CREATE TABLE num_exp_sub (id1 int, id2 int, expected decimal(38,10)) USING parquet --- !query 2 schema -struct<> --- !query 2 output - - - --- !query 3 -CREATE TABLE num_exp_div (id1 int, id2 int, expected decimal(38,10)) USING parquet --- !query 3 schema -struct<> --- !query 3 output - - - --- !query 4 -CREATE TABLE num_exp_mul (id1 int, id2 int, expected decimal(38,10)) USING parquet --- !query 4 schema -struct<> --- !query 4 output - - - --- !query 5 -CREATE TABLE num_exp_sqrt (id int, expected decimal(38,10)) USING parquet --- !query 5 schema -struct<> --- !query 5 output - - - --- !query 6 -CREATE TABLE num_exp_ln (id int, expected decimal(38,10)) USING parquet --- !query 6 schema -struct<> --- !query 6 output - - - --- !query 7 -CREATE TABLE num_exp_log10 (id int, expected decimal(38,10)) USING parquet --- !query 7 schema -struct<> --- !query 7 output - - - --- !query 8 -CREATE TABLE num_exp_power_10_ln (id int, expected decimal(38,10)) USING parquet --- !query 8 schema -struct<> --- !query 8 output - - - --- !query 9 -CREATE TABLE num_result (id1 int, id2 int, result decimal(38,10)) USING parquet --- !query 9 schema -struct<> --- !query 9 output - - - --- !query 10 -INSERT INTO num_exp_add VALUES (0,0,'0') --- !query 10 schema -struct<> --- !query 10 output - - - --- !query 11 -INSERT INTO num_exp_sub VALUES (0,0,'0') --- !query 11 schema -struct<> --- !query 11 output - - - --- !query 12 -INSERT INTO num_exp_mul VALUES (0,0,'0') --- !query 12 schema -struct<> --- !query 12 output - - - --- !query 13 -INSERT INTO num_exp_div VALUES (0,0,'NaN') --- !query 13 schema -struct<> --- !query 13 output - - - --- !query 14 -INSERT INTO num_exp_add VALUES (0,1,'0') --- !query 14 schema -struct<> --- !query 14 output - - - --- !query 15 -INSERT INTO num_exp_sub VALUES (0,1,'0') --- !query 15 schema -struct<> --- !query 15 output - - - --- !query 16 -INSERT INTO num_exp_mul VALUES (0,1,'0') --- !query 16 schema -struct<> --- !query 16 output - - - --- !query 17 -INSERT INTO num_exp_div VALUES (0,1,'NaN') --- !query 17 schema -struct<> --- !query 17 output - - - --- !query 18 -INSERT INTO num_exp_add VALUES (0,2,'-34338492.215397047') --- !query 18 schema -struct<> --- !query 18 output - - - --- !query 19 -INSERT INTO num_exp_sub VALUES (0,2,'34338492.215397047') --- !query 19 schema -struct<> --- !query 19 output - - - --- !query 20 -INSERT INTO num_exp_mul VALUES (0,2,'0') --- !query 20 schema -struct<> --- !query 20 output - - - --- !query 21 -INSERT INTO num_exp_div VALUES (0,2,'0') --- !query 21 schema -struct<> --- !query 21 output - - - --- !query 22 -INSERT INTO num_exp_add VALUES (0,3,'4.31') --- !query 22 schema -struct<> --- !query 22 output - - - --- !query 23 -INSERT INTO num_exp_sub VALUES (0,3,'-4.31') --- !query 23 schema -struct<> --- !query 23 output - - - --- !query 24 -INSERT INTO num_exp_mul VALUES (0,3,'0') --- !query 24 schema -struct<> --- !query 24 output - - - --- !query 25 -INSERT INTO num_exp_div VALUES (0,3,'0') --- !query 25 schema -struct<> --- !query 25 output - - - --- !query 26 -INSERT INTO num_exp_add VALUES (0,4,'7799461.4119') --- !query 26 schema -struct<> --- !query 26 output - - - --- !query 27 -INSERT INTO num_exp_sub VALUES (0,4,'-7799461.4119') --- !query 27 schema -struct<> --- !query 27 output - - - --- !query 28 -INSERT INTO num_exp_mul VALUES (0,4,'0') --- !query 28 schema -struct<> --- !query 28 output - - - --- !query 29 -INSERT INTO num_exp_div VALUES (0,4,'0') --- !query 29 schema -struct<> --- !query 29 output - - - --- !query 30 -INSERT INTO num_exp_add VALUES (0,5,'16397.038491') --- !query 30 schema -struct<> --- !query 30 output - - - --- !query 31 -INSERT INTO num_exp_sub VALUES (0,5,'-16397.038491') --- !query 31 schema -struct<> --- !query 31 output - - - --- !query 32 -INSERT INTO num_exp_mul VALUES (0,5,'0') --- !query 32 schema -struct<> --- !query 32 output - - - --- !query 33 -INSERT INTO num_exp_div VALUES (0,5,'0') --- !query 33 schema -struct<> --- !query 33 output - - - --- !query 34 -INSERT INTO num_exp_add VALUES (0,6,'93901.57763026') --- !query 34 schema -struct<> --- !query 34 output - - - --- !query 35 -INSERT INTO num_exp_sub VALUES (0,6,'-93901.57763026') --- !query 35 schema -struct<> --- !query 35 output - - - --- !query 36 -INSERT INTO num_exp_mul VALUES (0,6,'0') --- !query 36 schema -struct<> --- !query 36 output - - - --- !query 37 -INSERT INTO num_exp_div VALUES (0,6,'0') --- !query 37 schema -struct<> --- !query 37 output - - - --- !query 38 -INSERT INTO num_exp_add VALUES (0,7,'-83028485') --- !query 38 schema -struct<> --- !query 38 output - - - --- !query 39 -INSERT INTO num_exp_sub VALUES (0,7,'83028485') --- !query 39 schema -struct<> --- !query 39 output - - - --- !query 40 -INSERT INTO num_exp_mul VALUES (0,7,'0') --- !query 40 schema -struct<> --- !query 40 output - - - --- !query 41 -INSERT INTO num_exp_div VALUES (0,7,'0') --- !query 41 schema -struct<> --- !query 41 output - - - --- !query 42 -INSERT INTO num_exp_add VALUES (0,8,'74881') --- !query 42 schema -struct<> --- !query 42 output - - - --- !query 43 -INSERT INTO num_exp_sub VALUES (0,8,'-74881') --- !query 43 schema -struct<> --- !query 43 output - - - --- !query 44 -INSERT INTO num_exp_mul VALUES (0,8,'0') --- !query 44 schema -struct<> --- !query 44 output - - - --- !query 45 -INSERT INTO num_exp_div VALUES (0,8,'0') --- !query 45 schema -struct<> --- !query 45 output - - - --- !query 46 -INSERT INTO num_exp_add VALUES (0,9,'-24926804.045047420') --- !query 46 schema -struct<> --- !query 46 output - - - --- !query 47 -INSERT INTO num_exp_sub VALUES (0,9,'24926804.045047420') --- !query 47 schema -struct<> --- !query 47 output - - - --- !query 48 -INSERT INTO num_exp_mul VALUES (0,9,'0') --- !query 48 schema -struct<> --- !query 48 output - - - --- !query 49 -INSERT INTO num_exp_div VALUES (0,9,'0') --- !query 49 schema -struct<> --- !query 49 output - - - --- !query 50 -INSERT INTO num_exp_add VALUES (1,0,'0') --- !query 50 schema -struct<> --- !query 50 output - - - --- !query 51 -INSERT INTO num_exp_sub VALUES (1,0,'0') --- !query 51 schema -struct<> --- !query 51 output - - - --- !query 52 -INSERT INTO num_exp_mul VALUES (1,0,'0') --- !query 52 schema -struct<> --- !query 52 output - - - --- !query 53 -INSERT INTO num_exp_div VALUES (1,0,'NaN') --- !query 53 schema -struct<> --- !query 53 output - - - --- !query 54 -INSERT INTO num_exp_add VALUES (1,1,'0') --- !query 54 schema -struct<> --- !query 54 output - - - --- !query 55 -INSERT INTO num_exp_sub VALUES (1,1,'0') --- !query 55 schema -struct<> --- !query 55 output - - - --- !query 56 -INSERT INTO num_exp_mul VALUES (1,1,'0') --- !query 56 schema -struct<> --- !query 56 output - - - --- !query 57 -INSERT INTO num_exp_div VALUES (1,1,'NaN') --- !query 57 schema -struct<> --- !query 57 output - - - --- !query 58 -INSERT INTO num_exp_add VALUES (1,2,'-34338492.215397047') --- !query 58 schema -struct<> --- !query 58 output - - - --- !query 59 -INSERT INTO num_exp_sub VALUES (1,2,'34338492.215397047') --- !query 59 schema -struct<> --- !query 59 output - - - --- !query 60 -INSERT INTO num_exp_mul VALUES (1,2,'0') --- !query 60 schema -struct<> --- !query 60 output - - - --- !query 61 -INSERT INTO num_exp_div VALUES (1,2,'0') --- !query 61 schema -struct<> --- !query 61 output - - - --- !query 62 -INSERT INTO num_exp_add VALUES (1,3,'4.31') --- !query 62 schema -struct<> --- !query 62 output - - - --- !query 63 -INSERT INTO num_exp_sub VALUES (1,3,'-4.31') --- !query 63 schema -struct<> --- !query 63 output - - - --- !query 64 -INSERT INTO num_exp_mul VALUES (1,3,'0') --- !query 64 schema -struct<> --- !query 64 output - - - --- !query 65 -INSERT INTO num_exp_div VALUES (1,3,'0') --- !query 65 schema -struct<> --- !query 65 output - - - --- !query 66 -INSERT INTO num_exp_add VALUES (1,4,'7799461.4119') --- !query 66 schema -struct<> --- !query 66 output - - - --- !query 67 -INSERT INTO num_exp_sub VALUES (1,4,'-7799461.4119') --- !query 67 schema -struct<> --- !query 67 output - - - --- !query 68 -INSERT INTO num_exp_mul VALUES (1,4,'0') --- !query 68 schema -struct<> --- !query 68 output - - - --- !query 69 -INSERT INTO num_exp_div VALUES (1,4,'0') --- !query 69 schema -struct<> --- !query 69 output - - - --- !query 70 -INSERT INTO num_exp_add VALUES (1,5,'16397.038491') --- !query 70 schema -struct<> --- !query 70 output - - - --- !query 71 -INSERT INTO num_exp_sub VALUES (1,5,'-16397.038491') --- !query 71 schema -struct<> --- !query 71 output - - - --- !query 72 -INSERT INTO num_exp_mul VALUES (1,5,'0') --- !query 72 schema -struct<> --- !query 72 output - - - --- !query 73 -INSERT INTO num_exp_div VALUES (1,5,'0') --- !query 73 schema -struct<> --- !query 73 output - - - --- !query 74 -INSERT INTO num_exp_add VALUES (1,6,'93901.57763026') --- !query 74 schema -struct<> --- !query 74 output - - - --- !query 75 -INSERT INTO num_exp_sub VALUES (1,6,'-93901.57763026') --- !query 75 schema -struct<> --- !query 75 output - - - --- !query 76 -INSERT INTO num_exp_mul VALUES (1,6,'0') --- !query 76 schema -struct<> --- !query 76 output - - - --- !query 77 -INSERT INTO num_exp_div VALUES (1,6,'0') --- !query 77 schema -struct<> --- !query 77 output - - - --- !query 78 -INSERT INTO num_exp_add VALUES (1,7,'-83028485') --- !query 78 schema -struct<> --- !query 78 output - - - --- !query 79 -INSERT INTO num_exp_sub VALUES (1,7,'83028485') --- !query 79 schema -struct<> --- !query 79 output - - - --- !query 80 -INSERT INTO num_exp_mul VALUES (1,7,'0') --- !query 80 schema -struct<> --- !query 80 output - - - --- !query 81 -INSERT INTO num_exp_div VALUES (1,7,'0') --- !query 81 schema -struct<> --- !query 81 output - - - --- !query 82 -INSERT INTO num_exp_add VALUES (1,8,'74881') --- !query 82 schema -struct<> --- !query 82 output - - - --- !query 83 -INSERT INTO num_exp_sub VALUES (1,8,'-74881') --- !query 83 schema -struct<> --- !query 83 output - - - --- !query 84 -INSERT INTO num_exp_mul VALUES (1,8,'0') --- !query 84 schema -struct<> --- !query 84 output - - - --- !query 85 -INSERT INTO num_exp_div VALUES (1,8,'0') --- !query 85 schema -struct<> --- !query 85 output - - - --- !query 86 -INSERT INTO num_exp_add VALUES (1,9,'-24926804.045047420') --- !query 86 schema -struct<> --- !query 86 output - - - --- !query 87 -INSERT INTO num_exp_sub VALUES (1,9,'24926804.045047420') --- !query 87 schema -struct<> --- !query 87 output - - - --- !query 88 -INSERT INTO num_exp_mul VALUES (1,9,'0') --- !query 88 schema -struct<> --- !query 88 output - - - --- !query 89 -INSERT INTO num_exp_div VALUES (1,9,'0') --- !query 89 schema -struct<> --- !query 89 output - - - --- !query 90 -INSERT INTO num_exp_add VALUES (2,0,'-34338492.215397047') --- !query 90 schema -struct<> --- !query 90 output - - - --- !query 91 -INSERT INTO num_exp_sub VALUES (2,0,'-34338492.215397047') --- !query 91 schema -struct<> --- !query 91 output - - - --- !query 92 -INSERT INTO num_exp_mul VALUES (2,0,'0') --- !query 92 schema -struct<> --- !query 92 output - - - --- !query 93 -INSERT INTO num_exp_div VALUES (2,0,'NaN') --- !query 93 schema -struct<> --- !query 93 output - - - --- !query 94 -INSERT INTO num_exp_add VALUES (2,1,'-34338492.215397047') --- !query 94 schema -struct<> --- !query 94 output - - - --- !query 95 -INSERT INTO num_exp_sub VALUES (2,1,'-34338492.215397047') --- !query 95 schema -struct<> --- !query 95 output - - - --- !query 96 -INSERT INTO num_exp_mul VALUES (2,1,'0') --- !query 96 schema -struct<> --- !query 96 output - - - --- !query 97 -INSERT INTO num_exp_div VALUES (2,1,'NaN') --- !query 97 schema -struct<> --- !query 97 output - - - --- !query 98 -INSERT INTO num_exp_add VALUES (2,2,'-68676984.430794094') --- !query 98 schema -struct<> --- !query 98 output - - - --- !query 99 -INSERT INTO num_exp_sub VALUES (2,2,'0') --- !query 99 schema -struct<> --- !query 99 output - - - --- !query 100 -INSERT INTO num_exp_mul VALUES (2,2,'1179132047626883.596862135856320209') --- !query 100 schema -struct<> --- !query 100 output - - - --- !query 101 -INSERT INTO num_exp_div VALUES (2,2,'1.00000000000000000000') --- !query 101 schema -struct<> --- !query 101 output - - - --- !query 102 -INSERT INTO num_exp_add VALUES (2,3,'-34338487.905397047') --- !query 102 schema -struct<> --- !query 102 output - - - --- !query 103 -INSERT INTO num_exp_sub VALUES (2,3,'-34338496.525397047') --- !query 103 schema -struct<> --- !query 103 output - - - --- !query 104 -INSERT INTO num_exp_mul VALUES (2,3,'-147998901.44836127257') --- !query 104 schema -struct<> --- !query 104 output - - - --- !query 105 -INSERT INTO num_exp_div VALUES (2,3,'-7967167.56737750510440835266') --- !query 105 schema -struct<> --- !query 105 output - - - --- !query 106 -INSERT INTO num_exp_add VALUES (2,4,'-26539030.803497047') --- !query 106 schema -struct<> --- !query 106 output - - - --- !query 107 -INSERT INTO num_exp_sub VALUES (2,4,'-42137953.627297047') --- !query 107 schema -struct<> --- !query 107 output - - - --- !query 108 -INSERT INTO num_exp_mul VALUES (2,4,'-267821744976817.8111137106593') --- !query 108 schema -struct<> --- !query 108 output - - - --- !query 109 -INSERT INTO num_exp_div VALUES (2,4,'-4.40267480046830116685') --- !query 109 schema -struct<> --- !query 109 output - - - --- !query 110 -INSERT INTO num_exp_add VALUES (2,5,'-34322095.176906047') --- !query 110 schema -struct<> --- !query 110 output - - - --- !query 111 -INSERT INTO num_exp_sub VALUES (2,5,'-34354889.253888047') --- !query 111 schema -struct<> --- !query 111 output - - - --- !query 112 -INSERT INTO num_exp_mul VALUES (2,5,'-563049578578.769242506736077') --- !query 112 schema -struct<> --- !query 112 output - - - --- !query 113 -INSERT INTO num_exp_div VALUES (2,5,'-2094.18866914563535496429') --- !query 113 schema -struct<> --- !query 113 output - - - --- !query 114 -INSERT INTO num_exp_add VALUES (2,6,'-34244590.637766787') --- !query 114 schema -struct<> --- !query 114 output - - - --- !query 115 -INSERT INTO num_exp_sub VALUES (2,6,'-34432393.793027307') --- !query 115 schema -struct<> --- !query 115 output - - - --- !query 116 -INSERT INTO num_exp_mul VALUES (2,6,'-3224438592470.18449811926184222') --- !query 116 schema -struct<> --- !query 116 output - - - --- !query 117 -INSERT INTO num_exp_div VALUES (2,6,'-365.68599891479766440940') --- !query 117 schema -struct<> --- !query 117 output - - - --- !query 118 -INSERT INTO num_exp_add VALUES (2,7,'-117366977.215397047') --- !query 118 schema -struct<> --- !query 118 output - - - --- !query 119 -INSERT INTO num_exp_sub VALUES (2,7,'48689992.784602953') --- !query 119 schema -struct<> --- !query 119 output - - - --- !query 120 -INSERT INTO num_exp_mul VALUES (2,7,'2851072985828710.485883795') --- !query 120 schema -struct<> --- !query 120 output - - - --- !query 121 -INSERT INTO num_exp_div VALUES (2,7,'.41357483778485235518') --- !query 121 schema -struct<> --- !query 121 output - - - --- !query 122 -INSERT INTO num_exp_add VALUES (2,8,'-34263611.215397047') --- !query 122 schema -struct<> --- !query 122 output - - - --- !query 123 -INSERT INTO num_exp_sub VALUES (2,8,'-34413373.215397047') --- !query 123 schema -struct<> --- !query 123 output - - - --- !query 124 -INSERT INTO num_exp_mul VALUES (2,8,'-2571300635581.146276407') --- !query 124 schema -struct<> --- !query 124 output - - - --- !query 125 -INSERT INTO num_exp_div VALUES (2,8,'-458.57416721727870888476') --- !query 125 schema -struct<> --- !query 125 output - - - --- !query 126 -INSERT INTO num_exp_add VALUES (2,9,'-59265296.260444467') --- !query 126 schema -struct<> --- !query 126 output - - - --- !query 127 -INSERT INTO num_exp_sub VALUES (2,9,'-9411688.170349627') --- !query 127 schema -struct<> --- !query 127 output - - - --- !query 128 -INSERT INTO num_exp_mul VALUES (2,9,'855948866655588.453741509242968740') --- !query 128 schema -struct<> --- !query 128 output - - - --- !query 129 -INSERT INTO num_exp_div VALUES (2,9,'1.37757299946438931811') --- !query 129 schema -struct<> --- !query 129 output - - - --- !query 130 -INSERT INTO num_exp_add VALUES (3,0,'4.31') --- !query 130 schema -struct<> --- !query 130 output - - - --- !query 131 -INSERT INTO num_exp_sub VALUES (3,0,'4.31') --- !query 131 schema -struct<> --- !query 131 output - - - --- !query 132 -INSERT INTO num_exp_mul VALUES (3,0,'0') --- !query 132 schema -struct<> --- !query 132 output - - - --- !query 133 -INSERT INTO num_exp_div VALUES (3,0,'NaN') --- !query 133 schema -struct<> --- !query 133 output - - - --- !query 134 -INSERT INTO num_exp_add VALUES (3,1,'4.31') --- !query 134 schema -struct<> --- !query 134 output - - - --- !query 135 -INSERT INTO num_exp_sub VALUES (3,1,'4.31') --- !query 135 schema -struct<> --- !query 135 output - - - --- !query 136 -INSERT INTO num_exp_mul VALUES (3,1,'0') --- !query 136 schema -struct<> --- !query 136 output - - - --- !query 137 -INSERT INTO num_exp_div VALUES (3,1,'NaN') --- !query 137 schema -struct<> --- !query 137 output - - - --- !query 138 -INSERT INTO num_exp_add VALUES (3,2,'-34338487.905397047') --- !query 138 schema -struct<> --- !query 138 output - - - --- !query 139 -INSERT INTO num_exp_sub VALUES (3,2,'34338496.525397047') --- !query 139 schema -struct<> --- !query 139 output - - - --- !query 140 -INSERT INTO num_exp_mul VALUES (3,2,'-147998901.44836127257') --- !query 140 schema -struct<> --- !query 140 output - - - --- !query 141 -INSERT INTO num_exp_div VALUES (3,2,'-.00000012551512084352') --- !query 141 schema -struct<> --- !query 141 output - - - --- !query 142 -INSERT INTO num_exp_add VALUES (3,3,'8.62') --- !query 142 schema -struct<> --- !query 142 output - - - --- !query 143 -INSERT INTO num_exp_sub VALUES (3,3,'0') --- !query 143 schema -struct<> --- !query 143 output - - - --- !query 144 -INSERT INTO num_exp_mul VALUES (3,3,'18.5761') --- !query 144 schema -struct<> --- !query 144 output - - - --- !query 145 -INSERT INTO num_exp_div VALUES (3,3,'1.00000000000000000000') --- !query 145 schema -struct<> --- !query 145 output - - - --- !query 146 -INSERT INTO num_exp_add VALUES (3,4,'7799465.7219') --- !query 146 schema -struct<> --- !query 146 output - - - --- !query 147 -INSERT INTO num_exp_sub VALUES (3,4,'-7799457.1019') --- !query 147 schema -struct<> --- !query 147 output - - - --- !query 148 -INSERT INTO num_exp_mul VALUES (3,4,'33615678.685289') --- !query 148 schema -struct<> --- !query 148 output - - - --- !query 149 -INSERT INTO num_exp_div VALUES (3,4,'.00000055260225961552') --- !query 149 schema -struct<> --- !query 149 output - - - --- !query 150 -INSERT INTO num_exp_add VALUES (3,5,'16401.348491') --- !query 150 schema -struct<> --- !query 150 output - - - --- !query 151 -INSERT INTO num_exp_sub VALUES (3,5,'-16392.728491') --- !query 151 schema -struct<> --- !query 151 output - - - --- !query 152 -INSERT INTO num_exp_mul VALUES (3,5,'70671.23589621') --- !query 152 schema -struct<> --- !query 152 output - - - --- !query 153 -INSERT INTO num_exp_div VALUES (3,5,'.00026285234387695504') --- !query 153 schema -struct<> --- !query 153 output - - - --- !query 154 -INSERT INTO num_exp_add VALUES (3,6,'93905.88763026') --- !query 154 schema -struct<> --- !query 154 output - - - --- !query 155 -INSERT INTO num_exp_sub VALUES (3,6,'-93897.26763026') --- !query 155 schema -struct<> --- !query 155 output - - - --- !query 156 -INSERT INTO num_exp_mul VALUES (3,6,'404715.7995864206') --- !query 156 schema -struct<> --- !query 156 output - - - --- !query 157 -INSERT INTO num_exp_div VALUES (3,6,'.00004589912234457595') --- !query 157 schema -struct<> --- !query 157 output - - - --- !query 158 -INSERT INTO num_exp_add VALUES (3,7,'-83028480.69') --- !query 158 schema -struct<> --- !query 158 output - - - --- !query 159 -INSERT INTO num_exp_sub VALUES (3,7,'83028489.31') --- !query 159 schema -struct<> --- !query 159 output - - - --- !query 160 -INSERT INTO num_exp_mul VALUES (3,7,'-357852770.35') --- !query 160 schema -struct<> --- !query 160 output - - - --- !query 161 -INSERT INTO num_exp_div VALUES (3,7,'-.00000005190989574240') --- !query 161 schema -struct<> --- !query 161 output - - - --- !query 162 -INSERT INTO num_exp_add VALUES (3,8,'74885.31') --- !query 162 schema -struct<> --- !query 162 output - - - --- !query 163 -INSERT INTO num_exp_sub VALUES (3,8,'-74876.69') --- !query 163 schema -struct<> --- !query 163 output - - - --- !query 164 -INSERT INTO num_exp_mul VALUES (3,8,'322737.11') --- !query 164 schema -struct<> --- !query 164 output - - - --- !query 165 -INSERT INTO num_exp_div VALUES (3,8,'.00005755799201399553') --- !query 165 schema -struct<> --- !query 165 output - - - --- !query 166 -INSERT INTO num_exp_add VALUES (3,9,'-24926799.735047420') --- !query 166 schema -struct<> --- !query 166 output - - - --- !query 167 -INSERT INTO num_exp_sub VALUES (3,9,'24926808.355047420') --- !query 167 schema -struct<> --- !query 167 output - - - --- !query 168 -INSERT INTO num_exp_mul VALUES (3,9,'-107434525.43415438020') --- !query 168 schema -struct<> --- !query 168 output - - - --- !query 169 -INSERT INTO num_exp_div VALUES (3,9,'-.00000017290624149854') --- !query 169 schema -struct<> --- !query 169 output - - - --- !query 170 -INSERT INTO num_exp_add VALUES (4,0,'7799461.4119') --- !query 170 schema -struct<> --- !query 170 output - - - --- !query 171 -INSERT INTO num_exp_sub VALUES (4,0,'7799461.4119') --- !query 171 schema -struct<> --- !query 171 output - - - --- !query 172 -INSERT INTO num_exp_mul VALUES (4,0,'0') --- !query 172 schema -struct<> --- !query 172 output - - - --- !query 173 -INSERT INTO num_exp_div VALUES (4,0,'NaN') --- !query 173 schema -struct<> --- !query 173 output - - - --- !query 174 -INSERT INTO num_exp_add VALUES (4,1,'7799461.4119') --- !query 174 schema -struct<> --- !query 174 output - - - --- !query 175 -INSERT INTO num_exp_sub VALUES (4,1,'7799461.4119') --- !query 175 schema -struct<> --- !query 175 output - - - --- !query 176 -INSERT INTO num_exp_mul VALUES (4,1,'0') --- !query 176 schema -struct<> --- !query 176 output - - - --- !query 177 -INSERT INTO num_exp_div VALUES (4,1,'NaN') --- !query 177 schema -struct<> --- !query 177 output - - - --- !query 178 -INSERT INTO num_exp_add VALUES (4,2,'-26539030.803497047') --- !query 178 schema -struct<> --- !query 178 output - - - --- !query 179 -INSERT INTO num_exp_sub VALUES (4,2,'42137953.627297047') --- !query 179 schema -struct<> --- !query 179 output - - - --- !query 180 -INSERT INTO num_exp_mul VALUES (4,2,'-267821744976817.8111137106593') --- !query 180 schema -struct<> --- !query 180 output - - - --- !query 181 -INSERT INTO num_exp_div VALUES (4,2,'-.22713465002993920385') --- !query 181 schema -struct<> --- !query 181 output - - - --- !query 182 -INSERT INTO num_exp_add VALUES (4,3,'7799465.7219') --- !query 182 schema -struct<> --- !query 182 output - - - --- !query 183 -INSERT INTO num_exp_sub VALUES (4,3,'7799457.1019') --- !query 183 schema -struct<> --- !query 183 output - - - --- !query 184 -INSERT INTO num_exp_mul VALUES (4,3,'33615678.685289') --- !query 184 schema -struct<> --- !query 184 output - - - --- !query 185 -INSERT INTO num_exp_div VALUES (4,3,'1809619.81714617169373549883') --- !query 185 schema -struct<> --- !query 185 output - - - --- !query 186 -INSERT INTO num_exp_add VALUES (4,4,'15598922.8238') --- !query 186 schema -struct<> --- !query 186 output - - - --- !query 187 -INSERT INTO num_exp_sub VALUES (4,4,'0') --- !query 187 schema -struct<> --- !query 187 output - - - --- !query 188 -INSERT INTO num_exp_mul VALUES (4,4,'60831598315717.14146161') --- !query 188 schema -struct<> --- !query 188 output - - - --- !query 189 -INSERT INTO num_exp_div VALUES (4,4,'1.00000000000000000000') --- !query 189 schema -struct<> --- !query 189 output - - - --- !query 190 -INSERT INTO num_exp_add VALUES (4,5,'7815858.450391') --- !query 190 schema -struct<> --- !query 190 output - - - --- !query 191 -INSERT INTO num_exp_sub VALUES (4,5,'7783064.373409') --- !query 191 schema -struct<> --- !query 191 output - - - --- !query 192 -INSERT INTO num_exp_mul VALUES (4,5,'127888068979.9935054429') --- !query 192 schema -struct<> --- !query 192 output - - - --- !query 193 -INSERT INTO num_exp_div VALUES (4,5,'475.66281046305802686061') --- !query 193 schema -struct<> --- !query 193 output - - - --- !query 194 -INSERT INTO num_exp_add VALUES (4,6,'7893362.98953026') --- !query 194 schema -struct<> --- !query 194 output - - - --- !query 195 -INSERT INTO num_exp_sub VALUES (4,6,'7705559.83426974') --- !query 195 schema -struct<> --- !query 195 output - - - --- !query 196 -INSERT INTO num_exp_mul VALUES (4,6,'732381731243.745115764094') --- !query 196 schema -struct<> --- !query 196 output - - - --- !query 197 -INSERT INTO num_exp_div VALUES (4,6,'83.05996138436129499606') --- !query 197 schema -struct<> --- !query 197 output - - - --- !query 198 -INSERT INTO num_exp_add VALUES (4,7,'-75229023.5881') --- !query 198 schema -struct<> --- !query 198 output - - - --- !query 199 -INSERT INTO num_exp_sub VALUES (4,7,'90827946.4119') --- !query 199 schema -struct<> --- !query 199 output - - - --- !query 200 -INSERT INTO num_exp_mul VALUES (4,7,'-647577464846017.9715') --- !query 200 schema -struct<> --- !query 200 output - - - --- !query 201 -INSERT INTO num_exp_div VALUES (4,7,'-.09393717604145131637') --- !query 201 schema -struct<> --- !query 201 output - - - --- !query 202 -INSERT INTO num_exp_add VALUES (4,8,'7874342.4119') --- !query 202 schema -struct<> --- !query 202 output - - - --- !query 203 -INSERT INTO num_exp_sub VALUES (4,8,'7724580.4119') --- !query 203 schema -struct<> --- !query 203 output - - - --- !query 204 -INSERT INTO num_exp_mul VALUES (4,8,'584031469984.4839') --- !query 204 schema -struct<> --- !query 204 output - - - --- !query 205 -INSERT INTO num_exp_div VALUES (4,8,'104.15808298366741897143') --- !query 205 schema -struct<> --- !query 205 output - - - --- !query 206 -INSERT INTO num_exp_add VALUES (4,9,'-17127342.633147420') --- !query 206 schema -struct<> --- !query 206 output - - - --- !query 207 -INSERT INTO num_exp_sub VALUES (4,9,'32726265.456947420') --- !query 207 schema -struct<> --- !query 207 output - - - --- !query 208 -INSERT INTO num_exp_mul VALUES (4,9,'-194415646271340.1815956522980') --- !query 208 schema -struct<> --- !query 208 output - - - --- !query 209 -INSERT INTO num_exp_div VALUES (4,9,'-.31289456112403769409') --- !query 209 schema -struct<> --- !query 209 output - - - --- !query 210 -INSERT INTO num_exp_add VALUES (5,0,'16397.038491') --- !query 210 schema -struct<> --- !query 210 output - - - --- !query 211 -INSERT INTO num_exp_sub VALUES (5,0,'16397.038491') --- !query 211 schema -struct<> --- !query 211 output - - - --- !query 212 -INSERT INTO num_exp_mul VALUES (5,0,'0') --- !query 212 schema -struct<> --- !query 212 output - - - --- !query 213 -INSERT INTO num_exp_div VALUES (5,0,'NaN') --- !query 213 schema -struct<> --- !query 213 output - - - --- !query 214 -INSERT INTO num_exp_add VALUES (5,1,'16397.038491') --- !query 214 schema -struct<> --- !query 214 output - - - --- !query 215 -INSERT INTO num_exp_sub VALUES (5,1,'16397.038491') --- !query 215 schema -struct<> --- !query 215 output - - - --- !query 216 -INSERT INTO num_exp_mul VALUES (5,1,'0') --- !query 216 schema -struct<> --- !query 216 output - - - --- !query 217 -INSERT INTO num_exp_div VALUES (5,1,'NaN') --- !query 217 schema -struct<> --- !query 217 output - - - --- !query 218 -INSERT INTO num_exp_add VALUES (5,2,'-34322095.176906047') --- !query 218 schema -struct<> --- !query 218 output - - - --- !query 219 -INSERT INTO num_exp_sub VALUES (5,2,'34354889.253888047') --- !query 219 schema -struct<> --- !query 219 output - - - --- !query 220 -INSERT INTO num_exp_mul VALUES (5,2,'-563049578578.769242506736077') --- !query 220 schema -struct<> --- !query 220 output - - - --- !query 221 -INSERT INTO num_exp_div VALUES (5,2,'-.00047751189505192446') --- !query 221 schema -struct<> --- !query 221 output - - - --- !query 222 -INSERT INTO num_exp_add VALUES (5,3,'16401.348491') --- !query 222 schema -struct<> --- !query 222 output - - - --- !query 223 -INSERT INTO num_exp_sub VALUES (5,3,'16392.728491') --- !query 223 schema -struct<> --- !query 223 output - - - --- !query 224 -INSERT INTO num_exp_mul VALUES (5,3,'70671.23589621') --- !query 224 schema -struct<> --- !query 224 output - - - --- !query 225 -INSERT INTO num_exp_div VALUES (5,3,'3804.41728329466357308584') --- !query 225 schema -struct<> --- !query 225 output - - - --- !query 226 -INSERT INTO num_exp_add VALUES (5,4,'7815858.450391') --- !query 226 schema -struct<> --- !query 226 output - - - --- !query 227 -INSERT INTO num_exp_sub VALUES (5,4,'-7783064.373409') --- !query 227 schema -struct<> --- !query 227 output - - - --- !query 228 -INSERT INTO num_exp_mul VALUES (5,4,'127888068979.9935054429') --- !query 228 schema -struct<> --- !query 228 output - - - --- !query 229 -INSERT INTO num_exp_div VALUES (5,4,'.00210232958726897192') --- !query 229 schema -struct<> --- !query 229 output - - - --- !query 230 -INSERT INTO num_exp_add VALUES (5,5,'32794.076982') --- !query 230 schema -struct<> --- !query 230 output - - - --- !query 231 -INSERT INTO num_exp_sub VALUES (5,5,'0') --- !query 231 schema -struct<> --- !query 231 output - - - --- !query 232 -INSERT INTO num_exp_mul VALUES (5,5,'268862871.275335557081') --- !query 232 schema -struct<> --- !query 232 output - - - --- !query 233 -INSERT INTO num_exp_div VALUES (5,5,'1.00000000000000000000') --- !query 233 schema -struct<> --- !query 233 output - - - --- !query 234 -INSERT INTO num_exp_add VALUES (5,6,'110298.61612126') --- !query 234 schema -struct<> --- !query 234 output - - - --- !query 235 -INSERT INTO num_exp_sub VALUES (5,6,'-77504.53913926') --- !query 235 schema -struct<> --- !query 235 output - - - --- !query 236 -INSERT INTO num_exp_mul VALUES (5,6,'1539707782.76899778633766') --- !query 236 schema -struct<> --- !query 236 output - - - --- !query 237 -INSERT INTO num_exp_div VALUES (5,6,'.17461941433576102689') --- !query 237 schema -struct<> --- !query 237 output - - - --- !query 238 -INSERT INTO num_exp_add VALUES (5,7,'-83012087.961509') --- !query 238 schema -struct<> --- !query 238 output - - - --- !query 239 -INSERT INTO num_exp_sub VALUES (5,7,'83044882.038491') --- !query 239 schema -struct<> --- !query 239 output - - - --- !query 240 -INSERT INTO num_exp_mul VALUES (5,7,'-1361421264394.416135') --- !query 240 schema -struct<> --- !query 240 output - - - --- !query 241 -INSERT INTO num_exp_div VALUES (5,7,'-.00019748690453643710') --- !query 241 schema -struct<> --- !query 241 output - - - --- !query 242 -INSERT INTO num_exp_add VALUES (5,8,'91278.038491') --- !query 242 schema -struct<> --- !query 242 output - - - --- !query 243 -INSERT INTO num_exp_sub VALUES (5,8,'-58483.961509') --- !query 243 schema -struct<> --- !query 243 output - - - --- !query 244 -INSERT INTO num_exp_mul VALUES (5,8,'1227826639.244571') --- !query 244 schema -struct<> --- !query 244 output - - - --- !query 245 -INSERT INTO num_exp_div VALUES (5,8,'.21897461960978085228') --- !query 245 schema -struct<> --- !query 245 output - - - --- !query 246 -INSERT INTO num_exp_add VALUES (5,9,'-24910407.006556420') --- !query 246 schema -struct<> --- !query 246 output - - - --- !query 247 -INSERT INTO num_exp_sub VALUES (5,9,'24943201.083538420') --- !query 247 schema -struct<> --- !query 247 output - - - --- !query 248 -INSERT INTO num_exp_mul VALUES (5,9,'-408725765384.257043660243220') --- !query 248 schema -struct<> --- !query 248 output - - - --- !query 249 -INSERT INTO num_exp_div VALUES (5,9,'-.00065780749354660427') --- !query 249 schema -struct<> --- !query 249 output - - - --- !query 250 -INSERT INTO num_exp_add VALUES (6,0,'93901.57763026') --- !query 250 schema -struct<> --- !query 250 output - - - --- !query 251 -INSERT INTO num_exp_sub VALUES (6,0,'93901.57763026') --- !query 251 schema -struct<> --- !query 251 output - - - --- !query 252 -INSERT INTO num_exp_mul VALUES (6,0,'0') --- !query 252 schema -struct<> --- !query 252 output - - - --- !query 253 -INSERT INTO num_exp_div VALUES (6,0,'NaN') --- !query 253 schema -struct<> --- !query 253 output - - - --- !query 254 -INSERT INTO num_exp_add VALUES (6,1,'93901.57763026') --- !query 254 schema -struct<> --- !query 254 output - - - --- !query 255 -INSERT INTO num_exp_sub VALUES (6,1,'93901.57763026') --- !query 255 schema -struct<> --- !query 255 output - - - --- !query 256 -INSERT INTO num_exp_mul VALUES (6,1,'0') --- !query 256 schema -struct<> --- !query 256 output - - - --- !query 257 -INSERT INTO num_exp_div VALUES (6,1,'NaN') --- !query 257 schema -struct<> --- !query 257 output - - - --- !query 258 -INSERT INTO num_exp_add VALUES (6,2,'-34244590.637766787') --- !query 258 schema -struct<> --- !query 258 output - - - --- !query 259 -INSERT INTO num_exp_sub VALUES (6,2,'34432393.793027307') --- !query 259 schema -struct<> --- !query 259 output - - - --- !query 260 -INSERT INTO num_exp_mul VALUES (6,2,'-3224438592470.18449811926184222') --- !query 260 schema -struct<> --- !query 260 output - - - --- !query 261 -INSERT INTO num_exp_div VALUES (6,2,'-.00273458651128995823') --- !query 261 schema -struct<> --- !query 261 output - - - --- !query 262 -INSERT INTO num_exp_add VALUES (6,3,'93905.88763026') --- !query 262 schema -struct<> --- !query 262 output - - - --- !query 263 -INSERT INTO num_exp_sub VALUES (6,3,'93897.26763026') --- !query 263 schema -struct<> --- !query 263 output - - - --- !query 264 -INSERT INTO num_exp_mul VALUES (6,3,'404715.7995864206') --- !query 264 schema -struct<> --- !query 264 output - - - --- !query 265 -INSERT INTO num_exp_div VALUES (6,3,'21786.90896293735498839907') --- !query 265 schema -struct<> --- !query 265 output - - - --- !query 266 -INSERT INTO num_exp_add VALUES (6,4,'7893362.98953026') --- !query 266 schema -struct<> --- !query 266 output - - - --- !query 267 -INSERT INTO num_exp_sub VALUES (6,4,'-7705559.83426974') --- !query 267 schema -struct<> --- !query 267 output - - - --- !query 268 -INSERT INTO num_exp_mul VALUES (6,4,'732381731243.745115764094') --- !query 268 schema -struct<> --- !query 268 output - - - --- !query 269 -INSERT INTO num_exp_div VALUES (6,4,'.01203949512295682469') --- !query 269 schema -struct<> --- !query 269 output - - - --- !query 270 -INSERT INTO num_exp_add VALUES (6,5,'110298.61612126') --- !query 270 schema -struct<> --- !query 270 output - - - --- !query 271 -INSERT INTO num_exp_sub VALUES (6,5,'77504.53913926') --- !query 271 schema -struct<> --- !query 271 output - - - --- !query 272 -INSERT INTO num_exp_mul VALUES (6,5,'1539707782.76899778633766') --- !query 272 schema -struct<> --- !query 272 output - - - --- !query 273 -INSERT INTO num_exp_div VALUES (6,5,'5.72674008674192359679') --- !query 273 schema -struct<> --- !query 273 output - - - --- !query 274 -INSERT INTO num_exp_add VALUES (6,6,'187803.15526052') --- !query 274 schema -struct<> --- !query 274 output - - - --- !query 275 -INSERT INTO num_exp_sub VALUES (6,6,'0') --- !query 275 schema -struct<> --- !query 275 output - - - --- !query 276 -INSERT INTO num_exp_mul VALUES (6,6,'8817506281.4517452372676676') --- !query 276 schema -struct<> --- !query 276 output - - - --- !query 277 -INSERT INTO num_exp_div VALUES (6,6,'1.00000000000000000000') --- !query 277 schema -struct<> --- !query 277 output - - - --- !query 278 -INSERT INTO num_exp_add VALUES (6,7,'-82934583.42236974') --- !query 278 schema -struct<> --- !query 278 output - - - --- !query 279 -INSERT INTO num_exp_sub VALUES (6,7,'83122386.57763026') --- !query 279 schema -struct<> --- !query 279 output - - - --- !query 280 -INSERT INTO num_exp_mul VALUES (6,7,'-7796505729750.37795610') --- !query 280 schema -struct<> --- !query 280 output - - - --- !query 281 -INSERT INTO num_exp_div VALUES (6,7,'-.00113095617281538980') --- !query 281 schema -struct<> --- !query 281 output - - - --- !query 282 -INSERT INTO num_exp_add VALUES (6,8,'168782.57763026') --- !query 282 schema -struct<> --- !query 282 output - - - --- !query 283 -INSERT INTO num_exp_sub VALUES (6,8,'19020.57763026') --- !query 283 schema -struct<> --- !query 283 output - - - --- !query 284 -INSERT INTO num_exp_mul VALUES (6,8,'7031444034.53149906') --- !query 284 schema -struct<> --- !query 284 output - - - --- !query 285 -INSERT INTO num_exp_div VALUES (6,8,'1.25401073209839612184') --- !query 285 schema -struct<> --- !query 285 output - - - --- !query 286 -INSERT INTO num_exp_add VALUES (6,9,'-24832902.467417160') --- !query 286 schema -struct<> --- !query 286 output - - - --- !query 287 -INSERT INTO num_exp_sub VALUES (6,9,'25020705.622677680') --- !query 287 schema -struct<> --- !query 287 output - - - --- !query 288 -INSERT INTO num_exp_mul VALUES (6,9,'-2340666225110.29929521292692920') --- !query 288 schema -struct<> --- !query 288 output - - - --- !query 289 -INSERT INTO num_exp_div VALUES (6,9,'-.00376709254265256789') --- !query 289 schema -struct<> --- !query 289 output - - - --- !query 290 -INSERT INTO num_exp_add VALUES (7,0,'-83028485') --- !query 290 schema -struct<> --- !query 290 output - - - --- !query 291 -INSERT INTO num_exp_sub VALUES (7,0,'-83028485') --- !query 291 schema -struct<> --- !query 291 output - - - --- !query 292 -INSERT INTO num_exp_mul VALUES (7,0,'0') --- !query 292 schema -struct<> --- !query 292 output - - - --- !query 293 -INSERT INTO num_exp_div VALUES (7,0,'NaN') --- !query 293 schema -struct<> --- !query 293 output - - - --- !query 294 -INSERT INTO num_exp_add VALUES (7,1,'-83028485') --- !query 294 schema -struct<> --- !query 294 output - - - --- !query 295 -INSERT INTO num_exp_sub VALUES (7,1,'-83028485') --- !query 295 schema -struct<> --- !query 295 output - - - --- !query 296 -INSERT INTO num_exp_mul VALUES (7,1,'0') --- !query 296 schema -struct<> --- !query 296 output - - - --- !query 297 -INSERT INTO num_exp_div VALUES (7,1,'NaN') --- !query 297 schema -struct<> --- !query 297 output - - - --- !query 298 -INSERT INTO num_exp_add VALUES (7,2,'-117366977.215397047') --- !query 298 schema -struct<> --- !query 298 output - - - --- !query 299 -INSERT INTO num_exp_sub VALUES (7,2,'-48689992.784602953') --- !query 299 schema -struct<> --- !query 299 output - - - --- !query 300 -INSERT INTO num_exp_mul VALUES (7,2,'2851072985828710.485883795') --- !query 300 schema -struct<> --- !query 300 output - - - --- !query 301 -INSERT INTO num_exp_div VALUES (7,2,'2.41794207151503385700') --- !query 301 schema -struct<> --- !query 301 output - - - --- !query 302 -INSERT INTO num_exp_add VALUES (7,3,'-83028480.69') --- !query 302 schema -struct<> --- !query 302 output - - - --- !query 303 -INSERT INTO num_exp_sub VALUES (7,3,'-83028489.31') --- !query 303 schema -struct<> --- !query 303 output - - - --- !query 304 -INSERT INTO num_exp_mul VALUES (7,3,'-357852770.35') --- !query 304 schema -struct<> --- !query 304 output - - - --- !query 305 -INSERT INTO num_exp_div VALUES (7,3,'-19264149.65197215777262180974') --- !query 305 schema -struct<> --- !query 305 output - - - --- !query 306 -INSERT INTO num_exp_add VALUES (7,4,'-75229023.5881') --- !query 306 schema -struct<> --- !query 306 output - - - --- !query 307 -INSERT INTO num_exp_sub VALUES (7,4,'-90827946.4119') --- !query 307 schema -struct<> --- !query 307 output - - - --- !query 308 -INSERT INTO num_exp_mul VALUES (7,4,'-647577464846017.9715') --- !query 308 schema -struct<> --- !query 308 output - - - --- !query 309 -INSERT INTO num_exp_div VALUES (7,4,'-10.64541262725136247686') --- !query 309 schema -struct<> --- !query 309 output - - - --- !query 310 -INSERT INTO num_exp_add VALUES (7,5,'-83012087.961509') --- !query 310 schema -struct<> --- !query 310 output - - - --- !query 311 -INSERT INTO num_exp_sub VALUES (7,5,'-83044882.038491') --- !query 311 schema -struct<> --- !query 311 output - - - --- !query 312 -INSERT INTO num_exp_mul VALUES (7,5,'-1361421264394.416135') --- !query 312 schema -struct<> --- !query 312 output - - - --- !query 313 -INSERT INTO num_exp_div VALUES (7,5,'-5063.62688881730941836574') --- !query 313 schema -struct<> --- !query 313 output - - - --- !query 314 -INSERT INTO num_exp_add VALUES (7,6,'-82934583.42236974') --- !query 314 schema -struct<> --- !query 314 output - - - --- !query 315 -INSERT INTO num_exp_sub VALUES (7,6,'-83122386.57763026') --- !query 315 schema -struct<> --- !query 315 output - - - --- !query 316 -INSERT INTO num_exp_mul VALUES (7,6,'-7796505729750.37795610') --- !query 316 schema -struct<> --- !query 316 output - - - --- !query 317 -INSERT INTO num_exp_div VALUES (7,6,'-884.20756174009028770294') --- !query 317 schema -struct<> --- !query 317 output - - - --- !query 318 -INSERT INTO num_exp_add VALUES (7,7,'-166056970') --- !query 318 schema -struct<> --- !query 318 output - - - --- !query 319 -INSERT INTO num_exp_sub VALUES (7,7,'0') --- !query 319 schema -struct<> --- !query 319 output - - - --- !query 320 -INSERT INTO num_exp_mul VALUES (7,7,'6893729321395225') --- !query 320 schema -struct<> --- !query 320 output - - - --- !query 321 -INSERT INTO num_exp_div VALUES (7,7,'1.00000000000000000000') --- !query 321 schema -struct<> --- !query 321 output - - - --- !query 322 -INSERT INTO num_exp_add VALUES (7,8,'-82953604') --- !query 322 schema -struct<> --- !query 322 output - - - --- !query 323 -INSERT INTO num_exp_sub VALUES (7,8,'-83103366') --- !query 323 schema -struct<> --- !query 323 output - - - --- !query 324 -INSERT INTO num_exp_mul VALUES (7,8,'-6217255985285') --- !query 324 schema -struct<> --- !query 324 output - - - --- !query 325 -INSERT INTO num_exp_div VALUES (7,8,'-1108.80577182462841041118') --- !query 325 schema -struct<> --- !query 325 output - - - --- !query 326 -INSERT INTO num_exp_add VALUES (7,9,'-107955289.045047420') --- !query 326 schema -struct<> --- !query 326 output - - - --- !query 327 -INSERT INTO num_exp_sub VALUES (7,9,'-58101680.954952580') --- !query 327 schema -struct<> --- !query 327 output - - - --- !query 328 -INSERT INTO num_exp_mul VALUES (7,9,'2069634775752159.035758700') --- !query 328 schema -struct<> --- !query 328 output - - - --- !query 329 -INSERT INTO num_exp_div VALUES (7,9,'3.33089171198810413382') --- !query 329 schema -struct<> --- !query 329 output - - - --- !query 330 -INSERT INTO num_exp_add VALUES (8,0,'74881') --- !query 330 schema -struct<> --- !query 330 output - - - --- !query 331 -INSERT INTO num_exp_sub VALUES (8,0,'74881') --- !query 331 schema -struct<> --- !query 331 output - - - --- !query 332 -INSERT INTO num_exp_mul VALUES (8,0,'0') --- !query 332 schema -struct<> --- !query 332 output - - - --- !query 333 -INSERT INTO num_exp_div VALUES (8,0,'NaN') --- !query 333 schema -struct<> --- !query 333 output - - - --- !query 334 -INSERT INTO num_exp_add VALUES (8,1,'74881') --- !query 334 schema -struct<> --- !query 334 output - - - --- !query 335 -INSERT INTO num_exp_sub VALUES (8,1,'74881') --- !query 335 schema -struct<> --- !query 335 output - - - --- !query 336 -INSERT INTO num_exp_mul VALUES (8,1,'0') --- !query 336 schema -struct<> --- !query 336 output - - - --- !query 337 -INSERT INTO num_exp_div VALUES (8,1,'NaN') --- !query 337 schema -struct<> --- !query 337 output - - - --- !query 338 -INSERT INTO num_exp_add VALUES (8,2,'-34263611.215397047') --- !query 338 schema -struct<> --- !query 338 output - - - --- !query 339 -INSERT INTO num_exp_sub VALUES (8,2,'34413373.215397047') --- !query 339 schema -struct<> --- !query 339 output - - - --- !query 340 -INSERT INTO num_exp_mul VALUES (8,2,'-2571300635581.146276407') --- !query 340 schema -struct<> --- !query 340 output - - - --- !query 341 -INSERT INTO num_exp_div VALUES (8,2,'-.00218067233500788615') --- !query 341 schema -struct<> --- !query 341 output - - - --- !query 342 -INSERT INTO num_exp_add VALUES (8,3,'74885.31') --- !query 342 schema -struct<> --- !query 342 output - - - --- !query 343 -INSERT INTO num_exp_sub VALUES (8,3,'74876.69') --- !query 343 schema -struct<> --- !query 343 output - - - --- !query 344 -INSERT INTO num_exp_mul VALUES (8,3,'322737.11') --- !query 344 schema -struct<> --- !query 344 output - - - --- !query 345 -INSERT INTO num_exp_div VALUES (8,3,'17373.78190255220417633410') --- !query 345 schema -struct<> --- !query 345 output - - - --- !query 346 -INSERT INTO num_exp_add VALUES (8,4,'7874342.4119') --- !query 346 schema -struct<> --- !query 346 output - - - --- !query 347 -INSERT INTO num_exp_sub VALUES (8,4,'-7724580.4119') --- !query 347 schema -struct<> --- !query 347 output - - - --- !query 348 -INSERT INTO num_exp_mul VALUES (8,4,'584031469984.4839') --- !query 348 schema -struct<> --- !query 348 output - - - --- !query 349 -INSERT INTO num_exp_div VALUES (8,4,'.00960079113741758956') --- !query 349 schema -struct<> --- !query 349 output - - - --- !query 350 -INSERT INTO num_exp_add VALUES (8,5,'91278.038491') --- !query 350 schema -struct<> --- !query 350 output - - - --- !query 351 -INSERT INTO num_exp_sub VALUES (8,5,'58483.961509') --- !query 351 schema -struct<> --- !query 351 output - - - --- !query 352 -INSERT INTO num_exp_mul VALUES (8,5,'1227826639.244571') --- !query 352 schema -struct<> --- !query 352 output - - - --- !query 353 -INSERT INTO num_exp_div VALUES (8,5,'4.56673929509287019456') --- !query 353 schema -struct<> --- !query 353 output - - - --- !query 354 -INSERT INTO num_exp_add VALUES (8,6,'168782.57763026') --- !query 354 schema -struct<> --- !query 354 output - - - --- !query 355 -INSERT INTO num_exp_sub VALUES (8,6,'-19020.57763026') --- !query 355 schema -struct<> --- !query 355 output - - - --- !query 356 -INSERT INTO num_exp_mul VALUES (8,6,'7031444034.53149906') --- !query 356 schema -struct<> --- !query 356 output - - - --- !query 357 -INSERT INTO num_exp_div VALUES (8,6,'.79744134113322314424') --- !query 357 schema -struct<> --- !query 357 output - - - --- !query 358 -INSERT INTO num_exp_add VALUES (8,7,'-82953604') --- !query 358 schema -struct<> --- !query 358 output - - - --- !query 359 -INSERT INTO num_exp_sub VALUES (8,7,'83103366') --- !query 359 schema -struct<> --- !query 359 output - - - --- !query 360 -INSERT INTO num_exp_mul VALUES (8,7,'-6217255985285') --- !query 360 schema -struct<> --- !query 360 output - - - --- !query 361 -INSERT INTO num_exp_div VALUES (8,7,'-.00090187120721280172') --- !query 361 schema -struct<> --- !query 361 output - - - --- !query 362 -INSERT INTO num_exp_add VALUES (8,8,'149762') --- !query 362 schema -struct<> --- !query 362 output - - - --- !query 363 -INSERT INTO num_exp_sub VALUES (8,8,'0') --- !query 363 schema -struct<> --- !query 363 output - - - --- !query 364 -INSERT INTO num_exp_mul VALUES (8,8,'5607164161') --- !query 364 schema -struct<> --- !query 364 output - - - --- !query 365 -INSERT INTO num_exp_div VALUES (8,8,'1.00000000000000000000') --- !query 365 schema -struct<> --- !query 365 output - - - --- !query 366 -INSERT INTO num_exp_add VALUES (8,9,'-24851923.045047420') --- !query 366 schema -struct<> --- !query 366 output - - - --- !query 367 -INSERT INTO num_exp_sub VALUES (8,9,'25001685.045047420') --- !query 367 schema -struct<> --- !query 367 output - - - --- !query 368 -INSERT INTO num_exp_mul VALUES (8,9,'-1866544013697.195857020') --- !query 368 schema -struct<> --- !query 368 output - - - --- !query 369 -INSERT INTO num_exp_div VALUES (8,9,'-.00300403532938582735') --- !query 369 schema -struct<> --- !query 369 output - - - --- !query 370 -INSERT INTO num_exp_add VALUES (9,0,'-24926804.045047420') --- !query 370 schema -struct<> --- !query 370 output - - - --- !query 371 -INSERT INTO num_exp_sub VALUES (9,0,'-24926804.045047420') --- !query 371 schema -struct<> --- !query 371 output - - - --- !query 372 -INSERT INTO num_exp_mul VALUES (9,0,'0') --- !query 372 schema -struct<> --- !query 372 output - - - --- !query 373 -INSERT INTO num_exp_div VALUES (9,0,'NaN') --- !query 373 schema -struct<> --- !query 373 output - - - --- !query 374 -INSERT INTO num_exp_add VALUES (9,1,'-24926804.045047420') --- !query 374 schema -struct<> --- !query 374 output - - - --- !query 375 -INSERT INTO num_exp_sub VALUES (9,1,'-24926804.045047420') --- !query 375 schema -struct<> --- !query 375 output - - - --- !query 376 -INSERT INTO num_exp_mul VALUES (9,1,'0') --- !query 376 schema -struct<> --- !query 376 output - - - --- !query 377 -INSERT INTO num_exp_div VALUES (9,1,'NaN') --- !query 377 schema -struct<> --- !query 377 output - - - --- !query 378 -INSERT INTO num_exp_add VALUES (9,2,'-59265296.260444467') --- !query 378 schema -struct<> --- !query 378 output - - - --- !query 379 -INSERT INTO num_exp_sub VALUES (9,2,'9411688.170349627') --- !query 379 schema -struct<> --- !query 379 output - - - --- !query 380 -INSERT INTO num_exp_mul VALUES (9,2,'855948866655588.453741509242968740') --- !query 380 schema -struct<> --- !query 380 output - - - --- !query 381 -INSERT INTO num_exp_div VALUES (9,2,'.72591434384152961526') --- !query 381 schema -struct<> --- !query 381 output - - - --- !query 382 -INSERT INTO num_exp_add VALUES (9,3,'-24926799.735047420') --- !query 382 schema -struct<> --- !query 382 output - - - --- !query 383 -INSERT INTO num_exp_sub VALUES (9,3,'-24926808.355047420') --- !query 383 schema -struct<> --- !query 383 output - - - --- !query 384 -INSERT INTO num_exp_mul VALUES (9,3,'-107434525.43415438020') --- !query 384 schema -struct<> --- !query 384 output - - - --- !query 385 -INSERT INTO num_exp_div VALUES (9,3,'-5783481.21694835730858468677') --- !query 385 schema -struct<> --- !query 385 output - - - --- !query 386 -INSERT INTO num_exp_add VALUES (9,4,'-17127342.633147420') --- !query 386 schema -struct<> --- !query 386 output - - - --- !query 387 -INSERT INTO num_exp_sub VALUES (9,4,'-32726265.456947420') --- !query 387 schema -struct<> --- !query 387 output - - - --- !query 388 -INSERT INTO num_exp_mul VALUES (9,4,'-194415646271340.1815956522980') --- !query 388 schema -struct<> --- !query 388 output - - - --- !query 389 -INSERT INTO num_exp_div VALUES (9,4,'-3.19596478892958416484') --- !query 389 schema -struct<> --- !query 389 output - - - --- !query 390 -INSERT INTO num_exp_add VALUES (9,5,'-24910407.006556420') --- !query 390 schema -struct<> --- !query 390 output - - - --- !query 391 -INSERT INTO num_exp_sub VALUES (9,5,'-24943201.083538420') --- !query 391 schema -struct<> --- !query 391 output - - - --- !query 392 -INSERT INTO num_exp_mul VALUES (9,5,'-408725765384.257043660243220') --- !query 392 schema -struct<> --- !query 392 output - - - --- !query 393 -INSERT INTO num_exp_div VALUES (9,5,'-1520.20159364322004505807') --- !query 393 schema -struct<> --- !query 393 output - - - --- !query 394 -INSERT INTO num_exp_add VALUES (9,6,'-24832902.467417160') --- !query 394 schema -struct<> --- !query 394 output - - - --- !query 395 -INSERT INTO num_exp_sub VALUES (9,6,'-25020705.622677680') --- !query 395 schema -struct<> --- !query 395 output - - - --- !query 396 -INSERT INTO num_exp_mul VALUES (9,6,'-2340666225110.29929521292692920') --- !query 396 schema -struct<> --- !query 396 output - - - --- !query 397 -INSERT INTO num_exp_div VALUES (9,6,'-265.45671195426965751280') --- !query 397 schema -struct<> --- !query 397 output - - - --- !query 398 -INSERT INTO num_exp_add VALUES (9,7,'-107955289.045047420') --- !query 398 schema -struct<> --- !query 398 output - - - --- !query 399 -INSERT INTO num_exp_sub VALUES (9,7,'58101680.954952580') --- !query 399 schema -struct<> --- !query 399 output - - - --- !query 400 -INSERT INTO num_exp_mul VALUES (9,7,'2069634775752159.035758700') --- !query 400 schema -struct<> --- !query 400 output - - - --- !query 401 -INSERT INTO num_exp_div VALUES (9,7,'.30021990699995814689') --- !query 401 schema -struct<> --- !query 401 output - - - --- !query 402 -INSERT INTO num_exp_add VALUES (9,8,'-24851923.045047420') --- !query 402 schema -struct<> --- !query 402 output - - - --- !query 403 -INSERT INTO num_exp_sub VALUES (9,8,'-25001685.045047420') --- !query 403 schema -struct<> --- !query 403 output - - - --- !query 404 -INSERT INTO num_exp_mul VALUES (9,8,'-1866544013697.195857020') --- !query 404 schema -struct<> --- !query 404 output - - - --- !query 405 -INSERT INTO num_exp_div VALUES (9,8,'-332.88556569820675471748') --- !query 405 schema -struct<> --- !query 405 output - - - --- !query 406 -INSERT INTO num_exp_add VALUES (9,9,'-49853608.090094840') --- !query 406 schema -struct<> --- !query 406 output - - - --- !query 407 -INSERT INTO num_exp_sub VALUES (9,9,'0') --- !query 407 schema -struct<> --- !query 407 output - - - --- !query 408 -INSERT INTO num_exp_mul VALUES (9,9,'621345559900192.420120630048656400') --- !query 408 schema -struct<> --- !query 408 output - - - --- !query 409 -INSERT INTO num_exp_div VALUES (9,9,'1.00000000000000000000') --- !query 409 schema -struct<> --- !query 409 output - - - --- !query 410 -INSERT INTO num_exp_sqrt VALUES (0,'0') --- !query 410 schema -struct<> --- !query 410 output - - - --- !query 411 -INSERT INTO num_exp_sqrt VALUES (1,'0') --- !query 411 schema -struct<> --- !query 411 output - - - --- !query 412 -INSERT INTO num_exp_sqrt VALUES (2,'5859.90547836712524903505') --- !query 412 schema -struct<> --- !query 412 output - - - --- !query 413 -INSERT INTO num_exp_sqrt VALUES (3,'2.07605394920266944396') --- !query 413 schema -struct<> --- !query 413 output - - - --- !query 414 -INSERT INTO num_exp_sqrt VALUES (4,'2792.75158435189147418923') --- !query 414 schema -struct<> --- !query 414 output - - - --- !query 415 -INSERT INTO num_exp_sqrt VALUES (5,'128.05092147657509145473') --- !query 415 schema -struct<> --- !query 415 output - - - --- !query 416 -INSERT INTO num_exp_sqrt VALUES (6,'306.43364311096782703406') --- !query 416 schema -struct<> --- !query 416 output - - - --- !query 417 -INSERT INTO num_exp_sqrt VALUES (7,'9111.99676251039939975230') --- !query 417 schema -struct<> --- !query 417 output - - - --- !query 418 -INSERT INTO num_exp_sqrt VALUES (8,'273.64392922189960397542') --- !query 418 schema -struct<> --- !query 418 output - - - --- !query 419 -INSERT INTO num_exp_sqrt VALUES (9,'4992.67503899937593364766') --- !query 419 schema -struct<> --- !query 419 output - - - --- !query 420 -INSERT INTO num_exp_ln VALUES (0,'NaN') --- !query 420 schema -struct<> --- !query 420 output - - - --- !query 421 -INSERT INTO num_exp_ln VALUES (1,'NaN') --- !query 421 schema -struct<> --- !query 421 output - - - --- !query 422 -INSERT INTO num_exp_ln VALUES (2,'17.35177750493897715514') --- !query 422 schema -struct<> --- !query 422 output - - - --- !query 423 -INSERT INTO num_exp_ln VALUES (3,'1.46093790411565641971') --- !query 423 schema -struct<> --- !query 423 output - - - --- !query 424 -INSERT INTO num_exp_ln VALUES (4,'15.86956523951936572464') --- !query 424 schema -struct<> --- !query 424 output - - - --- !query 425 -INSERT INTO num_exp_ln VALUES (5,'9.70485601768871834038') --- !query 425 schema -struct<> --- !query 425 output - - - --- !query 426 -INSERT INTO num_exp_ln VALUES (6,'11.45000246622944403127') --- !query 426 schema -struct<> --- !query 426 output - - - --- !query 427 -INSERT INTO num_exp_ln VALUES (7,'18.23469429965478772991') --- !query 427 schema -struct<> --- !query 427 output - - - --- !query 428 -INSERT INTO num_exp_ln VALUES (8,'11.22365546576315513668') --- !query 428 schema -struct<> --- !query 428 output - - - --- !query 429 -INSERT INTO num_exp_ln VALUES (9,'17.03145425013166006962') --- !query 429 schema -struct<> --- !query 429 output - - - --- !query 430 -INSERT INTO num_exp_log10 VALUES (0,'NaN') --- !query 430 schema -struct<> --- !query 430 output - - - --- !query 431 -INSERT INTO num_exp_log10 VALUES (1,'NaN') --- !query 431 schema -struct<> --- !query 431 output - - - --- !query 432 -INSERT INTO num_exp_log10 VALUES (2,'7.53578122160797276459') --- !query 432 schema -struct<> --- !query 432 output - - - --- !query 433 -INSERT INTO num_exp_log10 VALUES (3,'.63447727016073160075') --- !query 433 schema -struct<> --- !query 433 output - - - --- !query 434 -INSERT INTO num_exp_log10 VALUES (4,'6.89206461372691743345') --- !query 434 schema -struct<> --- !query 434 output - - - --- !query 435 -INSERT INTO num_exp_log10 VALUES (5,'4.21476541614777768626') --- !query 435 schema -struct<> --- !query 435 output - - - --- !query 436 -INSERT INTO num_exp_log10 VALUES (6,'4.97267288886207207671') --- !query 436 schema -struct<> --- !query 436 output - - - --- !query 437 -INSERT INTO num_exp_log10 VALUES (7,'7.91922711353275546914') --- !query 437 schema -struct<> --- !query 437 output - - - --- !query 438 -INSERT INTO num_exp_log10 VALUES (8,'4.87437163556421004138') --- !query 438 schema -struct<> --- !query 438 output - - - --- !query 439 -INSERT INTO num_exp_log10 VALUES (9,'7.39666659961986567059') --- !query 439 schema -struct<> --- !query 439 output - - - --- !query 440 -INSERT INTO num_exp_power_10_ln VALUES (0,'NaN') --- !query 440 schema -struct<> --- !query 440 output - - - --- !query 441 -INSERT INTO num_exp_power_10_ln VALUES (1,'NaN') --- !query 441 schema -struct<> --- !query 441 output - - - --- !query 442 -INSERT INTO num_exp_power_10_ln VALUES (2,'224790267919917955.13261618583642653184') --- !query 442 schema -struct<> --- !query 442 output - - - --- !query 443 -INSERT INTO num_exp_power_10_ln VALUES (3,'28.90266599445155957393') --- !query 443 schema -struct<> --- !query 443 output - - - --- !query 444 -INSERT INTO num_exp_power_10_ln VALUES (4,'7405685069594999.07733999469386277636') --- !query 444 schema -struct<> --- !query 444 output - - - --- !query 445 -INSERT INTO num_exp_power_10_ln VALUES (5,'5068226527.32127265408584640098') --- !query 445 schema -struct<> --- !query 445 output - - - --- !query 446 -INSERT INTO num_exp_power_10_ln VALUES (6,'281839893606.99372343357047819067') --- !query 446 schema -struct<> --- !query 446 output - - - --- !query 447 -INSERT INTO num_exp_power_10_ln VALUES (7,'1716699575118597095.42330819910640247627') --- !query 447 schema -struct<> --- !query 447 output - - - --- !query 448 -INSERT INTO num_exp_power_10_ln VALUES (8,'167361463828.07491320069016125952') --- !query 448 schema -struct<> --- !query 448 output - - - --- !query 449 -INSERT INTO num_exp_power_10_ln VALUES (9,'107511333880052007.04141124673540337457') --- !query 449 schema -struct<> --- !query 449 output - - - --- !query 450 -INSERT INTO num_data VALUES (0, '0') --- !query 450 schema -struct<> --- !query 450 output - - - --- !query 451 -INSERT INTO num_data VALUES (1, '0') --- !query 451 schema -struct<> --- !query 451 output - - - --- !query 452 -INSERT INTO num_data VALUES (2, '-34338492.215397047') --- !query 452 schema -struct<> --- !query 452 output - - - --- !query 453 -INSERT INTO num_data VALUES (3, '4.31') --- !query 453 schema -struct<> --- !query 453 output - - - --- !query 454 -INSERT INTO num_data VALUES (4, '7799461.4119') --- !query 454 schema -struct<> --- !query 454 output - - - --- !query 455 -INSERT INTO num_data VALUES (5, '16397.038491') --- !query 455 schema -struct<> --- !query 455 output - - - --- !query 456 -INSERT INTO num_data VALUES (6, '93901.57763026') --- !query 456 schema -struct<> --- !query 456 output - - - --- !query 457 -INSERT INTO num_data VALUES (7, '-83028485') --- !query 457 schema -struct<> --- !query 457 output - - - --- !query 458 -INSERT INTO num_data VALUES (8, '74881') --- !query 458 schema -struct<> --- !query 458 output - - - --- !query 459 -INSERT INTO num_data VALUES (9, '-24926804.045047420') --- !query 459 schema -struct<> --- !query 459 output - - - --- !query 460 -SELECT * FROM num_data --- !query 460 schema -struct --- !query 460 output -0 0 -1 0 -2 -34338492.215397047 -3 4.31 -4 7799461.4119 -5 16397.038491 -6 93901.57763026 -7 -83028485 -8 74881 -9 -24926804.04504742 - - --- !query 461 -TRUNCATE TABLE num_result --- !query 461 schema -struct<> --- !query 461 output - - - --- !query 462 -INSERT INTO num_result SELECT t1.id, t2.id, t1.val + t2.val - FROM num_data t1, num_data t2 --- !query 462 schema -struct<> --- !query 462 output - - - --- !query 463 -SELECT t1.id1, t1.id2, t1.result, t2.expected - FROM num_result t1, num_exp_add t2 - WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 - AND t1.result != t2.expected --- !query 463 schema -struct --- !query 463 output - - - --- !query 464 -TRUNCATE TABLE num_result --- !query 464 schema -struct<> --- !query 464 output - - - --- !query 465 -INSERT INTO num_result SELECT t1.id, t2.id, round(t1.val + t2.val, 10) - FROM num_data t1, num_data t2 --- !query 465 schema -struct<> --- !query 465 output - - - --- !query 466 -SELECT t1.id1, t1.id2, t1.result, round(t2.expected, 10) as expected - FROM num_result t1, num_exp_add t2 - WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 - AND t1.result != round(t2.expected, 10) --- !query 466 schema -struct --- !query 466 output - - - --- !query 467 -TRUNCATE TABLE num_result --- !query 467 schema -struct<> --- !query 467 output - - - --- !query 468 -INSERT INTO num_result SELECT t1.id, t2.id, t1.val - t2.val - FROM num_data t1, num_data t2 --- !query 468 schema -struct<> --- !query 468 output - - - --- !query 469 -SELECT t1.id1, t1.id2, t1.result, t2.expected - FROM num_result t1, num_exp_sub t2 - WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 - AND t1.result != t2.expected --- !query 469 schema -struct --- !query 469 output - - - --- !query 470 -TRUNCATE TABLE num_result --- !query 470 schema -struct<> --- !query 470 output - - - --- !query 471 -INSERT INTO num_result SELECT t1.id, t2.id, round(t1.val - t2.val, 40) - FROM num_data t1, num_data t2 --- !query 471 schema -struct<> --- !query 471 output - - - --- !query 472 -SELECT t1.id1, t1.id2, t1.result, round(t2.expected, 40) - FROM num_result t1, num_exp_sub t2 - WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 - AND t1.result != round(t2.expected, 40) --- !query 472 schema -struct --- !query 472 output - - - --- !query 473 -TRUNCATE TABLE num_result --- !query 473 schema -struct<> --- !query 473 output - - - --- !query 474 -INSERT INTO num_result SELECT t1.id, t2.id, t1.val, t2.val, t1.val * t2.val - FROM num_data t1, num_data t2 --- !query 474 schema -struct<> --- !query 474 output -org.apache.spark.sql.AnalysisException -`default`.`num_result` requires that the data to be inserted have the same number of columns as the target table: target table has 3 column(s) but the inserted data has 5 column(s), including 0 partition column(s) having constant value(s).; - - --- !query 475 -SELECT t1.id1, t1.id2, t1.result, t2.expected - FROM num_result t1, num_exp_mul t2 - WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 - AND t1.result != t2.expected --- !query 475 schema -struct --- !query 475 output - - - --- !query 476 -TRUNCATE TABLE num_result --- !query 476 schema -struct<> --- !query 476 output - - - --- !query 477 -INSERT INTO num_result SELECT t1.id, t2.id, round(t1.val * t2.val, 30) - FROM num_data t1, num_data t2 --- !query 477 schema -struct<> --- !query 477 output - - - --- !query 478 -SELECT t1.id1, t1.id2, t1.result, round(t2.expected, 30) as expected - FROM num_result t1, num_exp_mul t2 - WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 - AND t1.result != round(t2.expected, 30) --- !query 478 schema -struct --- !query 478 output -2 2 1179132047626883.596862 1179132047626883.5968621359 -2 3 -147998901.448361 -147998901.4483612726 -2 4 -267821744976817.811114 -267821744976817.8111137107 -2 5 -563049578578.769243 -563049578578.7692425067 -2 6 -3224438592470.184498 -3224438592470.1844981193 -2 7 2851072985828710.485884 2851072985828710.485883795 -2 8 -2571300635581.146276 -2571300635581.146276407 -2 9 855948866655588.453742 855948866655588.4537415092 -3 2 -147998901.448361 -147998901.4483612726 -3 5 70671.235896 70671.23589621 -3 6 404715.799586 404715.7995864206 -3 9 -107434525.434154 -107434525.4341543802 -4 2 -267821744976817.811114 -267821744976817.8111137107 -4 4 60831598315717.141462 60831598315717.14146161 -4 5 127888068979.993505 127888068979.9935054429 -4 6 732381731243.745116 732381731243.7451157641 -4 9 -194415646271340.181596 -194415646271340.1815956523 -5 2 -563049578578.769243 -563049578578.7692425067 -5 3 70671.235896 70671.23589621 -5 4 127888068979.993505 127888068979.9935054429 -5 5 268862871.275336 268862871.2753355571 -5 6 1539707782.768998 1539707782.7689977863 -5 9 -408725765384.257044 -408725765384.2570436602 -6 2 -3224438592470.184498 -3224438592470.1844981193 -6 3 404715.799586 404715.7995864206 -6 4 732381731243.745116 732381731243.7451157641 -6 5 1539707782.768998 1539707782.7689977863 -6 6 8817506281.451745 8817506281.4517452373 -6 7 -7796505729750.377956 -7796505729750.3779561 -6 8 7031444034.531499 7031444034.53149906 -6 9 -2340666225110.299295 -2340666225110.2992952129 -7 2 2851072985828710.485884 2851072985828710.485883795 -7 6 -7796505729750.377956 -7796505729750.3779561 -7 9 2069634775752159.035759 2069634775752159.0357587 -8 2 -2571300635581.146276 -2571300635581.146276407 -8 6 7031444034.531499 7031444034.53149906 -8 9 -1866544013697.195857 -1866544013697.19585702 -9 2 855948866655588.453742 855948866655588.4537415092 -9 3 -107434525.434154 -107434525.4341543802 -9 4 -194415646271340.181596 -194415646271340.1815956523 -9 5 -408725765384.257044 -408725765384.2570436602 -9 6 -2340666225110.299295 -2340666225110.2992952129 -9 7 2069634775752159.035759 2069634775752159.0357587 -9 8 -1866544013697.195857 -1866544013697.19585702 -9 9 621345559900192.420121 621345559900192.42012063 - - --- !query 479 -TRUNCATE TABLE num_result --- !query 479 schema -struct<> --- !query 479 output - - - --- !query 480 -INSERT INTO num_result SELECT t1.id, t2.id, t1.val / t2.val - FROM num_data t1, num_data t2 - WHERE t2.val != '0.0' --- !query 480 schema -struct<> --- !query 480 output - - - --- !query 481 -SELECT t1.id1, t1.id2, t1.result, t2.expected - FROM num_result t1, num_exp_div t2 - WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 - AND t1.result != t2.expected --- !query 481 schema -struct --- !query 481 output -2 3 -7967167.567378 -7967167.5673775051 -2 4 -4.402675 -4.4026748005 -2 5 -2094.188669 -2094.1886691456 -2 6 -365.685999 -365.6859989148 -2 7 0.413575 0.4135748378 -2 8 -458.574167 -458.5741672173 -2 9 1.377573 1.3775729995 -3 2 0 -0.0000001255 -3 4 0.000001 0.0000005526 -3 5 0.000263 0.0002628523 -3 6 0.000046 0.0000458991 -3 7 0 -0.0000000519 -3 8 0.000058 0.000057558 -3 9 0 -0.0000001729 -4 2 -0.227135 -0.22713465 -4 3 1809619.817146 1809619.8171461717 -4 5 475.66281 475.6628104631 -4 6 83.059961 83.0599613844 -4 7 -0.093937 -0.093937176 -4 8 104.158083 104.1580829837 -4 9 -0.312895 -0.3128945611 -5 2 -0.000478 -0.0004775119 -5 3 3804.417283 3804.4172832947 -5 4 0.002102 0.0021023296 -5 6 0.174619 0.1746194143 -5 7 -0.000197 -0.0001974869 -5 8 0.218975 0.2189746196 -5 9 -0.000658 -0.0006578075 -6 2 -0.002735 -0.0027345865 -6 3 21786.908963 21786.9089629374 -6 4 0.012039 0.0120394951 -6 5 5.72674 5.7267400867 -6 7 -0.001131 -0.0011309562 -6 8 1.254011 1.2540107321 -6 9 -0.003767 -0.0037670925 -7 2 2.417942 2.4179420715 -7 3 -19264149.651972 -19264149.6519721578 -7 4 -10.645413 -10.6454126273 -7 5 -5063.626889 -5063.6268888173 -7 6 -884.207562 -884.2075617401 -7 8 -1108.805772 -1108.8057718246 -7 9 3.330892 3.330891712 -8 2 -0.002181 -0.0021806723 -8 3 17373.781903 17373.7819025522 -8 4 0.009601 0.0096007911 -8 5 4.566739 4.5667392951 -8 6 0.797441 0.7974413411 -8 7 -0.000902 -0.0009018712 -8 9 -0.003004 -0.0030040353 -9 2 0.725914 0.7259143438 -9 3 -5783481.216948 -5783481.2169483573 -9 4 -3.195965 -3.1959647889 -9 5 -1520.201594 -1520.2015936432 -9 6 -265.456712 -265.4567119543 -9 7 0.30022 0.300219907 -9 8 -332.885566 -332.8855656982 - - --- !query 482 -TRUNCATE TABLE num_result --- !query 482 schema -struct<> --- !query 482 output - - - --- !query 483 -INSERT INTO num_result SELECT t1.id, t2.id, round(t1.val / t2.val, 80) - FROM num_data t1, num_data t2 - WHERE t2.val != '0.0' --- !query 483 schema -struct<> --- !query 483 output - - - --- !query 484 -SELECT t1.id1, t1.id2, t1.result, round(t2.expected, 80) as expected - FROM num_result t1, num_exp_div t2 - WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 - AND t1.result != round(t2.expected, 80) --- !query 484 schema -struct --- !query 484 output -2 3 -7967167.567378 -7967167.5673775051 -2 4 -4.402675 -4.4026748005 -2 5 -2094.188669 -2094.1886691456 -2 6 -365.685999 -365.6859989148 -2 7 0.413575 0.4135748378 -2 8 -458.574167 -458.5741672173 -2 9 1.377573 1.3775729995 -3 2 0 -0.0000001255 -3 4 0.000001 0.0000005526 -3 5 0.000263 0.0002628523 -3 6 0.000046 0.0000458991 -3 7 0 -0.0000000519 -3 8 0.000058 0.000057558 -3 9 0 -0.0000001729 -4 2 -0.227135 -0.22713465 -4 3 1809619.817146 1809619.8171461717 -4 5 475.66281 475.6628104631 -4 6 83.059961 83.0599613844 -4 7 -0.093937 -0.093937176 -4 8 104.158083 104.1580829837 -4 9 -0.312895 -0.3128945611 -5 2 -0.000478 -0.0004775119 -5 3 3804.417283 3804.4172832947 -5 4 0.002102 0.0021023296 -5 6 0.174619 0.1746194143 -5 7 -0.000197 -0.0001974869 -5 8 0.218975 0.2189746196 -5 9 -0.000658 -0.0006578075 -6 2 -0.002735 -0.0027345865 -6 3 21786.908963 21786.9089629374 -6 4 0.012039 0.0120394951 -6 5 5.72674 5.7267400867 -6 7 -0.001131 -0.0011309562 -6 8 1.254011 1.2540107321 -6 9 -0.003767 -0.0037670925 -7 2 2.417942 2.4179420715 -7 3 -19264149.651972 -19264149.6519721578 -7 4 -10.645413 -10.6454126273 -7 5 -5063.626889 -5063.6268888173 -7 6 -884.207562 -884.2075617401 -7 8 -1108.805772 -1108.8057718246 -7 9 3.330892 3.330891712 -8 2 -0.002181 -0.0021806723 -8 3 17373.781903 17373.7819025522 -8 4 0.009601 0.0096007911 -8 5 4.566739 4.5667392951 -8 6 0.797441 0.7974413411 -8 7 -0.000902 -0.0009018712 -8 9 -0.003004 -0.0030040353 -9 2 0.725914 0.7259143438 -9 3 -5783481.216948 -5783481.2169483573 -9 4 -3.195965 -3.1959647889 -9 5 -1520.201594 -1520.2015936432 -9 6 -265.456712 -265.4567119543 -9 7 0.30022 0.300219907 -9 8 -332.885566 -332.8855656982 - - --- !query 485 -TRUNCATE TABLE num_result --- !query 485 schema -struct<> --- !query 485 output - - - --- !query 486 -INSERT INTO num_result SELECT id, 0, SQRT(ABS(val)) - FROM num_data --- !query 486 schema -struct<> --- !query 486 output - - - --- !query 487 -SELECT t1.id1, t1.result, t2.expected - FROM num_result t1, num_exp_sqrt t2 - WHERE t1.id1 = t2.id - AND t1.result != t2.expected --- !query 487 schema -struct --- !query 487 output - - - --- !query 488 -TRUNCATE TABLE num_result --- !query 488 schema -struct<> --- !query 488 output - - - --- !query 489 -INSERT INTO num_result SELECT id, 0, LN(ABS(val)) - FROM num_data - WHERE val != '0.0' --- !query 489 schema -struct<> --- !query 489 output - - - --- !query 490 -SELECT t1.id1, t1.result, t2.expected - FROM num_result t1, num_exp_ln t2 - WHERE t1.id1 = t2.id - AND t1.result != t2.expected --- !query 490 schema -struct --- !query 490 output - - - --- !query 491 -TRUNCATE TABLE num_result --- !query 491 schema -struct<> --- !query 491 output - - - --- !query 492 -INSERT INTO num_result SELECT id, 0, LOG(cast('10' as decimal(38, 18)), ABS(val)) - FROM num_data - WHERE val != '0.0' --- !query 492 schema -struct<> --- !query 492 output - - - --- !query 493 -SELECT t1.id1, t1.result, t2.expected - FROM num_result t1, num_exp_log10 t2 - WHERE t1.id1 = t2.id - AND t1.result != t2.expected --- !query 493 schema -struct --- !query 493 output - - - --- !query 494 -TRUNCATE TABLE num_result --- !query 494 schema -struct<> --- !query 494 output - - - --- !query 495 -INSERT INTO num_result SELECT id, 0, POWER(cast('10' as decimal(38, 18)), LN(ABS(round(val,200)))) - FROM num_data - WHERE val != '0.0' --- !query 495 schema -struct<> --- !query 495 output - - - --- !query 496 -SELECT t1.id1, t1.result, t2.expected - FROM num_result t1, num_exp_power_10_ln t2 - WHERE t1.id1 = t2.id - AND t1.result != t2.expected --- !query 496 schema -struct --- !query 496 output -2 224790267919917440 224790267919917955.1326161858 -4 7405685069595001 7405685069594999.0773399947 -5 5068226527.321263 5068226527.3212726541 -6 281839893606.99365 281839893606.9937234336 -7 1716699575118595580 1716699575118597095.4233081991 -8 167361463828.0749 167361463828.0749132007 -9 107511333880051872 107511333880052007.0414112467 - - --- !query 497 -SELECT AVG(val) FROM num_data --- !query 497 schema -struct --- !query 497 output --13430913.5922423207 - - --- !query 498 -CREATE TABLE fract_only (id int, val decimal(4,4)) USING parquet --- !query 498 schema -struct<> --- !query 498 output - - - --- !query 499 -INSERT INTO fract_only VALUES (1, '0.0') --- !query 499 schema -struct<> --- !query 499 output - - - --- !query 500 -INSERT INTO fract_only VALUES (2, '0.1') --- !query 500 schema -struct<> --- !query 500 output - - - --- !query 501 -INSERT INTO fract_only VALUES (4, '-0.9999') --- !query 501 schema -struct<> --- !query 501 output - - - --- !query 502 -INSERT INTO fract_only VALUES (5, '0.99994') --- !query 502 schema -struct<> --- !query 502 output - - - --- !query 503 -INSERT INTO fract_only VALUES (7, '0.00001') --- !query 503 schema -struct<> --- !query 503 output - - - --- !query 504 -INSERT INTO fract_only VALUES (8, '0.00017') --- !query 504 schema -struct<> --- !query 504 output - - - --- !query 505 -SELECT * FROM fract_only --- !query 505 schema -struct --- !query 505 output -1 0 -2 0.1 -4 -0.9999 -5 0.9999 -7 0 -8 0.0002 - - --- !query 506 -DROP TABLE fract_only --- !query 506 schema -struct<> --- !query 506 output - - - --- !query 507 -SELECT decimal(double('NaN')) --- !query 507 schema -struct --- !query 507 output -NULL - - --- !query 508 -SELECT decimal(double('Infinity')) --- !query 508 schema -struct --- !query 508 output -NULL - - --- !query 509 -SELECT decimal(double('-Infinity')) --- !query 509 schema -struct --- !query 509 output -NULL - - --- !query 510 -SELECT decimal(float('NaN')) --- !query 510 schema -struct --- !query 510 output -NULL - - --- !query 511 -SELECT decimal(float('Infinity')) --- !query 511 schema -struct --- !query 511 output -NULL - - --- !query 512 -SELECT decimal(float('-Infinity')) --- !query 512 schema -struct --- !query 512 output -NULL - - --- !query 513 -CREATE TABLE ceil_floor_round (a decimal(38, 18)) USING parquet --- !query 513 schema -struct<> --- !query 513 output - - - --- !query 514 -INSERT INTO ceil_floor_round VALUES ('-5.5') --- !query 514 schema -struct<> --- !query 514 output - - - --- !query 515 -INSERT INTO ceil_floor_round VALUES ('-5.499999') --- !query 515 schema -struct<> --- !query 515 output - - - --- !query 516 -INSERT INTO ceil_floor_round VALUES ('9.5') --- !query 516 schema -struct<> --- !query 516 output - - - --- !query 517 -INSERT INTO ceil_floor_round VALUES ('9.4999999') --- !query 517 schema -struct<> --- !query 517 output - - - --- !query 518 -INSERT INTO ceil_floor_round VALUES ('0.0') --- !query 518 schema -struct<> --- !query 518 output - - - --- !query 519 -INSERT INTO ceil_floor_round VALUES ('0.0000001') --- !query 519 schema -struct<> --- !query 519 output - - - --- !query 520 -INSERT INTO ceil_floor_round VALUES ('-0.000001') --- !query 520 schema -struct<> --- !query 520 output - - - --- !query 521 -SELECT a, ceil(a), ceiling(a), floor(a), round(a) FROM ceil_floor_round --- !query 521 schema -struct --- !query 521 output --0.000001 0 0 -1 0 --5.499999 -5 -5 -6 -5 --5.5 -5 -5 -6 -6 -0 0 0 0 0 -0.0000001 1 1 0 0 -9.4999999 10 10 9 9 -9.5 10 10 9 10 - - --- !query 522 -DROP TABLE ceil_floor_round --- !query 522 schema -struct<> --- !query 522 output - - - --- !query 523 -CREATE TABLE num_input_test (n1 decimal(38, 18)) USING parquet --- !query 523 schema -struct<> --- !query 523 output - - - --- !query 524 -INSERT INTO num_input_test VALUES (trim(' 123')) --- !query 524 schema -struct<> --- !query 524 output - - - --- !query 525 -INSERT INTO num_input_test VALUES (trim(' 3245874 ')) --- !query 525 schema -struct<> --- !query 525 output - - - --- !query 526 -INSERT INTO num_input_test VALUES (trim(' -93853')) --- !query 526 schema -struct<> --- !query 526 output - - - --- !query 527 -INSERT INTO num_input_test VALUES ('555.50') --- !query 527 schema -struct<> --- !query 527 output - - - --- !query 528 -INSERT INTO num_input_test VALUES ('-555.50') --- !query 528 schema -struct<> --- !query 528 output - - - --- !query 529 -SELECT * FROM num_input_test --- !query 529 schema -struct --- !query 529 output --555.5 --93853 -123 -3245874 -555.5 - - --- !query 530 -select cast(999999999999999999999 as decimal(38, 0))/1000000000000000000000 --- !query 530 schema -struct<(CAST(CAST(999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) / CAST(1000000000000000000000 AS DECIMAL(38,0))):decimal(38,6)> --- !query 530 output -1 - - --- !query 531 -select div(cast(999999999999999999999 as decimal(38, 0)),1000000000000000000000) --- !query 531 schema -struct<(CAST(CAST(999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) div CAST(1000000000000000000000 AS DECIMAL(38,0))):decimal(38,0)> --- !query 531 output -0 - - --- !query 532 -select mod(cast(999999999999999999999 as decimal(38, 0)),1000000000000000000000) --- !query 532 schema -struct<(CAST(CAST(999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) % CAST(1000000000000000000000 AS DECIMAL(38,0))):decimal(22,0)> --- !query 532 output -999999999999999999999 - - --- !query 533 -select div(cast(-9999999999999999999999 as decimal(38, 0)),1000000000000000000000) --- !query 533 schema -struct<(CAST(CAST(-9999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) div CAST(1000000000000000000000 AS DECIMAL(38,0))):decimal(38,0)> --- !query 533 output --9 - - --- !query 534 -select mod(cast(-9999999999999999999999 as decimal(38, 0)),1000000000000000000000) --- !query 534 schema -struct<(CAST(CAST(-9999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) % CAST(1000000000000000000000 AS DECIMAL(38,0))):decimal(22,0)> --- !query 534 output --999999999999999999999 - - --- !query 535 -select div(cast(-9999999999999999999999 as decimal(38, 0)),1000000000000000000000)*1000000000000000000000 + mod(cast(-9999999999999999999999 as decimal(38, 0)),1000000000000000000000) --- !query 535 schema -struct<(CAST((CAST((CAST(CAST(-9999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) div CAST(1000000000000000000000 AS DECIMAL(38,0))) AS DECIMAL(38,0)) * CAST(1000000000000000000000 AS DECIMAL(38,0))) AS DECIMAL(38,0)) + CAST((CAST(CAST(-9999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) % CAST(1000000000000000000000 AS DECIMAL(38,0))) AS DECIMAL(38,0))):decimal(38,0)> --- !query 535 output --9999999999999999999999 - - --- !query 536 -select mod (70.0,70) --- !query 536 schema -struct<(CAST(70.0 AS DECIMAL(3,1)) % CAST(CAST(70 AS DECIMAL(2,0)) AS DECIMAL(3,1))):decimal(3,1)> --- !query 536 output -0 - - --- !query 537 -select div (70.0,70) --- !query 537 schema -struct<(CAST(70.0 AS DECIMAL(3,1)) div CAST(CAST(70 AS DECIMAL(2,0)) AS DECIMAL(3,1))):decimal(2,0)> --- !query 537 output -1 - - --- !query 538 -select 70.0 / 70 --- !query 538 schema -struct<(CAST(70.0 AS DECIMAL(3,1)) / CAST(CAST(70 AS DECIMAL(2,0)) AS DECIMAL(3,1))):decimal(8,6)> --- !query 538 output -1 - - --- !query 539 -select 12345678901234567890 % 123 --- !query 539 schema -struct<(CAST(12345678901234567890 AS DECIMAL(20,0)) % CAST(CAST(123 AS DECIMAL(3,0)) AS DECIMAL(20,0))):decimal(3,0)> --- !query 539 output -78 - - --- !query 540 -select exp(0.0) --- !query 540 schema -struct --- !query 540 output -1.0 - - --- !query 541 -select exp(1.0) --- !query 541 schema -struct --- !query 541 output -2.7182818284590455 - - --- !query 542 -select exp(32.999) --- !query 542 schema -struct --- !query 542 output -2.1442904349215556E14 - - --- !query 543 -select exp(-32.999) --- !query 543 schema -struct --- !query 543 output -4.663547361468238E-15 - - --- !query 544 -select exp(123.456) --- !query 544 schema -struct --- !query 544 output -4.132944352778106E53 - - --- !query 545 -select exp(-123.456) --- !query 545 schema -struct --- !query 545 output -2.4195825412645934E-54 - - --- !query 546 -select exp(1234.5678) --- !query 546 schema -struct --- !query 546 output -Infinity - - --- !query 547 -select * from range(cast(0.0 as decimal(38, 18)), cast(4.0 as decimal(38, 18))) --- !query 547 schema -struct --- !query 547 output -0 -1 -2 -3 - - --- !query 548 -select * from range(cast(0.1 as decimal(38, 18)), cast(4.0 as decimal(38, 18)), cast(1.3 as decimal(38, 18))) --- !query 548 schema -struct --- !query 548 output -0 -1 -2 -3 - - --- !query 549 -select * from range(cast(4.0 as decimal(38, 18)), cast(-1.5 as decimal(38, 18)), cast(-2.2 as decimal(38, 18))) --- !query 549 schema -struct --- !query 549 output -0 -2 -4 - - --- !query 550 -select ln(1.2345678e-28) --- !query 550 schema -struct --- !query 550 output --64.26166165451762 - - --- !query 551 -select ln(0.0456789) --- !query 551 schema -struct --- !query 551 output --3.0861187944847437 - - --- !query 552 -select ln(0.99949452) --- !query 552 schema -struct --- !query 552 output --5.056077980832118E-4 - - --- !query 553 -select ln(1.00049687395) --- !query 553 schema -struct --- !query 553 output -4.967505490136803E-4 - - --- !query 554 -select ln(1234.567890123456789) --- !query 554 schema -struct --- !query 554 output -7.11847630129779 - - --- !query 555 -select ln(5.80397490724e5) --- !query 555 schema -struct --- !query 555 output -13.271468476626518 - - --- !query 556 -select ln(9.342536355e34) --- !query 556 schema -struct --- !query 556 output -80.52247093552418 - - --- !query 557 -select log(3.4634998359873254962349856073435545) --- !query 557 schema -struct --- !query 557 output -1.2422795911259166 - - --- !query 558 -select log(9.999999999999999999) --- !query 558 schema -struct --- !query 558 output -2.302585092994046 - - --- !query 559 -select log(10.00000000000000000) --- !query 559 schema -struct --- !query 559 output -2.302585092994046 - - --- !query 560 -select log(10.00000000000000001) --- !query 560 schema -struct --- !query 560 output -2.302585092994046 - - --- !query 561 -select log(590489.45235237) --- !query 561 schema -struct --- !query 561 output -13.288707052228641 - - --- !query 562 -select log(0.99923, 4.58934e34) --- !query 562 schema -struct --- !query 562 output --103611.55579543479 - - --- !query 563 -select log(1.000016, 8.452010e18) --- !query 563 schema -struct --- !query 563 output -2723830.287707013 - - --- !query 564 -SELECT SUM(decimal(9999)) FROM range(1, 100001) --- !query 564 schema -struct --- !query 564 output -999900000 - - --- !query 565 -SELECT SUM(decimal(-9999)) FROM range(1, 100001) --- !query 565 schema -struct --- !query 565 output --999900000 - - --- !query 566 -DROP TABLE num_data --- !query 566 schema -struct<> --- !query 566 output - - - --- !query 567 -DROP TABLE num_exp_add --- !query 567 schema -struct<> --- !query 567 output - - - --- !query 568 -DROP TABLE num_exp_sub --- !query 568 schema -struct<> --- !query 568 output - - - --- !query 569 -DROP TABLE num_exp_div --- !query 569 schema -struct<> --- !query 569 output - - - --- !query 570 -DROP TABLE num_exp_mul --- !query 570 schema -struct<> --- !query 570 output - - - --- !query 571 -DROP TABLE num_exp_sqrt --- !query 571 schema -struct<> --- !query 571 output - - - --- !query 572 -DROP TABLE num_exp_ln --- !query 572 schema -struct<> --- !query 572 output - - - --- !query 573 -DROP TABLE num_exp_log10 --- !query 573 schema -struct<> --- !query 573 output - - - --- !query 574 -DROP TABLE num_exp_power_10_ln --- !query 574 schema -struct<> --- !query 574 output - - - --- !query 575 -DROP TABLE num_result --- !query 575 schema -struct<> --- !query 575 output - - - --- !query 576 -DROP TABLE num_input_test --- !query 576 schema -struct<> --- !query 576 output - diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/strings.sql.out b/sql/core/src/test/resources/sql-tests/results/pgSQL/strings.sql.out deleted file mode 100644 index 1e4e6e5021de8..0000000000000 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/strings.sql.out +++ /dev/null @@ -1,750 +0,0 @@ --- Automatically generated by SQLQueryTestSuite --- Number of queries: 92 - - --- !query 0 -SELECT 'first line' -' - next line' - ' - third line' - AS `Three lines to one` --- !query 0 schema -struct --- !query 0 output -first line - next line - third line - - --- !query 1 -SELECT 'first line' -' - next line' /* this comment is not allowed here */ -' - third line' - AS `Illegal comment within continuation` --- !query 1 schema -struct --- !query 1 output -first line - next line - third line - - --- !query 2 -SELECT binary('\\xDeAdBeEf') --- !query 2 schema -struct --- !query 2 output -\xDeAdBeEf - - --- !query 3 -SELECT binary('\\x De Ad Be Ef ') --- !query 3 schema -struct --- !query 3 output -\x De Ad Be Ef - - --- !query 4 -SELECT binary('\\xDe00BeEf') --- !query 4 schema -struct --- !query 4 output -\xDe00BeEf - - --- !query 5 -SELECT binary('DeAdBeEf') --- !query 5 schema -struct --- !query 5 output -DeAdBeEf - - --- !query 6 -SELECT binary('De\\000dBeEf') --- !query 6 schema -struct --- !query 6 output -De\000dBeEf - - --- !query 7 -SELECT binary('De\\123dBeEf') --- !query 7 schema -struct --- !query 7 output -De\123dBeEf - - --- !query 8 -SELECT TRIM(BOTH FROM ' bunch o blanks ') = 'bunch o blanks' AS `bunch o blanks` --- !query 8 schema -struct --- !query 8 output -true - - --- !query 9 -SELECT TRIM(LEADING FROM ' bunch o blanks ') = 'bunch o blanks ' AS `bunch o blanks ` --- !query 9 schema -struct --- !query 9 output -true - - --- !query 10 -SELECT TRIM(TRAILING FROM ' bunch o blanks ') = ' bunch o blanks' AS ` bunch o blanks` --- !query 10 schema -struct< bunch o blanks:boolean> --- !query 10 output -true - - --- !query 11 -SELECT TRIM(BOTH 'x' FROM 'xxxxxsome Xsxxxxx') = 'some Xs' AS `some Xs` --- !query 11 schema -struct --- !query 11 output -true - - --- !query 12 -SELECT SUBSTRING('1234567890' FROM 3) = '34567890' AS `34567890` --- !query 12 schema -struct<34567890:boolean> --- !query 12 output -true - - --- !query 13 -SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS `456` --- !query 13 schema -struct<456:boolean> --- !query 13 output -true - - --- !query 14 -SELECT POSITION('4' IN '1234567890') = '4' AS `4` --- !query 14 schema -struct<4:boolean> --- !query 14 output -true - - --- !query 15 -SELECT POSITION('5' IN '1234567890') = '5' AS `5` --- !query 15 schema -struct<5:boolean> --- !query 15 output -true - - --- !query 16 -SELECT OVERLAY('abcdef' PLACING '45' FROM 4) AS `abc45f` --- !query 16 schema -struct --- !query 16 output -abc45f - - --- !query 17 -SELECT OVERLAY('yabadoo' PLACING 'daba' FROM 5) AS `yabadaba` --- !query 17 schema -struct --- !query 17 output -yabadaba - - --- !query 18 -SELECT OVERLAY('yabadoo' PLACING 'daba' FROM 5 FOR 0) AS `yabadabadoo` --- !query 18 schema -struct --- !query 18 output -yabadabadoo - - --- !query 19 -SELECT OVERLAY('babosa' PLACING 'ubb' FROM 2 FOR 4) AS `bubba` --- !query 19 schema -struct --- !query 19 output -bubba - - --- !query 20 -SELECT 'hawkeye' LIKE 'h%' AS `true` --- !query 20 schema -struct --- !query 20 output -true - - --- !query 21 -SELECT 'hawkeye' NOT LIKE 'h%' AS `false` --- !query 21 schema -struct --- !query 21 output -false - - --- !query 22 -SELECT 'hawkeye' LIKE 'H%' AS `false` --- !query 22 schema -struct --- !query 22 output -false - - --- !query 23 -SELECT 'hawkeye' NOT LIKE 'H%' AS `true` --- !query 23 schema -struct --- !query 23 output -true - - --- !query 24 -SELECT 'hawkeye' LIKE 'indio%' AS `false` --- !query 24 schema -struct --- !query 24 output -false - - --- !query 25 -SELECT 'hawkeye' NOT LIKE 'indio%' AS `true` --- !query 25 schema -struct --- !query 25 output -true - - --- !query 26 -SELECT 'hawkeye' LIKE 'h%eye' AS `true` --- !query 26 schema -struct --- !query 26 output -true - - --- !query 27 -SELECT 'hawkeye' NOT LIKE 'h%eye' AS `false` --- !query 27 schema -struct --- !query 27 output -false - - --- !query 28 -SELECT 'indio' LIKE '_ndio' AS `true` --- !query 28 schema -struct --- !query 28 output -true - - --- !query 29 -SELECT 'indio' NOT LIKE '_ndio' AS `false` --- !query 29 schema -struct --- !query 29 output -false - - --- !query 30 -SELECT 'indio' LIKE 'in__o' AS `true` --- !query 30 schema -struct --- !query 30 output -true - - --- !query 31 -SELECT 'indio' NOT LIKE 'in__o' AS `false` --- !query 31 schema -struct --- !query 31 output -false - - --- !query 32 -SELECT 'indio' LIKE 'in_o' AS `false` --- !query 32 schema -struct --- !query 32 output -false - - --- !query 33 -SELECT 'indio' NOT LIKE 'in_o' AS `true` --- !query 33 schema -struct --- !query 33 output -true - - --- !query 34 -SELECT 'foo' LIKE '_%' as t, 'f' LIKE '_%' as t, '' LIKE '_%' as f --- !query 34 schema -struct --- !query 34 output -true true false - - --- !query 35 -SELECT 'foo' LIKE '%_' as t, 'f' LIKE '%_' as t, '' LIKE '%_' as f --- !query 35 schema -struct --- !query 35 output -true true false - - --- !query 36 -SELECT 'foo' LIKE '__%' as t, 'foo' LIKE '___%' as t, 'foo' LIKE '____%' as f --- !query 36 schema -struct --- !query 36 output -true true false - - --- !query 37 -SELECT 'foo' LIKE '%__' as t, 'foo' LIKE '%___' as t, 'foo' LIKE '%____' as f --- !query 37 schema -struct --- !query 37 output -true true false - - --- !query 38 -SELECT 'jack' LIKE '%____%' AS t --- !query 38 schema -struct --- !query 38 output -true - - --- !query 39 -SELECT 'unknown' || ' and unknown' AS `Concat unknown types` --- !query 39 schema -struct --- !query 39 output -unknown and unknown - - --- !query 40 -SELECT string('text') || ' and unknown' AS `Concat text to unknown type` --- !query 40 schema -struct --- !query 40 output -text and unknown - - --- !query 41 -CREATE TABLE toasttest(f1 string) USING parquet --- !query 41 schema -struct<> --- !query 41 output - - - --- !query 42 -insert into toasttest values(repeat('1234567890',10000)) --- !query 42 schema -struct<> --- !query 42 output - - - --- !query 43 -insert into toasttest values(repeat('1234567890',10000)) --- !query 43 schema -struct<> --- !query 43 output - - - --- !query 44 -insert into toasttest values(repeat('1234567890',10000)) --- !query 44 schema -struct<> --- !query 44 output - - - --- !query 45 -insert into toasttest values(repeat('1234567890',10000)) --- !query 45 schema -struct<> --- !query 45 output - - - --- !query 46 -SELECT substr(f1, 99995) from toasttest --- !query 46 schema -struct --- !query 46 output -567890 -567890 -567890 -567890 - - --- !query 47 -SELECT substr(f1, 99995, 10) from toasttest --- !query 47 schema -struct --- !query 47 output -567890 -567890 -567890 -567890 - - --- !query 48 -SELECT length('abcdef') AS `length_6` --- !query 48 schema -struct --- !query 48 output -6 - - --- !query 49 -SELECT position('cd', 'abcdef') AS `pos_3` --- !query 49 schema -struct --- !query 49 output -3 - - --- !query 50 -SELECT position('xy', 'abcdef') AS `pos_0` --- !query 50 schema -struct --- !query 50 output -0 - - --- !query 51 -SELECT replace('abcdef', 'de', '45') AS `abc45f` --- !query 51 schema -struct --- !query 51 output -abc45f - - --- !query 52 -SELECT replace('yabadabadoo', 'ba', '123') AS `ya123da123doo` --- !query 52 schema -struct --- !query 52 output -ya123da123doo - - --- !query 53 -SELECT replace('yabadoo', 'bad', '') AS `yaoo` --- !query 53 schema -struct --- !query 53 output -yaoo - - --- !query 54 -select hex(256*256*256 - 1) AS `ffffff` --- !query 54 schema -struct --- !query 54 output -FFFFFF - - --- !query 55 -select hex(bigint(bigint(bigint(bigint(256)*256)*256)*256) - 1) AS `ffffffff` --- !query 55 schema -struct --- !query 55 output -FFFFFFFF - - --- !query 56 -select md5('') = 'd41d8cd98f00b204e9800998ecf8427e' AS `TRUE` --- !query 56 schema -struct --- !query 56 output -true - - --- !query 57 -select md5('a') = '0cc175b9c0f1b6a831c399e269772661' AS `TRUE` --- !query 57 schema -struct --- !query 57 output -true - - --- !query 58 -select md5('abc') = '900150983cd24fb0d6963f7d28e17f72' AS `TRUE` --- !query 58 schema -struct --- !query 58 output -true - - --- !query 59 -select md5('message digest') = 'f96b697d7cb7938d525a2f31aaf161d0' AS `TRUE` --- !query 59 schema -struct --- !query 59 output -true - - --- !query 60 -select md5('abcdefghijklmnopqrstuvwxyz') = 'c3fcd3d76192e4007dfb496cca67e13b' AS `TRUE` --- !query 60 schema -struct --- !query 60 output -true - - --- !query 61 -select md5('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789') = 'd174ab98d277d9f5a5611c2c9f419d9f' AS `TRUE` --- !query 61 schema -struct --- !query 61 output -true - - --- !query 62 -select md5('12345678901234567890123456789012345678901234567890123456789012345678901234567890') = '57edf4a22be3c955ac49da2e2107b67a' AS `TRUE` --- !query 62 schema -struct --- !query 62 output -true - - --- !query 63 -select md5(binary('')) = 'd41d8cd98f00b204e9800998ecf8427e' AS `TRUE` --- !query 63 schema -struct --- !query 63 output -true - - --- !query 64 -select md5(binary('a')) = '0cc175b9c0f1b6a831c399e269772661' AS `TRUE` --- !query 64 schema -struct --- !query 64 output -true - - --- !query 65 -select md5(binary('abc')) = '900150983cd24fb0d6963f7d28e17f72' AS `TRUE` --- !query 65 schema -struct --- !query 65 output -true - - --- !query 66 -select md5(binary('message digest')) = 'f96b697d7cb7938d525a2f31aaf161d0' AS `TRUE` --- !query 66 schema -struct --- !query 66 output -true - - --- !query 67 -select md5(binary('abcdefghijklmnopqrstuvwxyz')) = 'c3fcd3d76192e4007dfb496cca67e13b' AS `TRUE` --- !query 67 schema -struct --- !query 67 output -true - - --- !query 68 -select md5(binary('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789')) = 'd174ab98d277d9f5a5611c2c9f419d9f' AS `TRUE` --- !query 68 schema -struct --- !query 68 output -true - - --- !query 69 -select md5(binary('12345678901234567890123456789012345678901234567890123456789012345678901234567890')) = '57edf4a22be3c955ac49da2e2107b67a' AS `TRUE` --- !query 69 schema -struct --- !query 69 output -true - - --- !query 70 -SELECT initcap('hi THOMAS') --- !query 70 schema -struct --- !query 70 output -Hi Thomas - - --- !query 71 -SELECT lpad('hi', 5, 'xy') --- !query 71 schema -struct --- !query 71 output -xyxhi - - --- !query 72 -SELECT lpad('hi', 5) --- !query 72 schema -struct --- !query 72 output - hi - - --- !query 73 -SELECT lpad('hi', -5, 'xy') --- !query 73 schema -struct --- !query 73 output - - - --- !query 74 -SELECT lpad('hello', 2) --- !query 74 schema -struct --- !query 74 output -he - - --- !query 75 -SELECT lpad('hi', 5, '') --- !query 75 schema -struct --- !query 75 output -hi - - --- !query 76 -SELECT rpad('hi', 5, 'xy') --- !query 76 schema -struct --- !query 76 output -hixyx - - --- !query 77 -SELECT rpad('hi', 5) --- !query 77 schema -struct --- !query 77 output -hi - - --- !query 78 -SELECT rpad('hi', -5, 'xy') --- !query 78 schema -struct --- !query 78 output - - - --- !query 79 -SELECT rpad('hello', 2) --- !query 79 schema -struct --- !query 79 output -he - - --- !query 80 -SELECT rpad('hi', 5, '') --- !query 80 schema -struct --- !query 80 output -hi - - --- !query 81 -SELECT ltrim('zzzytrim', 'xyz') --- !query 81 schema -struct --- !query 81 output -trim - - --- !query 82 -SELECT translate('', '14', 'ax') --- !query 82 schema -struct --- !query 82 output - - - --- !query 83 -SELECT translate('12345', '14', 'ax') --- !query 83 schema -struct --- !query 83 output -a23x5 - - --- !query 84 -SELECT ascii('x') --- !query 84 schema -struct --- !query 84 output -120 - - --- !query 85 -SELECT ascii('') --- !query 85 schema -struct --- !query 85 output -0 - - --- !query 86 -SELECT chr(65) --- !query 86 schema -struct --- !query 86 output -A - - --- !query 87 -SELECT chr(0) --- !query 87 schema -struct --- !query 87 output - - - --- !query 88 -SELECT repeat('Pg', 4) --- !query 88 schema -struct --- !query 88 output -PgPgPgPg - - --- !query 89 -SELECT repeat('Pg', -4) --- !query 89 schema -struct --- !query 89 output - - - --- !query 90 -SELECT trim(binary('\\000') from binary('\\000Tom\\000')) --- !query 90 schema -struct --- !query 90 output -Tom - - --- !query 91 -DROP TABLE toasttest --- !query 91 schema -struct<> --- !query 91 output - diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/timestamp.sql.out b/sql/core/src/test/resources/sql-tests/results/pgSQL/timestamp.sql.out deleted file mode 100644 index 13a1d09b71b76..0000000000000 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/timestamp.sql.out +++ /dev/null @@ -1,138 +0,0 @@ --- Automatically generated by SQLQueryTestSuite --- Number of queries: 15 - - --- !query 0 -CREATE TABLE TIMESTAMP_TBL (d1 timestamp) USING parquet --- !query 0 schema -struct<> --- !query 0 output - - - --- !query 1 -INSERT INTO TIMESTAMP_TBL VALUES ('1997-01-02') --- !query 1 schema -struct<> --- !query 1 output - - - --- !query 2 -INSERT INTO TIMESTAMP_TBL VALUES ('1997-01-02 03:04:05') --- !query 2 schema -struct<> --- !query 2 output - - - --- !query 3 -INSERT INTO TIMESTAMP_TBL VALUES ('1997-02-10 17:32:01-08') --- !query 3 schema -struct<> --- !query 3 output - - - --- !query 4 -INSERT INTO TIMESTAMP_TBL VALUES ('2001-09-22T18:19:20') --- !query 4 schema -struct<> --- !query 4 output - - - --- !query 5 -SELECT '' AS `64`, d1 FROM TIMESTAMP_TBL --- !query 5 schema -struct<64:string,d1:timestamp> --- !query 5 output - 1997-01-02 00:00:00 - 1997-01-02 03:04:05 - 1997-02-10 17:32:01 - 2001-09-22 18:19:20 - - --- !query 6 -SELECT '' AS `48`, d1 FROM TIMESTAMP_TBL - WHERE d1 > timestamp '1997-01-02' --- !query 6 schema -struct<48:string,d1:timestamp> --- !query 6 output - 1997-01-02 03:04:05 - 1997-02-10 17:32:01 - 2001-09-22 18:19:20 - - --- !query 7 -SELECT '' AS `15`, d1 FROM TIMESTAMP_TBL - WHERE d1 < timestamp '1997-01-02' --- !query 7 schema -struct<15:string,d1:timestamp> --- !query 7 output - - - --- !query 8 -SELECT '' AS one, d1 FROM TIMESTAMP_TBL - WHERE d1 = timestamp '1997-01-02' --- !query 8 schema -struct --- !query 8 output - 1997-01-02 00:00:00 - - --- !query 9 -SELECT '' AS `63`, d1 FROM TIMESTAMP_TBL - WHERE d1 != timestamp '1997-01-02' --- !query 9 schema -struct<63:string,d1:timestamp> --- !query 9 output - 1997-01-02 03:04:05 - 1997-02-10 17:32:01 - 2001-09-22 18:19:20 - - --- !query 10 -SELECT '' AS `16`, d1 FROM TIMESTAMP_TBL - WHERE d1 <= timestamp '1997-01-02' --- !query 10 schema -struct<16:string,d1:timestamp> --- !query 10 output - 1997-01-02 00:00:00 - - --- !query 11 -SELECT '' AS `49`, d1 FROM TIMESTAMP_TBL - WHERE d1 >= timestamp '1997-01-02' --- !query 11 schema -struct<49:string,d1:timestamp> --- !query 11 output - 1997-01-02 00:00:00 - 1997-01-02 03:04:05 - 1997-02-10 17:32:01 - 2001-09-22 18:19:20 - - --- !query 12 -SELECT '' AS date_trunc_week, date_trunc( 'week', timestamp '2004-02-29 15:44:17.71393' ) AS week_trunc --- !query 12 schema -struct --- !query 12 output - 2004-02-23 00:00:00 - - --- !query 13 -SELECT make_timestamp(2014,12,28,6,30,45.887) --- !query 13 schema -struct --- !query 13 output -2014-12-28 06:30:45.887 - - --- !query 14 -DROP TABLE TIMESTAMP_TBL --- !query 14 schema -struct<> --- !query 14 output - diff --git a/sql/core/src/test/resources/sql-tests/results/pivot.sql.out b/sql/core/src/test/resources/sql-tests/results/pivot.sql.out index 9a8f783da4369..ac4e71e244bc0 100644 --- a/sql/core/src/test/resources/sql-tests/results/pivot.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/pivot.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 32 --- !query 0 +-- !query create temporary view courseSales as select * from values ("dotNET", 2012, 10000), ("Java", 2012, 20000), @@ -10,35 +10,35 @@ create temporary view courseSales as select * from values ("dotNET", 2013, 48000), ("Java", 2013, 30000) as courseSales(course, year, earnings) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view years as select * from values (2012, 1), (2013, 2) as years(y, s) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query create temporary view yearsWithComplexTypes as select * from values (2012, array(1, 1), map('1', 1), struct(1, 'a')), (2013, array(2, 2), map('2', 2), struct(2, 'b')) as yearsWithComplexTypes(y, a, m, s) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT * FROM ( SELECT year, course, earnings FROM courseSales ) @@ -46,27 +46,27 @@ PIVOT ( sum(earnings) FOR course IN ('dotNET', 'Java') ) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 2012 15000 20000 2013 48000 30000 --- !query 4 +-- !query SELECT * FROM courseSales PIVOT ( sum(earnings) FOR year IN (2012, 2013) ) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output Java 20000 30000 dotNET 15000 48000 --- !query 5 +-- !query SELECT * FROM ( SELECT year, course, earnings FROM courseSales ) @@ -74,14 +74,14 @@ PIVOT ( sum(earnings), avg(earnings) FOR course IN ('dotNET', 'Java') ) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 2012 15000 7500.0 20000 20000.0 2013 48000 48000.0 30000 30000.0 --- !query 6 +-- !query SELECT * FROM ( SELECT course, earnings FROM courseSales ) @@ -89,13 +89,13 @@ PIVOT ( sum(earnings) FOR course IN ('dotNET', 'Java') ) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 63000 50000 --- !query 7 +-- !query SELECT * FROM ( SELECT year, course, earnings FROM courseSales ) @@ -103,13 +103,13 @@ PIVOT ( sum(earnings), min(year) FOR course IN ('dotNET', 'Java') ) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 63000 2012 50000 2012 --- !query 8 +-- !query SELECT * FROM ( SELECT course, year, earnings, s FROM courseSales @@ -119,16 +119,16 @@ PIVOT ( sum(earnings) FOR s IN (1, 2) ) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output Java 2012 20000 NULL Java 2013 NULL 30000 dotNET 2012 15000 NULL dotNET 2013 NULL 48000 --- !query 9 +-- !query SELECT * FROM ( SELECT course, year, earnings, s FROM courseSales @@ -138,14 +138,14 @@ PIVOT ( sum(earnings), min(s) FOR course IN ('dotNET', 'Java') ) --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 2012 15000 1 20000 1 2013 48000 2 30000 2 --- !query 10 +-- !query SELECT * FROM ( SELECT course, year, earnings, s FROM courseSales @@ -155,14 +155,14 @@ PIVOT ( sum(earnings * s) FOR course IN ('dotNET', 'Java') ) --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 2012 15000 20000 2013 96000 60000 --- !query 11 +-- !query SELECT 2012_s, 2013_s, 2012_a, 2013_a, c FROM ( SELECT year y, course c, earnings e FROM courseSales ) @@ -170,14 +170,14 @@ PIVOT ( sum(e) s, avg(e) a FOR y IN (2012, 2013) ) --- !query 11 schema +-- !query schema struct<2012_s:bigint,2013_s:bigint,2012_a:double,2013_a:double,c:string> --- !query 11 output +-- !query output 15000 48000 7500.0 48000.0 dotNET 20000 30000 20000.0 30000.0 Java --- !query 12 +-- !query SELECT firstYear_s, secondYear_s, firstYear_a, secondYear_a, c FROM ( SELECT year y, course c, earnings e FROM courseSales ) @@ -185,27 +185,27 @@ PIVOT ( sum(e) s, avg(e) a FOR y IN (2012 as firstYear, 2013 secondYear) ) --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 15000 48000 7500.0 48000.0 dotNET 20000 30000 20000.0 30000.0 Java --- !query 13 +-- !query SELECT * FROM courseSales PIVOT ( abs(earnings) FOR year IN (2012, 2013) ) --- !query 13 schema +-- !query schema struct<> --- !query 13 output +-- !query output org.apache.spark.sql.AnalysisException Aggregate expression required for pivot, but 'coursesales.`earnings`' did not appear in any aggregate function.; --- !query 14 +-- !query SELECT * FROM ( SELECT year, course, earnings FROM courseSales ) @@ -213,14 +213,14 @@ PIVOT ( sum(earnings), year FOR course IN ('dotNET', 'Java') ) --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output org.apache.spark.sql.AnalysisException Aggregate expression required for pivot, but '__auto_generated_subquery_name.`year`' did not appear in any aggregate function.; --- !query 15 +-- !query SELECT * FROM ( SELECT course, earnings FROM courseSales ) @@ -228,14 +228,14 @@ PIVOT ( sum(earnings) FOR year IN (2012, 2013) ) --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`year`' given input columns: [__auto_generated_subquery_name.course, __auto_generated_subquery_name.earnings]; line 4 pos 0 --- !query 16 +-- !query SELECT * FROM ( SELECT year, course, earnings FROM courseSales ) @@ -243,14 +243,14 @@ PIVOT ( ceil(sum(earnings)), avg(earnings) + 1 as a1 FOR course IN ('dotNET', 'Java') ) --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output 2012 15000 7501.0 20000 20001.0 2013 48000 48001.0 30000 30001.0 --- !query 17 +-- !query SELECT * FROM ( SELECT year, course, earnings FROM courseSales ) @@ -258,14 +258,14 @@ PIVOT ( sum(avg(earnings)) FOR course IN ('dotNET', 'Java') ) --- !query 17 schema +-- !query schema struct<> --- !query 17 output +-- !query output org.apache.spark.sql.AnalysisException It is not allowed to use an aggregate function in the argument of another aggregate function. Please use the inner aggregate function in a sub-query.; --- !query 18 +-- !query SELECT * FROM ( SELECT course, year, earnings, s FROM courseSales @@ -275,14 +275,14 @@ PIVOT ( sum(earnings) FOR (course, year) IN (('dotNET', 2012), ('Java', 2013)) ) --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output 1 15000 NULL 2 NULL 30000 --- !query 19 +-- !query SELECT * FROM ( SELECT course, year, earnings, s FROM courseSales @@ -292,14 +292,14 @@ PIVOT ( sum(earnings) FOR (course, s) IN (('dotNET', 2) as c1, ('Java', 1) as c2) ) --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output 2012 NULL 20000 2013 48000 NULL --- !query 20 +-- !query SELECT * FROM ( SELECT course, year, earnings, s FROM courseSales @@ -309,40 +309,40 @@ PIVOT ( sum(earnings) FOR (course, year) IN ('dotNET', 'Java') ) --- !query 20 schema +-- !query schema struct<> --- !query 20 output +-- !query output org.apache.spark.sql.AnalysisException Invalid pivot value 'dotNET': value data type string does not match pivot column data type struct; --- !query 21 +-- !query SELECT * FROM courseSales PIVOT ( sum(earnings) FOR year IN (s, 2013) ) --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`s`' given input columns: [coursesales.course, coursesales.earnings, coursesales.year]; line 4 pos 15 --- !query 22 +-- !query SELECT * FROM courseSales PIVOT ( sum(earnings) FOR year IN (course, 2013) ) --- !query 22 schema +-- !query schema struct<> --- !query 22 output +-- !query output org.apache.spark.sql.AnalysisException Literal expressions required for pivot values, found 'course#x'; --- !query 23 +-- !query SELECT * FROM ( SELECT course, year, a FROM courseSales @@ -352,14 +352,14 @@ PIVOT ( min(a) FOR course IN ('dotNET', 'Java') ) --- !query 23 schema +-- !query schema struct,Java:array> --- !query 23 output +-- !query output 2012 [1,1] [1,1] 2013 [2,2] [2,2] --- !query 24 +-- !query SELECT * FROM ( SELECT course, year, y, a FROM courseSales @@ -369,14 +369,14 @@ PIVOT ( max(a) FOR (y, course) IN ((2012, 'dotNET'), (2013, 'Java')) ) --- !query 24 schema +-- !query schema struct,[2013, Java]:array> --- !query 24 output +-- !query output 2012 [1,1] NULL 2013 NULL [2,2] --- !query 25 +-- !query SELECT * FROM ( SELECT earnings, year, a FROM courseSales @@ -386,14 +386,14 @@ PIVOT ( sum(earnings) FOR a IN (array(1, 1), array(2, 2)) ) --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output 2012 35000 NULL 2013 NULL 78000 --- !query 26 +-- !query SELECT * FROM ( SELECT course, earnings, year, a FROM courseSales @@ -403,14 +403,14 @@ PIVOT ( sum(earnings) FOR (course, a) IN (('dotNET', array(1, 1)), ('Java', array(2, 2))) ) --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output 2012 15000 NULL 2013 NULL 30000 --- !query 27 +-- !query SELECT * FROM ( SELECT earnings, year, s FROM courseSales @@ -420,14 +420,14 @@ PIVOT ( sum(earnings) FOR s IN ((1, 'a'), (2, 'b')) ) --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output 2012 35000 NULL 2013 NULL 78000 --- !query 28 +-- !query SELECT * FROM ( SELECT course, earnings, year, s FROM courseSales @@ -437,14 +437,14 @@ PIVOT ( sum(earnings) FOR (course, s) IN (('dotNET', (1, 'a')), ('Java', (2, 'b'))) ) --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output 2012 15000 NULL 2013 NULL 30000 --- !query 29 +-- !query SELECT * FROM ( SELECT earnings, year, m FROM courseSales @@ -454,14 +454,14 @@ PIVOT ( sum(earnings) FOR m IN (map('1', 1), map('2', 2)) ) --- !query 29 schema +-- !query schema struct<> --- !query 29 output +-- !query output org.apache.spark.sql.AnalysisException Invalid pivot column 'm#x'. Pivot columns must be comparable.; --- !query 30 +-- !query SELECT * FROM ( SELECT course, earnings, year, m FROM courseSales @@ -471,14 +471,14 @@ PIVOT ( sum(earnings) FOR (course, m) IN (('dotNET', map('1', 1)), ('Java', map('2', 2))) ) --- !query 30 schema +-- !query schema struct<> --- !query 30 output +-- !query output org.apache.spark.sql.AnalysisException Invalid pivot column 'named_struct(course, course#x, m, m#x)'. Pivot columns must be comparable.; --- !query 31 +-- !query SELECT * FROM ( SELECT course, earnings, "a" as a, "z" as z, "b" as b, "y" as y, "c" as c, "x" as x, "d" as d, "w" as w FROM courseSales @@ -487,7 +487,7 @@ PIVOT ( sum(Earnings) FOR Course IN ('dotNET', 'Java') ) --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output a z b y c x d w 63000 50000 diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part1.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part1.sql.out similarity index 73% rename from sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part1.sql.out rename to sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part1.sql.out index 29bafb42f579e..5efb58c7fc1b0 100644 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part1.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part1.sql.out @@ -2,330 +2,330 @@ -- Number of queries: 44 --- !query 0 +-- !query SELECT avg(four) AS avg_1 FROM onek --- !query 0 schema +-- !query schema struct --- !query 0 output +-- !query output 1.5 --- !query 1 +-- !query SELECT avg(a) AS avg_32 FROM aggtest WHERE a < 100 --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 32.666666666666664 --- !query 2 +-- !query select CAST(avg(b) AS Decimal(10,3)) AS avg_107_943 FROM aggtest --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 107.943 --- !query 3 +-- !query SELECT sum(four) AS sum_1500 FROM onek --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 1500 --- !query 4 +-- !query SELECT sum(a) AS sum_198 FROM aggtest --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 198 --- !query 5 +-- !query SELECT sum(b) AS avg_431_773 FROM aggtest --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 431.77260909229517 --- !query 6 +-- !query SELECT max(four) AS max_3 FROM onek --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 3 --- !query 7 +-- !query SELECT max(a) AS max_100 FROM aggtest --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 100 --- !query 8 +-- !query SELECT max(aggtest.b) AS max_324_78 FROM aggtest --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 324.78 --- !query 9 +-- !query SELECT stddev_pop(b) FROM aggtest --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 131.10703231895047 --- !query 10 +-- !query SELECT stddev_samp(b) FROM aggtest --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 151.38936080399804 --- !query 11 +-- !query SELECT var_pop(b) FROM aggtest --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 17189.053923482323 --- !query 12 +-- !query SELECT var_samp(b) FROM aggtest --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 22918.738564643096 --- !query 13 +-- !query SELECT stddev_pop(CAST(b AS Decimal(38,0))) FROM aggtest --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output 131.18117242958306 --- !query 14 +-- !query SELECT stddev_samp(CAST(b AS Decimal(38,0))) FROM aggtest --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output 151.47497042966097 --- !query 15 +-- !query SELECT var_pop(CAST(b AS Decimal(38,0))) FROM aggtest --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 17208.5 --- !query 16 +-- !query SELECT var_samp(CAST(b AS Decimal(38,0))) FROM aggtest --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output 22944.666666666668 --- !query 17 +-- !query SELECT var_pop(1.0), var_samp(2.0) --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output 0.0 NaN --- !query 18 +-- !query SELECT stddev_pop(CAST(3.0 AS Decimal(38,0))), stddev_samp(CAST(4.0 AS Decimal(38,0))) --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output 0.0 NaN --- !query 19 +-- !query select sum(CAST(null AS int)) from range(1,4) --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output NULL --- !query 20 +-- !query select sum(CAST(null AS long)) from range(1,4) --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output NULL --- !query 21 +-- !query select sum(CAST(null AS Decimal(38,0))) from range(1,4) --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output NULL --- !query 22 +-- !query select sum(CAST(null AS DOUBLE)) from range(1,4) --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output NULL --- !query 23 +-- !query select avg(CAST(null AS int)) from range(1,4) --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output NULL --- !query 24 +-- !query select avg(CAST(null AS long)) from range(1,4) --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output NULL --- !query 25 +-- !query select avg(CAST(null AS Decimal(38,0))) from range(1,4) --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output NULL --- !query 26 +-- !query select avg(CAST(null AS DOUBLE)) from range(1,4) --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output NULL --- !query 27 +-- !query select sum(CAST('NaN' AS DOUBLE)) from range(1,4) --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output NaN --- !query 28 +-- !query select avg(CAST('NaN' AS DOUBLE)) from range(1,4) --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output NaN --- !query 29 +-- !query SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE)) FROM (VALUES (CAST('1' AS DOUBLE)), (CAST('infinity' AS DOUBLE))) v(x) --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output Infinity NaN --- !query 30 +-- !query SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE)) FROM (VALUES ('infinity'), ('1')) v(x) --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output Infinity NaN --- !query 31 +-- !query SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE)) FROM (VALUES ('infinity'), ('infinity')) v(x) --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output Infinity NaN --- !query 32 +-- !query SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE)) FROM (VALUES ('-infinity'), ('infinity')) v(x) --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output NaN NaN --- !query 33 +-- !query SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE)) FROM (VALUES (100000003), (100000004), (100000006), (100000007)) v(x) --- !query 33 schema +-- !query schema struct --- !query 33 output +-- !query output 1.00000005E8 2.5 --- !query 34 +-- !query SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE)) FROM (VALUES (7000000000005), (7000000000007)) v(x) --- !query 34 schema +-- !query schema struct --- !query 34 output +-- !query output 7.000000000006E12 1.0 --- !query 35 +-- !query SELECT covar_pop(b, a), covar_samp(b, a) FROM aggtest --- !query 35 schema +-- !query schema struct --- !query 35 output +-- !query output 653.6289553875104 871.5052738500139 --- !query 36 +-- !query SELECT corr(b, a) FROM aggtest --- !query 36 schema +-- !query schema struct --- !query 36 output +-- !query output 0.1396345165178734 --- !query 37 +-- !query SELECT count(four) AS cnt_1000 FROM onek --- !query 37 schema +-- !query schema struct --- !query 37 output +-- !query output 1000 --- !query 38 +-- !query SELECT count(DISTINCT four) AS cnt_4 FROM onek --- !query 38 schema +-- !query schema struct --- !query 38 output +-- !query output 4 --- !query 39 +-- !query select ten, count(*), sum(four) from onek group by ten order by ten --- !query 39 schema +-- !query schema struct --- !query 39 output +-- !query output 0 100 100 1 100 200 2 100 100 @@ -338,12 +338,12 @@ struct 9 100 200 --- !query 40 +-- !query select ten, count(four), sum(DISTINCT four) from onek group by ten order by ten --- !query 40 schema +-- !query schema struct --- !query 40 output +-- !query output 0 100 2 1 100 4 2 100 2 @@ -356,13 +356,13 @@ struct 9 100 4 --- !query 41 +-- !query select ten, sum(distinct four) from onek a group by ten having exists (select 1 from onek b where sum(distinct a.four) = b.four) --- !query 41 schema +-- !query schema struct --- !query 41 output +-- !query output 0 2 2 2 4 2 @@ -370,14 +370,14 @@ struct 8 2 --- !query 42 +-- !query select ten, sum(distinct four) from onek a group by ten having exists (select 1 from onek b where sum(distinct a.four + b.four) = b.four) --- !query 42 schema +-- !query schema struct<> --- !query 42 output +-- !query output org.apache.spark.sql.AnalysisException Aggregate/Window/Generate expressions are not valid in where clause of the query. @@ -385,12 +385,12 @@ Expression in where clause: [(sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT)) Invalid expressions: [sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT))]; --- !query 43 +-- !query select (select max((select i.unique2 from tenk1 i where i.unique1 = o.unique1))) from tenk1 o --- !query 43 schema +-- !query schema struct<> --- !query 43 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`o.unique1`' given input columns: [i.even, i.fivethous, i.four, i.hundred, i.odd, i.string4, i.stringu1, i.stringu2, i.ten, i.tenthous, i.thousand, i.twenty, i.two, i.twothousand, i.unique1, i.unique2]; line 2 pos 63 diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part2.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part2.sql.out new file mode 100644 index 0000000000000..6633bf5d114ed --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part2.sql.out @@ -0,0 +1,303 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 29 + + +-- !query +create temporary view int4_tbl as select * from values + (0), + (123456), + (-123456), + (2147483647), + (-2147483647) + as int4_tbl(f1) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW bitwise_test AS SELECT * FROM VALUES + (1, 1, 1, 1L), + (3, 3, 3, null), + (7, 7, 7, 3L) AS bitwise_test(b1, b2, b3, b4) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT BIT_AND(b1) AS n1, BIT_OR(b2) AS n2 FROM bitwise_test where 1 = 0 +-- !query schema +struct +-- !query output +NULL NULL + + +-- !query +SELECT BIT_AND(b4) AS n1, BIT_OR(b4) AS n2 FROM bitwise_test where b4 is null +-- !query schema +struct +-- !query output +NULL NULL + + +-- !query +SELECT + BIT_AND(cast(b1 as tinyint)) AS a1, + BIT_AND(cast(b2 as smallint)) AS b1, + BIT_AND(b3) AS c1, + BIT_AND(b4) AS d1, + BIT_OR(cast(b1 as tinyint)) AS e7, + BIT_OR(cast(b2 as smallint)) AS f7, + BIT_OR(b3) AS g7, + BIT_OR(b4) AS h3 +FROM bitwise_test +-- !query schema +struct +-- !query output +1 1 1 1 7 7 7 3 + + +-- !query +SELECT b1 , bit_and(b2), bit_or(b4) FROM bitwise_test GROUP BY b1 +-- !query schema +struct +-- !query output +1 1 1 +3 3 NULL +7 7 3 + + +-- !query +SELECT b1, bit_and(b2) FROM bitwise_test GROUP BY b1 HAVING bit_and(b2) < 7 +-- !query schema +struct +-- !query output +1 1 +3 3 + + +-- !query +SELECT b1, b2, bit_and(b2) OVER (PARTITION BY b1 ORDER BY b2) FROM bitwise_test +-- !query schema +struct +-- !query output +1 1 1 +3 3 3 +7 7 7 + + +-- !query +SELECT b1, b2, bit_or(b2) OVER (PARTITION BY b1 ORDER BY b2) FROM bitwise_test +-- !query schema +struct +-- !query output +1 1 1 +3 3 3 +7 7 7 + + +-- !query +SELECT + (NULL AND NULL) IS NULL AS `t`, + (TRUE AND NULL) IS NULL AS `t`, + (FALSE AND NULL) IS NULL AS `t`, + (NULL AND TRUE) IS NULL AS `t`, + (NULL AND FALSE) IS NULL AS `t`, + (TRUE AND TRUE) AS `t`, + NOT (TRUE AND FALSE) AS `t`, + NOT (FALSE AND TRUE) AS `t`, + NOT (FALSE AND FALSE) AS `t` +-- !query schema +struct +-- !query output +true true false true false true true true true + + +-- !query +SELECT + (NULL OR NULL) IS NULL AS `t`, + (TRUE OR NULL) IS NULL AS `t`, + (FALSE OR NULL) IS NULL AS `t`, + (NULL OR TRUE) IS NULL AS `t`, + (NULL OR FALSE) IS NULL AS `t`, + (TRUE OR TRUE) AS `t`, + (TRUE OR FALSE) AS `t`, + (FALSE OR TRUE) AS `t`, + NOT (FALSE OR FALSE) AS `t` +-- !query schema +struct +-- !query output +true false true false true true true true true + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW bool_test AS SELECT * FROM VALUES + (TRUE, null, FALSE, null), + (FALSE, TRUE, null, null), + (null, TRUE, FALSE, null) AS bool_test(b1, b2, b3, b4) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT BOOL_AND(b1) AS n1, BOOL_OR(b3) AS n2 FROM bool_test WHERE 1 = 0 +-- !query schema +struct +-- !query output +NULL NULL + + +-- !query +SELECT + BOOL_AND(b1) AS f1, + BOOL_AND(b2) AS t2, + BOOL_AND(b3) AS f3, + BOOL_AND(b4) AS n4, + BOOL_AND(NOT b2) AS f5, + BOOL_AND(NOT b3) AS t6 +FROM bool_test +-- !query schema +struct +-- !query output +false true false NULL false true + + +-- !query +SELECT + EVERY(b1) AS f1, + EVERY(b2) AS t2, + EVERY(b3) AS f3, + EVERY(b4) AS n4, + EVERY(NOT b2) AS f5, + EVERY(NOT b3) AS t6 +FROM bool_test +-- !query schema +struct +-- !query output +false true false NULL false true + + +-- !query +SELECT + BOOL_OR(b1) AS t1, + BOOL_OR(b2) AS t2, + BOOL_OR(b3) AS f3, + BOOL_OR(b4) AS n4, + BOOL_OR(NOT b2) AS f5, + BOOL_OR(NOT b3) AS t6 +FROM bool_test +-- !query schema +struct +-- !query output +true true false NULL false true + + +-- !query +select min(unique1) from tenk1 +-- !query schema +struct +-- !query output +0 + + +-- !query +select max(unique1) from tenk1 +-- !query schema +struct +-- !query output +9999 + + +-- !query +select max(unique1) from tenk1 where unique1 < 42 +-- !query schema +struct +-- !query output +41 + + +-- !query +select max(unique1) from tenk1 where unique1 > 42 +-- !query schema +struct +-- !query output +9999 + + +-- !query +select max(unique1) from tenk1 where unique1 > 42000 +-- !query schema +struct +-- !query output +NULL + + +-- !query +select max(tenthous) from tenk1 where thousand = 33 +-- !query schema +struct +-- !query output +9033 + + +-- !query +select min(tenthous) from tenk1 where thousand = 33 +-- !query schema +struct +-- !query output +33 + + +-- !query +select distinct max(unique2) from tenk1 +-- !query schema +struct +-- !query output +9999 + + +-- !query +select max(unique2) from tenk1 order by 1 +-- !query schema +struct +-- !query output +9999 + + +-- !query +select max(unique2) from tenk1 order by max(unique2) +-- !query schema +struct +-- !query output +9999 + + +-- !query +select max(unique2) from tenk1 order by max(unique2)+1 +-- !query schema +struct +-- !query output +9999 + + +-- !query +select t1.max_unique2, g from (select max(unique2) as max_unique2 FROM tenk1) t1 LATERAL VIEW explode(array(1,2,3)) t2 AS g order by g desc +-- !query schema +struct +-- !query output +9999 3 +9999 2 +9999 1 + + +-- !query +select max(100) from tenk1 +-- !query schema +struct +-- !query output +100 diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part3.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part3.sql.out new file mode 100644 index 0000000000000..69f96b02782e3 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part3.sql.out @@ -0,0 +1,38 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 4 + + +-- !query +select max(min(unique1)) from tenk1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +It is not allowed to use an aggregate function in the argument of another aggregate function. Please use the inner aggregate function in a sub-query.; + + +-- !query +select min(unique1) filter (where unique1 > 100) from tenk1 +-- !query schema +struct 100)):int> +-- !query output +101 + + +-- !query +select sum(1/ten) filter (where ten > 0) from tenk1 +-- !query schema +struct 0)):double> +-- !query output +2828.9682539682954 + + +-- !query +select (select count(*) + from (values (1)) t0(inner_c)) +from (values (2),(3)) t1(outer_c) +-- !query schema +struct +-- !query output +1 +1 diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part4.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part4.sql.out similarity index 100% rename from sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part4.sql.out rename to sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part4.sql.out diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/boolean.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/boolean.sql.out similarity index 63% rename from sql/core/src/test/resources/sql-tests/results/pgSQL/boolean.sql.out rename to sql/core/src/test/resources/sql-tests/results/postgreSQL/boolean.sql.out index 203806d43368a..0347e0dc7853b 100644 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/boolean.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/boolean.sql.out @@ -2,475 +2,475 @@ -- Number of queries: 92 --- !query 0 +-- !query SELECT 1 AS one --- !query 0 schema +-- !query schema struct --- !query 0 output +-- !query output 1 --- !query 1 +-- !query SELECT true AS true --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output true --- !query 2 +-- !query SELECT false AS `false` --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output false --- !query 3 +-- !query SELECT boolean('t') AS true --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output true --- !query 4 +-- !query SELECT boolean(' f ') AS `false` --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output false --- !query 5 +-- !query SELECT boolean('true') AS true --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output true --- !query 6 +-- !query SELECT boolean('test') AS error --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output NULL --- !query 7 +-- !query SELECT boolean('false') AS `false` --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output false --- !query 8 +-- !query SELECT boolean('foo') AS error --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output NULL --- !query 9 +-- !query SELECT boolean('y') AS true --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output true --- !query 10 +-- !query SELECT boolean('yes') AS true --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output true --- !query 11 +-- !query SELECT boolean('yeah') AS error --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output NULL --- !query 12 +-- !query SELECT boolean('n') AS `false` --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output false --- !query 13 +-- !query SELECT boolean('no') AS `false` --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output false --- !query 14 +-- !query SELECT boolean('nay') AS error --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output NULL --- !query 15 +-- !query SELECT boolean('on') AS true --- !query 15 schema +-- !query schema struct --- !query 15 output -true +-- !query output +NULL --- !query 16 +-- !query SELECT boolean('off') AS `false` --- !query 16 schema +-- !query schema struct --- !query 16 output -false +-- !query output +NULL --- !query 17 +-- !query SELECT boolean('of') AS `false` --- !query 17 schema +-- !query schema struct --- !query 17 output -false +-- !query output +NULL --- !query 18 +-- !query SELECT boolean('o') AS error --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output NULL --- !query 19 +-- !query SELECT boolean('on_') AS error --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output NULL --- !query 20 +-- !query SELECT boolean('off_') AS error --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output NULL --- !query 21 +-- !query SELECT boolean('1') AS true --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output true --- !query 22 +-- !query SELECT boolean('11') AS error --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output NULL --- !query 23 +-- !query SELECT boolean('0') AS `false` --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output false --- !query 24 +-- !query SELECT boolean('000') AS error --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output NULL --- !query 25 +-- !query SELECT boolean('') AS error --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output NULL --- !query 26 +-- !query SELECT boolean('t') or boolean('f') AS true --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output true --- !query 27 +-- !query SELECT boolean('t') and boolean('f') AS `false` --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output false --- !query 28 +-- !query SELECT not boolean('f') AS true --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output true --- !query 29 +-- !query SELECT boolean('t') = boolean('f') AS `false` --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output false --- !query 30 +-- !query SELECT boolean('t') <> boolean('f') AS true --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output true --- !query 31 +-- !query SELECT boolean('t') > boolean('f') AS true --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output true --- !query 32 +-- !query SELECT boolean('t') >= boolean('f') AS true --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output true --- !query 33 +-- !query SELECT boolean('f') < boolean('t') AS true --- !query 33 schema +-- !query schema struct --- !query 33 output +-- !query output true --- !query 34 +-- !query SELECT boolean('f') <= boolean('t') AS true --- !query 34 schema +-- !query schema struct --- !query 34 output +-- !query output true --- !query 35 +-- !query SELECT boolean(string('TrUe')) AS true, boolean(string('fAlse')) AS `false` --- !query 35 schema +-- !query schema struct --- !query 35 output +-- !query output true false --- !query 36 +-- !query SELECT boolean(string(' true ')) AS true, boolean(string(' FALSE')) AS `false` --- !query 36 schema +-- !query schema struct --- !query 36 output +-- !query output true false --- !query 37 +-- !query SELECT string(boolean(true)) AS true, string(boolean(false)) AS `false` --- !query 37 schema +-- !query schema struct --- !query 37 output +-- !query output true false --- !query 38 +-- !query SELECT boolean(string(' tru e ')) AS invalid --- !query 38 schema +-- !query schema struct --- !query 38 output +-- !query output NULL --- !query 39 +-- !query SELECT boolean(string('')) AS invalid --- !query 39 schema +-- !query schema struct --- !query 39 output +-- !query output NULL --- !query 40 +-- !query CREATE TABLE BOOLTBL1 (f1 boolean) USING parquet --- !query 40 schema +-- !query schema struct<> --- !query 40 output +-- !query output --- !query 41 +-- !query INSERT INTO BOOLTBL1 VALUES (cast('t' as boolean)) --- !query 41 schema +-- !query schema struct<> --- !query 41 output +-- !query output --- !query 42 +-- !query INSERT INTO BOOLTBL1 VALUES (cast('True' as boolean)) --- !query 42 schema +-- !query schema struct<> --- !query 42 output +-- !query output --- !query 43 +-- !query INSERT INTO BOOLTBL1 VALUES (cast('true' as boolean)) --- !query 43 schema +-- !query schema struct<> --- !query 43 output +-- !query output --- !query 44 +-- !query SELECT '' AS t_3, BOOLTBL1.* FROM BOOLTBL1 --- !query 44 schema +-- !query schema struct --- !query 44 output +-- !query output true true true --- !query 45 +-- !query SELECT '' AS t_3, BOOLTBL1.* FROM BOOLTBL1 WHERE f1 = boolean('true') --- !query 45 schema +-- !query schema struct --- !query 45 output +-- !query output true true true --- !query 46 +-- !query SELECT '' AS t_3, BOOLTBL1.* FROM BOOLTBL1 WHERE f1 <> boolean('false') --- !query 46 schema +-- !query schema struct --- !query 46 output +-- !query output true true true --- !query 47 +-- !query SELECT '' AS zero, BOOLTBL1.* FROM BOOLTBL1 WHERE booleq(boolean('false'), f1) --- !query 47 schema +-- !query schema struct --- !query 47 output +-- !query output --- !query 48 +-- !query INSERT INTO BOOLTBL1 VALUES (boolean('f')) --- !query 48 schema +-- !query schema struct<> --- !query 48 output +-- !query output --- !query 49 +-- !query SELECT '' AS f_1, BOOLTBL1.* FROM BOOLTBL1 WHERE f1 = boolean('false') --- !query 49 schema +-- !query schema struct --- !query 49 output +-- !query output false --- !query 50 +-- !query CREATE TABLE BOOLTBL2 (f1 boolean) USING parquet --- !query 50 schema +-- !query schema struct<> --- !query 50 output +-- !query output --- !query 51 +-- !query INSERT INTO BOOLTBL2 VALUES (boolean('f')) --- !query 51 schema +-- !query schema struct<> --- !query 51 output +-- !query output --- !query 52 +-- !query INSERT INTO BOOLTBL2 VALUES (boolean('false')) --- !query 52 schema +-- !query schema struct<> --- !query 52 output +-- !query output --- !query 53 +-- !query INSERT INTO BOOLTBL2 VALUES (boolean('False')) --- !query 53 schema +-- !query schema struct<> --- !query 53 output +-- !query output --- !query 54 +-- !query INSERT INTO BOOLTBL2 VALUES (boolean('FALSE')) --- !query 54 schema +-- !query schema struct<> --- !query 54 output +-- !query output --- !query 55 +-- !query INSERT INTO BOOLTBL2 VALUES (boolean('XXX')) --- !query 55 schema +-- !query schema struct<> --- !query 55 output +-- !query output --- !query 56 +-- !query SELECT '' AS f_4, BOOLTBL2.* FROM BOOLTBL2 --- !query 56 schema +-- !query schema struct --- !query 56 output +-- !query output NULL false false @@ -478,13 +478,13 @@ struct false --- !query 57 +-- !query SELECT '' AS tf_12, BOOLTBL1.*, BOOLTBL2.* FROM BOOLTBL1, BOOLTBL2 WHERE BOOLTBL2.f1 <> BOOLTBL1.f1 --- !query 57 schema +-- !query schema struct --- !query 57 output +-- !query output true false true false true false @@ -499,13 +499,13 @@ struct true false --- !query 58 +-- !query SELECT '' AS tf_12, BOOLTBL1.*, BOOLTBL2.* FROM BOOLTBL1, BOOLTBL2 WHERE boolne(BOOLTBL2.f1,BOOLTBL1.f1) --- !query 58 schema +-- !query schema struct --- !query 58 output +-- !query output true false true false true false @@ -520,27 +520,27 @@ struct true false --- !query 59 +-- !query SELECT '' AS ff_4, BOOLTBL1.*, BOOLTBL2.* FROM BOOLTBL1, BOOLTBL2 WHERE BOOLTBL2.f1 = BOOLTBL1.f1 and BOOLTBL1.f1 = boolean('false') --- !query 59 schema +-- !query schema struct --- !query 59 output +-- !query output false false false false false false false false --- !query 60 +-- !query SELECT '' AS tf_12_ff_4, BOOLTBL1.*, BOOLTBL2.* FROM BOOLTBL1, BOOLTBL2 WHERE BOOLTBL2.f1 = BOOLTBL1.f1 or BOOLTBL1.f1 = boolean('true') ORDER BY BOOLTBL1.f1, BOOLTBL2.f1 --- !query 60 schema +-- !query schema struct --- !query 60 output +-- !query output false false false false false false @@ -562,90 +562,90 @@ struct true false --- !query 61 +-- !query SELECT '' AS True, f1 FROM BOOLTBL1 WHERE f1 IS TRUE --- !query 61 schema +-- !query schema struct --- !query 61 output +-- !query output true true true --- !query 62 +-- !query SELECT '' AS `Not False`, f1 FROM BOOLTBL1 WHERE f1 IS NOT FALSE --- !query 62 schema +-- !query schema struct --- !query 62 output +-- !query output true true true --- !query 63 +-- !query SELECT '' AS `False`, f1 FROM BOOLTBL1 WHERE f1 IS FALSE --- !query 63 schema +-- !query schema struct --- !query 63 output +-- !query output false --- !query 64 +-- !query SELECT '' AS `Not True`, f1 FROM BOOLTBL1 WHERE f1 IS NOT TRUE --- !query 64 schema +-- !query schema struct --- !query 64 output +-- !query output false --- !query 65 +-- !query SELECT '' AS `True`, f1 FROM BOOLTBL2 WHERE f1 IS TRUE --- !query 65 schema +-- !query schema struct --- !query 65 output +-- !query output --- !query 66 +-- !query SELECT '' AS `Not False`, f1 FROM BOOLTBL2 WHERE f1 IS NOT FALSE --- !query 66 schema +-- !query schema struct --- !query 66 output +-- !query output NULL --- !query 67 +-- !query SELECT '' AS `False`, f1 FROM BOOLTBL2 WHERE f1 IS FALSE --- !query 67 schema +-- !query schema struct --- !query 67 output +-- !query output false false false false --- !query 68 +-- !query SELECT '' AS `Not True`, f1 FROM BOOLTBL2 WHERE f1 IS NOT TRUE --- !query 68 schema +-- !query schema struct --- !query 68 output +-- !query output NULL false false @@ -653,39 +653,39 @@ struct false --- !query 69 +-- !query CREATE TABLE BOOLTBL3 (d string, b boolean, o int) USING parquet --- !query 69 schema +-- !query schema struct<> --- !query 69 output +-- !query output --- !query 70 +-- !query INSERT INTO BOOLTBL3 VALUES ('true', true, 1) --- !query 70 schema +-- !query schema struct<> --- !query 70 output +-- !query output --- !query 71 +-- !query INSERT INTO BOOLTBL3 VALUES ('false', false, 2) --- !query 71 schema +-- !query schema struct<> --- !query 71 output +-- !query output --- !query 72 +-- !query INSERT INTO BOOLTBL3 VALUES ('null', null, 3) --- !query 72 schema +-- !query schema struct<> --- !query 72 output +-- !query output --- !query 73 +-- !query SELECT d, b IS TRUE AS istrue, @@ -695,153 +695,153 @@ SELECT b IS UNKNOWN AS isunknown, b IS NOT UNKNOWN AS isnotunknown FROM booltbl3 ORDER BY o --- !query 73 schema +-- !query schema struct --- !query 73 output +-- !query output true true false false true false true false false true true false false true null false true false true true false --- !query 74 +-- !query CREATE TABLE booltbl4(isfalse boolean, istrue boolean, isnul boolean) USING parquet --- !query 74 schema +-- !query schema struct<> --- !query 74 output +-- !query output --- !query 75 +-- !query INSERT INTO booltbl4 VALUES (false, true, null) --- !query 75 schema +-- !query schema struct<> --- !query 75 output +-- !query output --- !query 76 +-- !query SELECT istrue AND isnul AND istrue FROM booltbl4 --- !query 76 schema +-- !query schema struct<((istrue AND isnul) AND istrue):boolean> --- !query 76 output +-- !query output NULL --- !query 77 +-- !query SELECT istrue AND istrue AND isnul FROM booltbl4 --- !query 77 schema +-- !query schema struct<((istrue AND istrue) AND isnul):boolean> --- !query 77 output +-- !query output NULL --- !query 78 +-- !query SELECT isnul AND istrue AND istrue FROM booltbl4 --- !query 78 schema +-- !query schema struct<((isnul AND istrue) AND istrue):boolean> --- !query 78 output +-- !query output NULL --- !query 79 +-- !query SELECT isfalse AND isnul AND istrue FROM booltbl4 --- !query 79 schema +-- !query schema struct<((isfalse AND isnul) AND istrue):boolean> --- !query 79 output +-- !query output false --- !query 80 +-- !query SELECT istrue AND isfalse AND isnul FROM booltbl4 --- !query 80 schema +-- !query schema struct<((istrue AND isfalse) AND isnul):boolean> --- !query 80 output +-- !query output false --- !query 81 +-- !query SELECT isnul AND istrue AND isfalse FROM booltbl4 --- !query 81 schema +-- !query schema struct<((isnul AND istrue) AND isfalse):boolean> --- !query 81 output +-- !query output false --- !query 82 +-- !query SELECT isfalse OR isnul OR isfalse FROM booltbl4 --- !query 82 schema +-- !query schema struct<((isfalse OR isnul) OR isfalse):boolean> --- !query 82 output +-- !query output NULL --- !query 83 +-- !query SELECT isfalse OR isfalse OR isnul FROM booltbl4 --- !query 83 schema +-- !query schema struct<((isfalse OR isfalse) OR isnul):boolean> --- !query 83 output +-- !query output NULL --- !query 84 +-- !query SELECT isnul OR isfalse OR isfalse FROM booltbl4 --- !query 84 schema +-- !query schema struct<((isnul OR isfalse) OR isfalse):boolean> --- !query 84 output +-- !query output NULL --- !query 85 +-- !query SELECT isfalse OR isnul OR istrue FROM booltbl4 --- !query 85 schema +-- !query schema struct<((isfalse OR isnul) OR istrue):boolean> --- !query 85 output +-- !query output true --- !query 86 +-- !query SELECT istrue OR isfalse OR isnul FROM booltbl4 --- !query 86 schema +-- !query schema struct<((istrue OR isfalse) OR isnul):boolean> --- !query 86 output +-- !query output true --- !query 87 +-- !query SELECT isnul OR istrue OR isfalse FROM booltbl4 --- !query 87 schema +-- !query schema struct<((isnul OR istrue) OR isfalse):boolean> --- !query 87 output +-- !query output true --- !query 88 +-- !query DROP TABLE BOOLTBL1 --- !query 88 schema +-- !query schema struct<> --- !query 88 output +-- !query output --- !query 89 +-- !query DROP TABLE BOOLTBL2 --- !query 89 schema +-- !query schema struct<> --- !query 89 output +-- !query output --- !query 90 +-- !query DROP TABLE BOOLTBL3 --- !query 90 schema +-- !query schema struct<> --- !query 90 output +-- !query output --- !query 91 +-- !query DROP TABLE BOOLTBL4 --- !query 91 schema +-- !query schema struct<> --- !query 91 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/case.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/case.sql.out similarity index 68% rename from sql/core/src/test/resources/sql-tests/results/pgSQL/case.sql.out rename to sql/core/src/test/resources/sql-tests/results/postgreSQL/case.sql.out index 348198b060238..1b002c3f48ae2 100644 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/case.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/case.sql.out @@ -2,243 +2,243 @@ -- Number of queries: 35 --- !query 0 +-- !query CREATE TABLE CASE_TBL ( i integer, f double ) USING parquet --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TABLE CASE2_TBL ( i integer, j integer ) USING parquet --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query INSERT INTO CASE_TBL VALUES (1, 10.1) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query INSERT INTO CASE_TBL VALUES (2, 20.2) --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query INSERT INTO CASE_TBL VALUES (3, -30.3) --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query INSERT INTO CASE_TBL VALUES (4, NULL) --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query INSERT INTO CASE2_TBL VALUES (1, -1) --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output --- !query 7 +-- !query INSERT INTO CASE2_TBL VALUES (2, -2) --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output --- !query 8 +-- !query INSERT INTO CASE2_TBL VALUES (3, -3) --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output --- !query 9 +-- !query INSERT INTO CASE2_TBL VALUES (2, -4) --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output --- !query 10 +-- !query INSERT INTO CASE2_TBL VALUES (1, NULL) --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output --- !query 11 +-- !query INSERT INTO CASE2_TBL VALUES (NULL, -6) --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output --- !query 12 +-- !query SELECT '3' AS `One`, CASE WHEN 1 < 2 THEN 3 END AS `Simple WHEN` --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 3 3 --- !query 13 +-- !query SELECT '' AS `One`, CASE WHEN 1 > 2 THEN 3 END AS `Simple default` --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output NULL --- !query 14 +-- !query SELECT '3' AS `One`, CASE WHEN 1 < 2 THEN 3 ELSE 4 END AS `Simple ELSE` --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output 3 3 --- !query 15 +-- !query SELECT '4' AS `One`, CASE WHEN 1 > 2 THEN 3 ELSE 4 END AS `ELSE default` --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 4 4 --- !query 16 +-- !query SELECT '6' AS `One`, CASE WHEN 1 > 2 THEN 3 WHEN 4 < 5 THEN 6 ELSE 7 END AS `Two WHEN with default` --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output 6 6 --- !query 17 +-- !query SELECT '7' AS `None`, CASE WHEN rand() < 0 THEN 1 END AS `NULL on no matches` --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output 7 NULL --- !query 18 +-- !query SELECT CASE WHEN 1=0 THEN 1/0 WHEN 1=1 THEN 1 ELSE 2/0 END --- !query 18 schema -struct --- !query 18 output -1 +-- !query schema +struct +-- !query output +1.0 --- !query 19 +-- !query SELECT CASE 1 WHEN 0 THEN 1/0 WHEN 1 THEN 1 ELSE 2/0 END --- !query 19 schema -struct --- !query 19 output -1 +-- !query schema +struct +-- !query output +1.0 --- !query 20 +-- !query SELECT CASE WHEN i > 100 THEN 1/0 ELSE 0 END FROM case_tbl --- !query 20 schema -struct 100) THEN (1 div 0) ELSE 0 END:int> --- !query 20 output -0 -0 -0 -0 +-- !query schema +struct 100) THEN (CAST(1 AS DOUBLE) / CAST(0 AS DOUBLE)) ELSE CAST(0 AS DOUBLE) END:double> +-- !query output +0.0 +0.0 +0.0 +0.0 --- !query 21 +-- !query SELECT CASE 'a' WHEN 'a' THEN 1 ELSE 2 END --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output 1 --- !query 22 +-- !query SELECT '' AS `Five`, CASE WHEN i >= 3 THEN i END AS `>= 3 or Null` FROM CASE_TBL --- !query 22 schema +-- !query schema struct= 3 or Null:int> --- !query 22 output +-- !query output 3 4 NULL NULL --- !query 23 +-- !query SELECT '' AS `Five`, CASE WHEN i >= 3 THEN (i + i) ELSE i END AS `Simplest Math` FROM CASE_TBL --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output 1 2 6 8 --- !query 24 +-- !query SELECT '' AS `Five`, i AS `Value`, CASE WHEN (i < 0) THEN 'small' WHEN (i = 0) THEN 'zero' @@ -247,16 +247,16 @@ SELECT '' AS `Five`, i AS `Value`, ELSE 'big' END AS `Category` FROM CASE_TBL --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output 1 one 2 two 3 big 4 big --- !query 25 +-- !query SELECT '' AS `Five`, CASE WHEN ((i < 0) or (i < 0)) THEN 'small' WHEN ((i = 0) or (i = 0)) THEN 'zero' @@ -265,37 +265,37 @@ SELECT '' AS `Five`, ELSE 'big' END AS `Category` FROM CASE_TBL --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output big big one two --- !query 26 +-- !query SELECT * FROM CASE_TBL WHERE COALESCE(f,i) = 4 --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output 4 NULL --- !query 27 +-- !query SELECT * FROM CASE_TBL WHERE NULLIF(f,i) = 2 --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output --- !query 28 +-- !query SELECT COALESCE(a.f, b.i, b.j) FROM CASE_TBL a, CASE2_TBL b --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output -30.3 -30.3 -30.3 @@ -322,24 +322,24 @@ struct 3.0 --- !query 29 +-- !query SELECT * FROM CASE_TBL a, CASE2_TBL b WHERE COALESCE(a.f, b.i, b.j) = 2 --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output 4 NULL 2 -2 4 NULL 2 -4 --- !query 30 +-- !query SELECT '' AS Five, NULLIF(a.i,b.i) AS `NULLIF(a.i,b.i)`, NULLIF(b.i, 4) AS `NULLIF(b.i,4)` FROM CASE_TBL a, CASE2_TBL b --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output 1 2 1 2 1 3 @@ -366,18 +366,18 @@ struct NULL 3 --- !query 31 +-- !query SELECT '' AS `Two`, * FROM CASE_TBL a, CASE2_TBL b WHERE COALESCE(f,b.i) = 2 --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output 4 NULL 2 -2 4 NULL 2 -4 --- !query 32 +-- !query SELECT CASE (CASE vol('bar') WHEN 'foo' THEN 'it was foo!' @@ -387,23 +387,23 @@ SELECT CASE WHEN 'it was foo!' THEN 'foo recognized' WHEN 'it was bar!' THEN 'bar recognized' ELSE 'unrecognized' END --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output bar recognized --- !query 33 +-- !query DROP TABLE CASE_TBL --- !query 33 schema +-- !query schema struct<> --- !query 33 output +-- !query output --- !query 34 +-- !query DROP TABLE CASE2_TBL --- !query 34 schema +-- !query schema struct<> --- !query 34 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/comments.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/comments.sql.out new file mode 100644 index 0000000000000..637c5561bd940 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/comments.sql.out @@ -0,0 +1,115 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 7 + + +-- !query +SELECT 'trailing' AS first +-- !query schema +struct +-- !query output +trailing + + +-- !query +SELECT /* embedded single line */ 'embedded' AS `second` +-- !query schema +struct +-- !query output +embedded + + +-- !query +SELECT /* both embedded and trailing single line */ 'both' AS third +-- !query schema +struct +-- !query output +both + + +-- !query +SELECT 'before multi-line' AS fourth +-- !query schema +struct +-- !query output +before multi-line + + +-- !query +/* This is an example of SQL which should not execute: + * select 'multi-line'; + */ +SELECT 'after multi-line' AS fifth +-- !query schema +struct +-- !query output +after multi-line + + +-- !query +/* +SELECT 'trailing' as x1; -- inside block comment +*/ + +/* This block comment surrounds a query which itself has a block comment... +SELECT /* embedded single line */ 'embedded' AS x2; +*/ + +SELECT -- continued after the following block comments... +/* Deeply nested comment. + This includes a single apostrophe to make sure we aren't decoding this part as a string. +SELECT 'deep nest' AS n1; +/* Second level of nesting... +SELECT 'deeper nest' as n2; +/* Third level of nesting... +SELECT 'deepest nest' as n3; +*/ +Hoo boy. Still two deep... +*/ +Now just one deep... +*/ +'deeply nested example' AS sixth +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +mismatched input ''embedded'' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 6, pos 34) + +== SQL == +/* +SELECT 'trailing' as x1; -- inside block comment +*/ + +/* This block comment surrounds a query which itself has a block comment... +SELECT /* embedded single line */ 'embedded' AS x2; +----------------------------------^^^ +*/ + +SELECT -- continued after the following block comments... +/* Deeply nested comment. + This includes a single apostrophe to make sure we aren't decoding this part as a string. +SELECT 'deep nest' AS n1; +/* Second level of nesting... +SELECT 'deeper nest' as n2; +/* Third level of nesting... +SELECT 'deepest nest' as n3; +*/ +Hoo boy. Still two deep... +*/ +Now just one deep... +*/ +'deeply nested example' AS sixth + + +-- !query +/* and this is the end of the file */ +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +mismatched input '' expecting {'(', 'ADD', 'ALTER', 'ANALYZE', 'CACHE', 'CLEAR', 'COMMENT', 'COMMIT', 'CREATE', 'DELETE', 'DESC', 'DESCRIBE', 'DFS', 'DROP', 'EXPLAIN', 'EXPORT', 'FROM', 'GRANT', 'IMPORT', 'INSERT', 'LIST', 'LOAD', 'LOCK', 'MAP', 'MERGE', 'MSCK', 'REDUCE', 'REFRESH', 'REPLACE', 'RESET', 'REVOKE', 'ROLLBACK', 'SELECT', 'SET', 'SHOW', 'START', 'TABLE', 'TRUNCATE', 'UNCACHE', 'UNLOCK', 'UPDATE', 'USE', 'VALUES', 'WITH'}(line 1, pos 37) + +== SQL == +/* and this is the end of the file */ +-------------------------------------^^^ diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out new file mode 100644 index 0000000000000..436b33ce43980 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/create_view.sql.out @@ -0,0 +1,2047 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 195 + + +-- !query +CREATE TABLE emp ( + name string, + age int, + salary int, + manager string +) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE VIEW toyemp AS + SELECT name, age, /* location ,*/ 12*salary AS annualsal + FROM emp +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW toyemp +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE emp +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE view_base_table (key int /* PRIMARY KEY */, data varchar(20)) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE VIEW key_dependent_view AS + SELECT * FROM view_base_table GROUP BY key +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +expression 'default.view_base_table.`data`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.; + + +-- !query +CREATE VIEW key_dependent_view_no_cols AS + SELECT FROM view_base_table GROUP BY key HAVING length(data) > 0 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +no viable alternative at input 'FROM'(line 2, pos 10) + +== SQL == +CREATE VIEW key_dependent_view_no_cols AS + SELECT FROM view_base_table GROUP BY key HAVING length(data) > 0 +----------^^^ + + +-- !query +CREATE TABLE viewtest_tbl (a int, b int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO viewtest_tbl VALUES (5, 10), (10, 15), (15, 20), (20, 25) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW viewtest AS + SELECT * FROM viewtest_tbl +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW viewtest AS + SELECT * FROM viewtest_tbl WHERE a > 10 +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM viewtest +-- !query schema +struct +-- !query output +15 20 +20 25 + + +-- !query +CREATE OR REPLACE VIEW viewtest AS + SELECT a, b FROM viewtest_tbl WHERE a > 5 ORDER BY b DESC +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM viewtest +-- !query schema +struct +-- !query output +20 25 +15 20 +10 15 + + +-- !query +CREATE OR REPLACE VIEW viewtest AS + SELECT a FROM viewtest_tbl WHERE a <> 20 +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW viewtest AS + SELECT 1, * FROM viewtest_tbl +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW viewtest AS + SELECT a, decimal(b) FROM viewtest_tbl +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE VIEW viewtest AS + SELECT a, b, 0 AS c FROM viewtest_tbl +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW viewtest +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE viewtest_tbl +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE SCHEMA temp_view_test +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE temp_view_test.base_table (a int, id int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE temp_view_test.base_table2 (a int, id int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +USE temp_view_test +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TEMPORARY VIEW temp_table AS SELECT * FROM VALUES + (1, 1) as temp_table(a, id) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE VIEW v1 AS SELECT * FROM base_table +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE EXTENDED v1 +-- !query schema +struct +-- !query output +a int +id int + +# Detailed Table Information +Database temp_view_test +Table v1 +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM base_table +View Original Text SELECT * FROM base_table +View Catalog and Namespace spark_catalog.temp_view_test +View Query Output Columns [a, id] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] + + +-- !query +CREATE VIEW v1_temp AS SELECT * FROM temp_table +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Not allowed to create a permanent view `v1_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; + + +-- !query +CREATE TEMP VIEW v2_temp AS SELECT * FROM base_table +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE EXTENDED v2_temp +-- !query schema +struct +-- !query output +a int +id int + + +-- !query +CREATE VIEW temp_view_test.v2 AS SELECT * FROM base_table +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE EXTENDED temp_view_test.v2 +-- !query schema +struct +-- !query output +a int +id int + +# Detailed Table Information +Database temp_view_test +Table v2 +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM base_table +View Original Text SELECT * FROM base_table +View Catalog and Namespace spark_catalog.temp_view_test +View Query Output Columns [a, id] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] + + +-- !query +CREATE VIEW temp_view_test.v3_temp AS SELECT * FROM temp_table +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Not allowed to create a permanent view `temp_view_test`.`v3_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; + + +-- !query +CREATE VIEW v3 AS + SELECT t1.a AS t1_a, t2.a AS t2_a + FROM base_table t1, base_table2 t2 + WHERE t1.id = t2.id +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE EXTENDED v3 +-- !query schema +struct +-- !query output +t1_a int +t2_a int + +# Detailed Table Information +Database temp_view_test +Table v3 +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT t1.a AS t1_a, t2.a AS t2_a + FROM base_table t1, base_table2 t2 + WHERE t1.id = t2.id +View Original Text SELECT t1.a AS t1_a, t2.a AS t2_a + FROM base_table t1, base_table2 t2 + WHERE t1.id = t2.id +View Catalog and Namespace spark_catalog.temp_view_test +View Query Output Columns [t1_a, t2_a] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=t1_a, view.query.out.numCols=2, view.query.out.col.1=t2_a, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] + + +-- !query +CREATE VIEW v4_temp AS + SELECT t1.a AS t1_a, t2.a AS t2_a + FROM base_table t1, temp_table t2 + WHERE t1.id = t2.id +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Not allowed to create a permanent view `v4_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; + + +-- !query +CREATE VIEW v5_temp AS + SELECT t1.a AS t1_a, t2.a AS t2_a, t3.a AS t3_a + FROM base_table t1, base_table2 t2, temp_table t3 + WHERE t1.id = t2.id and t2.id = t3.id +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Not allowed to create a permanent view `v5_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; + + +-- !query +CREATE VIEW v4 AS SELECT * FROM base_table WHERE id IN (SELECT id FROM base_table2) +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE EXTENDED v4 +-- !query schema +struct +-- !query output +a int +id int + +# Detailed Table Information +Database temp_view_test +Table v4 +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM base_table WHERE id IN (SELECT id FROM base_table2) +View Original Text SELECT * FROM base_table WHERE id IN (SELECT id FROM base_table2) +View Catalog and Namespace spark_catalog.temp_view_test +View Query Output Columns [a, id] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] + + +-- !query +CREATE VIEW v5 AS SELECT t1.id, t2.a FROM base_table t1, (SELECT * FROM base_table2) t2 +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE EXTENDED v5 +-- !query schema +struct +-- !query output +id int +a int + +# Detailed Table Information +Database temp_view_test +Table v5 +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT t1.id, t2.a FROM base_table t1, (SELECT * FROM base_table2) t2 +View Original Text SELECT t1.id, t2.a FROM base_table t1, (SELECT * FROM base_table2) t2 +View Catalog and Namespace spark_catalog.temp_view_test +View Query Output Columns [id, a] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=id, view.query.out.numCols=2, view.query.out.col.1=a, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] + + +-- !query +CREATE VIEW v6 AS SELECT * FROM base_table WHERE EXISTS (SELECT 1 FROM base_table2) +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE EXTENDED v6 +-- !query schema +struct +-- !query output +a int +id int + +# Detailed Table Information +Database temp_view_test +Table v6 +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM base_table WHERE EXISTS (SELECT 1 FROM base_table2) +View Original Text SELECT * FROM base_table WHERE EXISTS (SELECT 1 FROM base_table2) +View Catalog and Namespace spark_catalog.temp_view_test +View Query Output Columns [a, id] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] + + +-- !query +CREATE VIEW v7 AS SELECT * FROM base_table WHERE NOT EXISTS (SELECT 1 FROM base_table2) +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE EXTENDED v7 +-- !query schema +struct +-- !query output +a int +id int + +# Detailed Table Information +Database temp_view_test +Table v7 +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM base_table WHERE NOT EXISTS (SELECT 1 FROM base_table2) +View Original Text SELECT * FROM base_table WHERE NOT EXISTS (SELECT 1 FROM base_table2) +View Catalog and Namespace spark_catalog.temp_view_test +View Query Output Columns [a, id] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] + + +-- !query +CREATE VIEW v8 AS SELECT * FROM base_table WHERE EXISTS (SELECT 1) +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE EXTENDED v8 +-- !query schema +struct +-- !query output +a int +id int + +# Detailed Table Information +Database temp_view_test +Table v8 +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM base_table WHERE EXISTS (SELECT 1) +View Original Text SELECT * FROM base_table WHERE EXISTS (SELECT 1) +View Catalog and Namespace spark_catalog.temp_view_test +View Query Output Columns [a, id] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.query.out.col.1=id, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=temp_view_test] + + +-- !query +CREATE VIEW v6_temp AS SELECT * FROM base_table WHERE id IN (SELECT id FROM temp_table) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Not allowed to create a permanent view `v6_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; + + +-- !query +CREATE VIEW v7_temp AS SELECT t1.id, t2.a FROM base_table t1, (SELECT * FROM temp_table) t2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Not allowed to create a permanent view `v7_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; + + +-- !query +CREATE VIEW v8_temp AS SELECT * FROM base_table WHERE EXISTS (SELECT 1 FROM temp_table) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Not allowed to create a permanent view `v8_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; + + +-- !query +CREATE VIEW v9_temp AS SELECT * FROM base_table WHERE NOT EXISTS (SELECT 1 FROM temp_table) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Not allowed to create a permanent view `v9_temp` by referencing a temporary view temp_table. Please create a temp view instead by CREATE TEMP VIEW; + + +-- !query +CREATE VIEW v10_temp AS SELECT * FROM v7_temp +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Table or view not found: v7_temp; line 1 pos 38 + + +-- !query +CREATE VIEW v11_temp AS SELECT t1.id, t2.a FROM base_table t1, v10_temp t2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Table or view not found: v10_temp; line 1 pos 63 + + +-- !query +CREATE VIEW v12_temp AS SELECT true FROM v11_temp +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Table or view not found: v11_temp; line 1 pos 41 + + +-- !query +CREATE SCHEMA testviewschm2 +-- !query schema +struct<> +-- !query output + + + +-- !query +USE testviewschm2 +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t1 (num int, name string) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE t2 (num2 int, value string) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TEMP VIEW tt AS SELECT * FROM VALUES + (1, 'a') AS tt(num2, value) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE VIEW nontemp1 AS SELECT * FROM t1 CROSS JOIN t2 +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE EXTENDED nontemp1 +-- !query schema +struct +-- !query output +num int +name string +num2 int +value string + +# Detailed Table Information +Database testviewschm2 +Table nontemp1 +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t1 CROSS JOIN t2 +View Original Text SELECT * FROM t1 CROSS JOIN t2 +View Catalog and Namespace spark_catalog.testviewschm2 +View Query Output Columns [num, name, num2, value] +Table Properties [view.query.out.col.3=value, view.catalogAndNamespace.numParts=2, view.query.out.col.0=num, view.query.out.numCols=4, view.query.out.col.1=name, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=num2, view.catalogAndNamespace.part.1=testviewschm2] + + +-- !query +CREATE VIEW temporal1 AS SELECT * FROM t1 CROSS JOIN tt +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Not allowed to create a permanent view `temporal1` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW; + + +-- !query +CREATE VIEW nontemp2 AS SELECT * FROM t1 INNER JOIN t2 ON t1.num = t2.num2 +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE EXTENDED nontemp2 +-- !query schema +struct +-- !query output +num int +name string +num2 int +value string + +# Detailed Table Information +Database testviewschm2 +Table nontemp2 +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t1 INNER JOIN t2 ON t1.num = t2.num2 +View Original Text SELECT * FROM t1 INNER JOIN t2 ON t1.num = t2.num2 +View Catalog and Namespace spark_catalog.testviewschm2 +View Query Output Columns [num, name, num2, value] +Table Properties [view.query.out.col.3=value, view.catalogAndNamespace.numParts=2, view.query.out.col.0=num, view.query.out.numCols=4, view.query.out.col.1=name, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=num2, view.catalogAndNamespace.part.1=testviewschm2] + + +-- !query +CREATE VIEW temporal2 AS SELECT * FROM t1 INNER JOIN tt ON t1.num = tt.num2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Not allowed to create a permanent view `temporal2` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW; + + +-- !query +CREATE VIEW nontemp3 AS SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE EXTENDED nontemp3 +-- !query schema +struct +-- !query output +num int +name string +num2 int +value string + +# Detailed Table Information +Database testviewschm2 +Table nontemp3 +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 +View Original Text SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 +View Catalog and Namespace spark_catalog.testviewschm2 +View Query Output Columns [num, name, num2, value] +Table Properties [view.query.out.col.3=value, view.catalogAndNamespace.numParts=2, view.query.out.col.0=num, view.query.out.numCols=4, view.query.out.col.1=name, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=num2, view.catalogAndNamespace.part.1=testviewschm2] + + +-- !query +CREATE VIEW temporal3 AS SELECT * FROM t1 LEFT JOIN tt ON t1.num = tt.num2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Not allowed to create a permanent view `temporal3` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW; + + +-- !query +CREATE VIEW nontemp4 AS SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 AND t2.value = 'xxx' +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE EXTENDED nontemp4 +-- !query schema +struct +-- !query output +num int +name string +num2 int +value string + +# Detailed Table Information +Database testviewschm2 +Table nontemp4 +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 AND t2.value = 'xxx' +View Original Text SELECT * FROM t1 LEFT JOIN t2 ON t1.num = t2.num2 AND t2.value = 'xxx' +View Catalog and Namespace spark_catalog.testviewschm2 +View Query Output Columns [num, name, num2, value] +Table Properties [view.query.out.col.3=value, view.catalogAndNamespace.numParts=2, view.query.out.col.0=num, view.query.out.numCols=4, view.query.out.col.1=name, view.catalogAndNamespace.part.0=spark_catalog, view.query.out.col.2=num2, view.catalogAndNamespace.part.1=testviewschm2] + + +-- !query +CREATE VIEW temporal4 AS SELECT * FROM t1 LEFT JOIN tt ON t1.num = tt.num2 AND tt.value = 'xxx' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Not allowed to create a permanent view `temporal4` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW; + + +-- !query +CREATE VIEW temporal5 AS SELECT * FROM t1 WHERE num IN (SELECT num FROM t1 WHERE EXISTS (SELECT 1 FROM tt)) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Not allowed to create a permanent view `temporal5` by referencing a temporary view tt. Please create a temp view instead by CREATE TEMP VIEW; + + +-- !query +CREATE TABLE tbl1 ( a int, b int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE tbl2 (c int, d int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE tbl3 (e int, f int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE tbl4 (g int, h int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE tmptbl (i int, j int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO tmptbl VALUES (1, 1) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE VIEW pubview AS SELECT * FROM tbl1 WHERE tbl1.a +BETWEEN (SELECT d FROM tbl2 WHERE c = 1) AND (SELECT e FROM tbl3 WHERE f = 2) +AND EXISTS (SELECT g FROM tbl4 LEFT JOIN tbl3 ON tbl4.h = tbl3.f) +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE EXTENDED pubview +-- !query schema +struct +-- !query output +a int +b int + +# Detailed Table Information +Database testviewschm2 +Table pubview +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM tbl1 WHERE tbl1.a +BETWEEN (SELECT d FROM tbl2 WHERE c = 1) AND (SELECT e FROM tbl3 WHERE f = 2) +AND EXISTS (SELECT g FROM tbl4 LEFT JOIN tbl3 ON tbl4.h = tbl3.f) +View Original Text SELECT * FROM tbl1 WHERE tbl1.a +BETWEEN (SELECT d FROM tbl2 WHERE c = 1) AND (SELECT e FROM tbl3 WHERE f = 2) +AND EXISTS (SELECT g FROM tbl4 LEFT JOIN tbl3 ON tbl4.h = tbl3.f) +View Catalog and Namespace spark_catalog.testviewschm2 +View Query Output Columns [a, b] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.query.out.col.1=b, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=testviewschm2] + + +-- !query +CREATE VIEW mytempview AS SELECT * FROM tbl1 WHERE tbl1.a +BETWEEN (SELECT d FROM tbl2 WHERE c = 1) AND (SELECT e FROM tbl3 WHERE f = 2) +AND EXISTS (SELECT g FROM tbl4 LEFT JOIN tbl3 ON tbl4.h = tbl3.f) +AND NOT EXISTS (SELECT g FROM tbl4 LEFT JOIN tmptbl ON tbl4.h = tmptbl.j) +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE EXTENDED mytempview +-- !query schema +struct +-- !query output +a int +b int + +# Detailed Table Information +Database testviewschm2 +Table mytempview +Created Time [not included in comparison] +Last Access [not included in comparison] +Created By [not included in comparison] +Type VIEW +View Text SELECT * FROM tbl1 WHERE tbl1.a +BETWEEN (SELECT d FROM tbl2 WHERE c = 1) AND (SELECT e FROM tbl3 WHERE f = 2) +AND EXISTS (SELECT g FROM tbl4 LEFT JOIN tbl3 ON tbl4.h = tbl3.f) +AND NOT EXISTS (SELECT g FROM tbl4 LEFT JOIN tmptbl ON tbl4.h = tmptbl.j) +View Original Text SELECT * FROM tbl1 WHERE tbl1.a +BETWEEN (SELECT d FROM tbl2 WHERE c = 1) AND (SELECT e FROM tbl3 WHERE f = 2) +AND EXISTS (SELECT g FROM tbl4 LEFT JOIN tbl3 ON tbl4.h = tbl3.f) +AND NOT EXISTS (SELECT g FROM tbl4 LEFT JOIN tmptbl ON tbl4.h = tmptbl.j) +View Catalog and Namespace spark_catalog.testviewschm2 +View Query Output Columns [a, b] +Table Properties [view.catalogAndNamespace.numParts=2, view.query.out.col.0=a, view.query.out.numCols=2, view.query.out.col.1=b, view.catalogAndNamespace.part.0=spark_catalog, view.catalogAndNamespace.part.1=testviewschm2] + + +-- !query +CREATE VIEW tt1 AS + SELECT * FROM ( + VALUES + ('abc', '0123456789', 42, 'abcd'), + ('0123456789', 'abc', 42.12, 'abc') + ) vv(a,b,c,d) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM tt1 +-- !query schema +struct +-- !query output +0123456789 abc 42.12 abc +abc 0123456789 42.00 abcd + + +-- !query +SELECT string(a) FROM tt1 +-- !query schema +struct +-- !query output +0123456789 +abc + + +-- !query +DROP VIEW tt1 +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE tt1 (f1 int, f2 int, f3 string) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE tx1 (x1 int, x2 int, x3 string) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE temp_view_test.tt1 (y1 int, f2 int, f3 string) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE VIEW aliased_view_1 AS + select * from tt1 + where exists (select 1 from tx1 where tt1.f1 = tx1.x1) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE VIEW aliased_view_2 AS + select * from tt1 a1 + where exists (select 1 from tx1 where a1.f1 = tx1.x1) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE VIEW aliased_view_3 AS + select * from tt1 + where exists (select 1 from tx1 a2 where tt1.f1 = a2.x1) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE VIEW aliased_view_4 AS + select * from temp_view_test.tt1 + where exists (select 1 from tt1 where temp_view_test.tt1.y1 = tt1.f1) +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE aliased_view_1 +-- !query schema +struct +-- !query output +f1 int +f2 int +f3 string + + +-- !query +DESC TABLE aliased_view_2 +-- !query schema +struct +-- !query output +f1 int +f2 int +f3 string + + +-- !query +DESC TABLE aliased_view_3 +-- !query schema +struct +-- !query output +f1 int +f2 int +f3 string + + +-- !query +DESC TABLE aliased_view_4 +-- !query schema +struct +-- !query output +y1 int +f2 int +f3 string + + +-- !query +ALTER TABLE tx1 RENAME TO a1 +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE aliased_view_1 +-- !query schema +struct +-- !query output +f1 int +f2 int +f3 string + + +-- !query +DESC TABLE aliased_view_2 +-- !query schema +struct +-- !query output +f1 int +f2 int +f3 string + + +-- !query +DESC TABLE aliased_view_3 +-- !query schema +struct +-- !query output +f1 int +f2 int +f3 string + + +-- !query +DESC TABLE aliased_view_4 +-- !query schema +struct +-- !query output +y1 int +f2 int +f3 string + + +-- !query +ALTER TABLE tt1 RENAME TO a2 +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE aliased_view_1 +-- !query schema +struct +-- !query output +f1 int +f2 int +f3 string + + +-- !query +DESC TABLE aliased_view_2 +-- !query schema +struct +-- !query output +f1 int +f2 int +f3 string + + +-- !query +DESC TABLE aliased_view_3 +-- !query schema +struct +-- !query output +f1 int +f2 int +f3 string + + +-- !query +DESC TABLE aliased_view_4 +-- !query schema +struct +-- !query output +y1 int +f2 int +f3 string + + +-- !query +ALTER TABLE a1 RENAME TO tt1 +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE aliased_view_1 +-- !query schema +struct +-- !query output +f1 int +f2 int +f3 string + + +-- !query +DESC TABLE aliased_view_2 +-- !query schema +struct +-- !query output +f1 int +f2 int +f3 string + + +-- !query +DESC TABLE aliased_view_3 +-- !query schema +struct +-- !query output +f1 int +f2 int +f3 string + + +-- !query +DESC TABLE aliased_view_4 +-- !query schema +struct +-- !query output +y1 int +f2 int +f3 string + + +-- !query +ALTER TABLE a2 RENAME TO tx1 +-- !query schema +struct<> +-- !query output + + + +-- !query +create view view_of_joins as +select * from + (select * from (tbl1 cross join tbl2) same) ss, + (tbl3 cross join tbl4) same +-- !query schema +struct<> +-- !query output + + + +-- !query +create table tt2 (a int, b int, c int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +create table tt3 (ax bigint, b short, c decimal) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +create table tt4 (ay int, b int, q int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +create view v1 as select * from tt2 natural join tt3 +-- !query schema +struct<> +-- !query output + + + +-- !query +create view v1a as select * from (tt2 natural join tt3) j +-- !query schema +struct<> +-- !query output + + + +-- !query +create view v2 as select * from tt2 join tt3 using (b,c) join tt4 using (b) +-- !query schema +struct<> +-- !query output + + + +-- !query +create view v2a as select * from (tt2 join tt3 using (b,c) join tt4 using (b)) j +-- !query schema +struct<> +-- !query output + + + +-- !query +create view v3 as select * from tt2 join tt3 using (b,c) full join tt4 using (b) +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE v1 +-- !query schema +struct +-- !query output +b int +c int +a int +ax bigint + + +-- !query +DESC TABLE v1a +-- !query schema +struct +-- !query output +b int +c int +a int +ax bigint + + +-- !query +DESC TABLE v2 +-- !query schema +struct +-- !query output +b int +c int +a int +ax bigint +ay int +q int + + +-- !query +DESC TABLE v2a +-- !query schema +struct +-- !query output +b int +c int +a int +ax bigint +ay int +q int + + +-- !query +DESC TABLE v3 +-- !query schema +struct +-- !query output +b int +c int +a int +ax bigint +ay int +q int + + +-- !query +alter table tt2 add column d int +-- !query schema +struct<> +-- !query output + + + +-- !query +alter table tt2 add column e int +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE v1 +-- !query schema +struct +-- !query output +b int +c int +a int +ax bigint + + +-- !query +DESC TABLE v1a +-- !query schema +struct +-- !query output +b int +c int +a int +ax bigint + + +-- !query +DESC TABLE v2 +-- !query schema +struct +-- !query output +b int +c int +a int +ax bigint +ay int +q int + + +-- !query +DESC TABLE v2a +-- !query schema +struct +-- !query output +b int +c int +a int +ax bigint +ay int +q int + + +-- !query +DESC TABLE v3 +-- !query schema +struct +-- !query output +b int +c int +a int +ax bigint +ay int +q int + + +-- !query +drop table tt3 +-- !query schema +struct<> +-- !query output + + + +-- !query +create table tt3 (ax bigint, b short, d decimal) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +alter table tt3 add column c int +-- !query schema +struct<> +-- !query output + + + +-- !query +alter table tt3 add column e int +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE v1 +-- !query schema +struct +-- !query output +b int +c int +a int +ax bigint + + +-- !query +DESC TABLE v1a +-- !query schema +struct +-- !query output +b int +c int +a int +ax bigint + + +-- !query +DESC TABLE v2 +-- !query schema +struct +-- !query output +b int +c int +a int +ax bigint +ay int +q int + + +-- !query +DESC TABLE v2a +-- !query schema +struct +-- !query output +b int +c int +a int +ax bigint +ay int +q int + + +-- !query +DESC TABLE v3 +-- !query schema +struct +-- !query output +b int +c int +a int +ax bigint +ay int +q int + + +-- !query +create table tt5 (a int, b int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +create table tt6 (c int, d int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +create view vv1 as select * from (tt5 cross join tt6) j(aa,bb,cc,dd) +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE vv1 +-- !query schema +struct +-- !query output +aa int +bb int +cc int +dd int + + +-- !query +alter table tt5 add column c int +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE vv1 +-- !query schema +struct +-- !query output +aa int +bb int +cc int +dd int + + +-- !query +alter table tt5 add column cc int +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE vv1 +-- !query schema +struct +-- !query output +aa int +bb int +cc int +dd int + + +-- !query +create table tt7 (x int, /* xx int, */ y int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +create table tt8 (x int, z int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +create view vv2 as +select * from (values(1,2,3,4,5)) v(a,b,c,d,e) +union all +select * from tt7 full join tt8 using (x), tt8 tt8x +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE vv2 +-- !query schema +struct +-- !query output +a int +b int +c int +d int +e int + + +-- !query +create view vv3 as +select * from (values(1,2,3,4,5,6)) v(a,b,c,x,e,f) +union all +select * from + tt7 full join tt8 using (x), + tt7 tt7x full join tt8 tt8x using (x) +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE vv3 +-- !query schema +struct +-- !query output +a int +b int +c int +x int +e int +f int + + +-- !query +create view vv4 as +select * from (values(1,2,3,4,5,6,7)) v(a,b,c,x,e,f,g) +union all +select * from + tt7 full join tt8 using (x), + tt7 tt7x full join tt8 tt8x using (x) full join tt8 tt8y using (x) +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE vv4 +-- !query schema +struct +-- !query output +a int +b int +c int +x int +e int +f int +g int + + +-- !query +alter table tt7 add column zz int +-- !query schema +struct<> +-- !query output + + + +-- !query +alter table tt7 add column z int +-- !query schema +struct<> +-- !query output + + + +-- !query +alter table tt8 add column z2 int +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE vv2 +-- !query schema +struct +-- !query output +a int +b int +c int +d int +e int + + +-- !query +DESC TABLE vv3 +-- !query schema +struct +-- !query output +a int +b int +c int +x int +e int +f int + + +-- !query +DESC TABLE vv4 +-- !query schema +struct +-- !query output +a int +b int +c int +x int +e int +f int +g int + + +-- !query +create table tt7a (x date, /* xx int, */ y int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +create table tt8a (x timestamp, z int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +create view vv2a as +select * from (values(now(),2,3,now(),5)) v(a,b,c,d,e) +union all +select * from tt7a left join tt8a using (x), tt8a tt8ax +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE vv4 +-- !query schema +struct +-- !query output +a int +b int +c int +x int +e int +f int +g int + + +-- !query +DESC TABLE vv2a +-- !query schema +struct +-- !query output +a timestamp +b int +c int +d timestamp +e int + + +-- !query +create table tt9 (x int, xx int, y int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +create table tt10 (x int, z int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +create view vv5 as select x,y,z from tt9 join tt10 using(x) +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE vv5 +-- !query schema +struct +-- !query output +x int +y int +z int + + +-- !query +DESC TABLE vv5 +-- !query schema +struct +-- !query output +x int +y int +z int + + +-- !query +create table tt11 (x int, y int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +create table tt12 (x int, z int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +create table tt13 (z int, q int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +create view vv6 as select x,y,z,q from + (tt11 join tt12 using(x)) join tt13 using(z) +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE vv6 +-- !query schema +struct +-- !query output +x int +y int +z int +q int + + +-- !query +alter table tt11 add column z int +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE vv6 +-- !query schema +struct +-- !query output +x int +y int +z int +q int + + +-- !query +CREATE TABLE int8_tbl (q1 int, q2 int) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +create view tt18v as + select * from int8_tbl xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxy + union all + select * from int8_tbl xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxz +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE tt18v +-- !query schema +struct +-- !query output +q1 int +q2 int + + +-- !query +create view tt21v as +select * from tt5 natural inner join tt6 +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE tt21v +-- !query schema +struct +-- !query output +c int +a int +b int +cc int +d int + + +-- !query +create view tt22v as +select * from tt5 natural left join tt6 +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE tt22v +-- !query schema +struct +-- !query output +c int +a int +b int +cc int +d int + + +-- !query +create view tt23v (col_a, col_b) as +select q1 as other_name1, q2 as other_name2 from int8_tbl +union +select 42, 43 +-- !query schema +struct<> +-- !query output + + + +-- !query +DESC TABLE tt23v +-- !query schema +struct +-- !query output +col_a int +col_b int + + +-- !query +DROP SCHEMA temp_view_test CASCADE +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP SCHEMA testviewschm2 CASCADE +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW temp_table +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW tt +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/date.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/date.sql.out new file mode 100755 index 0000000000000..ed27317121623 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/date.sql.out @@ -0,0 +1,923 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 98 + + +-- !query +CREATE TABLE DATE_TBL (f1 date) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO DATE_TBL VALUES (date('1957-04-09')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO DATE_TBL VALUES (date('1957-06-13')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO DATE_TBL VALUES (date('1996-02-28')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO DATE_TBL VALUES (date('1996-02-29')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO DATE_TBL VALUES (date('1996-03-01')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO DATE_TBL VALUES (date('1996-03-02')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO DATE_TBL VALUES (date('1997-02-28')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO DATE_TBL VALUES (date('1997-03-01')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO DATE_TBL VALUES (date('1997-03-02')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO DATE_TBL VALUES (date('2000-04-01')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO DATE_TBL VALUES (date('2000-04-02')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO DATE_TBL VALUES (date('2000-04-03')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO DATE_TBL VALUES (date('2038-04-08')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO DATE_TBL VALUES (date('2039-04-09')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO DATE_TBL VALUES (date('2040-04-10')) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT f1 AS `Fifteen` FROM DATE_TBL +-- !query schema +struct +-- !query output +1957-04-09 +1957-06-13 +1996-02-28 +1996-02-29 +1996-03-01 +1996-03-02 +1997-02-28 +1997-03-01 +1997-03-02 +2000-04-01 +2000-04-02 +2000-04-03 +2038-04-08 +2039-04-09 +2040-04-10 + + +-- !query +SELECT f1 AS `Nine` FROM DATE_TBL WHERE f1 < '2000-01-01' +-- !query schema +struct +-- !query output +1957-04-09 +1957-06-13 +1996-02-28 +1996-02-29 +1996-03-01 +1996-03-02 +1997-02-28 +1997-03-01 +1997-03-02 + + +-- !query +SELECT f1 AS `Three` FROM DATE_TBL + WHERE f1 BETWEEN '2000-01-01' AND '2001-01-01' +-- !query schema +struct +-- !query output +2000-04-01 +2000-04-02 +2000-04-03 + + +-- !query +SELECT date '1999-01-08' +-- !query schema +struct +-- !query output +1999-01-08 + + +-- !query +SELECT date '1999-01-18' +-- !query schema +struct +-- !query output +1999-01-18 + + +-- !query +SELECT date '1999 Jan 08' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the DATE value: 1999 Jan 08(line 1, pos 7) + +== SQL == +SELECT date '1999 Jan 08' +-------^^^ + + +-- !query +SELECT date '1999 08 Jan' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the DATE value: 1999 08 Jan(line 1, pos 7) + +== SQL == +SELECT date '1999 08 Jan' +-------^^^ + + +-- !query +SELECT date '1999-01-08' +-- !query schema +struct +-- !query output +1999-01-08 + + +-- !query +SELECT date '1999-08-01' +-- !query schema +struct +-- !query output +1999-08-01 + + +-- !query +SELECT date '1999 01 08' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the DATE value: 1999 01 08(line 1, pos 7) + +== SQL == +SELECT date '1999 01 08' +-------^^^ + + +-- !query +SELECT date '1999 08 01' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the DATE value: 1999 08 01(line 1, pos 7) + +== SQL == +SELECT date '1999 08 01' +-------^^^ + + +-- !query +SELECT date '1999-01-08' +-- !query schema +struct +-- !query output +1999-01-08 + + +-- !query +SELECT date '1999 Jan 08' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the DATE value: 1999 Jan 08(line 1, pos 7) + +== SQL == +SELECT date '1999 Jan 08' +-------^^^ + + +-- !query +SELECT date '1999 08 Jan' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the DATE value: 1999 08 Jan(line 1, pos 7) + +== SQL == +SELECT date '1999 08 Jan' +-------^^^ + + +-- !query +SELECT date '1999-01-08' +-- !query schema +struct +-- !query output +1999-01-08 + + +-- !query +SELECT date '1999-08-01' +-- !query schema +struct +-- !query output +1999-08-01 + + +-- !query +SELECT date '1999 01 08' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the DATE value: 1999 01 08(line 1, pos 7) + +== SQL == +SELECT date '1999 01 08' +-------^^^ + + +-- !query +SELECT date '1999 08 01' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the DATE value: 1999 08 01(line 1, pos 7) + +== SQL == +SELECT date '1999 08 01' +-------^^^ + + +-- !query +SELECT date '1999-01-08' +-- !query schema +struct +-- !query output +1999-01-08 + + +-- !query +SELECT date '1999-01-18' +-- !query schema +struct +-- !query output +1999-01-18 + + +-- !query +SELECT date '1999 Jan 08' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the DATE value: 1999 Jan 08(line 1, pos 7) + +== SQL == +SELECT date '1999 Jan 08' +-------^^^ + + +-- !query +SELECT date '1999 08 Jan' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the DATE value: 1999 08 Jan(line 1, pos 7) + +== SQL == +SELECT date '1999 08 Jan' +-------^^^ + + +-- !query +SELECT date '1999-01-08' +-- !query schema +struct +-- !query output +1999-01-08 + + +-- !query +SELECT date '1999-08-01' +-- !query schema +struct +-- !query output +1999-08-01 + + +-- !query +SELECT date '1999 01 08' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the DATE value: 1999 01 08(line 1, pos 7) + +== SQL == +SELECT date '1999 01 08' +-------^^^ + + +-- !query +SELECT date '1999 08 01' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the DATE value: 1999 08 01(line 1, pos 7) + +== SQL == +SELECT date '1999 08 01' +-------^^^ + + +-- !query +SELECT date '4714-11-24 BC' +-- !query schema +struct +-- !query output +4714-11-24 + + +-- !query +SELECT date '4714-11-23 BC' +-- !query schema +struct +-- !query output +4714-11-23 + + +-- !query +SELECT date '5874897-12-31' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the DATE value: 5874897-12-31(line 1, pos 7) + +== SQL == +SELECT date '5874897-12-31' +-------^^^ + + +-- !query +SELECT date '5874898-01-01' +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +Cannot parse the DATE value: 5874898-01-01(line 1, pos 7) + +== SQL == +SELECT date '5874898-01-01' +-------^^^ + + +-- !query +SELECT f1 - date '2000-01-01' AS `Days From 2K` FROM DATE_TBL +-- !query schema +struct +-- !query output +-2 years -10 months +-2 years -10 months -1 days +-2 years -9 months -30 days +-3 years -10 months +-3 years -10 months -1 days +-3 years -10 months -2 days +-3 years -9 months -30 days +-42 years -6 months -18 days +-42 years -8 months -22 days +3 months +3 months 1 days +3 months 2 days +38 years 3 months 7 days +39 years 3 months 8 days +40 years 3 months 9 days + + +-- !query +SELECT f1 - date 'epoch' AS `Days From Epoch` FROM DATE_TBL +-- !query schema +struct +-- !query output +-12 years -6 months -18 days +-12 years -8 months -22 days +26 years 1 months 27 days +26 years 1 months 28 days +26 years 2 months +26 years 2 months 1 days +27 years 1 months 27 days +27 years 2 months +27 years 2 months 1 days +30 years 3 months +30 years 3 months 1 days +30 years 3 months 2 days +68 years 3 months 7 days +69 years 3 months 8 days +70 years 3 months 9 days + + +-- !query +SELECT date 'yesterday' - date 'today' AS `One day` +-- !query schema +struct +-- !query output +-1 days + + +-- !query +SELECT date 'today' - date 'tomorrow' AS `One day` +-- !query schema +struct +-- !query output +-1 days + + +-- !query +SELECT date 'yesterday' - date 'tomorrow' AS `Two days` +-- !query schema +struct +-- !query output +-2 days + + +-- !query +SELECT date 'tomorrow' - date 'today' AS `One day` +-- !query schema +struct +-- !query output +1 days + + +-- !query +SELECT date 'today' - date 'yesterday' AS `One day` +-- !query schema +struct +-- !query output +1 days + + +-- !query +SELECT date 'tomorrow' - date 'yesterday' AS `Two days` +-- !query schema +struct +-- !query output +2 days + + +-- !query +SELECT EXTRACT(EPOCH FROM DATE '1970-01-01') +-- !query schema +struct +-- !query output +0.000000 + + +-- !query +SELECT EXTRACT(EPOCH FROM TIMESTAMP '1970-01-01') +-- !query schema +struct +-- !query output +0.000000 + + +-- !query +SELECT EXTRACT(CENTURY FROM TO_DATE('0101-12-31 BC', 'yyyy-MM-dd G')) +-- !query schema +struct +-- !query output +-2 + + +-- !query +SELECT EXTRACT(CENTURY FROM TO_DATE('0100-12-31 BC', 'yyyy-MM-dd G')) +-- !query schema +struct +-- !query output +-1 + + +-- !query +SELECT EXTRACT(CENTURY FROM TO_DATE('0001-12-31 BC', 'yyyy-MM-dd G')) +-- !query schema +struct +-- !query output +-1 + + +-- !query +SELECT EXTRACT(CENTURY FROM DATE '0001-01-01') +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT EXTRACT(CENTURY FROM DATE '0001-01-01 AD') +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT EXTRACT(CENTURY FROM DATE '1900-12-31') +-- !query schema +struct +-- !query output +19 + + +-- !query +SELECT EXTRACT(CENTURY FROM DATE '1901-01-01') +-- !query schema +struct +-- !query output +20 + + +-- !query +SELECT EXTRACT(CENTURY FROM DATE '2000-12-31') +-- !query schema +struct +-- !query output +20 + + +-- !query +SELECT EXTRACT(CENTURY FROM DATE '2001-01-01') +-- !query schema +struct +-- !query output +21 + + +-- !query +SELECT EXTRACT(CENTURY FROM CURRENT_DATE)>=21 AS True +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT EXTRACT(MILLENNIUM FROM TO_DATE('0001-12-31 BC', 'yyyy-MM-dd G')) +-- !query schema +struct +-- !query output +-1 + + +-- !query +SELECT EXTRACT(MILLENNIUM FROM DATE '0001-01-01 AD') +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT EXTRACT(MILLENNIUM FROM DATE '1000-12-31') +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT EXTRACT(MILLENNIUM FROM DATE '1001-01-01') +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT EXTRACT(MILLENNIUM FROM DATE '2000-12-31') +-- !query schema +struct +-- !query output +2 + + +-- !query +SELECT EXTRACT(MILLENNIUM FROM DATE '2001-01-01') +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT EXTRACT(MILLENNIUM FROM CURRENT_DATE) +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT EXTRACT(DECADE FROM DATE '1994-12-25') +-- !query schema +struct +-- !query output +199 + + +-- !query +SELECT EXTRACT(DECADE FROM DATE '0010-01-01') +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT EXTRACT(DECADE FROM DATE '0009-12-31') +-- !query schema +struct +-- !query output +0 + + +-- !query +SELECT EXTRACT(DECADE FROM TO_DATE('0001-01-01 BC', 'yyyy-MM-dd G')) +-- !query schema +struct +-- !query output +0 + + +-- !query +SELECT EXTRACT(DECADE FROM TO_DATE('0002-12-31 BC', 'yyyy-MM-dd G')) +-- !query schema +struct +-- !query output +-1 + + +-- !query +SELECT EXTRACT(DECADE FROM TO_DATE('0011-01-01 BC', 'yyyy-MM-dd G')) +-- !query schema +struct +-- !query output +-1 + + +-- !query +SELECT EXTRACT(DECADE FROM TO_DATE('0012-12-31 BC', 'yyyy-MM-dd G')) +-- !query schema +struct +-- !query output +-2 + + +-- !query +SELECT EXTRACT(CENTURY FROM NOW())>=21 AS True +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT EXTRACT(CENTURY FROM TIMESTAMP '1970-03-20 04:30:00.00000') +-- !query schema +struct +-- !query output +20 + + +-- !query +SELECT DATE_TRUNC('MILLENNIUM', TIMESTAMP '1970-03-20 04:30:00.00000') +-- !query schema +struct +-- !query output +1001-01-01 00:00:00 + + +-- !query +SELECT DATE_TRUNC('MILLENNIUM', DATE '1970-03-20') +-- !query schema +struct +-- !query output +1001-01-01 00:00:00 + + +-- !query +SELECT DATE_TRUNC('CENTURY', TIMESTAMP '1970-03-20 04:30:00.00000') +-- !query schema +struct +-- !query output +1901-01-01 00:00:00 + + +-- !query +SELECT DATE_TRUNC('CENTURY', DATE '1970-03-20') +-- !query schema +struct +-- !query output +1901-01-01 00:00:00 + + +-- !query +SELECT DATE_TRUNC('CENTURY', DATE '2004-08-10') +-- !query schema +struct +-- !query output +2001-01-01 00:00:00 + + +-- !query +SELECT DATE_TRUNC('CENTURY', DATE '0002-02-04') +-- !query schema +struct +-- !query output +0001-01-01 00:00:00 + + +-- !query +SELECT DATE_TRUNC('CENTURY', TO_DATE('0055-08-10 BC', 'yyyy-MM-dd G')) +-- !query schema +struct +-- !query output +-0099-01-01 00:00:00 + + +-- !query +SELECT DATE_TRUNC('DECADE', DATE '1993-12-25') +-- !query schema +struct +-- !query output +1990-01-01 00:00:00 + + +-- !query +SELECT DATE_TRUNC('DECADE', DATE '0004-12-25') +-- !query schema +struct +-- !query output +0000-01-01 00:00:00 + + +-- !query +SELECT DATE_TRUNC('DECADE', TO_DATE('0002-12-31 BC', 'yyyy-MM-dd G')) +-- !query schema +struct +-- !query output +-0010-01-01 00:00:00 + + +-- !query +select make_date(2013, 7, 15) +-- !query schema +struct +-- !query output +2013-07-15 + + +-- !query +select make_date(-44, 3, 15) +-- !query schema +struct +-- !query output +-0044-03-15 + + +-- !query +select make_date(2013, 2, 30) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select make_date(2013, 13, 1) +-- !query schema +struct +-- !query output +NULL + + +-- !query +select make_date(2013, 11, -1) +-- !query schema +struct +-- !query output +NULL + + +-- !query +DROP TABLE DATE_TBL +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/float4.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/float4.sql.out similarity index 58% rename from sql/core/src/test/resources/sql-tests/results/pgSQL/float4.sql.out rename to sql/core/src/test/resources/sql-tests/results/postgreSQL/float4.sql.out index 6e47cff91a7d5..fe8375c5eab8f 100644 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/float4.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/float4.sql.out @@ -2,155 +2,159 @@ -- Number of queries: 43 --- !query 0 +-- !query CREATE TABLE FLOAT4_TBL (f1 float) USING parquet --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 -INSERT INTO FLOAT4_TBL VALUES (' 0.0') --- !query 1 schema +-- !query +INSERT INTO FLOAT4_TBL VALUES (float(' 0.0')) +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 -INSERT INTO FLOAT4_TBL VALUES ('1004.30 ') --- !query 2 schema +-- !query +INSERT INTO FLOAT4_TBL VALUES (float('1004.30 ')) +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 -INSERT INTO FLOAT4_TBL VALUES (' -34.84 ') --- !query 3 schema +-- !query +INSERT INTO FLOAT4_TBL VALUES (float(' -34.84 ')) +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 -INSERT INTO FLOAT4_TBL VALUES ('1.2345678901234e+20') --- !query 4 schema +-- !query +INSERT INTO FLOAT4_TBL VALUES (float('1.2345678901234e+20')) +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 -INSERT INTO FLOAT4_TBL VALUES ('1.2345678901234e-20') --- !query 5 schema +-- !query +INSERT INTO FLOAT4_TBL VALUES (float('1.2345678901234e-20')) +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query SELECT float('NaN') --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output NaN --- !query 7 +-- !query SELECT float('nan') --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output NaN --- !query 8 +-- !query SELECT float(' NAN ') --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output NaN --- !query 9 +-- !query SELECT float('infinity') --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output Infinity --- !query 10 +-- !query SELECT float(' -INFINiTY ') --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output -Infinity --- !query 11 +-- !query SELECT float('N A N') --- !query 11 schema -struct --- !query 11 output -NULL +-- !query schema +struct<> +-- !query output +java.lang.NumberFormatException +invalid input syntax for type numeric: N A N --- !query 12 +-- !query SELECT float('NaN x') --- !query 12 schema -struct --- !query 12 output -NULL +-- !query schema +struct<> +-- !query output +java.lang.NumberFormatException +invalid input syntax for type numeric: NaN x --- !query 13 +-- !query SELECT float(' INFINITY x') --- !query 13 schema -struct --- !query 13 output -NULL +-- !query schema +struct<> +-- !query output +java.lang.NumberFormatException +invalid input syntax for type numeric: INFINITY x --- !query 14 +-- !query SELECT float('Infinity') + 100.0 --- !query 14 schema +-- !query schema struct<(CAST(CAST(Infinity AS FLOAT) AS DOUBLE) + CAST(100.0 AS DOUBLE)):double> --- !query 14 output +-- !query output Infinity --- !query 15 +-- !query SELECT float('Infinity') / float('Infinity') --- !query 15 schema +-- !query schema struct<(CAST(CAST(Infinity AS FLOAT) AS DOUBLE) / CAST(CAST(Infinity AS FLOAT) AS DOUBLE)):double> --- !query 15 output +-- !query output NaN --- !query 16 +-- !query SELECT float('nan') / float('nan') --- !query 16 schema +-- !query schema struct<(CAST(CAST(nan AS FLOAT) AS DOUBLE) / CAST(CAST(nan AS FLOAT) AS DOUBLE)):double> --- !query 16 output +-- !query output NaN --- !query 17 +-- !query SELECT float(decimal('nan')) --- !query 17 schema -struct --- !query 17 output -NULL +-- !query schema +struct<> +-- !query output +java.lang.NumberFormatException +invalid input syntax for type numeric: nan --- !query 18 +-- !query SELECT '' AS five, * FROM FLOAT4_TBL --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output -34.84 0.0 1.2345679E-20 @@ -158,116 +162,116 @@ struct 1004.3 --- !query 19 +-- !query SELECT '' AS four, f.* FROM FLOAT4_TBL f WHERE f.f1 <> '1004.3' --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output -34.84 0.0 1.2345679E-20 1.2345679E20 --- !query 20 +-- !query SELECT '' AS one, f.* FROM FLOAT4_TBL f WHERE f.f1 = '1004.3' --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output 1004.3 --- !query 21 +-- !query SELECT '' AS three, f.* FROM FLOAT4_TBL f WHERE '1004.3' > f.f1 --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output -34.84 0.0 1.2345679E-20 --- !query 22 +-- !query SELECT '' AS three, f.* FROM FLOAT4_TBL f WHERE f.f1 < '1004.3' --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output -34.84 0.0 1.2345679E-20 --- !query 23 +-- !query SELECT '' AS four, f.* FROM FLOAT4_TBL f WHERE '1004.3' >= f.f1 --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output -34.84 0.0 1.2345679E-20 1004.3 --- !query 24 +-- !query SELECT '' AS four, f.* FROM FLOAT4_TBL f WHERE f.f1 <= '1004.3' --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output -34.84 0.0 1.2345679E-20 1004.3 --- !query 25 +-- !query SELECT '' AS three, f.f1, f.f1 * '-10' AS x FROM FLOAT4_TBL f WHERE f.f1 > '0.0' --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output 1.2345679E-20 -1.2345678720289608E-19 1.2345679E20 -1.2345678955701443E21 1004.3 -10042.999877929688 --- !query 26 +-- !query SELECT '' AS three, f.f1, f.f1 + '-10' AS x FROM FLOAT4_TBL f WHERE f.f1 > '0.0' --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output 1.2345679E-20 -10.0 1.2345679E20 1.2345678955701443E20 1004.3 994.2999877929688 --- !query 27 +-- !query SELECT '' AS three, f.f1, f.f1 / '-10' AS x FROM FLOAT4_TBL f WHERE f.f1 > '0.0' --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output 1.2345679E-20 -1.2345678720289608E-21 1.2345679E20 -1.2345678955701443E19 1004.3 -100.42999877929688 --- !query 28 +-- !query SELECT '' AS three, f.f1, f.f1 - '-10' AS x FROM FLOAT4_TBL f WHERE f.f1 > '0.0' --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output 1.2345679E-20 10.0 1.2345679E20 1.2345678955701443E20 1004.3 1014.2999877929688 --- !query 29 +-- !query SELECT '' AS five, * FROM FLOAT4_TBL --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output -34.84 0.0 1.2345679E-20 @@ -275,105 +279,108 @@ struct 1004.3 --- !query 30 +-- !query SELECT smallint(float('32767.4')) --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output 32767 --- !query 31 +-- !query SELECT smallint(float('32767.6')) --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output 32767 --- !query 32 +-- !query SELECT smallint(float('-32768.4')) --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output -32768 --- !query 33 +-- !query SELECT smallint(float('-32768.6')) --- !query 33 schema +-- !query schema struct --- !query 33 output +-- !query output -32768 --- !query 34 +-- !query SELECT int(float('2147483520')) --- !query 34 schema +-- !query schema struct --- !query 34 output +-- !query output 2147483520 --- !query 35 +-- !query SELECT int(float('2147483647')) --- !query 35 schema -struct --- !query 35 output -2147483647 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +Casting 2.14748365E9 to int causes overflow --- !query 36 +-- !query SELECT int(float('-2147483648.5')) --- !query 36 schema +-- !query schema struct --- !query 36 output +-- !query output -2147483648 --- !query 37 +-- !query SELECT int(float('-2147483900')) --- !query 37 schema -struct --- !query 37 output --2147483648 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +Casting -2.1474839E9 to int causes overflow --- !query 38 +-- !query SELECT bigint(float('9223369837831520256')) --- !query 38 schema +-- !query schema struct --- !query 38 output +-- !query output 9223369837831520256 --- !query 39 +-- !query SELECT bigint(float('9223372036854775807')) --- !query 39 schema +-- !query schema struct --- !query 39 output +-- !query output 9223372036854775807 --- !query 40 +-- !query SELECT bigint(float('-9223372036854775808.5')) --- !query 40 schema +-- !query schema struct --- !query 40 output +-- !query output -9223372036854775808 --- !query 41 +-- !query SELECT bigint(float('-9223380000000000000')) --- !query 41 schema -struct --- !query 41 output --9223372036854775808 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +Casting -9.22338E18 to int causes overflow --- !query 42 +-- !query DROP TABLE FLOAT4_TBL --- !query 42 schema +-- !query schema struct<> --- !query 42 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/float8.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/float8.sql.out similarity index 61% rename from sql/core/src/test/resources/sql-tests/results/pgSQL/float8.sql.out rename to sql/core/src/test/resources/sql-tests/results/postgreSQL/float8.sql.out index b4ea3c1ad1cab..4cdb6958a230a 100644 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/float8.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/float8.sql.out @@ -2,187 +2,191 @@ -- Number of queries: 95 --- !query 0 +-- !query CREATE TABLE FLOAT8_TBL(f1 double) USING parquet --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 -INSERT INTO FLOAT8_TBL VALUES (' 0.0 ') --- !query 1 schema +-- !query +INSERT INTO FLOAT8_TBL VALUES (double(' 0.0 ')) +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 -INSERT INTO FLOAT8_TBL VALUES ('1004.30 ') --- !query 2 schema +-- !query +INSERT INTO FLOAT8_TBL VALUES (double('1004.30 ')) +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 -INSERT INTO FLOAT8_TBL VALUES (' -34.84') --- !query 3 schema +-- !query +INSERT INTO FLOAT8_TBL VALUES (double(' -34.84')) +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 -INSERT INTO FLOAT8_TBL VALUES ('1.2345678901234e+200') --- !query 4 schema +-- !query +INSERT INTO FLOAT8_TBL VALUES (double('1.2345678901234e+200')) +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 -INSERT INTO FLOAT8_TBL VALUES ('1.2345678901234e-200') --- !query 5 schema +-- !query +INSERT INTO FLOAT8_TBL VALUES (double('1.2345678901234e-200')) +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query SELECT double('10e400') --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output Infinity --- !query 7 +-- !query SELECT double('-10e400') --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output -Infinity --- !query 8 +-- !query SELECT double('10e-400') --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 0.0 --- !query 9 +-- !query SELECT double('-10e-400') --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output -0.0 --- !query 10 +-- !query SELECT double('NaN') --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output NaN --- !query 11 +-- !query SELECT double('nan') --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output NaN --- !query 12 +-- !query SELECT double(' NAN ') --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output NaN --- !query 13 +-- !query SELECT double('infinity') --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output Infinity --- !query 14 +-- !query SELECT double(' -INFINiTY ') --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output -Infinity --- !query 15 +-- !query SELECT double('N A N') --- !query 15 schema -struct --- !query 15 output -NULL +-- !query schema +struct<> +-- !query output +java.lang.NumberFormatException +invalid input syntax for type numeric: N A N --- !query 16 +-- !query SELECT double('NaN x') --- !query 16 schema -struct --- !query 16 output -NULL +-- !query schema +struct<> +-- !query output +java.lang.NumberFormatException +invalid input syntax for type numeric: NaN x --- !query 17 +-- !query SELECT double(' INFINITY x') --- !query 17 schema -struct --- !query 17 output -NULL +-- !query schema +struct<> +-- !query output +java.lang.NumberFormatException +invalid input syntax for type numeric: INFINITY x --- !query 18 +-- !query SELECT double('Infinity') + 100.0 --- !query 18 schema +-- !query schema struct<(CAST(Infinity AS DOUBLE) + CAST(100.0 AS DOUBLE)):double> --- !query 18 output +-- !query output Infinity --- !query 19 +-- !query SELECT double('Infinity') / double('Infinity') --- !query 19 schema +-- !query schema struct<(CAST(Infinity AS DOUBLE) / CAST(Infinity AS DOUBLE)):double> --- !query 19 output +-- !query output NaN --- !query 20 +-- !query SELECT double('NaN') / double('NaN') --- !query 20 schema +-- !query schema struct<(CAST(NaN AS DOUBLE) / CAST(NaN AS DOUBLE)):double> --- !query 20 output +-- !query output NaN --- !query 21 +-- !query SELECT double(decimal('nan')) --- !query 21 schema -struct --- !query 21 output -NULL +-- !query schema +struct<> +-- !query output +java.lang.NumberFormatException +invalid input syntax for type numeric: nan --- !query 22 +-- !query SELECT '' AS five, * FROM FLOAT8_TBL --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output -34.84 0.0 1.2345678901234E-200 @@ -190,121 +194,121 @@ struct 1004.3 --- !query 23 +-- !query SELECT '' AS four, f.* FROM FLOAT8_TBL f WHERE f.f1 <> '1004.3' --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output -34.84 0.0 1.2345678901234E-200 1.2345678901234E200 --- !query 24 +-- !query SELECT '' AS one, f.* FROM FLOAT8_TBL f WHERE f.f1 = '1004.3' --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output 1004.3 --- !query 25 +-- !query SELECT '' AS three, f.* FROM FLOAT8_TBL f WHERE '1004.3' > f.f1 --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output -34.84 0.0 1.2345678901234E-200 --- !query 26 +-- !query SELECT '' AS three, f.* FROM FLOAT8_TBL f WHERE f.f1 < '1004.3' --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output -34.84 0.0 1.2345678901234E-200 --- !query 27 +-- !query SELECT '' AS four, f.* FROM FLOAT8_TBL f WHERE '1004.3' >= f.f1 --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output -34.84 0.0 1.2345678901234E-200 1004.3 --- !query 28 +-- !query SELECT '' AS four, f.* FROM FLOAT8_TBL f WHERE f.f1 <= '1004.3' --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output -34.84 0.0 1.2345678901234E-200 1004.3 --- !query 29 +-- !query SELECT '' AS three, f.f1, f.f1 * '-10' AS x FROM FLOAT8_TBL f WHERE f.f1 > '0.0' --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output 1.2345678901234E-200 -1.2345678901234E-199 1.2345678901234E200 -1.2345678901234E201 1004.3 -10043.0 --- !query 30 +-- !query SELECT '' AS three, f.f1, f.f1 + '-10' AS x FROM FLOAT8_TBL f WHERE f.f1 > '0.0' --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output 1.2345678901234E-200 -10.0 1.2345678901234E200 1.2345678901234E200 1004.3 994.3 --- !query 31 +-- !query SELECT '' AS three, f.f1, f.f1 / '-10' AS x FROM FLOAT8_TBL f WHERE f.f1 > '0.0' --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output 1.2345678901234E-200 -1.2345678901234E-201 1.2345678901234E200 -1.2345678901234E199 1004.3 -100.42999999999999 --- !query 32 +-- !query SELECT '' AS three, f.f1, f.f1 - '-10' AS x FROM FLOAT8_TBL f WHERE f.f1 > '0.0' --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output 1.2345678901234E-200 10.0 1.2345678901234E200 1.2345678901234E200 1004.3 1014.3 --- !query 33 +-- !query SELECT '' AS five, f.f1, round(f.f1) AS round_f1 FROM FLOAT8_TBL f --- !query 33 schema +-- !query schema struct --- !query 33 output +-- !query output -34.84 -35.0 0.0 0.0 1.2345678901234E-200 0.0 @@ -312,11 +316,11 @@ struct 1004.3 1004.0 --- !query 34 +-- !query select ceil(f1) as ceil_f1 from float8_tbl f --- !query 34 schema +-- !query schema struct --- !query 34 output +-- !query output -34 0 1 @@ -324,11 +328,11 @@ struct 9223372036854775807 --- !query 35 +-- !query select ceiling(f1) as ceiling_f1 from float8_tbl f --- !query 35 schema +-- !query schema struct --- !query 35 output +-- !query output -34 0 1 @@ -336,11 +340,11 @@ struct 9223372036854775807 --- !query 36 +-- !query select floor(f1) as floor_f1 from float8_tbl f --- !query 36 schema +-- !query schema struct --- !query 36 output +-- !query output -35 0 0 @@ -348,11 +352,11 @@ struct 9223372036854775807 --- !query 37 +-- !query select sign(f1) as sign_f1 from float8_tbl f --- !query 37 schema +-- !query schema struct --- !query 37 output +-- !query output -1.0 0.0 1.0 @@ -360,87 +364,87 @@ struct 1.0 --- !query 38 +-- !query SELECT sqrt(double('64')) AS eight --- !query 38 schema +-- !query schema struct --- !query 38 output +-- !query output 8.0 --- !query 39 +-- !query SELECT power(double('144'), double('0.5')) --- !query 39 schema +-- !query schema struct --- !query 39 output +-- !query output 12.0 --- !query 40 +-- !query SELECT power(double('NaN'), double('0.5')) --- !query 40 schema +-- !query schema struct --- !query 40 output +-- !query output NaN --- !query 41 +-- !query SELECT power(double('144'), double('NaN')) --- !query 41 schema +-- !query schema struct --- !query 41 output +-- !query output NaN --- !query 42 +-- !query SELECT power(double('NaN'), double('NaN')) --- !query 42 schema +-- !query schema struct --- !query 42 output +-- !query output NaN --- !query 43 +-- !query SELECT power(double('-1'), double('NaN')) --- !query 43 schema +-- !query schema struct --- !query 43 output +-- !query output NaN --- !query 44 +-- !query SELECT power(double('1'), double('NaN')) --- !query 44 schema +-- !query schema struct --- !query 44 output +-- !query output NaN --- !query 45 +-- !query SELECT power(double('NaN'), double('0')) --- !query 45 schema +-- !query schema struct --- !query 45 output +-- !query output 1.0 --- !query 46 +-- !query SELECT '' AS three, f.f1, exp(ln(f.f1)) AS exp_ln_f1 FROM FLOAT8_TBL f WHERE f.f1 > '0.0' --- !query 46 schema +-- !query schema struct --- !query 46 output +-- !query output 1.2345678901234E-200 1.2345678901233948E-200 1.2345678901234E200 1.234567890123379E200 1004.3 1004.3000000000004 --- !query 47 +-- !query SELECT '' AS five, * FROM FLOAT8_TBL --- !query 47 schema +-- !query schema struct --- !query 47 output +-- !query output -34.84 0.0 1.2345678901234E-200 @@ -448,22 +452,22 @@ struct 1004.3 --- !query 48 +-- !query CREATE TEMPORARY VIEW UPDATED_FLOAT8_TBL as SELECT CASE WHEN FLOAT8_TBL.f1 > '0.0' THEN FLOAT8_TBL.f1 * '-1' ELSE FLOAT8_TBL.f1 END AS f1 FROM FLOAT8_TBL --- !query 48 schema +-- !query schema struct<> --- !query 48 output +-- !query output --- !query 49 +-- !query SELECT '' AS bad, f.f1 * '1e200' from UPDATED_FLOAT8_TBL f --- !query 49 schema +-- !query schema struct --- !query 49 output +-- !query output -1.0042999999999999E203 -1.2345678901234 -3.484E201 @@ -471,11 +475,11 @@ struct 0.0 --- !query 50 +-- !query SELECT '' AS five, * FROM UPDATED_FLOAT8_TBL --- !query 50 schema +-- !query schema struct --- !query 50 output +-- !query output -1.2345678901234E-200 -1.2345678901234E200 -1004.3 @@ -483,251 +487,251 @@ struct 0.0 --- !query 51 +-- !query SELECT sinh(double('1')) --- !query 51 schema +-- !query schema struct --- !query 51 output +-- !query output 1.1752011936438014 --- !query 52 +-- !query SELECT cosh(double('1')) --- !query 52 schema +-- !query schema struct --- !query 52 output +-- !query output 1.543080634815244 --- !query 53 +-- !query SELECT tanh(double('1')) --- !query 53 schema +-- !query schema struct --- !query 53 output +-- !query output 0.7615941559557649 --- !query 54 +-- !query SELECT asinh(double('1')) --- !query 54 schema +-- !query schema struct --- !query 54 output +-- !query output 0.8813735870195429 --- !query 55 +-- !query SELECT acosh(double('2')) --- !query 55 schema +-- !query schema struct --- !query 55 output +-- !query output 1.3169578969248166 --- !query 56 +-- !query SELECT atanh(double('0.5')) --- !query 56 schema +-- !query schema struct --- !query 56 output +-- !query output 0.5493061443340548 --- !query 57 +-- !query SELECT sinh(double('Infinity')) --- !query 57 schema +-- !query schema struct --- !query 57 output +-- !query output Infinity --- !query 58 +-- !query SELECT sinh(double('-Infinity')) --- !query 58 schema +-- !query schema struct --- !query 58 output +-- !query output -Infinity --- !query 59 +-- !query SELECT sinh(double('NaN')) --- !query 59 schema +-- !query schema struct --- !query 59 output +-- !query output NaN --- !query 60 +-- !query SELECT cosh(double('Infinity')) --- !query 60 schema +-- !query schema struct --- !query 60 output +-- !query output Infinity --- !query 61 +-- !query SELECT cosh(double('-Infinity')) --- !query 61 schema +-- !query schema struct --- !query 61 output +-- !query output Infinity --- !query 62 +-- !query SELECT cosh(double('NaN')) --- !query 62 schema +-- !query schema struct --- !query 62 output +-- !query output NaN --- !query 63 +-- !query SELECT tanh(double('Infinity')) --- !query 63 schema +-- !query schema struct --- !query 63 output +-- !query output 1.0 --- !query 64 +-- !query SELECT tanh(double('-Infinity')) --- !query 64 schema +-- !query schema struct --- !query 64 output +-- !query output -1.0 --- !query 65 +-- !query SELECT tanh(double('NaN')) --- !query 65 schema +-- !query schema struct --- !query 65 output +-- !query output NaN --- !query 66 +-- !query SELECT asinh(double('Infinity')) --- !query 66 schema +-- !query schema struct --- !query 66 output +-- !query output Infinity --- !query 67 +-- !query SELECT asinh(double('-Infinity')) --- !query 67 schema +-- !query schema struct --- !query 67 output +-- !query output -Infinity --- !query 68 +-- !query SELECT asinh(double('NaN')) --- !query 68 schema +-- !query schema struct --- !query 68 output +-- !query output NaN --- !query 69 +-- !query SELECT acosh(double('Infinity')) --- !query 69 schema +-- !query schema struct --- !query 69 output +-- !query output Infinity --- !query 70 +-- !query SELECT acosh(double('-Infinity')) --- !query 70 schema +-- !query schema struct --- !query 70 output +-- !query output NaN --- !query 71 +-- !query SELECT acosh(double('NaN')) --- !query 71 schema +-- !query schema struct --- !query 71 output +-- !query output NaN --- !query 72 +-- !query SELECT atanh(double('Infinity')) --- !query 72 schema +-- !query schema struct --- !query 72 output +-- !query output NaN --- !query 73 +-- !query SELECT atanh(double('-Infinity')) --- !query 73 schema +-- !query schema struct --- !query 73 output +-- !query output NaN --- !query 74 +-- !query SELECT atanh(double('NaN')) --- !query 74 schema +-- !query schema struct --- !query 74 output +-- !query output NaN --- !query 75 +-- !query TRUNCATE TABLE FLOAT8_TBL --- !query 75 schema +-- !query schema struct<> --- !query 75 output +-- !query output --- !query 76 -INSERT INTO FLOAT8_TBL VALUES ('0.0') --- !query 76 schema +-- !query +INSERT INTO FLOAT8_TBL VALUES (double('0.0')) +-- !query schema struct<> --- !query 76 output +-- !query output --- !query 77 -INSERT INTO FLOAT8_TBL VALUES ('-34.84') --- !query 77 schema +-- !query +INSERT INTO FLOAT8_TBL VALUES (double('-34.84')) +-- !query schema struct<> --- !query 77 output +-- !query output --- !query 78 -INSERT INTO FLOAT8_TBL VALUES ('-1004.30') --- !query 78 schema +-- !query +INSERT INTO FLOAT8_TBL VALUES (double('-1004.30')) +-- !query schema struct<> --- !query 78 output +-- !query output --- !query 79 -INSERT INTO FLOAT8_TBL VALUES ('-1.2345678901234e+200') --- !query 79 schema +-- !query +INSERT INTO FLOAT8_TBL VALUES (double('-1.2345678901234e+200')) +-- !query schema struct<> --- !query 79 output +-- !query output --- !query 80 -INSERT INTO FLOAT8_TBL VALUES ('-1.2345678901234e-200') --- !query 80 schema +-- !query +INSERT INTO FLOAT8_TBL VALUES (double('-1.2345678901234e-200')) +-- !query schema struct<> --- !query 80 output +-- !query output --- !query 81 +-- !query SELECT '' AS five, * FROM FLOAT8_TBL --- !query 81 schema +-- !query schema struct --- !query 81 output +-- !query output -1.2345678901234E-200 -1.2345678901234E200 -1004.3 @@ -735,105 +739,106 @@ struct 0.0 --- !query 82 +-- !query SELECT smallint(double('32767.4')) --- !query 82 schema +-- !query schema struct --- !query 82 output +-- !query output 32767 --- !query 83 +-- !query SELECT smallint(double('32767.6')) --- !query 83 schema +-- !query schema struct --- !query 83 output +-- !query output 32767 --- !query 84 +-- !query SELECT smallint(double('-32768.4')) --- !query 84 schema +-- !query schema struct --- !query 84 output +-- !query output -32768 --- !query 85 +-- !query SELECT smallint(double('-32768.6')) --- !query 85 schema +-- !query schema struct --- !query 85 output +-- !query output -32768 --- !query 86 +-- !query SELECT int(double('2147483647.4')) --- !query 86 schema +-- !query schema struct --- !query 86 output +-- !query output 2147483647 --- !query 87 +-- !query SELECT int(double('2147483647.6')) --- !query 87 schema +-- !query schema struct --- !query 87 output +-- !query output 2147483647 --- !query 88 +-- !query SELECT int(double('-2147483648.4')) --- !query 88 schema +-- !query schema struct --- !query 88 output +-- !query output -2147483648 --- !query 89 +-- !query SELECT int(double('-2147483648.6')) --- !query 89 schema +-- !query schema struct --- !query 89 output +-- !query output -2147483648 --- !query 90 +-- !query SELECT bigint(double('9223372036854773760')) --- !query 90 schema +-- !query schema struct --- !query 90 output +-- !query output 9223372036854773760 --- !query 91 +-- !query SELECT bigint(double('9223372036854775807')) --- !query 91 schema +-- !query schema struct --- !query 91 output +-- !query output 9223372036854775807 --- !query 92 +-- !query SELECT bigint(double('-9223372036854775808.5')) --- !query 92 schema +-- !query schema struct --- !query 92 output +-- !query output -9223372036854775808 --- !query 93 +-- !query SELECT bigint(double('-9223372036854780000')) --- !query 93 schema -struct --- !query 93 output --9223372036854775808 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +Casting -9.22337203685478E18 to long causes overflow --- !query 94 +-- !query DROP TABLE FLOAT8_TBL --- !query 94 schema +-- !query schema struct<> --- !query 94 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/groupingsets.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/groupingsets.sql.out new file mode 100644 index 0000000000000..24fd9dcbfc826 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/groupingsets.sql.out @@ -0,0 +1,715 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 54 + + +-- !query +create temp view gstest1(a,b,v) + as values (1,1,10),(1,1,11),(1,2,12),(1,2,13),(1,3,14), + (2,3,15), + (3,3,16),(3,4,17), + (4,1,18),(4,1,19) +-- !query schema +struct<> +-- !query output + + + +-- !query +create table gstest2 (a integer, b integer, c integer, d integer, + e integer, f integer, g integer, h integer) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +insert into gstest2 values + (1, 1, 1, 1, 1, 1, 1, 1), + (1, 1, 1, 1, 1, 1, 1, 2), + (1, 1, 1, 1, 1, 1, 2, 2), + (1, 1, 1, 1, 1, 2, 2, 2), + (1, 1, 1, 1, 2, 2, 2, 2), + (1, 1, 1, 2, 2, 2, 2, 2), + (1, 1, 2, 2, 2, 2, 2, 2), + (1, 2, 2, 2, 2, 2, 2, 2), + (2, 2, 2, 2, 2, 2, 2, 2) +-- !query schema +struct<> +-- !query output + + + +-- !query +create table gstest3 (a integer, b integer, c integer, d integer) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +insert into gstest3 values + (1, 1, 1, 1), + (2, 2, 2, 2) +-- !query schema +struct<> +-- !query output + + + +-- !query +create table gstest4(id integer, v integer, + unhashable_col /* bit(4) */ byte, unsortable_col /* xid */ integer) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +insert into gstest4 +values (1,1,tinyint('0'),1), (2,2,tinyint('1'),1), + (3,4,tinyint('2'),2), (4,8,tinyint('3'),2), + (5,16,tinyint('0'),2), (6,32,tinyint('1'),2), + (7,64,tinyint('2'),1), (8,128,tinyint('3'),1) +-- !query schema +struct<> +-- !query output + + + +-- !query +create table gstest_empty (a integer, b integer, v integer) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +select a, b, grouping(a), grouping(b), sum(v), count(*), max(v) + from gstest1 group by rollup (a,b) +-- !query schema +struct +-- !query output +1 1 0 0 21 2 11 +1 2 0 0 25 2 13 +1 3 0 0 14 1 14 +1 NULL 0 1 60 5 14 +2 3 0 0 15 1 15 +2 NULL 0 1 15 1 15 +3 3 0 0 16 1 16 +3 4 0 0 17 1 17 +3 NULL 0 1 33 2 17 +4 1 0 0 37 2 19 +4 NULL 0 1 37 2 19 +NULL NULL 1 1 145 10 19 + + +-- !query +select a, b, grouping(a), grouping(b), sum(v), count(*), max(v) + from gstest1 group by rollup (a,b) order by a,b +-- !query schema +struct +-- !query output +NULL NULL 1 1 145 10 19 +1 NULL 0 1 60 5 14 +1 1 0 0 21 2 11 +1 2 0 0 25 2 13 +1 3 0 0 14 1 14 +2 NULL 0 1 15 1 15 +2 3 0 0 15 1 15 +3 NULL 0 1 33 2 17 +3 3 0 0 16 1 16 +3 4 0 0 17 1 17 +4 NULL 0 1 37 2 19 +4 1 0 0 37 2 19 + + +-- !query +select a, b, grouping(a), grouping(b), sum(v), count(*), max(v) + from gstest1 group by rollup (a,b) order by b desc, a +-- !query schema +struct +-- !query output +3 4 0 0 17 1 17 +1 3 0 0 14 1 14 +2 3 0 0 15 1 15 +3 3 0 0 16 1 16 +1 2 0 0 25 2 13 +1 1 0 0 21 2 11 +4 1 0 0 37 2 19 +NULL NULL 1 1 145 10 19 +1 NULL 0 1 60 5 14 +2 NULL 0 1 15 1 15 +3 NULL 0 1 33 2 17 +4 NULL 0 1 37 2 19 + + +-- !query +select a, b, grouping(a), grouping(b), sum(v), count(*), max(v) + from gstest1 group by rollup (a,b) order by coalesce(a,0)+coalesce(b,0) +-- !query schema +struct +-- !query output +NULL NULL 1 1 145 10 19 +1 NULL 0 1 60 5 14 +1 1 0 0 21 2 11 +2 NULL 0 1 15 1 15 +1 2 0 0 25 2 13 +3 NULL 0 1 33 2 17 +1 3 0 0 14 1 14 +4 NULL 0 1 37 2 19 +4 1 0 0 37 2 19 +2 3 0 0 15 1 15 +3 3 0 0 16 1 16 +3 4 0 0 17 1 17 + + +-- !query +select a, b, sum(c), sum(sum(c)) over (order by a,b) as rsum + from gstest2 group by rollup (a,b) order by rsum, a, b +-- !query schema +struct +-- !query output +NULL NULL 12 12 +1 NULL 10 22 +1 1 8 30 +1 2 2 32 +2 NULL 2 34 +2 2 2 36 + + +-- !query +select a, b, sum(v), count(*) from gstest_empty group by grouping sets ((a,b),a) +-- !query schema +struct +-- !query output + + + +-- !query +select a, b, sum(v), count(*) from gstest_empty group by grouping sets ((a,b),()) +-- !query schema +struct +-- !query output + + + +-- !query +select a, b, sum(v), count(*) from gstest_empty group by grouping sets ((a,b),(),(),()) +-- !query schema +struct +-- !query output + + + +-- !query +select sum(v), count(*) from gstest_empty group by grouping sets ((),(),()) +-- !query schema +struct +-- !query output + + + +-- !query +select t1.a, t2.b, sum(t1.v), count(*) from gstest_empty t1, gstest_empty t2 + group by grouping sets ((t1.a,t2.b),()) +-- !query schema +struct +-- !query output + + + +-- !query +select t1.a, t2.b, grouping(t1.a), grouping(t2.b), sum(t1.v), max(t2.a) + from gstest1 t1, gstest2 t2 + group by grouping sets ((t1.a, t2.b), ()) +-- !query schema +struct +-- !query output +1 1 0 0 420 1 +1 2 0 0 120 2 +2 1 0 0 105 1 +2 2 0 0 30 2 +3 1 0 0 231 1 +3 2 0 0 66 2 +4 1 0 0 259 1 +4 2 0 0 74 2 +NULL NULL 1 1 1305 2 + + +-- !query +select t1.a, t2.b, grouping(t1.a), grouping(t2.b), sum(t1.v), max(t2.a) + from gstest1 t1 join gstest2 t2 on (t1.a=t2.a) + group by grouping sets ((t1.a, t2.b), ()) +-- !query schema +struct +-- !query output +1 1 0 0 420 1 +1 2 0 0 60 1 +2 2 0 0 15 2 +NULL NULL 1 1 495 2 + + +-- !query +select a, b, grouping(a), grouping(b), sum(t1.v), max(t2.c) + from gstest1 t1 join gstest2 t2 using (a,b) + group by grouping sets ((a, b), ()) +-- !query schema +struct +-- !query output +1 1 0 0 147 2 +1 2 0 0 25 2 +NULL NULL 1 1 172 2 + + +-- !query +select four, x + from (select four, ten, 'foo' as x from tenk1) as t + group by grouping sets (four, x) + having x = 'foo' +-- !query schema +struct +-- !query output +NULL foo + + +-- !query +select four, x || 'x' + from (select four, ten, 'foo' as x from tenk1) as t + group by grouping sets (four, x) + order by four +-- !query schema +struct +-- !query output +NULL foox +0 NULL +1 NULL +2 NULL +3 NULL + + +-- !query +select (x+y)*1, sum(z) + from (select 1 as x, 2 as y, 3 as z) s + group by grouping sets (x+y, x) +-- !query schema +struct<((x + y) * 1):int,sum(z):bigint> +-- !query output +3 3 +NULL 3 + + +-- !query +CREATE TEMP VIEW int8_tbl AS SELECT * FROM VALUES + (123L, 456L), + (123L, 4567890123456789L), + (4567890123456789L, 123L), + (4567890123456789L, 4567890123456789L), + (4567890123456789L, -4567890123456789L) as int8_tbl(q1, q2) +-- !query schema +struct<> +-- !query output + + + +-- !query +select x, not x as not_x, q2 from + (select *, q1 = 1 as x from int8_tbl i1) as t + group by grouping sets(x, q2) + order by x, q2 +-- !query schema +struct +-- !query output +NULL NULL -4567890123456789 +NULL NULL 123 +NULL NULL 456 +NULL NULL 4567890123456789 +false true NULL + + +-- !query +DROP VIEW int8_tbl +-- !query schema +struct<> +-- !query output + + + +-- !query +select ten, sum(distinct four) from onek a +group by grouping sets((ten,four),(ten)) +having exists (select 1 from onek b where sum(distinct a.four) = b.four) +-- !query schema +struct +-- !query output +0 0 +0 2 +0 2 +1 1 +1 3 +2 0 +2 2 +2 2 +3 1 +3 3 +4 0 +4 2 +4 2 +5 1 +5 3 +6 0 +6 2 +6 2 +7 1 +7 3 +8 0 +8 2 +8 2 +9 1 +9 3 + + +-- !query +select a,count(*) from gstest2 group by rollup(a) order by a +-- !query schema +struct +-- !query output +NULL 9 +1 8 +2 1 + + +-- !query +select a,count(*) from gstest2 group by rollup(a) having a is distinct from 1 order by a +-- !query schema +struct +-- !query output +NULL 9 +2 1 + + +-- !query +select ten, grouping(ten) from onek +group by grouping sets(ten) having grouping(ten) >= 0 +order by 2,1 +-- !query schema +struct +-- !query output +0 0 +1 0 +2 0 +3 0 +4 0 +5 0 +6 0 +7 0 +8 0 +9 0 + + +-- !query +select ten, grouping(ten) from onek +group by grouping sets(ten, four) having grouping(ten) > 0 +order by 2,1 +-- !query schema +struct +-- !query output +NULL 1 +NULL 1 +NULL 1 +NULL 1 + + +-- !query +select ten, grouping(ten) from onek +group by rollup(ten) having grouping(ten) > 0 +order by 2,1 +-- !query schema +struct +-- !query output +NULL 1 + + +-- !query +select ten, grouping(ten) from onek +group by cube(ten) having grouping(ten) > 0 +order by 2,1 +-- !query schema +struct +-- !query output +NULL 1 + + +-- !query +select count(*) from gstest4 group by rollup(unhashable_col,unsortable_col) +-- !query schema +struct +-- !query output +1 +1 +1 +1 +1 +1 +1 +1 +2 +2 +2 +2 +8 + + +-- !query +select a, b, grouping(a), grouping(b), sum(v), count(*), max(v) + from gstest1 group by grouping sets ((a),(b)) order by 3,4,1,2 /* 3,1,2 */ +-- !query schema +struct +-- !query output +1 NULL 0 1 60 5 14 +2 NULL 0 1 15 1 15 +3 NULL 0 1 33 2 17 +4 NULL 0 1 37 2 19 +NULL 1 1 0 58 4 19 +NULL 2 1 0 25 2 13 +NULL 3 1 0 45 3 16 +NULL 4 1 0 17 1 17 + + +-- !query +select a, b, grouping(a), grouping(b), sum(v), count(*), max(v) + from gstest1 group by cube(a,b) order by 3,4,1,2 /* 3,1,2 */ +-- !query schema +struct +-- !query output +1 1 0 0 21 2 11 +1 2 0 0 25 2 13 +1 3 0 0 14 1 14 +2 3 0 0 15 1 15 +3 3 0 0 16 1 16 +3 4 0 0 17 1 17 +4 1 0 0 37 2 19 +1 NULL 0 1 60 5 14 +2 NULL 0 1 15 1 15 +3 NULL 0 1 33 2 17 +4 NULL 0 1 37 2 19 +NULL 1 1 0 58 4 19 +NULL 2 1 0 25 2 13 +NULL 3 1 0 45 3 16 +NULL 4 1 0 17 1 17 +NULL NULL 1 1 145 10 19 + + +-- !query +select unsortable_col, count(*) + from gstest4 group by grouping sets ((unsortable_col),(unsortable_col)) + order by string(unsortable_col) +-- !query schema +struct +-- !query output +1 4 +1 4 +2 4 +2 4 + + +-- !query +select unhashable_col, unsortable_col, + grouping(unhashable_col), grouping(unsortable_col), + count(*), sum(v) + from gstest4 group by grouping sets ((unhashable_col),(unsortable_col)) + order by 3, 4, 6 /* 3, 5 */ +-- !query schema +struct +-- !query output +0 NULL 0 1 2 17 +1 NULL 0 1 2 34 +2 NULL 0 1 2 68 +3 NULL 0 1 2 136 +NULL 2 1 0 4 60 +NULL 1 1 0 4 195 + + +-- !query +select unhashable_col, unsortable_col, + grouping(unhashable_col), grouping(unsortable_col), + count(*), sum(v) + from gstest4 group by grouping sets ((v,unhashable_col),(v,unsortable_col)) + order by 3, 4, 6 /* 3,5 */ +-- !query schema +struct +-- !query output +0 NULL 0 1 1 1 +1 NULL 0 1 1 2 +2 NULL 0 1 1 4 +3 NULL 0 1 1 8 +0 NULL 0 1 1 16 +1 NULL 0 1 1 32 +2 NULL 0 1 1 64 +3 NULL 0 1 1 128 +NULL 1 1 0 1 1 +NULL 1 1 0 1 2 +NULL 2 1 0 1 4 +NULL 2 1 0 1 8 +NULL 2 1 0 1 16 +NULL 2 1 0 1 32 +NULL 1 1 0 1 64 +NULL 1 1 0 1 128 + + +-- !query +select a, b, sum(v), count(*) from gstest_empty group by grouping sets ((a,b),a) +-- !query schema +struct +-- !query output + + + +-- !query +select a, b, sum(v), count(*) from gstest_empty group by grouping sets ((a,b),()) +-- !query schema +struct +-- !query output + + + +-- !query +select a, b, sum(v), count(*) from gstest_empty group by grouping sets ((a,b),(),(),()) +-- !query schema +struct +-- !query output + + + +-- !query +select sum(v), count(*) from gstest_empty group by grouping sets ((),(),()) +-- !query schema +struct +-- !query output + + + +-- !query +select a, b, grouping(a), grouping(b), sum(v), count(*), max(v) + from gstest1 group by grouping sets ((a,b),(a+1,b+1),(a+2,b+2)) order by 3,4,7 /* 3,6 */ +-- !query schema +struct +-- !query output +1 1 0 0 21 2 11 +1 2 0 0 25 2 13 +1 3 0 0 14 1 14 +2 3 0 0 15 1 15 +3 3 0 0 16 1 16 +3 4 0 0 17 1 17 +4 1 0 0 37 2 19 +NULL NULL 1 1 21 2 11 +NULL NULL 1 1 21 2 11 +NULL NULL 1 1 25 2 13 +NULL NULL 1 1 25 2 13 +NULL NULL 1 1 14 1 14 +NULL NULL 1 1 14 1 14 +NULL NULL 1 1 15 1 15 +NULL NULL 1 1 15 1 15 +NULL NULL 1 1 16 1 16 +NULL NULL 1 1 16 1 16 +NULL NULL 1 1 17 1 17 +NULL NULL 1 1 17 1 17 +NULL NULL 1 1 37 2 19 +NULL NULL 1 1 37 2 19 + + +-- !query +select a, b, sum(c), sum(sum(c)) over (order by a,b) as rsum + from gstest2 group by cube (a,b) order by rsum, a, b +-- !query schema +struct +-- !query output +NULL NULL 12 12 +NULL 1 8 20 +NULL 2 4 24 +1 NULL 10 34 +1 1 8 42 +1 2 2 44 +2 NULL 2 46 +2 2 2 48 + + +-- !query +SELECT a, b, count(*), max(a), max(b) FROM gstest3 GROUP BY GROUPING SETS(a, b,()) ORDER BY a, b +-- !query schema +struct +-- !query output +NULL NULL 2 2 2 +NULL 1 1 1 1 +NULL 2 1 2 2 +1 NULL 1 1 1 +2 NULL 1 2 2 + + +-- !query +select v||'a', case grouping(v||'a') when 1 then 1 else 0 end, count(*) + from values (1, 'a'), (1, 'b') u(i,v) + group by rollup(i, v||'a') order by 1,3 +-- !query schema +struct +-- !query output +NULL 1 2 +NULL 1 2 +aa 0 1 +ba 0 1 + + +-- !query +select v||'a', case when grouping(v||'a') = 1 then 1 else 0 end, count(*) + from values (1, 'a'), (1, 'b') u(i,v) + group by rollup(i, v||'a') order by 1,3 +-- !query schema +struct +-- !query output +NULL 1 2 +NULL 1 2 +aa 0 1 +ba 0 1 + + +-- !query +DROP VIEW gstest1 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE gstest2 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE gstest3 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE gstest4 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE gstest_empty +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/insert.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/insert.sql.out new file mode 100644 index 0000000000000..1046d0ec86bbd --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/insert.sql.out @@ -0,0 +1,81 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 9 + + +-- !query +create table inserttest (col1 int, col2 int /* NOT NULL */, col3 string /* default 'testing' */) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +insert into inserttest values (NULL, 3, 'testing') +-- !query schema +struct<> +-- !query output + + + +-- !query +insert into inserttest values (NULL, 5, 'testing') +-- !query schema +struct<> +-- !query output + + + +-- !query +insert into inserttest values (NULL, 5, 'test') +-- !query schema +struct<> +-- !query output + + + +-- !query +insert into inserttest values (NULL, 7, 'testing') +-- !query schema +struct<> +-- !query output + + + +-- !query +select * from inserttest +-- !query schema +struct +-- !query output +NULL 3 testing +NULL 5 test +NULL 5 testing +NULL 7 testing + + +-- !query +insert into inserttest values(30, 50, repeat('x', 10000)) +-- !query schema +struct<> +-- !query output + + + +-- !query +select col1, col2, char_length(col3) from inserttest +-- !query schema +struct +-- !query output +30 50 10000 +NULL 3 7 +NULL 5 4 +NULL 5 7 +NULL 7 7 + + +-- !query +drop table inserttest +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/int2.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/int2.sql.out old mode 100644 new mode 100755 similarity index 64% rename from sql/core/src/test/resources/sql-tests/results/pgSQL/int2.sql.out rename to sql/core/src/test/resources/sql-tests/results/postgreSQL/int2.sql.out index 569d137891dd3..02e373f2d2b2b --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/int2.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/int2.sql.out @@ -2,59 +2,59 @@ -- Number of queries: 35 --- !query 0 +-- !query CREATE TABLE INT2_TBL(f1 smallint) USING parquet --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 -INSERT INTO INT2_TBL VALUES (trim('0 ')) --- !query 1 schema +-- !query +INSERT INTO INT2_TBL VALUES (smallint(trim('0 '))) +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 -INSERT INTO INT2_TBL VALUES (trim(' 1234 ')) --- !query 2 schema +-- !query +INSERT INTO INT2_TBL VALUES (smallint(trim(' 1234 '))) +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 -INSERT INTO INT2_TBL VALUES (trim(' -1234')) --- !query 3 schema +-- !query +INSERT INTO INT2_TBL VALUES (smallint(trim(' -1234'))) +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 -INSERT INTO INT2_TBL VALUES ('32767') --- !query 4 schema +-- !query +INSERT INTO INT2_TBL VALUES (smallint('32767')) +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 -INSERT INTO INT2_TBL VALUES ('-32767') --- !query 5 schema +-- !query +INSERT INTO INT2_TBL VALUES (smallint('-32767')) +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query SELECT '' AS five, * FROM INT2_TBL --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output -1234 -32767 0 @@ -62,154 +62,154 @@ struct 32767 --- !query 7 +-- !query SELECT '' AS four, i.* FROM INT2_TBL i WHERE i.f1 <> smallint('0') --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output -1234 -32767 1234 32767 --- !query 8 +-- !query SELECT '' AS four, i.* FROM INT2_TBL i WHERE i.f1 <> int('0') --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output -1234 -32767 1234 32767 --- !query 9 +-- !query SELECT '' AS one, i.* FROM INT2_TBL i WHERE i.f1 = smallint('0') --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 0 --- !query 10 +-- !query SELECT '' AS one, i.* FROM INT2_TBL i WHERE i.f1 = int('0') --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 0 --- !query 11 +-- !query SELECT '' AS two, i.* FROM INT2_TBL i WHERE i.f1 < smallint('0') --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output -1234 -32767 --- !query 12 +-- !query SELECT '' AS two, i.* FROM INT2_TBL i WHERE i.f1 < int('0') --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output -1234 -32767 --- !query 13 +-- !query SELECT '' AS three, i.* FROM INT2_TBL i WHERE i.f1 <= smallint('0') --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output -1234 -32767 0 --- !query 14 +-- !query SELECT '' AS three, i.* FROM INT2_TBL i WHERE i.f1 <= int('0') --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output -1234 -32767 0 --- !query 15 +-- !query SELECT '' AS two, i.* FROM INT2_TBL i WHERE i.f1 > smallint('0') --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 1234 32767 --- !query 16 +-- !query SELECT '' AS two, i.* FROM INT2_TBL i WHERE i.f1 > int('0') --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output 1234 32767 --- !query 17 +-- !query SELECT '' AS three, i.* FROM INT2_TBL i WHERE i.f1 >= smallint('0') --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output 0 1234 32767 --- !query 18 +-- !query SELECT '' AS three, i.* FROM INT2_TBL i WHERE i.f1 >= int('0') --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output 0 1234 32767 --- !query 19 +-- !query SELECT '' AS one, i.* FROM INT2_TBL i WHERE (i.f1 % smallint('2')) = smallint('1') --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output 32767 --- !query 20 +-- !query SELECT '' AS three, i.* FROM INT2_TBL i WHERE (i.f1 % int('2')) = smallint('0') --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output -1234 0 1234 --- !query 21 +-- !query SELECT '' AS five, i.f1, i.f1 * smallint('2') AS x FROM INT2_TBL i WHERE abs(f1) < 16384 --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output -1234 -2468 0 0 1234 2468 --- !query 22 +-- !query SELECT '' AS five, i.f1, i.f1 * int('2') AS x FROM INT2_TBL i --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output -1234 -2468 -32767 -65534 0 0 @@ -217,23 +217,23 @@ struct 32767 65534 --- !query 23 +-- !query SELECT '' AS five, i.f1, i.f1 + smallint('2') AS x FROM INT2_TBL i WHERE f1 < 32766 --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output -1234 -1232 -32767 -32765 0 2 1234 1236 --- !query 24 +-- !query SELECT '' AS five, i.f1, i.f1 + int('2') AS x FROM INT2_TBL i --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output -1234 -1232 -32767 -32765 0 2 @@ -241,23 +241,23 @@ struct 32767 32769 --- !query 25 +-- !query SELECT '' AS five, i.f1, i.f1 - smallint('2') AS x FROM INT2_TBL i WHERE f1 > -32767 --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output -1234 -1236 0 -2 1234 1232 32767 32765 --- !query 26 +-- !query SELECT '' AS five, i.f1, i.f1 - int('2') AS x FROM INT2_TBL i --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output -1234 -1236 -32767 -32769 0 -2 @@ -265,55 +265,55 @@ struct 32767 32765 --- !query 27 +-- !query SELECT '' AS five, i.f1, i.f1 / smallint('2') AS x FROM INT2_TBL i --- !query 27 schema -struct --- !query 27 output - -1234 -617 - -32767 -16383 - 0 0 - 1234 617 - 32767 16383 +-- !query schema +struct +-- !query output + -1234 -617.0 + -32767 -16383.5 + 0 0.0 + 1234 617.0 + 32767 16383.5 --- !query 28 +-- !query SELECT '' AS five, i.f1, i.f1 / int('2') AS x FROM INT2_TBL i --- !query 28 schema -struct --- !query 28 output - -1234 -617 - -32767 -16383 - 0 0 - 1234 617 - 32767 16383 +-- !query schema +struct +-- !query output + -1234 -617.0 + -32767 -16383.5 + 0 0.0 + 1234 617.0 + 32767 16383.5 --- !query 29 +-- !query SELECT string(shiftleft(smallint(-1), 15)) --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output -32768 --- !query 30 +-- !query SELECT string(smallint(shiftleft(smallint(-1), 15))+1) --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output -32767 --- !query 31 +-- !query SELECT smallint(-32768) % smallint(-1) --- !query 31 schema +-- !query schema struct<(CAST(-32768 AS SMALLINT) % CAST(-1 AS SMALLINT)):smallint> --- !query 31 output +-- !query output 0 --- !query 32 +-- !query SELECT x, smallint(x) AS int2_value FROM (VALUES float(-2.5), float(-1.5), @@ -322,9 +322,9 @@ FROM (VALUES float(-2.5), float(0.5), float(1.5), float(2.5)) t(x) --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output -0.5 0 -1.5 -1 -2.5 -2 @@ -334,7 +334,7 @@ struct 2.5 2 --- !query 33 +-- !query SELECT x, smallint(x) AS int2_value FROM (VALUES cast(-2.5 as decimal(38, 18)), cast(-1.5 as decimal(38, 18)), @@ -343,21 +343,21 @@ FROM (VALUES cast(-2.5 as decimal(38, 18)), cast(0.5 as decimal(38, 18)), cast(1.5 as decimal(38, 18)), cast(2.5 as decimal(38, 18))) t(x) --- !query 33 schema +-- !query schema struct --- !query 33 output --0.5 0 --1.5 -1 --2.5 -2 -0 0 -0.5 0 -1.5 1 -2.5 2 +-- !query output +-0.500000000000000000 0 +-1.500000000000000000 -1 +-2.500000000000000000 -2 +0.000000000000000000 0 +0.500000000000000000 0 +1.500000000000000000 1 +2.500000000000000000 2 --- !query 34 +-- !query DROP TABLE INT2_TBL --- !query 34 schema +-- !query schema struct<> --- !query 34 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/int4.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/int4.sql.out old mode 100644 new mode 100755 similarity index 55% rename from sql/core/src/test/resources/sql-tests/results/pgSQL/int4.sql.out rename to sql/core/src/test/resources/sql-tests/results/postgreSQL/int4.sql.out index 879b3c626ec1b..3d80c5d595d53 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/int4.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/int4.sql.out @@ -1,68 +1,60 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 54 +-- Number of queries: 53 --- !query 0 +-- !query CREATE TABLE INT4_TBL(f1 int) USING parquet --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 -INSERT INTO INT4_TBL VALUES (trim(' 0 ')) --- !query 1 schema +-- !query +INSERT INTO INT4_TBL VALUES (int(trim(' 0 '))) +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 -INSERT INTO INT4_TBL VALUES (trim('123456 ')) --- !query 2 schema +-- !query +INSERT INTO INT4_TBL VALUES (int(trim('123456 '))) +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 -INSERT INTO INT4_TBL VALUES (trim(' -123456')) --- !query 3 schema +-- !query +INSERT INTO INT4_TBL VALUES (int(trim(' -123456'))) +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 -INSERT INTO INT4_TBL VALUES ('2147483647') --- !query 4 schema +-- !query +INSERT INTO INT4_TBL VALUES (int('2147483647')) +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 -INSERT INTO INT4_TBL VALUES ('-2147483647') --- !query 5 schema +-- !query +INSERT INTO INT4_TBL VALUES (int('-2147483647')) +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 -set spark.sql.arithmeticOperations.failOnOverFlow=false --- !query 6 schema -struct --- !query 6 output -spark.sql.arithmeticOperations.failOnOverFlow false - - --- !query 7 +-- !query SELECT '' AS five, * FROM INT4_TBL --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output -123456 -2147483647 0 @@ -70,425 +62,407 @@ struct 2147483647 --- !query 8 +-- !query SELECT '' AS four, i.* FROM INT4_TBL i WHERE i.f1 <> smallint('0') --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output -123456 -2147483647 123456 2147483647 --- !query 9 +-- !query SELECT '' AS four, i.* FROM INT4_TBL i WHERE i.f1 <> int('0') --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output -123456 -2147483647 123456 2147483647 --- !query 10 +-- !query SELECT '' AS one, i.* FROM INT4_TBL i WHERE i.f1 = smallint('0') --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 0 --- !query 11 +-- !query SELECT '' AS one, i.* FROM INT4_TBL i WHERE i.f1 = int('0') --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 0 --- !query 12 +-- !query SELECT '' AS two, i.* FROM INT4_TBL i WHERE i.f1 < smallint('0') --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output -123456 -2147483647 --- !query 13 +-- !query SELECT '' AS two, i.* FROM INT4_TBL i WHERE i.f1 < int('0') --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output -123456 -2147483647 --- !query 14 +-- !query SELECT '' AS three, i.* FROM INT4_TBL i WHERE i.f1 <= smallint('0') --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output -123456 -2147483647 0 --- !query 15 +-- !query SELECT '' AS three, i.* FROM INT4_TBL i WHERE i.f1 <= int('0') --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output -123456 -2147483647 0 --- !query 16 +-- !query SELECT '' AS two, i.* FROM INT4_TBL i WHERE i.f1 > smallint('0') --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output 123456 2147483647 --- !query 17 +-- !query SELECT '' AS two, i.* FROM INT4_TBL i WHERE i.f1 > int('0') --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output 123456 2147483647 --- !query 18 +-- !query SELECT '' AS three, i.* FROM INT4_TBL i WHERE i.f1 >= smallint('0') --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output 0 123456 2147483647 --- !query 19 +-- !query SELECT '' AS three, i.* FROM INT4_TBL i WHERE i.f1 >= int('0') --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output 0 123456 2147483647 --- !query 20 +-- !query SELECT '' AS one, i.* FROM INT4_TBL i WHERE (i.f1 % smallint('2')) = smallint('1') --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output 2147483647 --- !query 21 +-- !query SELECT '' AS three, i.* FROM INT4_TBL i WHERE (i.f1 % int('2')) = smallint('0') --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output -123456 0 123456 --- !query 22 +-- !query SELECT '' AS five, i.f1, i.f1 * smallint('2') AS x FROM INT4_TBL i --- !query 22 schema -struct --- !query 22 output - -123456 -246912 - -2147483647 2 - 0 0 - 123456 246912 - 2147483647 -2 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +integer overflow --- !query 23 +-- !query SELECT '' AS five, i.f1, i.f1 * smallint('2') AS x FROM INT4_TBL i WHERE abs(f1) < 1073741824 --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output -123456 -246912 0 0 123456 246912 --- !query 24 +-- !query SELECT '' AS five, i.f1, i.f1 * int('2') AS x FROM INT4_TBL i --- !query 24 schema -struct --- !query 24 output - -123456 -246912 - -2147483647 2 - 0 0 - 123456 246912 - 2147483647 -2 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +integer overflow --- !query 25 +-- !query SELECT '' AS five, i.f1, i.f1 * int('2') AS x FROM INT4_TBL i WHERE abs(f1) < 1073741824 --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output -123456 -246912 0 0 123456 246912 --- !query 26 +-- !query SELECT '' AS five, i.f1, i.f1 + smallint('2') AS x FROM INT4_TBL i --- !query 26 schema -struct --- !query 26 output - -123456 -123454 - -2147483647 -2147483645 - 0 2 - 123456 123458 - 2147483647 -2147483647 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +integer overflow --- !query 27 +-- !query SELECT '' AS five, i.f1, i.f1 + smallint('2') AS x FROM INT4_TBL i WHERE f1 < 2147483646 --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output -123456 -123454 -2147483647 -2147483645 0 2 123456 123458 --- !query 28 +-- !query SELECT '' AS five, i.f1, i.f1 + int('2') AS x FROM INT4_TBL i --- !query 28 schema -struct --- !query 28 output - -123456 -123454 - -2147483647 -2147483645 - 0 2 - 123456 123458 - 2147483647 -2147483647 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +integer overflow --- !query 29 +-- !query SELECT '' AS five, i.f1, i.f1 + int('2') AS x FROM INT4_TBL i WHERE f1 < 2147483646 --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output -123456 -123454 -2147483647 -2147483645 0 2 123456 123458 --- !query 30 +-- !query SELECT '' AS five, i.f1, i.f1 - smallint('2') AS x FROM INT4_TBL i --- !query 30 schema -struct --- !query 30 output - -123456 -123458 - -2147483647 2147483647 - 0 -2 - 123456 123454 - 2147483647 2147483645 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +integer overflow --- !query 31 +-- !query SELECT '' AS five, i.f1, i.f1 - smallint('2') AS x FROM INT4_TBL i WHERE f1 > -2147483647 --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output -123456 -123458 0 -2 123456 123454 2147483647 2147483645 --- !query 32 +-- !query SELECT '' AS five, i.f1, i.f1 - int('2') AS x FROM INT4_TBL i --- !query 32 schema -struct --- !query 32 output - -123456 -123458 - -2147483647 2147483647 - 0 -2 - 123456 123454 - 2147483647 2147483645 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +integer overflow --- !query 33 +-- !query SELECT '' AS five, i.f1, i.f1 - int('2') AS x FROM INT4_TBL i WHERE f1 > -2147483647 --- !query 33 schema +-- !query schema struct --- !query 33 output +-- !query output -123456 -123458 0 -2 123456 123454 2147483647 2147483645 --- !query 34 +-- !query SELECT '' AS five, i.f1, i.f1 / smallint('2') AS x FROM INT4_TBL i --- !query 34 schema -struct --- !query 34 output - -123456 -61728 - -2147483647 -1073741823 - 0 0 - 123456 61728 - 2147483647 1073741823 +-- !query schema +struct +-- !query output + -123456 -61728.0 + -2147483647 -1.0737418235E9 + 0 0.0 + 123456 61728.0 + 2147483647 1.0737418235E9 --- !query 35 +-- !query SELECT '' AS five, i.f1, i.f1 / int('2') AS x FROM INT4_TBL i --- !query 35 schema -struct --- !query 35 output - -123456 -61728 - -2147483647 -1073741823 - 0 0 - 123456 61728 - 2147483647 1073741823 +-- !query schema +struct +-- !query output + -123456 -61728.0 + -2147483647 -1.0737418235E9 + 0 0.0 + 123456 61728.0 + 2147483647 1.0737418235E9 --- !query 36 +-- !query SELECT -2+3 AS one --- !query 36 schema +-- !query schema struct --- !query 36 output +-- !query output 1 --- !query 37 +-- !query SELECT 4-2 AS two --- !query 37 schema +-- !query schema struct --- !query 37 output +-- !query output 2 --- !query 38 +-- !query SELECT 2- -1 AS three --- !query 38 schema +-- !query schema struct --- !query 38 output +-- !query output 3 --- !query 39 +-- !query SELECT 2 - -2 AS four --- !query 39 schema +-- !query schema struct --- !query 39 output +-- !query output 4 --- !query 40 +-- !query SELECT smallint('2') * smallint('2') = smallint('16') / smallint('4') AS true --- !query 40 schema +-- !query schema struct --- !query 40 output +-- !query output true --- !query 41 +-- !query SELECT int('2') * smallint('2') = smallint('16') / int('4') AS true --- !query 41 schema +-- !query schema struct --- !query 41 output +-- !query output true --- !query 42 +-- !query SELECT smallint('2') * int('2') = int('16') / smallint('4') AS true --- !query 42 schema +-- !query schema struct --- !query 42 output +-- !query output true --- !query 43 +-- !query SELECT int('1000') < int('999') AS `false` --- !query 43 schema +-- !query schema struct --- !query 43 output +-- !query output false --- !query 44 +-- !query SELECT 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 AS ten --- !query 44 schema +-- !query schema struct --- !query 44 output +-- !query output 10 --- !query 45 +-- !query SELECT 2 + 2 / 2 AS three --- !query 45 schema -struct --- !query 45 output -3 +-- !query schema +struct +-- !query output +3.0 --- !query 46 +-- !query SELECT (2 + 2) / 2 AS two --- !query 46 schema -struct --- !query 46 output -2 +-- !query schema +struct +-- !query output +2.0 --- !query 47 +-- !query SELECT string(shiftleft(int(-1), 31)) --- !query 47 schema +-- !query schema struct --- !query 47 output +-- !query output -2147483648 --- !query 48 +-- !query SELECT string(int(shiftleft(int(-1), 31))+1) --- !query 48 schema +-- !query schema struct --- !query 48 output +-- !query output -2147483647 --- !query 49 +-- !query SELECT int(-2147483648) % int(-1) --- !query 49 schema +-- !query schema struct<(CAST(-2147483648 AS INT) % CAST(-1 AS INT)):int> --- !query 49 output +-- !query output 0 --- !query 50 +-- !query SELECT int(-2147483648) % smallint(-1) --- !query 50 schema +-- !query schema struct<(CAST(-2147483648 AS INT) % CAST(CAST(-1 AS SMALLINT) AS INT)):int> --- !query 50 output +-- !query output 0 --- !query 51 +-- !query SELECT x, int(x) AS int4_value FROM (VALUES double(-2.5), double(-1.5), @@ -497,9 +471,9 @@ FROM (VALUES double(-2.5), double(0.5), double(1.5), double(2.5)) t(x) --- !query 51 schema +-- !query schema struct --- !query 51 output +-- !query output -0.5 0 -1.5 -1 -2.5 -2 @@ -509,7 +483,7 @@ struct 2.5 2 --- !query 52 +-- !query SELECT x, int(x) AS int4_value FROM (VALUES cast(-2.5 as decimal(38, 18)), cast(-1.5 as decimal(38, 18)), @@ -518,21 +492,21 @@ FROM (VALUES cast(-2.5 as decimal(38, 18)), cast(0.5 as decimal(38, 18)), cast(1.5 as decimal(38, 18)), cast(2.5 as decimal(38, 18))) t(x) --- !query 52 schema +-- !query schema struct --- !query 52 output --0.5 0 --1.5 -1 --2.5 -2 -0 0 -0.5 0 -1.5 1 -2.5 2 +-- !query output +-0.500000000000000000 0 +-1.500000000000000000 -1 +-2.500000000000000000 -2 +0.000000000000000000 0 +0.500000000000000000 0 +1.500000000000000000 1 +2.500000000000000000 2 --- !query 53 +-- !query DROP TABLE INT4_TBL --- !query 53 schema +-- !query schema struct<> --- !query 53 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/int8.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/int8.sql.out old mode 100644 new mode 100755 similarity index 59% rename from sql/core/src/test/resources/sql-tests/results/pgSQL/int8.sql.out rename to sql/core/src/test/resources/sql-tests/results/postgreSQL/int8.sql.out index fc9f1474eb26c..18b0c821ae70f --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/int8.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/int8.sql.out @@ -2,59 +2,59 @@ -- Number of queries: 85 --- !query 0 +-- !query CREATE TABLE INT8_TBL(q1 bigint, q2 bigint) USING parquet --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 -INSERT INTO INT8_TBL VALUES(trim(' 123 '),trim(' 456')) --- !query 1 schema +-- !query +INSERT INTO INT8_TBL VALUES(bigint(trim(' 123 ')),bigint(trim(' 456'))) +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 -INSERT INTO INT8_TBL VALUES(trim('123 '),'4567890123456789') --- !query 2 schema +-- !query +INSERT INTO INT8_TBL VALUES(bigint(trim('123 ')),bigint('4567890123456789')) +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 -INSERT INTO INT8_TBL VALUES('4567890123456789','123') --- !query 3 schema +-- !query +INSERT INTO INT8_TBL VALUES(bigint('4567890123456789'),bigint('123')) +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 -INSERT INTO INT8_TBL VALUES(+4567890123456789,'4567890123456789') --- !query 4 schema +-- !query +INSERT INTO INT8_TBL VALUES(+4567890123456789,bigint('4567890123456789')) +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 -INSERT INTO INT8_TBL VALUES('+4567890123456789','-4567890123456789') --- !query 5 schema +-- !query +INSERT INTO INT8_TBL VALUES(bigint('+4567890123456789'),bigint('-4567890123456789')) +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query SELECT * FROM INT8_TBL --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 123 456 123 4567890123456789 4567890123456789 -4567890123456789 @@ -62,48 +62,48 @@ struct 4567890123456789 4567890123456789 --- !query 7 +-- !query SELECT * FROM INT8_TBL WHERE q2 = 4567890123456789 --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 123 4567890123456789 4567890123456789 4567890123456789 --- !query 8 +-- !query SELECT * FROM INT8_TBL WHERE q2 <> 4567890123456789 --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 123 456 4567890123456789 -4567890123456789 4567890123456789 123 --- !query 9 +-- !query SELECT * FROM INT8_TBL WHERE q2 < 4567890123456789 --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 123 456 4567890123456789 -4567890123456789 4567890123456789 123 --- !query 10 +-- !query SELECT * FROM INT8_TBL WHERE q2 > 4567890123456789 --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output --- !query 11 +-- !query SELECT * FROM INT8_TBL WHERE q2 <= 4567890123456789 --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 123 456 123 4567890123456789 4567890123456789 -4567890123456789 @@ -111,114 +111,114 @@ struct 4567890123456789 4567890123456789 --- !query 12 +-- !query SELECT * FROM INT8_TBL WHERE q2 >= 4567890123456789 --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 123 4567890123456789 4567890123456789 4567890123456789 --- !query 13 +-- !query SELECT * FROM INT8_TBL WHERE q2 = 456 --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output 123 456 --- !query 14 +-- !query SELECT * FROM INT8_TBL WHERE q2 <> 456 --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output 123 4567890123456789 4567890123456789 -4567890123456789 4567890123456789 123 4567890123456789 4567890123456789 --- !query 15 +-- !query SELECT * FROM INT8_TBL WHERE q2 < 456 --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 4567890123456789 -4567890123456789 4567890123456789 123 --- !query 16 +-- !query SELECT * FROM INT8_TBL WHERE q2 > 456 --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output 123 4567890123456789 4567890123456789 4567890123456789 --- !query 17 +-- !query SELECT * FROM INT8_TBL WHERE q2 <= 456 --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output 123 456 4567890123456789 -4567890123456789 4567890123456789 123 --- !query 18 +-- !query SELECT * FROM INT8_TBL WHERE q2 >= 456 --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output 123 456 123 4567890123456789 4567890123456789 4567890123456789 --- !query 19 +-- !query SELECT * FROM INT8_TBL WHERE 123 = q1 --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output 123 456 123 4567890123456789 --- !query 20 +-- !query SELECT * FROM INT8_TBL WHERE 123 <> q1 --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output 4567890123456789 -4567890123456789 4567890123456789 123 4567890123456789 4567890123456789 --- !query 21 +-- !query SELECT * FROM INT8_TBL WHERE 123 < q1 --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output 4567890123456789 -4567890123456789 4567890123456789 123 4567890123456789 4567890123456789 --- !query 22 +-- !query SELECT * FROM INT8_TBL WHERE 123 > q1 --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output --- !query 23 +-- !query SELECT * FROM INT8_TBL WHERE 123 <= q1 --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output 123 456 123 4567890123456789 4567890123456789 -4567890123456789 @@ -226,114 +226,114 @@ struct 4567890123456789 4567890123456789 --- !query 24 +-- !query SELECT * FROM INT8_TBL WHERE 123 >= q1 --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output 123 456 123 4567890123456789 --- !query 25 +-- !query SELECT * FROM INT8_TBL WHERE q2 = smallint('456') --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output 123 456 --- !query 26 +-- !query SELECT * FROM INT8_TBL WHERE q2 <> smallint('456') --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output 123 4567890123456789 4567890123456789 -4567890123456789 4567890123456789 123 4567890123456789 4567890123456789 --- !query 27 +-- !query SELECT * FROM INT8_TBL WHERE q2 < smallint('456') --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output 4567890123456789 -4567890123456789 4567890123456789 123 --- !query 28 +-- !query SELECT * FROM INT8_TBL WHERE q2 > smallint('456') --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output 123 4567890123456789 4567890123456789 4567890123456789 --- !query 29 +-- !query SELECT * FROM INT8_TBL WHERE q2 <= smallint('456') --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output 123 456 4567890123456789 -4567890123456789 4567890123456789 123 --- !query 30 +-- !query SELECT * FROM INT8_TBL WHERE q2 >= smallint('456') --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output 123 456 123 4567890123456789 4567890123456789 4567890123456789 --- !query 31 +-- !query SELECT * FROM INT8_TBL WHERE smallint('123') = q1 --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output 123 456 123 4567890123456789 --- !query 32 +-- !query SELECT * FROM INT8_TBL WHERE smallint('123') <> q1 --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output 4567890123456789 -4567890123456789 4567890123456789 123 4567890123456789 4567890123456789 --- !query 33 +-- !query SELECT * FROM INT8_TBL WHERE smallint('123') < q1 --- !query 33 schema +-- !query schema struct --- !query 33 output +-- !query output 4567890123456789 -4567890123456789 4567890123456789 123 4567890123456789 4567890123456789 --- !query 34 +-- !query SELECT * FROM INT8_TBL WHERE smallint('123') > q1 --- !query 34 schema +-- !query schema struct --- !query 34 output +-- !query output --- !query 35 +-- !query SELECT * FROM INT8_TBL WHERE smallint('123') <= q1 --- !query 35 schema +-- !query schema struct --- !query 35 output +-- !query output 123 456 123 4567890123456789 4567890123456789 -4567890123456789 @@ -341,20 +341,20 @@ struct 4567890123456789 4567890123456789 --- !query 36 +-- !query SELECT * FROM INT8_TBL WHERE smallint('123') >= q1 --- !query 36 schema +-- !query schema struct --- !query 36 output +-- !query output 123 456 123 4567890123456789 --- !query 37 +-- !query SELECT '' AS five, q1 AS plus, -q1 AS `minus` FROM INT8_TBL --- !query 37 schema +-- !query schema struct --- !query 37 output +-- !query output 123 -123 123 -123 4567890123456789 -4567890123456789 @@ -362,11 +362,11 @@ struct 4567890123456789 -4567890123456789 --- !query 38 +-- !query SELECT '' AS five, q1, q2, q1 + q2 AS plus FROM INT8_TBL --- !query 38 schema +-- !query schema struct --- !query 38 output +-- !query output 123 456 579 123 4567890123456789 4567890123456912 4567890123456789 -4567890123456789 0 @@ -374,11 +374,11 @@ struct 4567890123456789 4567890123456789 9135780246913578 --- !query 39 +-- !query SELECT '' AS five, q1, q2, q1 - q2 AS `minus` FROM INT8_TBL --- !query 39 schema +-- !query schema struct --- !query 39 output +-- !query output 123 456 -333 123 4567890123456789 -4567890123456666 4567890123456789 -4567890123456789 9135780246913578 @@ -386,46 +386,43 @@ struct 4567890123456789 4567890123456789 0 --- !query 40 +-- !query SELECT '' AS three, q1, q2, q1 * q2 AS multiply FROM INT8_TBL --- !query 40 schema -struct --- !query 40 output - 123 456 56088 - 123 4567890123456789 561850485185185047 - 4567890123456789 -4567890123456789 -4868582358072306617 - 4567890123456789 123 561850485185185047 - 4567890123456789 4567890123456789 4868582358072306617 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +long overflow --- !query 41 +-- !query SELECT '' AS three, q1, q2, q1 * q2 AS multiply FROM INT8_TBL WHERE q1 < 1000 or (q2 > 0 and q2 < 1000) --- !query 41 schema +-- !query schema struct --- !query 41 output +-- !query output 123 456 56088 123 4567890123456789 561850485185185047 4567890123456789 123 561850485185185047 --- !query 42 +-- !query SELECT '' AS five, q1, q2, q1 / q2 AS divide, q1 % q2 AS mod FROM INT8_TBL --- !query 42 schema -struct --- !query 42 output - 123 456 0 123 - 123 4567890123456789 0 123 - 4567890123456789 -4567890123456789 -1 0 - 4567890123456789 123 37137318076884 57 - 4567890123456789 4567890123456789 1 0 +-- !query schema +struct +-- !query output + 123 456 0.26973684210526316 123 + 123 4567890123456789 2.6927092525360204E-14 123 + 4567890123456789 -4567890123456789 -1.0 0 + 4567890123456789 123 3.713731807688446E13 57 + 4567890123456789 4567890123456789 1.0 0 --- !query 43 +-- !query SELECT '' AS five, q1, double(q1) FROM INT8_TBL --- !query 43 schema +-- !query schema struct --- !query 43 output +-- !query output 123 123.0 123 123.0 4567890123456789 4.567890123456789E15 @@ -433,11 +430,11 @@ struct 4567890123456789 4.567890123456789E15 --- !query 44 +-- !query SELECT '' AS five, q2, double(q2) FROM INT8_TBL --- !query 44 schema +-- !query schema struct --- !query 44 output +-- !query output -4567890123456789 -4.567890123456789E15 123 123.0 456 456.0 @@ -445,11 +442,11 @@ struct 4567890123456789 4.567890123456789E15 --- !query 45 +-- !query SELECT 37 + q1 AS plus4 FROM INT8_TBL --- !query 45 schema +-- !query schema struct --- !query 45 output +-- !query output 160 160 4567890123456826 @@ -457,11 +454,11 @@ struct 4567890123456826 --- !query 46 +-- !query SELECT 37 - q1 AS minus4 FROM INT8_TBL --- !query 46 schema +-- !query schema struct --- !query 46 output +-- !query output -4567890123456752 -4567890123456752 -4567890123456752 @@ -469,11 +466,11 @@ struct -86 --- !query 47 +-- !query SELECT '' AS five, 2 * q1 AS `twice int4` FROM INT8_TBL --- !query 47 schema +-- !query schema struct --- !query 47 output +-- !query output 246 246 9135780246913578 @@ -481,11 +478,11 @@ struct 9135780246913578 --- !query 48 +-- !query SELECT '' AS five, q1 * 2 AS `twice int4` FROM INT8_TBL --- !query 48 schema +-- !query schema struct --- !query 48 output +-- !query output 246 246 9135780246913578 @@ -493,59 +490,59 @@ struct 9135780246913578 --- !query 49 +-- !query SELECT q1 + int(42) AS `8plus4`, q1 - int(42) AS `8minus4`, q1 * int(42) AS `8mul4`, q1 / int(42) AS `8div4` FROM INT8_TBL --- !query 49 schema -struct<8plus4:bigint,8minus4:bigint,8mul4:bigint,8div4:bigint> --- !query 49 output -165 81 5166 2 -165 81 5166 2 -4567890123456831 4567890123456747 191851385185185138 108759288653733 -4567890123456831 4567890123456747 191851385185185138 108759288653733 -4567890123456831 4567890123456747 191851385185185138 108759288653733 +-- !query schema +struct<8plus4:bigint,8minus4:bigint,8mul4:bigint,8div4:double> +-- !query output +165 81 5166 2.9285714285714284 +165 81 5166 2.9285714285714284 +4567890123456831 4567890123456747 191851385185185138 1.0875928865373308E14 +4567890123456831 4567890123456747 191851385185185138 1.0875928865373308E14 +4567890123456831 4567890123456747 191851385185185138 1.0875928865373308E14 --- !query 50 +-- !query SELECT int(246) + q1 AS `4plus8`, int(246) - q1 AS `4minus8`, int(246) * q1 AS `4mul8`, int(246) / q1 AS `4div8` FROM INT8_TBL --- !query 50 schema -struct<4plus8:bigint,4minus8:bigint,4mul8:bigint,4div8:bigint> --- !query 50 output -369 123 30258 2 -369 123 30258 2 -4567890123457035 -4567890123456543 1123700970370370094 0 -4567890123457035 -4567890123456543 1123700970370370094 0 -4567890123457035 -4567890123456543 1123700970370370094 0 +-- !query schema +struct<4plus8:bigint,4minus8:bigint,4mul8:bigint,4div8:double> +-- !query output +369 123 30258 2.0 +369 123 30258 2.0 +4567890123457035 -4567890123456543 1123700970370370094 5.385418505072041E-14 +4567890123457035 -4567890123456543 1123700970370370094 5.385418505072041E-14 +4567890123457035 -4567890123456543 1123700970370370094 5.385418505072041E-14 --- !query 51 +-- !query SELECT q1 + smallint(42) AS `8plus2`, q1 - smallint(42) AS `8minus2`, q1 * smallint(42) AS `8mul2`, q1 / smallint(42) AS `8div2` FROM INT8_TBL --- !query 51 schema -struct<8plus2:bigint,8minus2:bigint,8mul2:bigint,8div2:bigint> --- !query 51 output -165 81 5166 2 -165 81 5166 2 -4567890123456831 4567890123456747 191851385185185138 108759288653733 -4567890123456831 4567890123456747 191851385185185138 108759288653733 -4567890123456831 4567890123456747 191851385185185138 108759288653733 +-- !query schema +struct<8plus2:bigint,8minus2:bigint,8mul2:bigint,8div2:double> +-- !query output +165 81 5166 2.9285714285714284 +165 81 5166 2.9285714285714284 +4567890123456831 4567890123456747 191851385185185138 1.0875928865373308E14 +4567890123456831 4567890123456747 191851385185185138 1.0875928865373308E14 +4567890123456831 4567890123456747 191851385185185138 1.0875928865373308E14 --- !query 52 +-- !query SELECT smallint(246) + q1 AS `2plus8`, smallint(246) - q1 AS `2minus8`, smallint(246) * q1 AS `2mul8`, smallint(246) / q1 AS `2div8` FROM INT8_TBL --- !query 52 schema -struct<2plus8:bigint,2minus8:bigint,2mul8:bigint,2div8:bigint> --- !query 52 output -369 123 30258 2 -369 123 30258 2 -4567890123457035 -4567890123456543 1123700970370370094 0 -4567890123457035 -4567890123456543 1123700970370370094 0 -4567890123457035 -4567890123456543 1123700970370370094 0 +-- !query schema +struct<2plus8:bigint,2minus8:bigint,2mul8:bigint,2div8:double> +-- !query output +369 123 30258 2.0 +369 123 30258 2.0 +4567890123457035 -4567890123456543 1123700970370370094 5.385418505072041E-14 +4567890123457035 -4567890123456543 1123700970370370094 5.385418505072041E-14 +4567890123457035 -4567890123456543 1123700970370370094 5.385418505072041E-14 --- !query 53 +-- !query SELECT q2, abs(q2) FROM INT8_TBL --- !query 53 schema +-- !query schema struct --- !query 53 output +-- !query output -4567890123456789 4567890123456789 123 123 456 456 @@ -553,97 +550,93 @@ struct 4567890123456789 4567890123456789 --- !query 54 +-- !query SELECT min(q1), min(q2) FROM INT8_TBL --- !query 54 schema +-- !query schema struct --- !query 54 output +-- !query output 123 -4567890123456789 --- !query 55 +-- !query SELECT max(q1), max(q2) FROM INT8_TBL --- !query 55 schema +-- !query schema struct --- !query 55 output +-- !query output 4567890123456789 4567890123456789 --- !query 56 +-- !query select bigint('9223372036854775800') / bigint('0') --- !query 56 schema -struct<(CAST(9223372036854775800 AS BIGINT) div CAST(0 AS BIGINT)):bigint> --- !query 56 output +-- !query schema +struct<(CAST(CAST(9223372036854775800 AS BIGINT) AS DOUBLE) / CAST(CAST(0 AS BIGINT) AS DOUBLE)):double> +-- !query output NULL --- !query 57 +-- !query select bigint('-9223372036854775808') / smallint('0') --- !query 57 schema -struct<(CAST(-9223372036854775808 AS BIGINT) div CAST(CAST(0 AS SMALLINT) AS BIGINT)):bigint> --- !query 57 output +-- !query schema +struct<(CAST(CAST(-9223372036854775808 AS BIGINT) AS DOUBLE) / CAST(CAST(0 AS SMALLINT) AS DOUBLE)):double> +-- !query output NULL --- !query 58 +-- !query select smallint('100') / bigint('0') --- !query 58 schema -struct<(CAST(CAST(100 AS SMALLINT) AS BIGINT) div CAST(0 AS BIGINT)):bigint> --- !query 58 output +-- !query schema +struct<(CAST(CAST(100 AS SMALLINT) AS DOUBLE) / CAST(CAST(0 AS BIGINT) AS DOUBLE)):double> +-- !query output NULL --- !query 59 +-- !query SELECT CAST(q1 AS int) FROM int8_tbl WHERE q2 = 456 --- !query 59 schema +-- !query schema struct --- !query 59 output +-- !query output 123 --- !query 60 +-- !query SELECT CAST(q1 AS int) FROM int8_tbl WHERE q2 <> 456 --- !query 60 schema -struct --- !query 60 output --869367531 --869367531 --869367531 -123 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +Casting 4567890123456789 to int causes overflow --- !query 61 +-- !query SELECT CAST(q1 AS smallint) FROM int8_tbl WHERE q2 = 456 --- !query 61 schema +-- !query schema struct --- !query 61 output +-- !query output 123 --- !query 62 +-- !query SELECT CAST(q1 AS smallint) FROM int8_tbl WHERE q2 <> 456 --- !query 62 schema -struct --- !query 62 output --32491 --32491 --32491 -123 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +Casting 4567890123456789 to short causes overflow --- !query 63 +-- !query SELECT CAST(smallint('42') AS bigint), CAST(smallint('-37') AS bigint) --- !query 63 schema +-- !query schema struct --- !query 63 output +-- !query output 42 -37 --- !query 64 +-- !query SELECT CAST(q1 AS float), CAST(q2 AS double) FROM INT8_TBL --- !query 64 schema +-- !query schema struct --- !query 64 output +-- !query output 123.0 4.567890123456789E15 123.0 456.0 4.5678899E15 -4.567890123456789E15 @@ -651,27 +644,28 @@ struct 4.5678899E15 4.567890123456789E15 --- !query 65 +-- !query SELECT CAST(float('36854775807.0') AS bigint) --- !query 65 schema +-- !query schema struct --- !query 65 output +-- !query output 36854775808 --- !query 66 +-- !query SELECT CAST(double('922337203685477580700.0') AS bigint) --- !query 66 schema -struct --- !query 66 output -9223372036854775807 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +Casting 9.223372036854776E20 to long causes overflow --- !query 67 +-- !query SELECT q1, q2, q1 & q2 AS `and`, q1 | q2 AS `or`, ~q1 AS `not` FROM INT8_TBL --- !query 67 schema +-- !query schema struct --- !query 67 output +-- !query output 123 456 72 507 -124 123 4567890123456789 17 4567890123456895 -124 4567890123456789 -4567890123456789 1 -1 -4567890123456790 @@ -679,11 +673,11 @@ struct 4567890123456789 4567890123456789 4567890123456789 4567890123456789 -4567890123456790 --- !query 68 +-- !query SELECT * FROM range(bigint('+4567890123456789'), bigint('+4567890123456799')) --- !query 68 schema +-- !query schema struct --- !query 68 output +-- !query output 4567890123456789 4567890123456790 4567890123456791 @@ -696,20 +690,20 @@ struct 4567890123456798 --- !query 69 +-- !query SELECT * FROM range(bigint('+4567890123456789'), bigint('+4567890123456799'), 0) --- !query 69 schema +-- !query schema struct<> --- !query 69 output +-- !query output java.lang.IllegalArgumentException requirement failed: step (0) cannot be 0 --- !query 70 +-- !query SELECT * FROM range(bigint('+4567890123456789'), bigint('+4567890123456799'), 2) --- !query 70 schema +-- !query schema struct --- !query 70 output +-- !query output 4567890123456789 4567890123456791 4567890123456793 @@ -717,95 +711,99 @@ struct 4567890123456797 --- !query 71 +-- !query SELECT string(shiftleft(bigint(-1), 63)) --- !query 71 schema +-- !query schema struct --- !query 71 output +-- !query output -9223372036854775808 --- !query 72 +-- !query SELECT string(int(shiftleft(bigint(-1), 63))+1) --- !query 72 schema -struct --- !query 72 output -1 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +Casting -9223372036854775808 to int causes overflow --- !query 73 +-- !query SELECT bigint((-9223372036854775808)) * bigint((-1)) --- !query 73 schema -struct<(CAST(-9223372036854775808 AS BIGINT) * CAST(-1 AS BIGINT)):bigint> --- !query 73 output --9223372036854775808 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +long overflow --- !query 74 +-- !query SELECT bigint((-9223372036854775808)) / bigint((-1)) --- !query 74 schema -struct<(CAST(-9223372036854775808 AS BIGINT) div CAST(-1 AS BIGINT)):bigint> --- !query 74 output --9223372036854775808 +-- !query schema +struct<(CAST(CAST(-9223372036854775808 AS BIGINT) AS DOUBLE) / CAST(CAST(-1 AS BIGINT) AS DOUBLE)):double> +-- !query output +9.223372036854776E18 --- !query 75 +-- !query SELECT bigint((-9223372036854775808)) % bigint((-1)) --- !query 75 schema +-- !query schema struct<(CAST(-9223372036854775808 AS BIGINT) % CAST(-1 AS BIGINT)):bigint> --- !query 75 output +-- !query output 0 --- !query 76 +-- !query SELECT bigint((-9223372036854775808)) * int((-1)) --- !query 76 schema -struct<(CAST(-9223372036854775808 AS BIGINT) * CAST(CAST(-1 AS INT) AS BIGINT)):bigint> --- !query 76 output --9223372036854775808 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +long overflow --- !query 77 +-- !query SELECT bigint((-9223372036854775808)) / int((-1)) --- !query 77 schema -struct<(CAST(-9223372036854775808 AS BIGINT) div CAST(CAST(-1 AS INT) AS BIGINT)):bigint> --- !query 77 output --9223372036854775808 +-- !query schema +struct<(CAST(CAST(-9223372036854775808 AS BIGINT) AS DOUBLE) / CAST(CAST(-1 AS INT) AS DOUBLE)):double> +-- !query output +9.223372036854776E18 --- !query 78 +-- !query SELECT bigint((-9223372036854775808)) % int((-1)) --- !query 78 schema +-- !query schema struct<(CAST(-9223372036854775808 AS BIGINT) % CAST(CAST(-1 AS INT) AS BIGINT)):bigint> --- !query 78 output +-- !query output 0 --- !query 79 +-- !query SELECT bigint((-9223372036854775808)) * smallint((-1)) --- !query 79 schema -struct<(CAST(-9223372036854775808 AS BIGINT) * CAST(CAST(-1 AS SMALLINT) AS BIGINT)):bigint> --- !query 79 output --9223372036854775808 +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +long overflow --- !query 80 +-- !query SELECT bigint((-9223372036854775808)) / smallint((-1)) --- !query 80 schema -struct<(CAST(-9223372036854775808 AS BIGINT) div CAST(CAST(-1 AS SMALLINT) AS BIGINT)):bigint> --- !query 80 output --9223372036854775808 +-- !query schema +struct<(CAST(CAST(-9223372036854775808 AS BIGINT) AS DOUBLE) / CAST(CAST(-1 AS SMALLINT) AS DOUBLE)):double> +-- !query output +9.223372036854776E18 --- !query 81 +-- !query SELECT bigint((-9223372036854775808)) % smallint((-1)) --- !query 81 schema +-- !query schema struct<(CAST(-9223372036854775808 AS BIGINT) % CAST(CAST(-1 AS SMALLINT) AS BIGINT)):bigint> --- !query 81 output +-- !query output 0 --- !query 82 +-- !query SELECT x, bigint(x) AS int8_value FROM (VALUES (double(-2.5)), (double(-1.5)), @@ -814,9 +812,9 @@ FROM (VALUES (double(-2.5)), (double(0.5)), (double(1.5)), (double(2.5))) t(x) --- !query 82 schema +-- !query schema struct --- !query 82 output +-- !query output -0.5 0 -1.5 -1 -2.5 -2 @@ -826,7 +824,7 @@ struct 2.5 2 --- !query 83 +-- !query SELECT x, bigint(x) AS int8_value FROM (VALUES cast(-2.5 as decimal(38, 18)), cast(-1.5 as decimal(38, 18)), @@ -835,21 +833,21 @@ FROM (VALUES cast(-2.5 as decimal(38, 18)), cast(0.5 as decimal(38, 18)), cast(1.5 as decimal(38, 18)), cast(2.5 as decimal(38, 18))) t(x) --- !query 83 schema +-- !query schema struct --- !query 83 output --0.5 0 --1.5 -1 --2.5 -2 -0 0 -0.5 0 -1.5 1 -2.5 2 +-- !query output +-0.500000000000000000 0 +-1.500000000000000000 -1 +-2.500000000000000000 -2 +0.000000000000000000 0 +0.500000000000000000 0 +1.500000000000000000 1 +2.500000000000000000 2 --- !query 84 +-- !query DROP TABLE INT8_TBL --- !query 84 schema +-- !query schema struct<> --- !query 84 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/interval.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/interval.sql.out new file mode 100644 index 0000000000000..62d47410aab65 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/interval.sql.out @@ -0,0 +1,254 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 24 + + +-- !query +SELECT interval '999' second +-- !query schema +struct +-- !query output +16 minutes 39 seconds + + +-- !query +SELECT interval '999' minute +-- !query schema +struct +-- !query output +16 hours 39 minutes + + +-- !query +SELECT interval '999' hour +-- !query schema +struct +-- !query output +999 hours + + +-- !query +SELECT interval '999' day +-- !query schema +struct +-- !query output +999 days + + +-- !query +SELECT interval '999' month +-- !query schema +struct +-- !query output +83 years 3 months + + +-- !query +SELECT interval '1' year +-- !query schema +struct +-- !query output +1 years + + +-- !query +SELECT interval '2' month +-- !query schema +struct +-- !query output +2 months + + +-- !query +SELECT interval '3' day +-- !query schema +struct +-- !query output +3 days + + +-- !query +SELECT interval '4' hour +-- !query schema +struct +-- !query output +4 hours + + +-- !query +SELECT interval '5' minute +-- !query schema +struct +-- !query output +5 minutes + + +-- !query +SELECT interval '6' second +-- !query schema +struct +-- !query output +6 seconds + + +-- !query +SELECT interval '1-2' year to month +-- !query schema +struct +-- !query output +1 years 2 months + + +-- !query +SELECT interval '1 2:03' day to hour +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2})$': 1 2:03, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) + +== SQL == +SELECT interval '1 2:03' day to hour +----------------^^^ + + +-- !query +SELECT interval '1 2:03:04' day to hour +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2})$': 1 2:03:04, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) + +== SQL == +SELECT interval '1 2:03:04' day to hour +----------------^^^ + + +-- !query +SELECT interval '1 2:03' day to minute +-- !query schema +struct +-- !query output +1 days 2 hours 3 minutes + + +-- !query +SELECT interval '1 2:03:04' day to minute +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2}):(?\d{1,2})$': 1 2:03:04, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) + +== SQL == +SELECT interval '1 2:03:04' day to minute +----------------^^^ + + +-- !query +SELECT interval '1 2:03' day to second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d+) (?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 1 2:03, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) + +== SQL == +SELECT interval '1 2:03' day to second +----------------^^^ + + +-- !query +SELECT interval '1 2:03:04' day to second +-- !query schema +struct +-- !query output +1 days 2 hours 3 minutes 4 seconds + + +-- !query +SELECT interval '1 2:03' hour to minute +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2})$': 1 2:03, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) + +== SQL == +SELECT interval '1 2:03' hour to minute +----------------^^^ + + +-- !query +SELECT interval '1 2:03:04' hour to minute +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2})$': 1 2:03:04, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) + +== SQL == +SELECT interval '1 2:03:04' hour to minute +----------------^^^ + + +-- !query +SELECT interval '1 2:03' hour to second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 1 2:03, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) + +== SQL == +SELECT interval '1 2:03' hour to second +----------------^^^ + + +-- !query +SELECT interval '1 2:03:04' hour to second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 1 2:03:04, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) + +== SQL == +SELECT interval '1 2:03:04' hour to second +----------------^^^ + + +-- !query +SELECT interval '1 2:03' minute to second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 1 2:03, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) + +== SQL == +SELECT interval '1 2:03' minute to second +----------------^^^ + + +-- !query +SELECT interval '1 2:03:04' minute to second +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +requirement failed: Interval string must match day-time format of '^(?[+|-])?(?\d{1,2}):(?(\d{1,2})(\.(\d{1,9}))?)$': 1 2:03:04, set spark.sql.legacy.fromDayTimeString.enabled to true to restore the behavior before Spark 3.0.(line 1, pos 16) + +== SQL == +SELECT interval '1 2:03:04' minute to second +----------------^^^ diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/join.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/join.sql.out similarity index 84% rename from sql/core/src/test/resources/sql-tests/results/pgSQL/join.sql.out rename to sql/core/src/test/resources/sql-tests/results/postgreSQL/join.sql.out index f75fe0519645b..5332dfff9f101 100644 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/join.sql.out @@ -1,18 +1,18 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 185 +-- Number of queries: 181 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW INT4_TBL AS SELECT * FROM (VALUES (0), (123456), (-123456), (2147483647), (-2147483647)) AS v(f1) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE OR REPLACE TEMPORARY VIEW INT8_TBL AS SELECT * FROM (VALUES (123, 456), @@ -21,230 +21,230 @@ CREATE OR REPLACE TEMPORARY VIEW INT8_TBL AS SELECT * FROM (4567890123456789, 4567890123456789), (4567890123456789, -4567890123456789)) AS v(q1, q2) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE OR REPLACE TEMPORARY VIEW FLOAT8_TBL AS SELECT * FROM (VALUES (0.0), (1004.30), (-34.84), (cast('1.2345678901234e+200' as double)), (cast('1.2345678901234e-200' as double))) AS v(f1) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query CREATE OR REPLACE TEMPORARY VIEW TEXT_TBL AS SELECT * FROM (VALUES ('doh!'), ('hi de ho neighbor')) AS v(f1) --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query CREATE OR REPLACE TEMPORARY VIEW tenk2 AS SELECT * FROM tenk1 --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query CREATE TABLE J1_TBL ( i integer, j integer, t string ) USING parquet --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query CREATE TABLE J2_TBL ( i integer, k integer ) USING parquet --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output --- !query 7 +-- !query INSERT INTO J1_TBL VALUES (1, 4, 'one') --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output --- !query 8 +-- !query INSERT INTO J1_TBL VALUES (2, 3, 'two') --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output --- !query 9 +-- !query INSERT INTO J1_TBL VALUES (3, 2, 'three') --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output --- !query 10 +-- !query INSERT INTO J1_TBL VALUES (4, 1, 'four') --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output --- !query 11 +-- !query INSERT INTO J1_TBL VALUES (5, 0, 'five') --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output --- !query 12 +-- !query INSERT INTO J1_TBL VALUES (6, 6, 'six') --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output --- !query 13 +-- !query INSERT INTO J1_TBL VALUES (7, 7, 'seven') --- !query 13 schema +-- !query schema struct<> --- !query 13 output +-- !query output --- !query 14 +-- !query INSERT INTO J1_TBL VALUES (8, 8, 'eight') --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output --- !query 15 +-- !query INSERT INTO J1_TBL VALUES (0, NULL, 'zero') --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output --- !query 16 +-- !query INSERT INTO J1_TBL VALUES (NULL, NULL, 'null') --- !query 16 schema +-- !query schema struct<> --- !query 16 output +-- !query output --- !query 17 +-- !query INSERT INTO J1_TBL VALUES (NULL, 0, 'zero') --- !query 17 schema +-- !query schema struct<> --- !query 17 output +-- !query output --- !query 18 +-- !query INSERT INTO J2_TBL VALUES (1, -1) --- !query 18 schema +-- !query schema struct<> --- !query 18 output +-- !query output --- !query 19 +-- !query INSERT INTO J2_TBL VALUES (2, 2) --- !query 19 schema +-- !query schema struct<> --- !query 19 output +-- !query output --- !query 20 +-- !query INSERT INTO J2_TBL VALUES (3, -3) --- !query 20 schema +-- !query schema struct<> --- !query 20 output +-- !query output --- !query 21 +-- !query INSERT INTO J2_TBL VALUES (2, 4) --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output --- !query 22 +-- !query INSERT INTO J2_TBL VALUES (5, -5) --- !query 22 schema +-- !query schema struct<> --- !query 22 output +-- !query output --- !query 23 +-- !query INSERT INTO J2_TBL VALUES (5, -5) --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output --- !query 24 +-- !query INSERT INTO J2_TBL VALUES (0, NULL) --- !query 24 schema +-- !query schema struct<> --- !query 24 output +-- !query output --- !query 25 +-- !query INSERT INTO J2_TBL VALUES (NULL, NULL) --- !query 25 schema +-- !query schema struct<> --- !query 25 output +-- !query output --- !query 26 +-- !query INSERT INTO J2_TBL VALUES (NULL, 0) --- !query 26 schema +-- !query schema struct<> --- !query 26 output +-- !query output --- !query 27 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL AS tx --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output 0 NULL zero 1 4 one 2 3 two @@ -258,12 +258,12 @@ struct NULL NULL null --- !query 28 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL tx --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output 0 NULL zero 1 4 one 2 3 two @@ -277,12 +277,12 @@ struct NULL NULL null --- !query 29 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL AS t1 (a, b, c) --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output 0 NULL zero 1 4 one 2 3 two @@ -296,12 +296,12 @@ struct NULL NULL null --- !query 30 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL t1 (a, b, c) --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output 0 NULL zero 1 4 one 2 3 two @@ -315,12 +315,12 @@ struct NULL NULL null --- !query 31 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL t1 (a, b, c), J2_TBL t2 (d, e) --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output 0 NULL zero 0 NULL 0 NULL zero 1 -1 0 NULL zero 2 2 @@ -422,12 +422,12 @@ struct NULL NULL null NULL NULL --- !query 32 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL CROSS JOIN J2_TBL --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output 0 NULL zero 0 NULL 0 NULL zero 1 -1 0 NULL zero 2 2 @@ -529,22 +529,22 @@ struct NULL NULL null NULL NULL --- !query 33 +-- !query SELECT '' AS `xxx`, i, k, t FROM J1_TBL CROSS JOIN J2_TBL --- !query 33 schema +-- !query schema struct<> --- !query 33 output +-- !query output org.apache.spark.sql.AnalysisException Reference 'i' is ambiguous, could be: default.j1_tbl.i, default.j2_tbl.i.; line 1 pos 20 --- !query 34 +-- !query SELECT '' AS `xxx`, t1.i, k, t FROM J1_TBL t1 CROSS JOIN J2_TBL t2 --- !query 34 schema +-- !query schema struct --- !query 34 output +-- !query output 0 -1 zero 0 -3 zero 0 -5 zero @@ -646,13 +646,13 @@ struct NULL NULL zero --- !query 35 +-- !query SELECT '' AS `xxx`, ii, tt, kk FROM (J1_TBL CROSS JOIN J2_TBL) AS tx (ii, jj, tt, ii2, kk) --- !query 35 schema +-- !query schema struct --- !query 35 output +-- !query output 0 zero -1 0 zero -3 0 zero -5 @@ -754,12 +754,12 @@ struct NULL zero NULL --- !query 36 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL CROSS JOIN J2_TBL a CROSS JOIN J2_TBL b --- !query 36 schema +-- !query schema struct --- !query 36 output +-- !query output 0 NULL zero 0 NULL 0 NULL 0 NULL zero 0 NULL 1 -1 0 NULL zero 0 NULL 2 2 @@ -1653,12 +1653,12 @@ struct NULL NULL null NULL NULL NULL NULL --- !query 37 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL INNER JOIN J2_TBL USING (i) --- !query 37 schema +-- !query schema struct --- !query 37 output +-- !query output 0 NULL zero NULL 1 4 one -1 2 3 two 2 @@ -1668,12 +1668,12 @@ struct 5 0 five -5 --- !query 38 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL JOIN J2_TBL USING (i) --- !query 38 schema +-- !query schema struct --- !query 38 output +-- !query output 0 NULL zero NULL 1 4 one -1 2 3 two 2 @@ -1683,13 +1683,13 @@ struct 5 0 five -5 --- !query 39 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL t1 (a, b, c) JOIN J2_TBL t2 (a, d) USING (a) ORDER BY a, d --- !query 39 schema +-- !query schema struct --- !query 39 output +-- !query output 0 NULL zero NULL 1 4 one -1 2 3 two 2 @@ -1699,12 +1699,12 @@ struct 5 0 five -5 --- !query 40 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL NATURAL JOIN J2_TBL --- !query 40 schema +-- !query schema struct --- !query 40 output +-- !query output 0 NULL zero NULL 1 4 one -1 2 3 two 2 @@ -1714,12 +1714,12 @@ struct 5 0 five -5 --- !query 41 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL t1 (a, b, c) NATURAL JOIN J2_TBL t2 (a, d) --- !query 41 schema +-- !query schema struct --- !query 41 output +-- !query output 0 NULL zero NULL 1 4 one -1 2 3 two 2 @@ -1729,23 +1729,23 @@ struct 5 0 five -5 --- !query 42 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL t1 (a, b, c) NATURAL JOIN J2_TBL t2 (d, a) --- !query 42 schema +-- !query schema struct --- !query 42 output +-- !query output 0 NULL zero NULL 2 3 two 2 4 1 four 2 --- !query 43 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL JOIN J2_TBL ON (J1_TBL.i = J2_TBL.i) --- !query 43 schema +-- !query schema struct --- !query 43 output +-- !query output 0 NULL zero 0 NULL 1 4 one 1 -1 2 3 two 2 2 @@ -1755,23 +1755,23 @@ struct 5 0 five 5 -5 --- !query 44 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL JOIN J2_TBL ON (J1_TBL.i = J2_TBL.k) --- !query 44 schema +-- !query schema struct --- !query 44 output +-- !query output 0 NULL zero NULL 0 2 3 two 2 2 4 1 four 2 4 --- !query 45 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL JOIN J2_TBL ON (J1_TBL.i <= J2_TBL.k) --- !query 45 schema +-- !query schema struct --- !query 45 output +-- !query output 0 NULL zero 2 2 0 NULL zero 2 4 0 NULL zero NULL 0 @@ -1783,13 +1783,13 @@ struct 4 1 four 2 4 --- !query 46 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL LEFT OUTER JOIN J2_TBL USING (i) ORDER BY i, k, t --- !query 46 schema +-- !query schema struct --- !query 46 output +-- !query output NULL NULL null NULL NULL 0 zero NULL 0 NULL zero NULL @@ -1805,13 +1805,13 @@ struct 8 8 eight NULL --- !query 47 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL LEFT JOIN J2_TBL USING (i) ORDER BY i, k, t --- !query 47 schema +-- !query schema struct --- !query 47 output +-- !query output NULL NULL null NULL NULL 0 zero NULL 0 NULL zero NULL @@ -1827,12 +1827,12 @@ struct 8 8 eight NULL --- !query 48 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL RIGHT OUTER JOIN J2_TBL USING (i) --- !query 48 schema +-- !query schema struct --- !query 48 output +-- !query output 0 NULL zero NULL 1 4 one -1 2 3 two 2 @@ -1844,12 +1844,12 @@ struct NULL NULL NULL NULL --- !query 49 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL RIGHT JOIN J2_TBL USING (i) --- !query 49 schema +-- !query schema struct --- !query 49 output +-- !query output 0 NULL zero NULL 1 4 one -1 2 3 two 2 @@ -1861,13 +1861,13 @@ struct NULL NULL NULL NULL --- !query 50 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL FULL OUTER JOIN J2_TBL USING (i) ORDER BY i, k, t --- !query 50 schema +-- !query schema struct --- !query 50 output +-- !query output NULL NULL NULL NULL NULL NULL null NULL NULL 0 zero NULL @@ -1885,13 +1885,13 @@ struct 8 8 eight NULL --- !query 51 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL FULL JOIN J2_TBL USING (i) ORDER BY i, k, t --- !query 51 schema +-- !query schema struct --- !query 51 output +-- !query output NULL NULL NULL NULL NULL NULL null NULL NULL 0 zero NULL @@ -1909,226 +1909,226 @@ struct 8 8 eight NULL --- !query 52 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL LEFT JOIN J2_TBL USING (i) WHERE (k = 1) --- !query 52 schema +-- !query schema struct --- !query 52 output +-- !query output --- !query 53 +-- !query SELECT '' AS `xxx`, * FROM J1_TBL LEFT JOIN J2_TBL USING (i) WHERE (i = 1) --- !query 53 schema +-- !query schema struct --- !query 53 output +-- !query output 1 4 one -1 --- !query 54 +-- !query CREATE TABLE t1 (name STRING, n INTEGER) USING parquet --- !query 54 schema +-- !query schema struct<> --- !query 54 output +-- !query output --- !query 55 +-- !query CREATE TABLE t2 (name STRING, n INTEGER) USING parquet --- !query 55 schema +-- !query schema struct<> --- !query 55 output +-- !query output --- !query 56 +-- !query CREATE TABLE t3 (name STRING, n INTEGER) USING parquet --- !query 56 schema +-- !query schema struct<> --- !query 56 output +-- !query output --- !query 57 +-- !query INSERT INTO t1 VALUES ( 'bb', 11 ) --- !query 57 schema +-- !query schema struct<> --- !query 57 output +-- !query output --- !query 58 +-- !query INSERT INTO t2 VALUES ( 'bb', 12 ) --- !query 58 schema +-- !query schema struct<> --- !query 58 output +-- !query output --- !query 59 +-- !query INSERT INTO t2 VALUES ( 'cc', 22 ) --- !query 59 schema +-- !query schema struct<> --- !query 59 output +-- !query output --- !query 60 +-- !query INSERT INTO t2 VALUES ( 'ee', 42 ) --- !query 60 schema +-- !query schema struct<> --- !query 60 output +-- !query output --- !query 61 +-- !query INSERT INTO t3 VALUES ( 'bb', 13 ) --- !query 61 schema +-- !query schema struct<> --- !query 61 output +-- !query output --- !query 62 +-- !query INSERT INTO t3 VALUES ( 'cc', 23 ) --- !query 62 schema +-- !query schema struct<> --- !query 62 output +-- !query output --- !query 63 +-- !query INSERT INTO t3 VALUES ( 'dd', 33 ) --- !query 63 schema +-- !query schema struct<> --- !query 63 output +-- !query output --- !query 64 +-- !query SELECT * FROM t1 FULL JOIN t2 USING (name) FULL JOIN t3 USING (name) --- !query 64 schema +-- !query schema struct --- !query 64 output +-- !query output bb 11 12 13 cc NULL 22 23 dd NULL NULL 33 ee NULL 42 NULL --- !query 65 +-- !query SELECT * FROM (SELECT * FROM t2) as s2 INNER JOIN (SELECT * FROM t3) s3 USING (name) --- !query 65 schema +-- !query schema struct --- !query 65 output +-- !query output bb 12 13 cc 22 23 --- !query 66 +-- !query SELECT * FROM (SELECT * FROM t2) as s2 LEFT JOIN (SELECT * FROM t3) s3 USING (name) --- !query 66 schema +-- !query schema struct --- !query 66 output +-- !query output bb 12 13 cc 22 23 ee 42 NULL --- !query 67 +-- !query SELECT * FROM (SELECT * FROM t2) as s2 FULL JOIN (SELECT * FROM t3) s3 USING (name) --- !query 67 schema +-- !query schema struct --- !query 67 output +-- !query output bb 12 13 cc 22 23 dd NULL 33 ee 42 NULL --- !query 68 +-- !query SELECT * FROM (SELECT name, n as s2_n, 2 as s2_2 FROM t2) as s2 NATURAL INNER JOIN (SELECT name, n as s3_n, 3 as s3_2 FROM t3) s3 --- !query 68 schema +-- !query schema struct --- !query 68 output +-- !query output bb 12 2 13 3 cc 22 2 23 3 --- !query 69 +-- !query SELECT * FROM (SELECT name, n as s2_n, 2 as s2_2 FROM t2) as s2 NATURAL LEFT JOIN (SELECT name, n as s3_n, 3 as s3_2 FROM t3) s3 --- !query 69 schema +-- !query schema struct --- !query 69 output +-- !query output bb 12 2 13 3 cc 22 2 23 3 ee 42 2 NULL NULL --- !query 70 +-- !query SELECT * FROM (SELECT name, n as s2_n, 2 as s2_2 FROM t2) as s2 NATURAL FULL JOIN (SELECT name, n as s3_n, 3 as s3_2 FROM t3) s3 --- !query 70 schema +-- !query schema struct --- !query 70 output +-- !query output bb 12 2 13 3 cc 22 2 23 3 dd NULL NULL 33 3 ee 42 2 NULL NULL --- !query 71 +-- !query SELECT * FROM (SELECT name, n as s1_n, 1 as s1_1 FROM t1) as s1 NATURAL INNER JOIN (SELECT name, n as s2_n, 2 as s2_2 FROM t2) as s2 NATURAL INNER JOIN (SELECT name, n as s3_n, 3 as s3_2 FROM t3) s3 --- !query 71 schema +-- !query schema struct --- !query 71 output +-- !query output bb 11 1 12 2 13 3 --- !query 72 +-- !query SELECT * FROM (SELECT name, n as s1_n, 1 as s1_1 FROM t1) as s1 NATURAL FULL JOIN (SELECT name, n as s2_n, 2 as s2_2 FROM t2) as s2 NATURAL FULL JOIN (SELECT name, n as s3_n, 3 as s3_2 FROM t3) s3 --- !query 72 schema +-- !query schema struct --- !query 72 output +-- !query output bb 11 1 12 2 13 3 cc NULL NULL 22 2 23 3 dd NULL NULL NULL NULL 33 3 ee NULL NULL 42 2 NULL NULL --- !query 73 +-- !query SELECT * FROM (SELECT name, n as s1_n FROM t1) as s1 NATURAL FULL JOIN @@ -2137,16 +2137,16 @@ NATURAL FULL JOIN NATURAL FULL JOIN (SELECT name, n as s3_n FROM t3) as s3 ) ss2 --- !query 73 schema +-- !query schema struct --- !query 73 output +-- !query output bb 11 12 13 cc NULL 22 23 dd NULL NULL 33 ee NULL 42 NULL --- !query 74 +-- !query SELECT * FROM (SELECT name, n as s1_n FROM t1) as s1 NATURAL FULL JOIN @@ -2155,55 +2155,55 @@ NATURAL FULL JOIN NATURAL FULL JOIN (SELECT name, n as s3_n FROM t3) as s3 ) ss2 --- !query 74 schema +-- !query schema struct --- !query 74 output +-- !query output bb 11 12 2 13 cc NULL 22 2 23 dd NULL NULL NULL 33 ee NULL 42 2 NULL --- !query 75 +-- !query SELECT * FROM (SELECT name, n as s1_n FROM t1) as s1 FULL JOIN (SELECT name, 2 as s2_n FROM t2) as s2 ON (s1_n = s2_n) --- !query 75 schema +-- !query schema struct --- !query 75 output +-- !query output NULL NULL bb 2 NULL NULL cc 2 NULL NULL ee 2 bb 11 NULL NULL --- !query 76 +-- !query create or replace temporary view x as select * from (values (1,11), (2,22), (3,null), (4,44), (5,null)) as v(x1, x2) --- !query 76 schema +-- !query schema struct<> --- !query 76 output +-- !query output --- !query 77 +-- !query create or replace temporary view y as select * from (values (1,111), (2,222), (3,333), (4,null)) as v(y1, y2) --- !query 77 schema +-- !query schema struct<> --- !query 77 output +-- !query output --- !query 78 +-- !query select * from x --- !query 78 schema +-- !query schema struct --- !query 78 output +-- !query output 1 11 2 22 3 NULL @@ -2211,22 +2211,22 @@ struct 5 NULL --- !query 79 +-- !query select * from y --- !query 79 schema +-- !query schema struct --- !query 79 output +-- !query output 1 111 2 222 3 333 4 NULL --- !query 80 +-- !query select * from x left join y on (x1 = y1 and x2 is not null) --- !query 80 schema +-- !query schema struct --- !query 80 output +-- !query output 1 11 1 111 2 22 2 222 3 NULL NULL NULL @@ -2234,11 +2234,11 @@ struct 5 NULL NULL NULL --- !query 81 +-- !query select * from x left join y on (x1 = y1 and y2 is not null) --- !query 81 schema +-- !query schema struct --- !query 81 output +-- !query output 1 11 1 111 2 22 2 222 3 NULL 3 333 @@ -2246,12 +2246,12 @@ struct 5 NULL NULL NULL --- !query 82 +-- !query select * from (x left join y on (x1 = y1)) left join x xx(xx1,xx2) on (x1 = xx1) --- !query 82 schema +-- !query schema struct --- !query 82 output +-- !query output 1 11 1 111 1 11 2 22 2 222 2 22 3 NULL 3 333 3 NULL @@ -2259,12 +2259,12 @@ struct 5 NULL NULL NULL 5 NULL --- !query 83 +-- !query select * from (x left join y on (x1 = y1)) left join x xx(xx1,xx2) on (x1 = xx1 and x2 is not null) --- !query 83 schema +-- !query schema struct --- !query 83 output +-- !query output 1 11 1 111 1 11 2 22 2 222 2 22 3 NULL 3 333 NULL NULL @@ -2272,12 +2272,12 @@ struct 5 NULL NULL NULL NULL NULL --- !query 84 +-- !query select * from (x left join y on (x1 = y1)) left join x xx(xx1,xx2) on (x1 = xx1 and y2 is not null) --- !query 84 schema +-- !query schema struct --- !query 84 output +-- !query output 1 11 1 111 1 11 2 22 2 222 2 22 3 NULL 3 333 3 NULL @@ -2285,12 +2285,12 @@ struct 5 NULL NULL NULL NULL NULL --- !query 85 +-- !query select * from (x left join y on (x1 = y1)) left join x xx(xx1,xx2) on (x1 = xx1 and xx2 is not null) --- !query 85 schema +-- !query schema struct --- !query 85 output +-- !query output 1 11 1 111 1 11 2 22 2 222 2 22 3 NULL 3 333 NULL NULL @@ -2298,78 +2298,78 @@ struct 5 NULL NULL NULL NULL NULL --- !query 86 +-- !query select * from (x left join y on (x1 = y1)) left join x xx(xx1,xx2) on (x1 = xx1) where (x2 is not null) --- !query 86 schema +-- !query schema struct --- !query 86 output +-- !query output 1 11 1 111 1 11 2 22 2 222 2 22 4 44 4 NULL 4 44 --- !query 87 +-- !query select * from (x left join y on (x1 = y1)) left join x xx(xx1,xx2) on (x1 = xx1) where (y2 is not null) --- !query 87 schema +-- !query schema struct --- !query 87 output +-- !query output 1 11 1 111 1 11 2 22 2 222 2 22 3 NULL 3 333 3 NULL --- !query 88 +-- !query select * from (x left join y on (x1 = y1)) left join x xx(xx1,xx2) on (x1 = xx1) where (xx2 is not null) --- !query 88 schema +-- !query schema struct --- !query 88 output +-- !query output 1 11 1 111 1 11 2 22 2 222 2 22 4 44 4 NULL 4 44 --- !query 89 +-- !query select count(*) from tenk1 a where unique1 in (select unique1 from tenk1 b join tenk1 c using (unique1) where b.unique2 = 42) --- !query 89 schema +-- !query schema struct --- !query 89 output +-- !query output 1 --- !query 90 +-- !query select count(*) from tenk1 x where x.unique1 in (select a.f1 from int4_tbl a,float8_tbl b where a.f1=b.f1) and x.unique1 = 0 and x.unique1 in (select aa.f1 from int4_tbl aa,float8_tbl bb where aa.f1=bb.f1) --- !query 90 schema +-- !query schema struct --- !query 90 output +-- !query output 1 --- !query 91 +-- !query select count(*) from tenk1 x where x.unique1 in (select a.f1 from int4_tbl a,float8_tbl b where a.f1=b.f1) and x.unique1 = 0 and x.unique1 in (select aa.f1 from int4_tbl aa,float8_tbl bb where aa.f1=bb.f1) --- !query 91 schema +-- !query schema struct --- !query 91 output +-- !query output 1 --- !query 92 +-- !query select * from int8_tbl i1 left join (int8_tbl i2 join (select 123 as x) ss on i2.q1 = x) on i1.q2 = i2.q2 order by 1, 2 --- !query 92 schema +-- !query schema struct --- !query 92 output +-- !query output 123 456 123 456 123 123 4567890123456789 123 4567890123456789 123 4567890123456789 -4567890123456789 NULL NULL NULL @@ -2377,7 +2377,7 @@ struct 4567890123456789 4567890123456789 123 4567890123456789 123 --- !query 93 +-- !query select count(*) from (select t3.tenthous as x1, coalesce(t1.stringu1, t2.stringu1) as x2 @@ -2387,32 +2387,32 @@ from tenk1 t4, tenk1 t5 where t4.thousand = t5.unique1 and ss.x1 = t4.tenthous and ss.x2 = t5.stringu1 --- !query 93 schema +-- !query schema struct --- !query 93 output +-- !query output 1000 --- !query 94 +-- !query select a.f1, b.f1, t.thousand, t.tenthous from tenk1 t, (select sum(f1)+1 as f1 from int4_tbl i4a) a, (select sum(f1) as f1 from int4_tbl i4b) b where b.f1 = t.thousand and a.f1 = b.f1 and (a.f1+b.f1+999) = t.tenthous --- !query 94 schema +-- !query schema struct --- !query 94 output +-- !query output --- !query 95 +-- !query select * from j1_tbl full join (select * from j2_tbl order by j2_tbl.i desc, j2_tbl.k asc) j2_tbl on j1_tbl.i = j2_tbl.i and j1_tbl.i = j2_tbl.k --- !query 95 schema +-- !query schema struct --- !query 95 output +-- !query output 0 NULL zero NULL NULL 1 4 one NULL NULL 2 3 two 2 2 @@ -2434,156 +2434,128 @@ NULL NULL NULL NULL NULL NULL NULL null NULL NULL --- !query 96 +-- !query select count(*) from (select * from tenk1 x order by x.thousand, x.twothousand, x.fivethous) x left join (select * from tenk1 y order by y.unique2) y on x.thousand = y.unique2 and x.twothousand = y.hundred and x.fivethous = y.unique2 --- !query 96 schema +-- !query schema struct --- !query 96 output +-- !query output 10000 --- !query 97 +-- !query DROP TABLE t1 --- !query 97 schema +-- !query schema struct<> --- !query 97 output +-- !query output --- !query 98 +-- !query DROP TABLE t2 --- !query 98 schema +-- !query schema struct<> --- !query 98 output +-- !query output --- !query 99 +-- !query DROP TABLE t3 --- !query 99 schema +-- !query schema struct<> --- !query 99 output +-- !query output --- !query 100 +-- !query DROP TABLE J1_TBL --- !query 100 schema +-- !query schema struct<> --- !query 100 output +-- !query output --- !query 101 +-- !query DROP TABLE J2_TBL --- !query 101 schema +-- !query schema struct<> --- !query 101 output +-- !query output --- !query 102 +-- !query create or replace temporary view tt1 as select * from (values (1, 11), (2, NULL)) as v(tt1_id, joincol) --- !query 102 schema +-- !query schema struct<> --- !query 102 output +-- !query output --- !query 103 +-- !query create or replace temporary view tt2 as select * from (values (21, 11), (22, 11)) as v(tt2_id, joincol) --- !query 103 schema +-- !query schema struct<> --- !query 103 output +-- !query output --- !query 104 +-- !query select tt1.*, tt2.* from tt1 left join tt2 on tt1.joincol = tt2.joincol --- !query 104 schema +-- !query schema struct --- !query 104 output +-- !query output 1 11 21 11 1 11 22 11 2 NULL NULL NULL --- !query 105 +-- !query select tt1.*, tt2.* from tt2 right join tt1 on tt1.joincol = tt2.joincol --- !query 105 schema +-- !query schema struct --- !query 105 output +-- !query output 1 11 21 11 1 11 22 11 2 NULL NULL NULL --- !query 106 +-- !query select count(*) from tenk1 a, tenk1 b where a.hundred = b.thousand and (b.fivethous % 10) < 10 --- !query 106 schema +-- !query schema struct --- !query 106 output +-- !query output 100000 --- !query 107 -DROP TABLE IF EXISTS tt3 --- !query 107 schema -struct<> --- !query 107 output - - - --- !query 108 -CREATE TABLE tt3(f1 int, f2 string) USING parquet --- !query 108 schema -struct<> --- !query 108 output - - - --- !query 109 -INSERT INTO tt3 SELECT x.id, repeat('xyzzy', 100) FROM range(1,10001) x --- !query 109 schema -struct<> --- !query 109 output - - - --- !query 110 -DROP TABLE IF EXISTS tt4 --- !query 110 schema -struct<> --- !query 110 output - - - --- !query 111 -CREATE TABLE tt4(f1 int) USING parquet --- !query 111 schema +-- !query +create or replace temporary view tt3 as select * from + (SELECT cast(x.id as int), repeat('xyzzy', 100) FROM range(1,10001) x) + as v(f1, f2) +-- !query schema struct<> --- !query 111 output +-- !query output --- !query 112 -INSERT INTO tt4 VALUES (0),(1),(9999) --- !query 112 schema +-- !query +create or replace temporary view tt4 as select * from + (values (0), (1), (9999)) + as v(f1) +-- !query schema struct<> --- !query 112 output +-- !query output --- !query 113 +-- !query SELECT a.f1 FROM tt4 a LEFT JOIN ( @@ -2592,242 +2564,242 @@ LEFT JOIN ( WHERE c.f1 IS NULL ) AS d ON (a.f1 = d.f1) WHERE d.f1 IS NULL --- !query 113 schema +-- !query schema struct --- !query 113 output +-- !query output 0 1 9999 --- !query 114 +-- !query create or replace temporary view tt5 as select * from (values (1, 10), (1, 11)) as v(f1, f2) --- !query 114 schema +-- !query schema struct<> --- !query 114 output +-- !query output --- !query 115 +-- !query create or replace temporary view tt6 as select * from (values (1, 9), (1, 2), (2, 9)) as v(f1, f2) --- !query 115 schema +-- !query schema struct<> --- !query 115 output +-- !query output --- !query 116 +-- !query select * from tt5,tt6 where tt5.f1 = tt6.f1 and tt5.f1 = tt5.f2 - tt6.f2 --- !query 116 schema +-- !query schema struct --- !query 116 output +-- !query output 1 10 1 9 --- !query 117 +-- !query create or replace temporary view xx as select * from (values (1), (2), (3)) as v(pkxx) --- !query 117 schema +-- !query schema struct<> --- !query 117 output +-- !query output --- !query 118 +-- !query create or replace temporary view yy as select * from (values (101, 1), (201, 2), (301, NULL)) as v(pkyy, pkxx) --- !query 118 schema +-- !query schema struct<> --- !query 118 output +-- !query output --- !query 119 +-- !query select yy.pkyy as yy_pkyy, yy.pkxx as yy_pkxx, yya.pkyy as yya_pkyy, xxa.pkxx as xxa_pkxx, xxb.pkxx as xxb_pkxx from yy left join (SELECT * FROM yy where pkyy = 101) as yya ON yy.pkyy = yya.pkyy left join xx xxa on yya.pkxx = xxa.pkxx left join xx xxb on coalesce (xxa.pkxx, 1) = xxb.pkxx --- !query 119 schema +-- !query schema struct --- !query 119 output +-- !query output 101 1 101 1 1 201 2 NULL NULL 1 301 NULL NULL NULL 1 --- !query 120 +-- !query create or replace temporary view zt1 as select * from (values (53)) as v(f1) --- !query 120 schema +-- !query schema struct<> --- !query 120 output +-- !query output --- !query 121 +-- !query create or replace temporary view zt2 as select * from (values (53)) as v(f2) --- !query 121 schema +-- !query schema struct<> --- !query 121 output +-- !query output --- !query 122 +-- !query create or replace temporary view zt3(f3 int) using parquet --- !query 122 schema +-- !query schema struct<> --- !query 122 output +-- !query output --- !query 123 +-- !query select * from zt2 left join zt3 on (f2 = f3) left join zt1 on (f3 = f1) where f2 = 53 --- !query 123 schema +-- !query schema struct --- !query 123 output +-- !query output 53 NULL NULL --- !query 124 +-- !query create temp view zv1 as select *,'dummy' AS junk from zt1 --- !query 124 schema +-- !query schema struct<> --- !query 124 output +-- !query output --- !query 125 +-- !query select * from zt2 left join zt3 on (f2 = f3) left join zv1 on (f3 = f1) where f2 = 53 --- !query 125 schema +-- !query schema struct --- !query 125 output +-- !query output 53 NULL NULL NULL --- !query 126 +-- !query select a.unique2, a.ten, b.tenthous, b.unique2, b.hundred from tenk1 a left join tenk1 b on a.unique2 = b.tenthous where a.unique1 = 42 and ((b.unique2 is null and a.ten = 2) or b.hundred = 3) --- !query 126 schema +-- !query schema struct --- !query 126 output +-- !query output --- !query 127 +-- !query create or replace temporary view a (i integer) using parquet --- !query 127 schema +-- !query schema struct<> --- !query 127 output +-- !query output --- !query 128 +-- !query create or replace temporary view b (x integer, y integer) using parquet --- !query 128 schema +-- !query schema struct<> --- !query 128 output +-- !query output --- !query 129 +-- !query select * from a left join b on i = x and i = y and x = i --- !query 129 schema +-- !query schema struct --- !query 129 output +-- !query output --- !query 130 +-- !query select t1.q2, count(t2.*) from int8_tbl t1 left join int8_tbl t2 on (t1.q2 = t2.q1) group by t1.q2 order by 1 --- !query 130 schema +-- !query schema struct --- !query 130 output +-- !query output -4567890123456789 0 123 2 456 0 4567890123456789 6 --- !query 131 +-- !query select t1.q2, count(t2.*) from int8_tbl t1 left join (select * from int8_tbl) t2 on (t1.q2 = t2.q1) group by t1.q2 order by 1 --- !query 131 schema +-- !query schema struct --- !query 131 output +-- !query output -4567890123456789 0 123 2 456 0 4567890123456789 6 --- !query 132 +-- !query select t1.q2, count(t2.*) from int8_tbl t1 left join (select q1, case when q2=1 then 1 else q2 end as q2 from int8_tbl) t2 on (t1.q2 = t2.q1) group by t1.q2 order by 1 --- !query 132 schema +-- !query schema struct --- !query 132 output +-- !query output -4567890123456789 0 123 2 456 0 4567890123456789 6 --- !query 133 +-- !query create or replace temporary view a as select * from (values ('p'), ('q')) as v(code) --- !query 133 schema +-- !query schema struct<> --- !query 133 output +-- !query output --- !query 134 +-- !query create or replace temporary view b as select * from (values ('p', 1), ('p', 2)) as v(a, num) --- !query 134 schema +-- !query schema struct<> --- !query 134 output +-- !query output --- !query 135 +-- !query create or replace temporary view c as select * from (values ('A', 'p'), ('B', 'q'), ('C', null)) as v(name, a) --- !query 135 schema +-- !query schema struct<> --- !query 135 output +-- !query output --- !query 136 +-- !query select c.name, ss.code, ss.b_cnt, ss.const from c left join (select a.code, coalesce(b_grp.cnt, 0) as b_cnt, -1 as const @@ -2837,15 +2809,15 @@ from c left join ) as ss on (c.a = ss.code) order by c.name --- !query 136 schema +-- !query schema struct --- !query 136 output +-- !query output A p 2 -1 B q 0 -1 C NULL NULL NULL --- !query 137 +-- !query SELECT * FROM ( SELECT 1 as key1 ) sub1 LEFT JOIN @@ -2861,13 +2833,13 @@ LEFT JOIN ON sub4.key5 = sub3.key3 ) sub2 ON sub1.key1 = sub2.key3 --- !query 137 schema +-- !query schema struct --- !query 137 output +-- !query output 1 1 1 1 --- !query 138 +-- !query SELECT * FROM ( SELECT 1 as key1 ) sub1 LEFT JOIN @@ -2883,13 +2855,13 @@ LEFT JOIN ON sub4.key5 = sub3.key3 ) sub2 ON sub1.key1 = sub2.key3 --- !query 138 schema +-- !query schema struct --- !query 138 output +-- !query output 1 1 1 1 --- !query 139 +-- !query SELECT qq, unique1 FROM ( SELECT COALESCE(q1, 0) AS qq FROM int8_tbl a ) AS ss1 @@ -2897,45 +2869,45 @@ SELECT qq, unique1 ( SELECT COALESCE(q2, -1) AS qq FROM int8_tbl b ) AS ss2 USING (qq) INNER JOIN tenk1 c ON qq = unique2 --- !query 139 schema +-- !query schema struct --- !query 139 output +-- !query output 123 4596 123 4596 456 7318 --- !query 140 +-- !query create or replace temporary view nt1 as select * from (values(1,true,true), (2,true,false), (3,false,false)) as v(id, a1, a2) --- !query 140 schema +-- !query schema struct<> --- !query 140 output +-- !query output --- !query 141 +-- !query create or replace temporary view nt2 as select * from (values(1,1,true,true), (2,2,true,false), (3,3,false,false)) as v(id, nt1_id, b1, b2) --- !query 141 schema +-- !query schema struct<> --- !query 141 output +-- !query output --- !query 142 +-- !query create or replace temporary view nt3 as select * from (values(1,1,true), (2,2,false), (3,3,true)) as v(id, nt2_id, c1) --- !query 142 schema +-- !query schema struct<> --- !query 142 output +-- !query output --- !query 143 +-- !query select nt3.id from nt3 as nt3 left join @@ -2947,17 +2919,17 @@ from nt3 as nt3 ) as ss2 on ss2.id = nt3.nt2_id where nt3.id = 1 and ss2.b3 --- !query 143 schema +-- !query schema struct --- !query 143 output +-- !query output 1 --- !query 144 +-- !query select * from int4_tbl a full join int4_tbl b on true --- !query 144 schema +-- !query schema struct --- !query 144 output +-- !query output -123456 -123456 -123456 -2147483647 -123456 0 @@ -2985,11 +2957,11 @@ struct 2147483647 2147483647 --- !query 145 +-- !query select * from int4_tbl a full join int4_tbl b on false --- !query 145 schema +-- !query schema struct --- !query 145 output +-- !query output -123456 NULL -2147483647 NULL 0 NULL @@ -3002,27 +2974,27 @@ NULL 123456 NULL 2147483647 --- !query 146 +-- !query select count(*) from tenk1 a join tenk1 b on a.unique1 = b.unique2 left join tenk1 c on a.unique2 = b.unique1 and c.thousand = a.thousand join int4_tbl on b.thousand = f1 --- !query 146 schema +-- !query schema struct --- !query 146 output +-- !query output 10 --- !query 147 +-- !query select b.unique1 from tenk1 a join tenk1 b on a.unique1 = b.unique2 left join tenk1 c on b.unique1 = 42 and c.thousand = a.thousand join int4_tbl i1 on b.thousand = f1 right join int4_tbl i2 on i2.f1 = b.tenthous order by 1 --- !query 147 schema +-- !query schema struct --- !query 147 output +-- !query output NULL NULL NULL @@ -3030,7 +3002,7 @@ NULL 0 --- !query 148 +-- !query select * from ( select unique1, q1, coalesce(unique1, -1) + q1 as fault @@ -3038,43 +3010,43 @@ select * from ) ss where fault = 122 order by fault --- !query 148 schema +-- !query schema struct --- !query 148 output +-- !query output NULL 123 122 --- !query 149 +-- !query select q1, unique2, thousand, hundred from int8_tbl a left join tenk1 b on q1 = unique2 where coalesce(thousand,123) = q1 and q1 = coalesce(hundred,123) --- !query 149 schema +-- !query schema struct --- !query 149 output +-- !query output --- !query 150 +-- !query select f1, unique2, case when unique2 is null then f1 else 0 end from int4_tbl a left join tenk1 b on f1 = unique2 where (case when unique2 is null then f1 else 0 end) = 0 --- !query 150 schema +-- !query schema struct --- !query 150 output +-- !query output 0 0 0 --- !query 151 +-- !query select a.unique1, b.unique1, c.unique1, coalesce(b.twothousand, a.twothousand) from tenk1 a left join tenk1 b on b.thousand = a.unique1 left join tenk1 c on c.unique2 = coalesce(b.twothousand, a.twothousand) where a.unique2 < 10 and coalesce(b.twothousand, a.twothousand) = 44 --- !query 151 schema +-- !query schema struct --- !query 151 output +-- !query output --- !query 152 +-- !query select * from text_tbl t1 inner join int8_tbl i8 @@ -3083,32 +3055,32 @@ select * from on t1.f1 = 'doh!' left join int4_tbl i4 on i8.q1 = i4.f1 --- !query 152 schema +-- !query schema struct --- !query 152 output +-- !query output doh! 123 456 doh! NULL doh! 123 456 hi de ho neighbor NULL --- !query 153 +-- !query select * from (select 1 as id) as xx left join (tenk1 as a1 full join (select 1 as id) as yy on (a1.unique1 = yy.id)) on (xx.id = coalesce(yy.id)) --- !query 153 schema +-- !query schema struct --- !query 153 output +-- !query output 1 1 2838 1 1 1 1 1 1 1 1 1 2 3 BAAAAA EFEAAA OOOOxx 1 --- !query 154 +-- !query select a.q2, b.q1 from int8_tbl a left join int8_tbl b on a.q2 = coalesce(b.q1, 1) where coalesce(b.q1, 1) > 0 --- !query 154 schema +-- !query schema struct --- !query 154 output +-- !query output -4567890123456789 NULL 123 123 123 123 @@ -3121,124 +3093,124 @@ struct 4567890123456789 4567890123456789 --- !query 155 +-- !query create or replace temporary view parent as select * from (values (1, 10), (2, 20), (3, 30)) as v(k, pd) --- !query 155 schema +-- !query schema struct<> --- !query 155 output +-- !query output --- !query 156 +-- !query create or replace temporary view child as select * from (values (1, 100), (4, 400)) as v(k, cd) --- !query 156 schema +-- !query schema struct<> --- !query 156 output +-- !query output --- !query 157 +-- !query select p.* from parent p left join child c on (p.k = c.k) --- !query 157 schema +-- !query schema struct --- !query 157 output +-- !query output 1 10 2 20 3 30 --- !query 158 +-- !query select p.*, linked from parent p left join (select c.*, true as linked from child c) as ss on (p.k = ss.k) --- !query 158 schema +-- !query schema struct --- !query 158 output +-- !query output 1 10 true 2 20 NULL 3 30 NULL --- !query 159 +-- !query select p.* from parent p left join child c on (p.k = c.k) where p.k = 1 and p.k = 2 --- !query 159 schema +-- !query schema struct --- !query 159 output +-- !query output --- !query 160 +-- !query select p.* from (parent p left join child c on (p.k = c.k)) join parent x on p.k = x.k where p.k = 1 and p.k = 2 --- !query 160 schema +-- !query schema struct --- !query 160 output +-- !query output --- !query 161 +-- !query create or replace temporary view a as select * from (values (0), (1)) as v(id) --- !query 161 schema +-- !query schema struct<> --- !query 161 output +-- !query output --- !query 162 +-- !query create or replace temporary view b as select * from (values (0, 0), (1, NULL)) as v(id, a_id) --- !query 162 schema +-- !query schema struct<> --- !query 162 output +-- !query output --- !query 163 +-- !query SELECT * FROM b LEFT JOIN a ON (b.a_id = a.id) WHERE (a.id IS NULL OR a.id > 0) --- !query 163 schema +-- !query schema struct --- !query 163 output +-- !query output 1 NULL NULL --- !query 164 +-- !query SELECT b.* FROM b LEFT JOIN a ON (b.a_id = a.id) WHERE (a.id IS NULL OR a.id > 0) --- !query 164 schema +-- !query schema struct --- !query 164 output +-- !query output 1 NULL --- !query 165 +-- !query create or replace temporary view innertab as select * from (values (123L, 42L)) as v(id, dat1) --- !query 165 schema +-- !query schema struct<> --- !query 165 output +-- !query output --- !query 166 +-- !query SELECT * FROM (SELECT 1 AS x) ss1 LEFT JOIN (SELECT q1, q2, COALESCE(dat1, q1) AS y FROM int8_tbl LEFT JOIN innertab ON q2 = id) ss2 ON true --- !query 166 schema +-- !query schema struct --- !query 166 output +-- !query output 1 123 456 123 1 123 4567890123456789 123 1 4567890123456789 -4567890123456789 4567890123456789 @@ -3246,163 +3218,163 @@ struct 1 4567890123456789 4567890123456789 4567890123456789 --- !query 167 +-- !query select * from int8_tbl x join (int4_tbl x cross join int4_tbl y) j on q1 = f1 --- !query 167 schema +-- !query schema struct<> --- !query 167 output +-- !query output org.apache.spark.sql.AnalysisException Reference 'f1' is ambiguous, could be: j.f1, j.f1.; line 2 pos 63 --- !query 168 +-- !query select * from int8_tbl x join (int4_tbl x cross join int4_tbl y) j on q1 = y.f1 --- !query 168 schema +-- !query schema struct<> --- !query 168 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`y.f1`' given input columns: [j.f1, j.f1, x.q1, x.q2]; line 2 pos 63 --- !query 169 +-- !query select * from int8_tbl x join (int4_tbl x cross join int4_tbl y(ff)) j on q1 = f1 --- !query 169 schema +-- !query schema struct --- !query 169 output +-- !query output --- !query 170 +-- !query select t1.uunique1 from tenk1 t1 join tenk2 t2 on t1.two = t2.two --- !query 170 schema +-- !query schema struct<> --- !query 170 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`t1.uunique1`' given input columns: [t1.even, t2.even, t1.fivethous, t2.fivethous, t1.four, t2.four, t1.hundred, t2.hundred, t1.odd, t2.odd, t1.string4, t2.string4, t1.stringu1, t2.stringu1, t1.stringu2, t2.stringu2, t1.ten, t2.ten, t1.tenthous, t2.tenthous, t1.thousand, t2.thousand, t1.twenty, t2.twenty, t1.two, t2.two, t1.twothousand, t2.twothousand, t1.unique1, t2.unique1, t1.unique2, t2.unique2]; line 1 pos 7 --- !query 171 +-- !query select t2.uunique1 from tenk1 t1 join tenk2 t2 on t1.two = t2.two --- !query 171 schema +-- !query schema struct<> --- !query 171 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`t2.uunique1`' given input columns: [t1.even, t2.even, t1.fivethous, t2.fivethous, t1.four, t2.four, t1.hundred, t2.hundred, t1.odd, t2.odd, t1.string4, t2.string4, t1.stringu1, t2.stringu1, t1.stringu2, t2.stringu2, t1.ten, t2.ten, t1.tenthous, t2.tenthous, t1.thousand, t2.thousand, t1.twenty, t2.twenty, t1.two, t2.two, t1.twothousand, t2.twothousand, t1.unique1, t2.unique1, t1.unique2, t2.unique2]; line 1 pos 7 --- !query 172 +-- !query select uunique1 from tenk1 t1 join tenk2 t2 on t1.two = t2.two --- !query 172 schema +-- !query schema struct<> --- !query 172 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`uunique1`' given input columns: [t1.even, t2.even, t1.fivethous, t2.fivethous, t1.four, t2.four, t1.hundred, t2.hundred, t1.odd, t2.odd, t1.string4, t2.string4, t1.stringu1, t2.stringu1, t1.stringu2, t2.stringu2, t1.ten, t2.ten, t1.tenthous, t2.tenthous, t1.thousand, t2.thousand, t1.twenty, t2.twenty, t1.two, t2.two, t1.twothousand, t2.twothousand, t1.unique1, t2.unique1, t1.unique2, t2.unique2]; line 1 pos 7 --- !query 173 +-- !query select f1,g from int4_tbl a, (select f1 as g) ss --- !query 173 schema +-- !query schema struct<> --- !query 173 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`f1`' given input columns: []; line 1 pos 37 --- !query 174 +-- !query select f1,g from int4_tbl a, (select a.f1 as g) ss --- !query 174 schema +-- !query schema struct<> --- !query 174 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`a.f1`' given input columns: []; line 1 pos 37 --- !query 175 +-- !query select f1,g from int4_tbl a cross join (select f1 as g) ss --- !query 175 schema +-- !query schema struct<> --- !query 175 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`f1`' given input columns: []; line 1 pos 47 --- !query 176 +-- !query select f1,g from int4_tbl a cross join (select a.f1 as g) ss --- !query 176 schema +-- !query schema struct<> --- !query 176 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`a.f1`' given input columns: []; line 1 pos 47 --- !query 177 +-- !query CREATE TABLE j1 (id1 int, id2 int) USING parquet --- !query 177 schema +-- !query schema struct<> --- !query 177 output +-- !query output --- !query 178 +-- !query CREATE TABLE j2 (id1 int, id2 int) USING parquet --- !query 178 schema +-- !query schema struct<> --- !query 178 output +-- !query output --- !query 179 +-- !query INSERT INTO j1 values(1,1),(1,2) --- !query 179 schema +-- !query schema struct<> --- !query 179 output +-- !query output --- !query 180 +-- !query INSERT INTO j2 values(1,1) --- !query 180 schema +-- !query schema struct<> --- !query 180 output +-- !query output --- !query 181 +-- !query INSERT INTO j2 values(1,2) --- !query 181 schema +-- !query schema struct<> --- !query 181 output +-- !query output --- !query 182 +-- !query select * from j1 inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2 where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1 --- !query 182 schema +-- !query schema struct --- !query 182 output +-- !query output 1 1 1 1 1 2 1 2 --- !query 183 +-- !query drop table j1 --- !query 183 schema +-- !query schema struct<> --- !query 183 output +-- !query output --- !query 184 +-- !query drop table j2 --- !query 184 schema +-- !query schema struct<> --- !query 184 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/limit.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/limit.sql.out new file mode 100644 index 0000000000000..2c8bc31dbc6ca --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/limit.sql.out @@ -0,0 +1,81 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 7 + + +-- !query +SELECT '' AS two, unique1, unique2, stringu1 + FROM onek WHERE unique1 > 50 + ORDER BY unique1 LIMIT 2 +-- !query schema +struct +-- !query output + 51 76 ZBAAAA + 52 985 ACAAAA + + +-- !query +SELECT '' AS five, unique1, unique2, stringu1 + FROM onek WHERE unique1 > 60 + ORDER BY unique1 LIMIT 5 +-- !query schema +struct +-- !query output + 61 560 JCAAAA + 62 633 KCAAAA + 63 296 LCAAAA + 64 479 MCAAAA + 65 64 NCAAAA + + +-- !query +SELECT '' AS two, unique1, unique2, stringu1 + FROM onek WHERE unique1 > 60 AND unique1 < 63 + ORDER BY unique1 LIMIT 5 +-- !query schema +struct +-- !query output + 61 560 JCAAAA + 62 633 KCAAAA + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW INT8_TBL AS SELECT * FROM + (VALUES + (123, 456), + (123, 4567890123456789), + (4567890123456789, 123), + (4567890123456789, 4567890123456789), + (4567890123456789, -4567890123456789)) + AS v(q1, q2) +-- !query schema +struct<> +-- !query output + + + +-- !query +select * from int8_tbl limit (case when random() < 0.5 then bigint(null) end) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +The limit expression must evaluate to a constant value, but got CASE WHEN (`_nondeterministic` < CAST(0.5BD AS DOUBLE)) THEN CAST(NULL AS BIGINT) END; + + +-- !query +DROP VIEW INT8_TBL +-- !query schema +struct<> +-- !query output + + + +-- !query +select sum(tenthous) as s1, sum(tenthous) + random()*0 as s2 + from tenk1 group by thousand order by thousand limit 3 +-- !query schema +struct +-- !query output +45000 45000.0 +45010 45010.0 +45020 45020.0 diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/numeric.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/numeric.sql.out new file mode 100644 index 0000000000000..bdb605e406b8a --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/numeric.sql.out @@ -0,0 +1,4867 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 577 + + +-- !query +CREATE TABLE num_data (id int, val decimal(38,10)) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE num_exp_add (id1 int, id2 int, expected decimal(38,10)) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE num_exp_sub (id1 int, id2 int, expected decimal(38,10)) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE num_exp_div (id1 int, id2 int, expected decimal(38,10)) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE num_exp_mul (id1 int, id2 int, expected decimal(38,10)) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE num_exp_sqrt (id int, expected decimal(38,10)) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE num_exp_ln (id int, expected decimal(38,10)) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE num_exp_log10 (id int, expected decimal(38,10)) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE num_exp_power_10_ln (id int, expected decimal(38,10)) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE num_result (id1 int, id2 int, result decimal(38,10)) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (0,0,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (0,0,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (0,0,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (0,0,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (0,1,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (0,1,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (0,1,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (0,1,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (0,2,-34338492.215397047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (0,2,34338492.215397047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (0,2,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (0,2,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (0,3,4.31) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (0,3,-4.31) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (0,3,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (0,3,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (0,4,7799461.4119) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (0,4,-7799461.4119) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (0,4,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (0,4,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (0,5,16397.038491) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (0,5,-16397.038491) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (0,5,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (0,5,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (0,6,93901.57763026) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (0,6,-93901.57763026) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (0,6,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (0,6,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (0,7,-83028485) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (0,7,83028485) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (0,7,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (0,7,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (0,8,74881) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (0,8,-74881) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (0,8,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (0,8,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (0,9,-24926804.045047420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (0,9,24926804.045047420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (0,9,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (0,9,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (1,0,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (1,0,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (1,0,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (1,0,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (1,1,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (1,1,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (1,1,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (1,1,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (1,2,-34338492.215397047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (1,2,34338492.215397047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (1,2,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (1,2,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (1,3,4.31) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (1,3,-4.31) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (1,3,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (1,3,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (1,4,7799461.4119) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (1,4,-7799461.4119) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (1,4,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (1,4,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (1,5,16397.038491) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (1,5,-16397.038491) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (1,5,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (1,5,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (1,6,93901.57763026) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (1,6,-93901.57763026) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (1,6,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (1,6,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (1,7,-83028485) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (1,7,83028485) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (1,7,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (1,7,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (1,8,74881) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (1,8,-74881) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (1,8,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (1,8,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (1,9,-24926804.045047420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (1,9,24926804.045047420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (1,9,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (1,9,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (2,0,-34338492.215397047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (2,0,-34338492.215397047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (2,0,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (2,0,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (2,1,-34338492.215397047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (2,1,-34338492.215397047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (2,1,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (2,1,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (2,2,-68676984.430794094) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (2,2,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (2,2,1179132047626883.596862135856320209) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (2,2,1.00000000000000000000) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (2,3,-34338487.905397047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (2,3,-34338496.525397047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (2,3,-147998901.44836127257) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (2,3,-7967167.56737750510440835266) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (2,4,-26539030.803497047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (2,4,-42137953.627297047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (2,4,-267821744976817.8111137106593) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (2,4,-4.40267480046830116685) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (2,5,-34322095.176906047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (2,5,-34354889.253888047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (2,5,-563049578578.769242506736077) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (2,5,-2094.18866914563535496429) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (2,6,-34244590.637766787) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (2,6,-34432393.793027307) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (2,6,-3224438592470.18449811926184222) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (2,6,-365.68599891479766440940) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (2,7,-117366977.215397047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (2,7,48689992.784602953) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (2,7,2851072985828710.485883795) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (2,7,.41357483778485235518) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (2,8,-34263611.215397047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (2,8,-34413373.215397047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (2,8,-2571300635581.146276407) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (2,8,-458.57416721727870888476) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (2,9,-59265296.260444467) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (2,9,-9411688.170349627) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (2,9,855948866655588.453741509242968740) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (2,9,1.37757299946438931811) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (3,0,4.31) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (3,0,4.31) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (3,0,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (3,0,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (3,1,4.31) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (3,1,4.31) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (3,1,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (3,1,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (3,2,-34338487.905397047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (3,2,34338496.525397047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (3,2,-147998901.44836127257) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (3,2,-.00000012551512084352) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (3,3,8.62) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (3,3,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (3,3,18.5761) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (3,3,1.00000000000000000000) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (3,4,7799465.7219) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (3,4,-7799457.1019) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (3,4,33615678.685289) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (3,4,.00000055260225961552) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (3,5,16401.348491) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (3,5,-16392.728491) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (3,5,70671.23589621) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (3,5,.00026285234387695504) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (3,6,93905.88763026) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (3,6,-93897.26763026) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (3,6,404715.7995864206) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (3,6,.00004589912234457595) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (3,7,-83028480.69) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (3,7,83028489.31) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (3,7,-357852770.35) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (3,7,-.00000005190989574240) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (3,8,74885.31) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (3,8,-74876.69) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (3,8,322737.11) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (3,8,.00005755799201399553) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (3,9,-24926799.735047420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (3,9,24926808.355047420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (3,9,-107434525.43415438020) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (3,9,-.00000017290624149854) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (4,0,7799461.4119) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (4,0,7799461.4119) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (4,0,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (4,0,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (4,1,7799461.4119) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (4,1,7799461.4119) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (4,1,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (4,1,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (4,2,-26539030.803497047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (4,2,42137953.627297047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (4,2,-267821744976817.8111137106593) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (4,2,-.22713465002993920385) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (4,3,7799465.7219) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (4,3,7799457.1019) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (4,3,33615678.685289) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (4,3,1809619.81714617169373549883) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (4,4,15598922.8238) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (4,4,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (4,4,60831598315717.14146161) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (4,4,1.00000000000000000000) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (4,5,7815858.450391) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (4,5,7783064.373409) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (4,5,127888068979.9935054429) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (4,5,475.66281046305802686061) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (4,6,7893362.98953026) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (4,6,7705559.83426974) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (4,6,732381731243.745115764094) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (4,6,83.05996138436129499606) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (4,7,-75229023.5881) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (4,7,90827946.4119) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (4,7,-647577464846017.9715) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (4,7,-.09393717604145131637) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (4,8,7874342.4119) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (4,8,7724580.4119) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (4,8,584031469984.4839) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (4,8,104.15808298366741897143) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (4,9,-17127342.633147420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (4,9,32726265.456947420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (4,9,-194415646271340.1815956522980) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (4,9,-.31289456112403769409) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (5,0,16397.038491) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (5,0,16397.038491) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (5,0,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (5,0,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (5,1,16397.038491) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (5,1,16397.038491) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (5,1,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (5,1,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (5,2,-34322095.176906047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (5,2,34354889.253888047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (5,2,-563049578578.769242506736077) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (5,2,-.00047751189505192446) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (5,3,16401.348491) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (5,3,16392.728491) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (5,3,70671.23589621) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (5,3,3804.41728329466357308584) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (5,4,7815858.450391) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (5,4,-7783064.373409) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (5,4,127888068979.9935054429) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (5,4,.00210232958726897192) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (5,5,32794.076982) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (5,5,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (5,5,268862871.275335557081) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (5,5,1.00000000000000000000) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (5,6,110298.61612126) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (5,6,-77504.53913926) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (5,6,1539707782.76899778633766) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (5,6,.17461941433576102689) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (5,7,-83012087.961509) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (5,7,83044882.038491) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (5,7,-1361421264394.416135) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (5,7,-.00019748690453643710) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (5,8,91278.038491) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (5,8,-58483.961509) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (5,8,1227826639.244571) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (5,8,.21897461960978085228) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (5,9,-24910407.006556420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (5,9,24943201.083538420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (5,9,-408725765384.257043660243220) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (5,9,-.00065780749354660427) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (6,0,93901.57763026) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (6,0,93901.57763026) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (6,0,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (6,0,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (6,1,93901.57763026) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (6,1,93901.57763026) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (6,1,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (6,1,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (6,2,-34244590.637766787) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (6,2,34432393.793027307) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (6,2,-3224438592470.18449811926184222) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (6,2,-.00273458651128995823) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (6,3,93905.88763026) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (6,3,93897.26763026) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (6,3,404715.7995864206) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (6,3,21786.90896293735498839907) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (6,4,7893362.98953026) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (6,4,-7705559.83426974) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (6,4,732381731243.745115764094) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (6,4,.01203949512295682469) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (6,5,110298.61612126) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (6,5,77504.53913926) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (6,5,1539707782.76899778633766) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (6,5,5.72674008674192359679) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (6,6,187803.15526052) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (6,6,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (6,6,8817506281.4517452372676676) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (6,6,1.00000000000000000000) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (6,7,-82934583.42236974) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (6,7,83122386.57763026) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (6,7,-7796505729750.37795610) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (6,7,-.00113095617281538980) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (6,8,168782.57763026) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (6,8,19020.57763026) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (6,8,7031444034.53149906) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (6,8,1.25401073209839612184) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (6,9,-24832902.467417160) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (6,9,25020705.622677680) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (6,9,-2340666225110.29929521292692920) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (6,9,-.00376709254265256789) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (7,0,-83028485) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (7,0,-83028485) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (7,0,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (7,0,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (7,1,-83028485) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (7,1,-83028485) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (7,1,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (7,1,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (7,2,-117366977.215397047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (7,2,-48689992.784602953) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (7,2,2851072985828710.485883795) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (7,2,2.41794207151503385700) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (7,3,-83028480.69) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (7,3,-83028489.31) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (7,3,-357852770.35) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (7,3,-19264149.65197215777262180974) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (7,4,-75229023.5881) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (7,4,-90827946.4119) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (7,4,-647577464846017.9715) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (7,4,-10.64541262725136247686) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (7,5,-83012087.961509) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (7,5,-83044882.038491) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (7,5,-1361421264394.416135) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (7,5,-5063.62688881730941836574) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (7,6,-82934583.42236974) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (7,6,-83122386.57763026) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (7,6,-7796505729750.37795610) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (7,6,-884.20756174009028770294) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (7,7,-166056970) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (7,7,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (7,7,6893729321395225) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (7,7,1.00000000000000000000) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (7,8,-82953604) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (7,8,-83103366) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (7,8,-6217255985285) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (7,8,-1108.80577182462841041118) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (7,9,-107955289.045047420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (7,9,-58101680.954952580) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (7,9,2069634775752159.035758700) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (7,9,3.33089171198810413382) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (8,0,74881) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (8,0,74881) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (8,0,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (8,0,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (8,1,74881) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (8,1,74881) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (8,1,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (8,1,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (8,2,-34263611.215397047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (8,2,34413373.215397047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (8,2,-2571300635581.146276407) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (8,2,-.00218067233500788615) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (8,3,74885.31) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (8,3,74876.69) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (8,3,322737.11) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (8,3,17373.78190255220417633410) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (8,4,7874342.4119) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (8,4,-7724580.4119) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (8,4,584031469984.4839) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (8,4,.00960079113741758956) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (8,5,91278.038491) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (8,5,58483.961509) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (8,5,1227826639.244571) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (8,5,4.56673929509287019456) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (8,6,168782.57763026) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (8,6,-19020.57763026) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (8,6,7031444034.53149906) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (8,6,.79744134113322314424) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (8,7,-82953604) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (8,7,83103366) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (8,7,-6217255985285) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (8,7,-.00090187120721280172) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (8,8,149762) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (8,8,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (8,8,5607164161) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (8,8,1.00000000000000000000) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (8,9,-24851923.045047420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (8,9,25001685.045047420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (8,9,-1866544013697.195857020) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (8,9,-.00300403532938582735) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (9,0,-24926804.045047420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (9,0,-24926804.045047420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (9,0,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (9,0,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (9,1,-24926804.045047420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (9,1,-24926804.045047420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (9,1,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (9,1,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (9,2,-59265296.260444467) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (9,2,9411688.170349627) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (9,2,855948866655588.453741509242968740) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (9,2,.72591434384152961526) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (9,3,-24926799.735047420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (9,3,-24926808.355047420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (9,3,-107434525.43415438020) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (9,3,-5783481.21694835730858468677) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (9,4,-17127342.633147420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (9,4,-32726265.456947420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (9,4,-194415646271340.1815956522980) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (9,4,-3.19596478892958416484) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (9,5,-24910407.006556420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (9,5,-24943201.083538420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (9,5,-408725765384.257043660243220) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (9,5,-1520.20159364322004505807) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (9,6,-24832902.467417160) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (9,6,-25020705.622677680) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (9,6,-2340666225110.29929521292692920) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (9,6,-265.45671195426965751280) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (9,7,-107955289.045047420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (9,7,58101680.954952580) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (9,7,2069634775752159.035758700) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (9,7,.30021990699995814689) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (9,8,-24851923.045047420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (9,8,-25001685.045047420) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (9,8,-1866544013697.195857020) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (9,8,-332.88556569820675471748) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_add VALUES (9,9,-49853608.090094840) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sub VALUES (9,9,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_mul VALUES (9,9,621345559900192.420120630048656400) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_div VALUES (9,9,1.00000000000000000000) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sqrt VALUES (0,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sqrt VALUES (1,0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sqrt VALUES (2,5859.90547836712524903505) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sqrt VALUES (3,2.07605394920266944396) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sqrt VALUES (4,2792.75158435189147418923) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sqrt VALUES (5,128.05092147657509145473) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sqrt VALUES (6,306.43364311096782703406) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sqrt VALUES (7,9111.99676251039939975230) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sqrt VALUES (8,273.64392922189960397542) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_sqrt VALUES (9,4992.67503899937593364766) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_ln VALUES (0,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_ln VALUES (1,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_ln VALUES (2,17.35177750493897715514) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_ln VALUES (3,1.46093790411565641971) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_ln VALUES (4,15.86956523951936572464) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_ln VALUES (5,9.70485601768871834038) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_ln VALUES (6,11.45000246622944403127) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_ln VALUES (7,18.23469429965478772991) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_ln VALUES (8,11.22365546576315513668) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_ln VALUES (9,17.03145425013166006962) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_log10 VALUES (0,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_log10 VALUES (1,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_log10 VALUES (2,7.53578122160797276459) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_log10 VALUES (3,.63447727016073160075) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_log10 VALUES (4,6.89206461372691743345) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_log10 VALUES (5,4.21476541614777768626) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_log10 VALUES (6,4.97267288886207207671) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_log10 VALUES (7,7.91922711353275546914) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_log10 VALUES (8,4.87437163556421004138) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_log10 VALUES (9,7.39666659961986567059) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_power_10_ln VALUES (0,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_power_10_ln VALUES (1,double('NaN')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_power_10_ln VALUES (2,224790267919917955.13261618583642653184) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_power_10_ln VALUES (3,28.90266599445155957393) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_power_10_ln VALUES (4,7405685069594999.07733999469386277636) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_power_10_ln VALUES (5,5068226527.32127265408584640098) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_power_10_ln VALUES (6,281839893606.99372343357047819067) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_power_10_ln VALUES (7,1716699575118597095.42330819910640247627) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +decimal can only support precision up to 38 +== SQL == +INSERT INTO num_exp_power_10_ln VALUES (7,1716699575118597095.42330819910640247627) + + +-- !query +INSERT INTO num_exp_power_10_ln VALUES (8,167361463828.07491320069016125952) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_exp_power_10_ln VALUES (9,107511333880052007.04141124673540337457) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_data VALUES (0, 0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_data VALUES (1, 0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_data VALUES (2, -34338492.215397047) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_data VALUES (3, 4.31) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_data VALUES (4, 7799461.4119) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_data VALUES (5, 16397.038491) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_data VALUES (6, 93901.57763026) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_data VALUES (7, -83028485) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_data VALUES (8, 74881) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_data VALUES (9, -24926804.045047420) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM num_data +-- !query schema +struct +-- !query output +0 0.0000000000 +1 0.0000000000 +2 -34338492.2153970470 +3 4.3100000000 +4 7799461.4119000000 +5 16397.0384910000 +6 93901.5776302600 +7 -83028485.0000000000 +8 74881.0000000000 +9 -24926804.0450474200 + + +-- !query +TRUNCATE TABLE num_result +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_result SELECT t1.id, t2.id, t1.val + t2.val + FROM num_data t1, num_data t2 +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT t1.id1, t1.id2, t1.result, t2.expected + FROM num_result t1, num_exp_add t2 + WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 + AND t1.result != t2.expected +-- !query schema +struct +-- !query output + + + +-- !query +TRUNCATE TABLE num_result +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_result SELECT t1.id, t2.id, round(t1.val + t2.val, 10) + FROM num_data t1, num_data t2 +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT t1.id1, t1.id2, t1.result, round(t2.expected, 10) as expected + FROM num_result t1, num_exp_add t2 + WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 + AND t1.result != round(t2.expected, 10) +-- !query schema +struct +-- !query output + + + +-- !query +TRUNCATE TABLE num_result +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_result SELECT t1.id, t2.id, t1.val - t2.val + FROM num_data t1, num_data t2 +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT t1.id1, t1.id2, t1.result, t2.expected + FROM num_result t1, num_exp_sub t2 + WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 + AND t1.result != t2.expected +-- !query schema +struct +-- !query output + + + +-- !query +TRUNCATE TABLE num_result +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_result SELECT t1.id, t2.id, round(t1.val - t2.val, 40) + FROM num_data t1, num_data t2 +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT t1.id1, t1.id2, t1.result, round(t2.expected, 40) + FROM num_result t1, num_exp_sub t2 + WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 + AND t1.result != round(t2.expected, 40) +-- !query schema +struct +-- !query output + + + +-- !query +TRUNCATE TABLE num_result +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_result SELECT t1.id, t2.id, t1.val, t2.val, t1.val * t2.val + FROM num_data t1, num_data t2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +`default`.`num_result` requires that the data to be inserted have the same number of columns as the target table: target table has 3 column(s) but the inserted data has 5 column(s), including 0 partition column(s) having constant value(s).; + + +-- !query +SELECT t1.id1, t1.id2, t1.result, t2.expected + FROM num_result t1, num_exp_mul t2 + WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 + AND t1.result != t2.expected +-- !query schema +struct +-- !query output + + + +-- !query +TRUNCATE TABLE num_result +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_result SELECT t1.id, t2.id, round(t1.val * t2.val, 30) + FROM num_data t1, num_data t2 +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT t1.id1, t1.id2, t1.result, round(t2.expected, 30) as expected + FROM num_result t1, num_exp_mul t2 + WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 + AND t1.result != round(t2.expected, 30) +-- !query schema +struct +-- !query output +2 2 1179132047626883.5968620000 1179132047626883.5968621359 +2 3 -147998901.4483610000 -147998901.4483612726 +2 4 -267821744976817.8111140000 -267821744976817.8111137107 +2 5 -563049578578.7692430000 -563049578578.7692425067 +2 6 -3224438592470.1844980000 -3224438592470.1844981193 +2 7 2851072985828710.4858840000 2851072985828710.4858837950 +2 8 -2571300635581.1462760000 -2571300635581.1462764070 +2 9 855948866655588.4537420000 855948866655588.4537415092 +3 2 -147998901.4483610000 -147998901.4483612726 +3 5 70671.2358960000 70671.2358962100 +3 6 404715.7995860000 404715.7995864206 +3 9 -107434525.4341540000 -107434525.4341543802 +4 2 -267821744976817.8111140000 -267821744976817.8111137107 +4 4 60831598315717.1414620000 60831598315717.1414616100 +4 5 127888068979.9935050000 127888068979.9935054429 +4 6 732381731243.7451160000 732381731243.7451157641 +4 9 -194415646271340.1815960000 -194415646271340.1815956523 +5 2 -563049578578.7692430000 -563049578578.7692425067 +5 3 70671.2358960000 70671.2358962100 +5 4 127888068979.9935050000 127888068979.9935054429 +5 5 268862871.2753360000 268862871.2753355571 +5 6 1539707782.7689980000 1539707782.7689977863 +5 9 -408725765384.2570440000 -408725765384.2570436602 +6 2 -3224438592470.1844980000 -3224438592470.1844981193 +6 3 404715.7995860000 404715.7995864206 +6 4 732381731243.7451160000 732381731243.7451157641 +6 5 1539707782.7689980000 1539707782.7689977863 +6 6 8817506281.4517450000 8817506281.4517452373 +6 7 -7796505729750.3779560000 -7796505729750.3779561000 +6 8 7031444034.5314990000 7031444034.5314990600 +6 9 -2340666225110.2992950000 -2340666225110.2992952129 +7 2 2851072985828710.4858840000 2851072985828710.4858837950 +7 6 -7796505729750.3779560000 -7796505729750.3779561000 +7 9 2069634775752159.0357590000 2069634775752159.0357587000 +8 2 -2571300635581.1462760000 -2571300635581.1462764070 +8 6 7031444034.5314990000 7031444034.5314990600 +8 9 -1866544013697.1958570000 -1866544013697.1958570200 +9 2 855948866655588.4537420000 855948866655588.4537415092 +9 3 -107434525.4341540000 -107434525.4341543802 +9 4 -194415646271340.1815960000 -194415646271340.1815956523 +9 5 -408725765384.2570440000 -408725765384.2570436602 +9 6 -2340666225110.2992950000 -2340666225110.2992952129 +9 7 2069634775752159.0357590000 2069634775752159.0357587000 +9 8 -1866544013697.1958570000 -1866544013697.1958570200 +9 9 621345559900192.4201210000 621345559900192.4201206300 + + +-- !query +TRUNCATE TABLE num_result +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_result SELECT t1.id, t2.id, t1.val / t2.val + FROM num_data t1, num_data t2 + WHERE t2.val != '0.0' +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT t1.id1, t1.id2, t1.result, t2.expected + FROM num_result t1, num_exp_div t2 + WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 + AND t1.result != t2.expected +-- !query schema +struct +-- !query output +2 3 -7967167.5673780000 -7967167.5673775051 +2 4 -4.4026750000 -4.4026748005 +2 5 -2094.1886690000 -2094.1886691456 +2 6 -365.6859990000 -365.6859989148 +2 7 0.4135750000 0.4135748378 +2 8 -458.5741670000 -458.5741672173 +2 9 1.3775730000 1.3775729995 +3 2 0.0000000000 -0.0000001255 +3 4 0.0000010000 0.0000005526 +3 5 0.0002630000 0.0002628523 +3 6 0.0000460000 0.0000458991 +3 7 0.0000000000 -0.0000000519 +3 8 0.0000580000 0.0000575580 +3 9 0.0000000000 -0.0000001729 +4 2 -0.2271350000 -0.2271346500 +4 3 1809619.8171460000 1809619.8171461717 +4 5 475.6628100000 475.6628104631 +4 6 83.0599610000 83.0599613844 +4 7 -0.0939370000 -0.0939371760 +4 8 104.1580830000 104.1580829837 +4 9 -0.3128950000 -0.3128945611 +5 2 -0.0004780000 -0.0004775119 +5 3 3804.4172830000 3804.4172832947 +5 4 0.0021020000 0.0021023296 +5 6 0.1746190000 0.1746194143 +5 7 -0.0001970000 -0.0001974869 +5 8 0.2189750000 0.2189746196 +5 9 -0.0006580000 -0.0006578075 +6 2 -0.0027350000 -0.0027345865 +6 3 21786.9089630000 21786.9089629374 +6 4 0.0120390000 0.0120394951 +6 5 5.7267400000 5.7267400867 +6 7 -0.0011310000 -0.0011309562 +6 8 1.2540110000 1.2540107321 +6 9 -0.0037670000 -0.0037670925 +7 2 2.4179420000 2.4179420715 +7 3 -19264149.6519720000 -19264149.6519721578 +7 4 -10.6454130000 -10.6454126273 +7 5 -5063.6268890000 -5063.6268888173 +7 6 -884.2075620000 -884.2075617401 +7 8 -1108.8057720000 -1108.8057718246 +7 9 3.3308920000 3.3308917120 +8 2 -0.0021810000 -0.0021806723 +8 3 17373.7819030000 17373.7819025522 +8 4 0.0096010000 0.0096007911 +8 5 4.5667390000 4.5667392951 +8 6 0.7974410000 0.7974413411 +8 7 -0.0009020000 -0.0009018712 +8 9 -0.0030040000 -0.0030040353 +9 2 0.7259140000 0.7259143438 +9 3 -5783481.2169480000 -5783481.2169483573 +9 4 -3.1959650000 -3.1959647889 +9 5 -1520.2015940000 -1520.2015936432 +9 6 -265.4567120000 -265.4567119543 +9 7 0.3002200000 0.3002199070 +9 8 -332.8855660000 -332.8855656982 + + +-- !query +TRUNCATE TABLE num_result +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_result SELECT t1.id, t2.id, round(t1.val / t2.val, 80) + FROM num_data t1, num_data t2 + WHERE t2.val != '0.0' +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT t1.id1, t1.id2, t1.result, round(t2.expected, 80) as expected + FROM num_result t1, num_exp_div t2 + WHERE t1.id1 = t2.id1 AND t1.id2 = t2.id2 + AND t1.result != round(t2.expected, 80) +-- !query schema +struct +-- !query output +2 3 -7967167.5673780000 -7967167.5673775051 +2 4 -4.4026750000 -4.4026748005 +2 5 -2094.1886690000 -2094.1886691456 +2 6 -365.6859990000 -365.6859989148 +2 7 0.4135750000 0.4135748378 +2 8 -458.5741670000 -458.5741672173 +2 9 1.3775730000 1.3775729995 +3 2 0.0000000000 -0.0000001255 +3 4 0.0000010000 0.0000005526 +3 5 0.0002630000 0.0002628523 +3 6 0.0000460000 0.0000458991 +3 7 0.0000000000 -0.0000000519 +3 8 0.0000580000 0.0000575580 +3 9 0.0000000000 -0.0000001729 +4 2 -0.2271350000 -0.2271346500 +4 3 1809619.8171460000 1809619.8171461717 +4 5 475.6628100000 475.6628104631 +4 6 83.0599610000 83.0599613844 +4 7 -0.0939370000 -0.0939371760 +4 8 104.1580830000 104.1580829837 +4 9 -0.3128950000 -0.3128945611 +5 2 -0.0004780000 -0.0004775119 +5 3 3804.4172830000 3804.4172832947 +5 4 0.0021020000 0.0021023296 +5 6 0.1746190000 0.1746194143 +5 7 -0.0001970000 -0.0001974869 +5 8 0.2189750000 0.2189746196 +5 9 -0.0006580000 -0.0006578075 +6 2 -0.0027350000 -0.0027345865 +6 3 21786.9089630000 21786.9089629374 +6 4 0.0120390000 0.0120394951 +6 5 5.7267400000 5.7267400867 +6 7 -0.0011310000 -0.0011309562 +6 8 1.2540110000 1.2540107321 +6 9 -0.0037670000 -0.0037670925 +7 2 2.4179420000 2.4179420715 +7 3 -19264149.6519720000 -19264149.6519721578 +7 4 -10.6454130000 -10.6454126273 +7 5 -5063.6268890000 -5063.6268888173 +7 6 -884.2075620000 -884.2075617401 +7 8 -1108.8057720000 -1108.8057718246 +7 9 3.3308920000 3.3308917120 +8 2 -0.0021810000 -0.0021806723 +8 3 17373.7819030000 17373.7819025522 +8 4 0.0096010000 0.0096007911 +8 5 4.5667390000 4.5667392951 +8 6 0.7974410000 0.7974413411 +8 7 -0.0009020000 -0.0009018712 +8 9 -0.0030040000 -0.0030040353 +9 2 0.7259140000 0.7259143438 +9 3 -5783481.2169480000 -5783481.2169483573 +9 4 -3.1959650000 -3.1959647889 +9 5 -1520.2015940000 -1520.2015936432 +9 6 -265.4567120000 -265.4567119543 +9 7 0.3002200000 0.3002199070 +9 8 -332.8855660000 -332.8855656982 + + +-- !query +TRUNCATE TABLE num_result +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_result SELECT id, 0, SQRT(ABS(val)) + FROM num_data +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT t1.id1, t1.result, t2.expected + FROM num_result t1, num_exp_sqrt t2 + WHERE t1.id1 = t2.id + AND t1.result != t2.expected +-- !query schema +struct +-- !query output + + + +-- !query +TRUNCATE TABLE num_result +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_result SELECT id, 0, LN(ABS(val)) + FROM num_data + WHERE val != '0.0' +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT t1.id1, t1.result, t2.expected + FROM num_result t1, num_exp_ln t2 + WHERE t1.id1 = t2.id + AND t1.result != t2.expected +-- !query schema +struct +-- !query output + + + +-- !query +TRUNCATE TABLE num_result +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_result SELECT id, 0, LOG(cast('10' as decimal(38, 18)), ABS(val)) + FROM num_data + WHERE val != '0.0' +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT t1.id1, t1.result, t2.expected + FROM num_result t1, num_exp_log10 t2 + WHERE t1.id1 = t2.id + AND t1.result != t2.expected +-- !query schema +struct +-- !query output + + + +-- !query +TRUNCATE TABLE num_result +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_result SELECT id, 0, POWER(cast('10' as decimal(38, 18)), LN(ABS(round(val,200)))) + FROM num_data + WHERE val != '0.0' +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT t1.id1, t1.result, t2.expected + FROM num_result t1, num_exp_power_10_ln t2 + WHERE t1.id1 = t2.id + AND t1.result != t2.expected +-- !query schema +struct +-- !query output +2 224790267919917440.0000000000 224790267919917955.1326161858 +4 7405685069595001.0000000000 7405685069594999.0773399947 +5 5068226527.3212630000 5068226527.3212726541 +6 281839893606.9936500000 281839893606.9937234336 +8 167361463828.0749000000 167361463828.0749132007 +9 107511333880051872.0000000000 107511333880052007.0414112467 + + +-- !query +SELECT AVG(val) FROM num_data +-- !query schema +struct +-- !query output +-13430913.59224232070000 + + +-- !query +CREATE TABLE fract_only (id int, val decimal(4,4)) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO fract_only VALUES (1, 0.0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO fract_only VALUES (2, 0.1) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO fract_only VALUES (4, -0.9999) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO fract_only VALUES (5, 0.99994) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO fract_only VALUES (7, 0.00001) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO fract_only VALUES (8, 0.00017) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM fract_only +-- !query schema +struct +-- !query output +1 0.0000 +2 0.1000 +4 -0.9999 +5 0.9999 +7 0.0000 +8 0.0002 + + +-- !query +DROP TABLE fract_only +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT decimal(double('NaN')) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT decimal(double('Infinity')) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT decimal(double('-Infinity')) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT decimal(float('NaN')) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT decimal(float('Infinity')) +-- !query schema +struct +-- !query output +NULL + + +-- !query +SELECT decimal(float('-Infinity')) +-- !query schema +struct +-- !query output +NULL + + +-- !query +CREATE TABLE ceil_floor_round (a decimal(38, 18)) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO ceil_floor_round VALUES (-5.5) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO ceil_floor_round VALUES (-5.499999) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO ceil_floor_round VALUES (9.5) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO ceil_floor_round VALUES (9.4999999) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO ceil_floor_round VALUES (0.0) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO ceil_floor_round VALUES (0.0000001) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO ceil_floor_round VALUES (-0.000001) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT a, ceil(a), ceiling(a), floor(a), round(a) FROM ceil_floor_round +-- !query schema +struct +-- !query output +-0.000001000000000000 0 0 -1 0 +-5.499999000000000000 -5 -5 -6 -5 +-5.500000000000000000 -5 -5 -6 -6 +0.000000000000000000 0 0 0 0 +0.000000100000000000 1 1 0 0 +9.499999900000000000 10 10 9 9 +9.500000000000000000 10 10 9 10 + + +-- !query +DROP TABLE ceil_floor_round +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE num_input_test (n1 decimal(38, 18)) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_input_test VALUES (double(trim(' 123'))) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_input_test VALUES (double(trim(' 3245874 '))) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_input_test VALUES (double(trim(' -93853'))) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_input_test VALUES (555.50) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO num_input_test VALUES (-555.50) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM num_input_test +-- !query schema +struct +-- !query output +-555.500000000000000000 +-93853.000000000000000000 +123.000000000000000000 +3245874.000000000000000000 +555.500000000000000000 + + +-- !query +select cast(999999999999999999999 as decimal(38, 0))/1000000000000000000000 +-- !query schema +struct<(CAST(CAST(999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) / CAST(1000000000000000000000 AS DECIMAL(38,0))):decimal(38,6)> +-- !query output +1.000000 + + +-- !query +select div(cast(999999999999999999999 as decimal(38, 0)),1000000000000000000000) +-- !query schema +struct<(CAST(CAST(999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) div CAST(1000000000000000000000 AS DECIMAL(38,0))):decimal(38,0)> +-- !query output +0 + + +-- !query +select mod(cast(999999999999999999999 as decimal(38, 0)),1000000000000000000000) +-- !query schema +struct<(CAST(CAST(999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) % CAST(1000000000000000000000 AS DECIMAL(38,0))):decimal(22,0)> +-- !query output +999999999999999999999 + + +-- !query +select div(cast(-9999999999999999999999 as decimal(38, 0)),1000000000000000000000) +-- !query schema +struct<(CAST(CAST(-9999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) div CAST(1000000000000000000000 AS DECIMAL(38,0))):decimal(38,0)> +-- !query output +-9 + + +-- !query +select mod(cast(-9999999999999999999999 as decimal(38, 0)),1000000000000000000000) +-- !query schema +struct<(CAST(CAST(-9999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) % CAST(1000000000000000000000 AS DECIMAL(38,0))):decimal(22,0)> +-- !query output +-999999999999999999999 + + +-- !query +select div(cast(-9999999999999999999999 as decimal(38, 0)),1000000000000000000000)*1000000000000000000000 + mod(cast(-9999999999999999999999 as decimal(38, 0)),1000000000000000000000) +-- !query schema +struct<(CAST((CAST((CAST(CAST(-9999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) div CAST(1000000000000000000000 AS DECIMAL(38,0))) AS DECIMAL(38,0)) * CAST(1000000000000000000000 AS DECIMAL(38,0))) AS DECIMAL(38,0)) + CAST((CAST(CAST(-9999999999999999999999 AS DECIMAL(38,0)) AS DECIMAL(38,0)) % CAST(1000000000000000000000 AS DECIMAL(38,0))) AS DECIMAL(38,0))):decimal(38,0)> +-- !query output +-9999999999999999999999 + + +-- !query +select mod (70.0,70) +-- !query schema +struct<(CAST(70.0 AS DECIMAL(3,1)) % CAST(CAST(70 AS DECIMAL(2,0)) AS DECIMAL(3,1))):decimal(3,1)> +-- !query output +0.0 + + +-- !query +select div (70.0,70) +-- !query schema +struct<(CAST(70.0 AS DECIMAL(3,1)) div CAST(CAST(70 AS DECIMAL(2,0)) AS DECIMAL(3,1))):decimal(2,0)> +-- !query output +1 + + +-- !query +select 70.0 / 70 +-- !query schema +struct<(CAST(70.0 AS DECIMAL(3,1)) / CAST(CAST(70 AS DECIMAL(2,0)) AS DECIMAL(3,1))):decimal(8,6)> +-- !query output +1.000000 + + +-- !query +select 12345678901234567890 % 123 +-- !query schema +struct<(CAST(12345678901234567890 AS DECIMAL(20,0)) % CAST(CAST(123 AS DECIMAL(3,0)) AS DECIMAL(20,0))):decimal(3,0)> +-- !query output +78 + + +-- !query +select exp(0.0) +-- !query schema +struct +-- !query output +1.0 + + +-- !query +select exp(1.0) +-- !query schema +struct +-- !query output +2.7182818284590455 + + +-- !query +select exp(32.999) +-- !query schema +struct +-- !query output +2.1442904349215556E14 + + +-- !query +select exp(-32.999) +-- !query schema +struct +-- !query output +4.663547361468238E-15 + + +-- !query +select exp(123.456) +-- !query schema +struct +-- !query output +4.132944352778106E53 + + +-- !query +select exp(-123.456) +-- !query schema +struct +-- !query output +2.4195825412645934E-54 + + +-- !query +select exp(1234.5678) +-- !query schema +struct +-- !query output +Infinity + + +-- !query +select * from range(cast(0.0 as decimal(38, 18)), cast(4.0 as decimal(38, 18))) +-- !query schema +struct +-- !query output +0 +1 +2 +3 + + +-- !query +select * from range(cast(0.1 as decimal(38, 18)), cast(4.0 as decimal(38, 18)), cast(1.3 as decimal(38, 18))) +-- !query schema +struct +-- !query output +0 +1 +2 +3 + + +-- !query +select * from range(cast(4.0 as decimal(38, 18)), cast(-1.5 as decimal(38, 18)), cast(-2.2 as decimal(38, 18))) +-- !query schema +struct +-- !query output +0 +2 +4 + + +-- !query +select ln(1.2345678e-28) +-- !query schema +struct +-- !query output +-64.26166165451762 + + +-- !query +select ln(0.0456789) +-- !query schema +struct +-- !query output +-3.0861187944847437 + + +-- !query +select ln(0.99949452) +-- !query schema +struct +-- !query output +-5.056077980832118E-4 + + +-- !query +select ln(1.00049687395) +-- !query schema +struct +-- !query output +4.967505490136803E-4 + + +-- !query +select ln(1234.567890123456789) +-- !query schema +struct +-- !query output +7.11847630129779 + + +-- !query +select ln(5.80397490724e5) +-- !query schema +struct +-- !query output +13.271468476626518 + + +-- !query +select ln(9.342536355e34) +-- !query schema +struct +-- !query output +80.52247093552418 + + +-- !query +select log(3.4634998359873254962349856073435545) +-- !query schema +struct +-- !query output +1.2422795911259166 + + +-- !query +select log(9.999999999999999999) +-- !query schema +struct +-- !query output +2.302585092994046 + + +-- !query +select log(10.00000000000000000) +-- !query schema +struct +-- !query output +2.302585092994046 + + +-- !query +select log(10.00000000000000001) +-- !query schema +struct +-- !query output +2.302585092994046 + + +-- !query +select log(590489.45235237) +-- !query schema +struct +-- !query output +13.288707052228641 + + +-- !query +select log(0.99923, 4.58934e34) +-- !query schema +struct +-- !query output +-103611.55579543479 + + +-- !query +select log(1.000016, 8.452010e18) +-- !query schema +struct +-- !query output +2723830.287707013 + + +-- !query +SELECT SUM(decimal(9999)) FROM range(1, 100001) +-- !query schema +struct +-- !query output +999900000 + + +-- !query +SELECT SUM(decimal(-9999)) FROM range(1, 100001) +-- !query schema +struct +-- !query output +-999900000 + + +-- !query +DROP TABLE num_data +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE num_exp_add +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE num_exp_sub +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE num_exp_div +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE num_exp_mul +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE num_exp_sqrt +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE num_exp_ln +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE num_exp_log10 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE num_exp_power_10_ln +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE num_result +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE num_input_test +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/select.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/select.sql.out similarity index 79% rename from sql/core/src/test/resources/sql-tests/results/pgSQL/select.sql.out rename to sql/core/src/test/resources/sql-tests/results/postgreSQL/select.sql.out index e54de1d6fdbdc..1e59036b979b4 100644 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/select.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/select.sql.out @@ -2,15 +2,15 @@ -- Number of queries: 37 --- !query 0 +-- !query create or replace temporary view onek2 as select * from onek --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create or replace temporary view INT8_TBL as select * from values (cast(trim(' 123 ') as bigint), cast(trim(' 456') as bigint)), (cast(trim('123 ') as bigint),cast('4567890123456789' as bigint)), @@ -18,19 +18,19 @@ create or replace temporary view INT8_TBL as select * from values (cast(+4567890123456789 as bigint),cast('4567890123456789' as bigint)), (cast('+4567890123456789' as bigint),cast('-4567890123456789' as bigint)) as INT8_TBL(q1, q2) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query SELECT * FROM onek WHERE onek.unique1 < 10 ORDER BY onek.unique1 --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 0 998 0 0 0 0 0 0 0 0 0 0 1 AAAAAA KMBAAA OOOOxx 1 214 1 1 1 1 1 1 1 1 1 2 3 BAAAAA GIAAAA OOOOxx 2 326 0 2 2 2 2 2 2 2 2 4 5 CAAAAA OMAAAA OOOOxx @@ -43,13 +43,13 @@ struct --- !query 3 output +-- !query output 19 TAAAAA 18 SAAAAA 17 RAAAAA @@ -72,13 +72,13 @@ struct 0 AAAAAA --- !query 4 +-- !query SELECT onek.unique1, onek.stringu1 FROM onek WHERE onek.unique1 > 980 ORDER BY stringu1 ASC --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 988 AMAAAA 989 BMAAAA 990 CMAAAA @@ -100,13 +100,13 @@ struct 987 ZLAAAA --- !query 5 +-- !query SELECT onek.unique1, onek.string4 FROM onek WHERE onek.unique1 > 980 ORDER BY string4 ASC, unique1 DESC --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 999 AAAAxx 995 AAAAxx 983 AAAAxx @@ -128,13 +128,13 @@ struct 984 VVVVxx --- !query 6 +-- !query SELECT onek.unique1, onek.string4 FROM onek WHERE onek.unique1 > 980 ORDER BY string4 DESC, unique1 ASC --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 984 VVVVxx 989 VVVVxx 992 VVVVxx @@ -156,13 +156,13 @@ struct 999 AAAAxx --- !query 7 +-- !query SELECT onek.unique1, onek.string4 FROM onek WHERE onek.unique1 < 20 ORDER BY unique1 DESC, string4 ASC --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 19 OOOOxx 18 VVVVxx 17 HHHHxx @@ -185,13 +185,13 @@ struct 0 OOOOxx --- !query 8 +-- !query SELECT onek.unique1, onek.string4 FROM onek WHERE onek.unique1 < 20 ORDER BY unique1 ASC, string4 DESC --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 0 OOOOxx 1 OOOOxx 2 OOOOxx @@ -214,11 +214,11 @@ struct 19 OOOOxx --- !query 9 +-- !query SELECT onek2.* FROM onek2 WHERE onek2.unique1 < 10 --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 0 998 0 0 0 0 0 0 0 0 0 0 1 AAAAAA KMBAAA OOOOxx 1 214 1 1 1 1 1 1 1 1 1 2 3 BAAAAA GIAAAA OOOOxx 2 326 0 2 2 2 2 2 2 2 2 4 5 CAAAAA OMAAAA OOOOxx @@ -231,13 +231,13 @@ struct --- !query 10 output +-- !query output 19 TAAAAA 18 SAAAAA 17 RAAAAA @@ -260,12 +260,12 @@ struct 0 AAAAAA --- !query 11 +-- !query SELECT onek2.unique1, onek2.stringu1 FROM onek2 WHERE onek2.unique1 > 980 --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 981 TLAAAA 982 ULAAAA 983 VLAAAA @@ -287,94 +287,94 @@ struct 999 LMAAAA --- !query 12 +-- !query CREATE TABLE tmp USING parquet AS SELECT two, stringu1, ten, string4 FROM onek --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output --- !query 13 +-- !query select foo.* from (select 1) as foo --- !query 13 schema +-- !query schema struct<1:int> --- !query 13 output +-- !query output 1 --- !query 14 +-- !query select foo.* from (select null) as foo --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output NULL --- !query 15 +-- !query select foo.* from (select 'xyzzy',1,null) as foo --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output xyzzy 1 NULL --- !query 16 +-- !query select * from onek, values(147, 'RFAAAA'), (931, 'VJAAAA') as v (i, j) WHERE onek.unique1 = v.i and onek.stringu1 = v.j --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output 147 0 1 3 7 7 7 47 147 147 147 14 15 RFAAAA AAAAAA AAAAxx 147 RFAAAA 931 1 1 3 1 11 1 31 131 431 931 2 3 VJAAAA BAAAAA HHHHxx 931 VJAAAA --- !query 17 +-- !query VALUES (1,2), (3,4+4), (7,77.7) --- !query 17 schema +-- !query schema struct --- !query 17 output -1 2 -3 8 +-- !query output +1 2.0 +3 8.0 7 77.7 --- !query 18 +-- !query VALUES (1,2), (3,4+4), (7,77.7) UNION ALL SELECT 2+2, 57 UNION ALL TABLE int8_tbl --- !query 18 schema +-- !query schema struct --- !query 18 output -1 2 -123 456 -123 4567890123456789 -3 8 -4 57 -4567890123456789 -4567890123456789 -4567890123456789 123 -4567890123456789 4567890123456789 +-- !query output +1 2.0 +123 456.0 +123 4567890123456789.0 +3 8.0 +4 57.0 +4567890123456789 -4567890123456789.0 +4567890123456789 123.0 +4567890123456789 4567890123456789.0 7 77.7 --- !query 19 +-- !query CREATE OR REPLACE TEMPORARY VIEW foo AS SELECT * FROM (values(42),(3),(10),(7),(null),(null),(1)) as foo (f1) --- !query 19 schema +-- !query schema struct<> --- !query 19 output +-- !query output --- !query 20 +-- !query SELECT * FROM foo ORDER BY f1 --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output NULL NULL 1 @@ -384,11 +384,11 @@ NULL 42 --- !query 21 +-- !query SELECT * FROM foo ORDER BY f1 ASC --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output NULL NULL 1 @@ -398,11 +398,11 @@ NULL 42 --- !query 22 +-- !query SELECT * FROM foo ORDER BY f1 NULLS FIRST --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output NULL NULL 1 @@ -412,11 +412,11 @@ NULL 42 --- !query 23 +-- !query SELECT * FROM foo ORDER BY f1 DESC --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output 42 10 7 @@ -426,11 +426,11 @@ NULL NULL --- !query 24 +-- !query SELECT * FROM foo ORDER BY f1 DESC NULLS LAST --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output 42 10 7 @@ -440,103 +440,103 @@ NULL NULL --- !query 25 +-- !query select * from onek2 where unique2 = 11 and stringu1 = 'ATAAAA' --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output 494 11 0 2 4 14 4 94 94 494 494 8 9 ATAAAA LAAAAA VVVVxx --- !query 26 +-- !query select unique2 from onek2 where unique2 = 11 and stringu1 = 'ATAAAA' --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output 11 --- !query 27 +-- !query select * from onek2 where unique2 = 11 and stringu1 < 'B' --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output 494 11 0 2 4 14 4 94 94 494 494 8 9 ATAAAA LAAAAA VVVVxx --- !query 28 +-- !query select unique2 from onek2 where unique2 = 11 and stringu1 < 'B' --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output 11 --- !query 29 +-- !query select unique2 from onek2 where unique2 = 11 and stringu1 < 'C' --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output 11 --- !query 30 +-- !query select unique2 from onek2 where unique2 = 11 and stringu1 < 'B' --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output 11 --- !query 31 +-- !query select unique1, unique2 from onek2 where (unique2 = 11 or unique1 = 0) and stringu1 < 'B' --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output 0 998 494 11 --- !query 32 +-- !query select unique1, unique2 from onek2 where (unique2 = 11 and stringu1 < 'B') or unique1 = 0 --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output 0 998 494 11 --- !query 33 +-- !query SELECT 1 AS x ORDER BY x --- !query 33 schema +-- !query schema struct --- !query 33 output +-- !query output 1 --- !query 34 +-- !query select * from (values (2),(null),(1)) v(k) where k = k order by k --- !query 34 schema +-- !query schema struct --- !query 34 output +-- !query output 1 2 --- !query 35 +-- !query select * from (values (2),(null),(1)) v(k) where k = k --- !query 35 schema +-- !query schema struct --- !query 35 output +-- !query output 1 2 --- !query 36 +-- !query drop table tmp --- !query 36 schema +-- !query schema struct<> --- !query 36 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/select_distinct.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/select_distinct.sql.out similarity index 69% rename from sql/core/src/test/resources/sql-tests/results/pgSQL/select_distinct.sql.out rename to sql/core/src/test/resources/sql-tests/results/postgreSQL/select_distinct.sql.out index 38eae1739f553..53003e70f289a 100644 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/select_distinct.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/select_distinct.sql.out @@ -2,30 +2,30 @@ -- Number of queries: 19 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW tmp AS SELECT two, stringu1, ten, string4 FROM onek --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT DISTINCT two FROM tmp ORDER BY 1 --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 0 1 --- !query 2 +-- !query SELECT DISTINCT ten FROM tmp ORDER BY 1 --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 0 1 2 @@ -38,24 +38,24 @@ struct 9 --- !query 3 +-- !query SELECT DISTINCT string4 FROM tmp ORDER BY 1 --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output AAAAxx HHHHxx OOOOxx VVVVxx --- !query 4 +-- !query SELECT DISTINCT two, string4, ten FROM tmp ORDER BY two ASC, string4 ASC, ten ASC --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 0 AAAAxx 0 0 AAAAxx 2 0 AAAAxx 4 @@ -98,128 +98,128 @@ struct 1 VVVVxx 9 --- !query 5 +-- !query SELECT count(*) FROM (SELECT DISTINCT two, four, two FROM tenk1) ss --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 4 --- !query 6 +-- !query CREATE OR REPLACE TEMPORARY VIEW disttable AS SELECT * FROM (VALUES (1), (2), (3), (NULL)) AS v(f1) --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output --- !query 7 +-- !query SELECT f1, f1 IS DISTINCT FROM 2 as `not 2` FROM disttable --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 1 true 2 false 3 true NULL true --- !query 8 +-- !query SELECT f1, f1 IS DISTINCT FROM NULL as `not null` FROM disttable --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 1 true 2 true 3 true NULL false --- !query 9 +-- !query SELECT f1, f1 IS DISTINCT FROM f1 as `false` FROM disttable --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 1 false 2 false 3 false NULL false --- !query 10 +-- !query SELECT f1, f1 IS DISTINCT FROM f1+1 as `not null` FROM disttable --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 1 true 2 true 3 true NULL false --- !query 11 +-- !query SELECT 1 IS DISTINCT FROM 2 as `yes` --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output true --- !query 12 +-- !query SELECT 2 IS DISTINCT FROM 2 as `no` --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output false --- !query 13 +-- !query SELECT 2 IS DISTINCT FROM null as `yes` --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output true --- !query 14 +-- !query SELECT null IS DISTINCT FROM null as `no` --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output false --- !query 15 +-- !query SELECT 1 IS NOT DISTINCT FROM 2 as `no` --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output false --- !query 16 +-- !query SELECT 2 IS NOT DISTINCT FROM 2 as `yes` --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output true --- !query 17 +-- !query SELECT 2 IS NOT DISTINCT FROM null as `no` --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output false --- !query 18 +-- !query SELECT null IS NOT DISTINCT FROM null as `yes` --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output true diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/select_having.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/select_having.sql.out similarity index 65% rename from sql/core/src/test/resources/sql-tests/results/pgSQL/select_having.sql.out rename to sql/core/src/test/resources/sql-tests/results/postgreSQL/select_having.sql.out index 02536ebd8ebea..cbf4cfa58cdb9 100644 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/select_having.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/select_having.sql.out @@ -2,186 +2,186 @@ -- Number of queries: 22 --- !query 0 +-- !query CREATE TABLE test_having (a int, b int, c string, d string) USING parquet --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query INSERT INTO test_having VALUES (0, 1, 'XXXX', 'A') --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query INSERT INTO test_having VALUES (1, 2, 'AAAA', 'b') --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query INSERT INTO test_having VALUES (2, 2, 'AAAA', 'c') --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query INSERT INTO test_having VALUES (3, 3, 'BBBB', 'D') --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query INSERT INTO test_having VALUES (4, 3, 'BBBB', 'e') --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query INSERT INTO test_having VALUES (5, 3, 'bbbb', 'F') --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output --- !query 7 +-- !query INSERT INTO test_having VALUES (6, 4, 'cccc', 'g') --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output --- !query 8 +-- !query INSERT INTO test_having VALUES (7, 4, 'cccc', 'h') --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output --- !query 9 +-- !query INSERT INTO test_having VALUES (8, 4, 'CCCC', 'I') --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output --- !query 10 +-- !query INSERT INTO test_having VALUES (9, 4, 'CCCC', 'j') --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output --- !query 11 +-- !query SELECT b, c FROM test_having GROUP BY b, c HAVING count(*) = 1 ORDER BY b, c --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 1 XXXX 3 bbbb --- !query 12 +-- !query SELECT b, c FROM test_having GROUP BY b, c HAVING b = 3 ORDER BY b, c --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 3 BBBB 3 bbbb --- !query 13 +-- !query SELECT c, max(a) FROM test_having GROUP BY c HAVING count(*) > 2 OR min(a) = max(a) ORDER BY c --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output XXXX 0 bbbb 5 --- !query 14 +-- !query SELECT min(a), max(a) FROM test_having HAVING min(a) = max(a) --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output --- !query 15 +-- !query SELECT min(a), max(a) FROM test_having HAVING min(a) < max(a) --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 0 9 --- !query 16 +-- !query SELECT a FROM test_having HAVING min(a) < max(a) --- !query 16 schema +-- !query schema struct<> --- !query 16 output +-- !query output org.apache.spark.sql.AnalysisException grouping expressions sequence is empty, and 'default.test_having.`a`' is not an aggregate function. Wrap '(min(default.test_having.`a`) AS `min(a#x)`, max(default.test_having.`a`) AS `max(a#x)`)' in windowing function(s) or wrap 'default.test_having.`a`' in first() (or first_value) if you don't care which value you get.; --- !query 17 +-- !query SELECT 1 AS one FROM test_having HAVING a > 1 --- !query 17 schema +-- !query schema struct<> --- !query 17 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`a`' given input columns: [one]; line 1 pos 40 --- !query 18 +-- !query SELECT 1 AS one FROM test_having HAVING 1 > 2 --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output --- !query 19 +-- !query SELECT 1 AS one FROM test_having HAVING 1 < 2 --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output 1 --- !query 20 +-- !query SELECT 1 AS one FROM test_having WHERE 1/a = 1 HAVING 1 < 2 --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output 1 --- !query 21 +-- !query DROP TABLE test_having --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/select_implicit.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/select_implicit.sql.out old mode 100644 new mode 100755 similarity index 67% rename from sql/core/src/test/resources/sql-tests/results/pgSQL/select_implicit.sql.out rename to sql/core/src/test/resources/sql-tests/results/postgreSQL/select_implicit.sql.out index 0675820b381da..4ecfabccdf414 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/select_implicit.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/select_implicit.sql.out @@ -2,99 +2,99 @@ -- Number of queries: 38 --- !query 0 +-- !query CREATE TABLE test_missing_target (a int, b int, c string, d string) using parquet --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query INSERT INTO test_missing_target VALUES (0, 1, 'XXXX', 'A') --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query INSERT INTO test_missing_target VALUES (1, 2, 'ABAB', 'b') --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query INSERT INTO test_missing_target VALUES (2, 2, 'ABAB', 'c') --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query INSERT INTO test_missing_target VALUES (3, 3, 'BBBB', 'D') --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query INSERT INTO test_missing_target VALUES (4, 3, 'BBBB', 'e') --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query INSERT INTO test_missing_target VALUES (5, 3, 'bbbb', 'F') --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output --- !query 7 +-- !query INSERT INTO test_missing_target VALUES (6, 4, 'cccc', 'g') --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output --- !query 8 +-- !query INSERT INTO test_missing_target VALUES (7, 4, 'cccc', 'h') --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output --- !query 9 +-- !query INSERT INTO test_missing_target VALUES (8, 4, 'CCCC', 'I') --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output --- !query 10 +-- !query INSERT INTO test_missing_target VALUES (9, 4, 'CCCC', 'j') --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output --- !query 11 +-- !query SELECT c, count(*) FROM test_missing_target GROUP BY test_missing_target.c ORDER BY c --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output ABAB 2 BBBB 2 CCCC 2 @@ -103,11 +103,11 @@ bbbb 1 cccc 2 --- !query 12 +-- !query SELECT count(*) FROM test_missing_target GROUP BY test_missing_target.c ORDER BY c --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 2 2 2 @@ -116,43 +116,43 @@ struct 2 --- !query 13 +-- !query SELECT count(*) FROM test_missing_target GROUP BY a ORDER BY b --- !query 13 schema +-- !query schema struct<> --- !query 13 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`b`' given input columns: [count(1)]; line 1 pos 61 --- !query 14 +-- !query SELECT count(*) FROM test_missing_target GROUP BY b ORDER BY b --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output 1 2 3 4 --- !query 15 +-- !query SELECT test_missing_target.b, count(*) FROM test_missing_target GROUP BY b ORDER BY b --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 1 1 2 2 3 3 4 4 --- !query 16 +-- !query SELECT c FROM test_missing_target ORDER BY a --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output XXXX ABAB ABAB @@ -165,30 +165,30 @@ CCCC CCCC --- !query 17 +-- !query SELECT count(*) FROM test_missing_target GROUP BY b ORDER BY b desc --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output 4 3 2 1 --- !query 18 +-- !query SELECT count(*) FROM test_missing_target ORDER BY 1 desc --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output 10 --- !query 19 +-- !query SELECT c, count(*) FROM test_missing_target GROUP BY 1 ORDER BY 1 --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output ABAB 2 BBBB 2 CCCC 2 @@ -197,32 +197,32 @@ bbbb 1 cccc 2 --- !query 20 +-- !query SELECT c, count(*) FROM test_missing_target GROUP BY 3 --- !query 20 schema +-- !query schema struct<> --- !query 20 output +-- !query output org.apache.spark.sql.AnalysisException GROUP BY position 3 is not in select list (valid range is [1, 2]); line 1 pos 53 --- !query 21 +-- !query SELECT count(*) FROM test_missing_target x, test_missing_target y WHERE x.a = y.a GROUP BY b ORDER BY b --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output org.apache.spark.sql.AnalysisException Reference 'b' is ambiguous, could be: x.b, y.b.; line 3 pos 10 --- !query 22 +-- !query SELECT a, a FROM test_missing_target ORDER BY a --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output 0 0 1 1 2 2 @@ -235,123 +235,129 @@ struct 9 9 --- !query 23 +-- !query SELECT a/2, a/2 FROM test_missing_target ORDER BY a/2 --- !query 23 schema -struct<(a div 2):int,(a div 2):int> --- !query 23 output -0 0 -0 0 -1 1 -1 1 -2 2 -2 2 -3 3 -3 3 -4 4 -4 4 - - --- !query 24 +-- !query schema +struct<(CAST(a AS DOUBLE) / CAST(2 AS DOUBLE)):double,(CAST(a AS DOUBLE) / CAST(2 AS DOUBLE)):double> +-- !query output +0.0 0.0 +0.5 0.5 +1.0 1.0 +1.5 1.5 +2.0 2.0 +2.5 2.5 +3.0 3.0 +3.5 3.5 +4.0 4.0 +4.5 4.5 + + +-- !query SELECT a/2, a/2 FROM test_missing_target GROUP BY a/2 ORDER BY a/2 --- !query 24 schema -struct<(a div 2):int,(a div 2):int> --- !query 24 output -0 0 -1 1 -2 2 -3 3 -4 4 - - --- !query 25 +-- !query schema +struct<(CAST(a AS DOUBLE) / CAST(2 AS DOUBLE)):double,(CAST(a AS DOUBLE) / CAST(2 AS DOUBLE)):double> +-- !query output +0.0 0.0 +0.5 0.5 +1.0 1.0 +1.5 1.5 +2.0 2.0 +2.5 2.5 +3.0 3.0 +3.5 3.5 +4.0 4.0 +4.5 4.5 + + +-- !query SELECT x.b, count(*) FROM test_missing_target x, test_missing_target y WHERE x.a = y.a GROUP BY x.b ORDER BY x.b --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output 1 1 2 2 3 3 4 4 --- !query 26 +-- !query SELECT count(*) FROM test_missing_target x, test_missing_target y WHERE x.a = y.a GROUP BY x.b ORDER BY x.b --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output 1 2 3 4 --- !query 27 +-- !query SELECT a%2, count(b) FROM test_missing_target GROUP BY test_missing_target.a%2 ORDER BY test_missing_target.a%2 --- !query 27 schema +-- !query schema struct<(a % 2):int,count(b):bigint> --- !query 27 output +-- !query output 0 5 1 5 --- !query 28 +-- !query SELECT count(c) FROM test_missing_target GROUP BY lower(test_missing_target.c) ORDER BY lower(test_missing_target.c) --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output 2 3 4 1 --- !query 29 +-- !query SELECT count(a) FROM test_missing_target GROUP BY a ORDER BY b --- !query 29 schema +-- !query schema struct<> --- !query 29 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`b`' given input columns: [count(a)]; line 1 pos 61 --- !query 30 +-- !query SELECT count(b) FROM test_missing_target GROUP BY b/2 ORDER BY b/2 --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output 1 -5 +2 +3 4 --- !query 31 +-- !query SELECT lower(test_missing_target.c), count(c) FROM test_missing_target GROUP BY lower(c) ORDER BY lower(c) --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output abab 2 bbbb 3 cccc 4 xxxx 1 --- !query 32 +-- !query SELECT a FROM test_missing_target ORDER BY upper(d) --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output 0 1 2 @@ -364,53 +370,56 @@ struct 9 --- !query 33 +-- !query SELECT count(b) FROM test_missing_target GROUP BY (b + 1) / 2 ORDER BY (b + 1) / 2 desc --- !query 33 schema +-- !query schema struct --- !query 33 output -7 +-- !query output +4 3 +2 +1 --- !query 34 +-- !query SELECT count(x.a) FROM test_missing_target x, test_missing_target y WHERE x.a = y.a GROUP BY b/2 ORDER BY b/2 --- !query 34 schema +-- !query schema struct<> --- !query 34 output +-- !query output org.apache.spark.sql.AnalysisException Reference 'b' is ambiguous, could be: x.b, y.b.; line 3 pos 10 --- !query 35 +-- !query SELECT x.b/2, count(x.b) FROM test_missing_target x, test_missing_target y WHERE x.a = y.a GROUP BY x.b/2 ORDER BY x.b/2 --- !query 35 schema -struct<(b div 2):int,count(b):bigint> --- !query 35 output -0 1 -1 5 -2 4 +-- !query schema +struct<(CAST(b AS DOUBLE) / CAST(2 AS DOUBLE)):double,count(b):bigint> +-- !query output +0.5 1 +1.0 2 +1.5 3 +2.0 4 --- !query 36 +-- !query SELECT count(b) FROM test_missing_target x, test_missing_target y WHERE x.a = y.a GROUP BY x.b/2 --- !query 36 schema +-- !query schema struct<> --- !query 36 output +-- !query output org.apache.spark.sql.AnalysisException Reference 'b' is ambiguous, could be: x.b, y.b.; line 1 pos 13 --- !query 37 +-- !query DROP TABLE test_missing_target --- !query 37 schema +-- !query schema struct<> --- !query 37 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/strings.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/strings.sql.out new file mode 100644 index 0000000000000..c30eea8ab689d --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/strings.sql.out @@ -0,0 +1,998 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 122 + + +-- !query +SELECT 'first line' +' - next line' + ' - third line' + AS `Three lines to one` +-- !query schema +struct +-- !query output +first line - next line - third line + + +-- !query +SELECT 'first line' +' - next line' /* this comment is not allowed here */ +' - third line' + AS `Illegal comment within continuation` +-- !query schema +struct +-- !query output +first line - next line - third line + + +-- !query +SELECT binary('\\xDeAdBeEf') +-- !query schema +struct +-- !query output +\xDeAdBeEf + + +-- !query +SELECT binary('\\x De Ad Be Ef ') +-- !query schema +struct +-- !query output +\x De Ad Be Ef + + +-- !query +SELECT binary('\\xDe00BeEf') +-- !query schema +struct +-- !query output +\xDe00BeEf + + +-- !query +SELECT binary('DeAdBeEf') +-- !query schema +struct +-- !query output +DeAdBeEf + + +-- !query +SELECT binary('De\\000dBeEf') +-- !query schema +struct +-- !query output +De\000dBeEf + + +-- !query +SELECT binary('De\\123dBeEf') +-- !query schema +struct +-- !query output +De\123dBeEf + + +-- !query +SELECT TRIM(BOTH FROM ' bunch o blanks ') = 'bunch o blanks' AS `bunch o blanks` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT TRIM(LEADING FROM ' bunch o blanks ') = 'bunch o blanks ' AS `bunch o blanks ` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT TRIM(TRAILING FROM ' bunch o blanks ') = ' bunch o blanks' AS ` bunch o blanks` +-- !query schema +struct< bunch o blanks:boolean> +-- !query output +true + + +-- !query +SELECT TRIM(BOTH 'x' FROM 'xxxxxsome Xsxxxxx') = 'some Xs' AS `some Xs` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT SUBSTRING('1234567890' FROM 3) = '34567890' AS `34567890` +-- !query schema +struct<34567890:boolean> +-- !query output +true + + +-- !query +SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS `456` +-- !query schema +struct<456:boolean> +-- !query output +true + + +-- !query +SELECT POSITION('4' IN '1234567890') = '4' AS `4` +-- !query schema +struct<4:boolean> +-- !query output +true + + +-- !query +SELECT POSITION('5' IN '1234567890') = '5' AS `5` +-- !query schema +struct<5:boolean> +-- !query output +true + + +-- !query +SELECT OVERLAY('abcdef' PLACING '45' FROM 4) AS `abc45f` +-- !query schema +struct +-- !query output +abc45f + + +-- !query +SELECT OVERLAY('yabadoo' PLACING 'daba' FROM 5) AS `yabadaba` +-- !query schema +struct +-- !query output +yabadaba + + +-- !query +SELECT OVERLAY('yabadoo' PLACING 'daba' FROM 5 FOR 0) AS `yabadabadoo` +-- !query schema +struct +-- !query output +yabadabadoo + + +-- !query +SELECT OVERLAY('babosa' PLACING 'ubb' FROM 2 FOR 4) AS `bubba` +-- !query schema +struct +-- !query output +bubba + + +-- !query +SELECT 'hawkeye' LIKE 'h%' AS `true` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT 'hawkeye' NOT LIKE 'h%' AS `false` +-- !query schema +struct +-- !query output +false + + +-- !query +SELECT 'hawkeye' LIKE 'H%' AS `false` +-- !query schema +struct +-- !query output +false + + +-- !query +SELECT 'hawkeye' NOT LIKE 'H%' AS `true` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT 'hawkeye' LIKE 'indio%' AS `false` +-- !query schema +struct +-- !query output +false + + +-- !query +SELECT 'hawkeye' NOT LIKE 'indio%' AS `true` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT 'hawkeye' LIKE 'h%eye' AS `true` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT 'hawkeye' NOT LIKE 'h%eye' AS `false` +-- !query schema +struct +-- !query output +false + + +-- !query +SELECT 'indio' LIKE '_ndio' AS `true` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT 'indio' NOT LIKE '_ndio' AS `false` +-- !query schema +struct +-- !query output +false + + +-- !query +SELECT 'indio' LIKE 'in__o' AS `true` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT 'indio' NOT LIKE 'in__o' AS `false` +-- !query schema +struct +-- !query output +false + + +-- !query +SELECT 'indio' LIKE 'in_o' AS `false` +-- !query schema +struct +-- !query output +false + + +-- !query +SELECT 'indio' NOT LIKE 'in_o' AS `true` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT 'hawkeye' LIKE 'h%' ESCAPE '#' AS `true` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT 'hawkeye' NOT LIKE 'h%' ESCAPE '#' AS `false` +-- !query schema +struct +-- !query output +false + + +-- !query +SELECT 'indio' LIKE 'ind_o' ESCAPE '$' AS `true` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT 'indio' NOT LIKE 'ind_o' ESCAPE '$' AS `false` +-- !query schema +struct +-- !query output +false + + +-- !query +SELECT 'h%' LIKE 'h#%' ESCAPE '#' AS `true` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT 'h%' NOT LIKE 'h#%' ESCAPE '#' AS `false` +-- !query schema +struct +-- !query output +false + + +-- !query +SELECT 'h%wkeye' LIKE 'h#%' ESCAPE '#' AS `false` +-- !query schema +struct +-- !query output +false + + +-- !query +SELECT 'h%wkeye' NOT LIKE 'h#%' ESCAPE '#' AS `true` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT 'h%wkeye' LIKE 'h#%%' ESCAPE '#' AS `true` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT 'h%wkeye' NOT LIKE 'h#%%' ESCAPE '#' AS `false` +-- !query schema +struct +-- !query output +false + + +-- !query +SELECT 'h%awkeye' LIKE 'h#%a%k%e' ESCAPE '#' AS `true` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT 'h%awkeye' NOT LIKE 'h#%a%k%e' ESCAPE '#' AS `false` +-- !query schema +struct +-- !query output +false + + +-- !query +SELECT 'indio' LIKE '_ndio' ESCAPE '$' AS `true` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT 'indio' NOT LIKE '_ndio' ESCAPE '$' AS `false` +-- !query schema +struct +-- !query output +false + + +-- !query +SELECT 'i_dio' LIKE 'i$_d_o' ESCAPE '$' AS `true` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT 'i_dio' NOT LIKE 'i$_d_o' ESCAPE '$' AS `false` +-- !query schema +struct +-- !query output +false + + +-- !query +SELECT 'i_dio' LIKE 'i$_nd_o' ESCAPE '$' AS `false` +-- !query schema +struct +-- !query output +false + + +-- !query +SELECT 'i_dio' NOT LIKE 'i$_nd_o' ESCAPE '$' AS `true` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT 'i_dio' LIKE 'i$_d%o' ESCAPE '$' AS `true` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT 'i_dio' NOT LIKE 'i$_d%o' ESCAPE '$' AS `false` +-- !query schema +struct +-- !query output +false + + +-- !query +SELECT 'maca' LIKE 'm%aca' ESCAPE '%' AS `true` +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +the pattern 'm%aca' is invalid, the escape character is not allowed to precede 'a'; + + +-- !query +SELECT 'maca' NOT LIKE 'm%aca' ESCAPE '%' AS `false` +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +the pattern 'm%aca' is invalid, the escape character is not allowed to precede 'a'; + + +-- !query +SELECT 'ma%a' LIKE 'm%a%%a' ESCAPE '%' AS `true` +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +the pattern 'm%a%%a' is invalid, the escape character is not allowed to precede 'a'; + + +-- !query +SELECT 'ma%a' NOT LIKE 'm%a%%a' ESCAPE '%' AS `false` +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +the pattern 'm%a%%a' is invalid, the escape character is not allowed to precede 'a'; + + +-- !query +SELECT 'bear' LIKE 'b_ear' ESCAPE '_' AS `true` +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +the pattern 'b_ear' is invalid, the escape character is not allowed to precede 'e'; + + +-- !query +SELECT 'bear' NOT LIKE 'b_ear' ESCAPE '_' AS `false` +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +the pattern 'b_ear' is invalid, the escape character is not allowed to precede 'e'; + + +-- !query +SELECT 'be_r' LIKE 'b_e__r' ESCAPE '_' AS `true` +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +the pattern 'b_e__r' is invalid, the escape character is not allowed to precede 'e'; + + +-- !query +SELECT 'be_r' NOT LIKE 'b_e__r' ESCAPE '_' AS `false` +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +the pattern 'b_e__r' is invalid, the escape character is not allowed to precede 'e'; + + +-- !query +SELECT 'be_r' LIKE '__e__r' ESCAPE '_' AS `false` +-- !query schema +struct +-- !query output +false + + +-- !query +SELECT 'be_r' NOT LIKE '__e__r' ESCAPE '_' AS `true` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT 'foo' LIKE '_%' as t, 'f' LIKE '_%' as t, '' LIKE '_%' as f +-- !query schema +struct +-- !query output +true true false + + +-- !query +SELECT 'foo' LIKE '%_' as t, 'f' LIKE '%_' as t, '' LIKE '%_' as f +-- !query schema +struct +-- !query output +true true false + + +-- !query +SELECT 'foo' LIKE '__%' as t, 'foo' LIKE '___%' as t, 'foo' LIKE '____%' as f +-- !query schema +struct +-- !query output +true true false + + +-- !query +SELECT 'foo' LIKE '%__' as t, 'foo' LIKE '%___' as t, 'foo' LIKE '%____' as f +-- !query schema +struct +-- !query output +true true false + + +-- !query +SELECT 'jack' LIKE '%____%' AS t +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT 'unknown' || ' and unknown' AS `Concat unknown types` +-- !query schema +struct +-- !query output +unknown and unknown + + +-- !query +SELECT string('text') || ' and unknown' AS `Concat text to unknown type` +-- !query schema +struct +-- !query output +text and unknown + + +-- !query +CREATE TABLE toasttest(f1 string) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +insert into toasttest values(repeat('1234567890',10000)) +-- !query schema +struct<> +-- !query output + + + +-- !query +insert into toasttest values(repeat('1234567890',10000)) +-- !query schema +struct<> +-- !query output + + + +-- !query +insert into toasttest values(repeat('1234567890',10000)) +-- !query schema +struct<> +-- !query output + + + +-- !query +insert into toasttest values(repeat('1234567890',10000)) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT substr(f1, 99995) from toasttest +-- !query schema +struct +-- !query output +567890 +567890 +567890 +567890 + + +-- !query +SELECT substr(f1, 99995, 10) from toasttest +-- !query schema +struct +-- !query output +567890 +567890 +567890 +567890 + + +-- !query +SELECT length('abcdef') AS `length_6` +-- !query schema +struct +-- !query output +6 + + +-- !query +SELECT position('cd', 'abcdef') AS `pos_3` +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT position('xy', 'abcdef') AS `pos_0` +-- !query schema +struct +-- !query output +0 + + +-- !query +SELECT replace('abcdef', 'de', '45') AS `abc45f` +-- !query schema +struct +-- !query output +abc45f + + +-- !query +SELECT replace('yabadabadoo', 'ba', '123') AS `ya123da123doo` +-- !query schema +struct +-- !query output +ya123da123doo + + +-- !query +SELECT replace('yabadoo', 'bad', '') AS `yaoo` +-- !query schema +struct +-- !query output +yaoo + + +-- !query +select hex(256*256*256 - 1) AS `ffffff` +-- !query schema +struct +-- !query output +FFFFFF + + +-- !query +select hex(bigint(bigint(bigint(bigint(256)*256)*256)*256) - 1) AS `ffffffff` +-- !query schema +struct +-- !query output +FFFFFFFF + + +-- !query +select md5('') = 'd41d8cd98f00b204e9800998ecf8427e' AS `TRUE` +-- !query schema +struct +-- !query output +true + + +-- !query +select md5('a') = '0cc175b9c0f1b6a831c399e269772661' AS `TRUE` +-- !query schema +struct +-- !query output +true + + +-- !query +select md5('abc') = '900150983cd24fb0d6963f7d28e17f72' AS `TRUE` +-- !query schema +struct +-- !query output +true + + +-- !query +select md5('message digest') = 'f96b697d7cb7938d525a2f31aaf161d0' AS `TRUE` +-- !query schema +struct +-- !query output +true + + +-- !query +select md5('abcdefghijklmnopqrstuvwxyz') = 'c3fcd3d76192e4007dfb496cca67e13b' AS `TRUE` +-- !query schema +struct +-- !query output +true + + +-- !query +select md5('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789') = 'd174ab98d277d9f5a5611c2c9f419d9f' AS `TRUE` +-- !query schema +struct +-- !query output +true + + +-- !query +select md5('12345678901234567890123456789012345678901234567890123456789012345678901234567890') = '57edf4a22be3c955ac49da2e2107b67a' AS `TRUE` +-- !query schema +struct +-- !query output +true + + +-- !query +select md5(binary('')) = 'd41d8cd98f00b204e9800998ecf8427e' AS `TRUE` +-- !query schema +struct +-- !query output +true + + +-- !query +select md5(binary('a')) = '0cc175b9c0f1b6a831c399e269772661' AS `TRUE` +-- !query schema +struct +-- !query output +true + + +-- !query +select md5(binary('abc')) = '900150983cd24fb0d6963f7d28e17f72' AS `TRUE` +-- !query schema +struct +-- !query output +true + + +-- !query +select md5(binary('message digest')) = 'f96b697d7cb7938d525a2f31aaf161d0' AS `TRUE` +-- !query schema +struct +-- !query output +true + + +-- !query +select md5(binary('abcdefghijklmnopqrstuvwxyz')) = 'c3fcd3d76192e4007dfb496cca67e13b' AS `TRUE` +-- !query schema +struct +-- !query output +true + + +-- !query +select md5(binary('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789')) = 'd174ab98d277d9f5a5611c2c9f419d9f' AS `TRUE` +-- !query schema +struct +-- !query output +true + + +-- !query +select md5(binary('12345678901234567890123456789012345678901234567890123456789012345678901234567890')) = '57edf4a22be3c955ac49da2e2107b67a' AS `TRUE` +-- !query schema +struct +-- !query output +true + + +-- !query +SELECT initcap('hi THOMAS') +-- !query schema +struct +-- !query output +Hi Thomas + + +-- !query +SELECT lpad('hi', 5, 'xy') +-- !query schema +struct +-- !query output +xyxhi + + +-- !query +SELECT lpad('hi', 5) +-- !query schema +struct +-- !query output + hi + + +-- !query +SELECT lpad('hi', -5, 'xy') +-- !query schema +struct +-- !query output + + + +-- !query +SELECT lpad('hello', 2) +-- !query schema +struct +-- !query output +he + + +-- !query +SELECT lpad('hi', 5, '') +-- !query schema +struct +-- !query output +hi + + +-- !query +SELECT rpad('hi', 5, 'xy') +-- !query schema +struct +-- !query output +hixyx + + +-- !query +SELECT rpad('hi', 5) +-- !query schema +struct +-- !query output +hi + + +-- !query +SELECT rpad('hi', -5, 'xy') +-- !query schema +struct +-- !query output + + + +-- !query +SELECT rpad('hello', 2) +-- !query schema +struct +-- !query output +he + + +-- !query +SELECT rpad('hi', 5, '') +-- !query schema +struct +-- !query output +hi + + +-- !query +SELECT ltrim('zzzytrim', 'xyz') +-- !query schema +struct +-- !query output +trim + + +-- !query +SELECT translate('', '14', 'ax') +-- !query schema +struct +-- !query output + + + +-- !query +SELECT translate('12345', '14', 'ax') +-- !query schema +struct +-- !query output +a23x5 + + +-- !query +SELECT ascii('x') +-- !query schema +struct +-- !query output +120 + + +-- !query +SELECT ascii('') +-- !query schema +struct +-- !query output +0 + + +-- !query +SELECT chr(65) +-- !query schema +struct +-- !query output +A + + +-- !query +SELECT chr(0) +-- !query schema +struct +-- !query output + + + +-- !query +SELECT repeat('Pg', 4) +-- !query schema +struct +-- !query output +PgPgPgPg + + +-- !query +SELECT repeat('Pg', -4) +-- !query schema +struct +-- !query output + + + +-- !query +SELECT trim(binary('\\000') from binary('\\000Tom\\000')) +-- !query schema +struct +-- !query output +Tom + + +-- !query +DROP TABLE toasttest +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/text.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/text.sql.out old mode 100644 new mode 100755 similarity index 63% rename from sql/core/src/test/resources/sql-tests/results/pgSQL/text.sql.out rename to sql/core/src/test/resources/sql-tests/results/postgreSQL/text.sql.out index 352b0232e8945..ccca1ba8cd8b4 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/text.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/text.sql.out @@ -1,168 +1,162 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 44 +-- Number of queries: 42 --- !query 0 +-- !query SELECT string('this is a text string') = string('this is a text string') AS true --- !query 0 schema +-- !query schema struct --- !query 0 output +-- !query output true --- !query 1 +-- !query SELECT string('this is a text string') = string('this is a text strin') AS `false` --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output false --- !query 2 +-- !query CREATE TABLE TEXT_TBL (f1 string) USING parquet --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query INSERT INTO TEXT_TBL VALUES ('doh!') --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query INSERT INTO TEXT_TBL VALUES ('hi de ho neighbor') --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query SELECT '' AS two, * FROM TEXT_TBL --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output doh! hi de ho neighbor --- !query 6 +-- !query select length(42) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 2 --- !query 7 +-- !query select string('four: ') || 2+2 --- !query 7 schema -struct<(CAST(concat(CAST(four: AS STRING), CAST(2 AS STRING)) AS DOUBLE) + CAST(2 AS DOUBLE)):double> --- !query 7 output -NULL +-- !query schema +struct<> +-- !query output +java.lang.NumberFormatException +invalid input syntax for type numeric: four: 2 --- !query 8 +-- !query select 'four: ' || 2+2 --- !query 8 schema -struct<(CAST(concat(four: , CAST(2 AS STRING)) AS DOUBLE) + CAST(2 AS DOUBLE)):double> --- !query 8 output -NULL +-- !query schema +struct<> +-- !query output +java.lang.NumberFormatException +invalid input syntax for type numeric: four: 2 --- !query 9 +-- !query select 3 || 4.0 --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 34.0 --- !query 10 +-- !query /* * various string functions */ select concat('one') --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output one --- !query 11 +-- !query select concat(1,2,3,'hello',true, false, to_date('20100309','yyyyMMdd')) --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 123hellotruefalse2010-03-09 --- !query 12 +-- !query select concat_ws('#','one') --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output one --- !query 13 +-- !query select concat_ws('#',1,2,3,'hello',true, false, to_date('20100309','yyyyMMdd')) --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output 1#x#x#hello#true#false#x-03-09 --- !query 14 +-- !query select concat_ws(',',10,20,null,30) --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output 10,20,30 --- !query 15 +-- !query select concat_ws('',10,20,null,30) --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 102030 --- !query 16 +-- !query select concat_ws(NULL,10,20,null,30) is null --- !query 16 schema +-- !query schema struct<(concat_ws(CAST(NULL AS STRING), CAST(10 AS STRING), CAST(20 AS STRING), NULL, CAST(30 AS STRING)) IS NULL):boolean> --- !query 16 output +-- !query output true --- !query 17 +-- !query select reverse('abcde') --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output edcba --- !query 18 -set spark.sql.parser.ansi.enabled=false --- !query 18 schema -struct --- !query 18 output -spark.sql.parser.ansi.enabled false - - --- !query 19 +-- !query select i, left('ahoj', i), right('ahoj', i) from range(-5, 6) t(i) order by i --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output -5 -4 -3 @@ -176,200 +170,192 @@ struct 5 ahoj ahoj --- !query 20 -set spark.sql.parser.ansi.enabled=true --- !query 20 schema -struct --- !query 20 output -spark.sql.parser.ansi.enabled true - - --- !query 21 +-- !query /* * format */ select format_string(NULL) --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output NULL --- !query 22 +-- !query select format_string('Hello') --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output Hello --- !query 23 +-- !query select format_string('Hello %s', 'World') --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output Hello World --- !query 24 +-- !query select format_string('Hello %%') --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output Hello % --- !query 25 +-- !query select format_string('Hello %%%%') --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output Hello %% --- !query 26 +-- !query select format_string('Hello %s %s', 'World') --- !query 26 schema +-- !query schema struct<> --- !query 26 output +-- !query output java.util.MissingFormatArgumentException Format specifier '%s' --- !query 27 +-- !query select format_string('Hello %s') --- !query 27 schema +-- !query schema struct<> --- !query 27 output +-- !query output java.util.MissingFormatArgumentException Format specifier '%s' --- !query 28 +-- !query select format_string('Hello %x', 20) --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output Hello 14 --- !query 29 +-- !query select format_string('%1$s %3$s', 1, 2, 3) --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output 1 3 --- !query 30 +-- !query select format_string('%1$s %12$s', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output 1 12 --- !query 31 +-- !query select format_string('%1$s %4$s', 1, 2, 3) --- !query 31 schema +-- !query schema struct<> --- !query 31 output +-- !query output java.util.MissingFormatArgumentException Format specifier '%4$s' --- !query 32 +-- !query select format_string('%1$s %13$s', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) --- !query 32 schema +-- !query schema struct<> --- !query 32 output +-- !query output java.util.MissingFormatArgumentException Format specifier '%13$s' --- !query 33 +-- !query select format_string('%0$s', 'Hello') --- !query 33 schema +-- !query schema struct --- !query 33 output +-- !query output Hello --- !query 34 +-- !query select format_string('Hello %s %1$s %s', 'World', 'Hello again') --- !query 34 schema +-- !query schema struct --- !query 34 output +-- !query output Hello World World Hello again --- !query 35 +-- !query select format_string('Hello %s %s, %2$s %2$s', 'World', 'Hello again') --- !query 35 schema +-- !query schema struct --- !query 35 output +-- !query output Hello World Hello again, Hello again Hello again --- !query 36 +-- !query select format_string('>>%10s<<', 'Hello') --- !query 36 schema +-- !query schema struct>%10s<<, Hello):string> --- !query 36 output +-- !query output >> Hello<< --- !query 37 +-- !query select format_string('>>%10s<<', NULL) --- !query 37 schema +-- !query schema struct>%10s<<, NULL):string> --- !query 37 output +-- !query output >> null<< --- !query 38 +-- !query select format_string('>>%10s<<', '') --- !query 38 schema +-- !query schema struct>%10s<<, ):string> --- !query 38 output +-- !query output >> << --- !query 39 +-- !query select format_string('>>%-10s<<', '') --- !query 39 schema +-- !query schema struct>%-10s<<, ):string> --- !query 39 output +-- !query output >> << --- !query 40 +-- !query select format_string('>>%-10s<<', 'Hello') --- !query 40 schema +-- !query schema struct>%-10s<<, Hello):string> --- !query 40 output +-- !query output >>Hello << --- !query 41 +-- !query select format_string('>>%-10s<<', NULL) --- !query 41 schema +-- !query schema struct>%-10s<<, NULL):string> --- !query 41 output +-- !query output >>null << --- !query 42 +-- !query select format_string('>>%1$10s<<', 'Hello') --- !query 42 schema +-- !query schema struct>%1$10s<<, Hello):string> --- !query 42 output +-- !query output >> Hello<< --- !query 43 +-- !query DROP TABLE TEXT_TBL --- !query 43 schema +-- !query schema struct<> --- !query 43 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/timestamp.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/timestamp.sql.out new file mode 100644 index 0000000000000..75ea3f3c42932 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/timestamp.sql.out @@ -0,0 +1,311 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 32 + + +-- !query +CREATE TABLE TIMESTAMP_TBL (d1 timestamp) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('now')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('now')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('today')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('yesterday')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('tomorrow')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('tomorrow EST')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('tomorrow Zulu')) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT count(*) AS One FROM TIMESTAMP_TBL WHERE d1 = timestamp 'today' +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT count(*) AS Three FROM TIMESTAMP_TBL WHERE d1 = timestamp 'tomorrow' +-- !query schema +struct +-- !query output +3 + + +-- !query +SELECT count(*) AS One FROM TIMESTAMP_TBL WHERE d1 = timestamp 'yesterday' +-- !query schema +struct +-- !query output +1 + + +-- !query +TRUNCATE TABLE TIMESTAMP_TBL +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('epoch')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('1997-01-02')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('1997-01-02 03:04:05')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('1997-02-10 17:32:01-08')) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO TIMESTAMP_TBL VALUES (timestamp('2001-09-22T18:19:20')) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT '' AS `64`, d1 FROM TIMESTAMP_TBL +-- !query schema +struct<64:string,d1:timestamp> +-- !query output + 1969-12-31 16:00:00 + 1997-01-02 00:00:00 + 1997-01-02 03:04:05 + 1997-02-10 17:32:01 + 2001-09-22 18:19:20 + + +-- !query +SELECT '' AS `48`, d1 FROM TIMESTAMP_TBL + WHERE d1 > timestamp '1997-01-02' +-- !query schema +struct<48:string,d1:timestamp> +-- !query output + 1997-01-02 03:04:05 + 1997-02-10 17:32:01 + 2001-09-22 18:19:20 + + +-- !query +SELECT '' AS `15`, d1 FROM TIMESTAMP_TBL + WHERE d1 < timestamp '1997-01-02' +-- !query schema +struct<15:string,d1:timestamp> +-- !query output + 1969-12-31 16:00:00 + + +-- !query +SELECT '' AS one, d1 FROM TIMESTAMP_TBL + WHERE d1 = timestamp '1997-01-02' +-- !query schema +struct +-- !query output + 1997-01-02 00:00:00 + + +-- !query +SELECT '' AS `63`, d1 FROM TIMESTAMP_TBL + WHERE d1 != timestamp '1997-01-02' +-- !query schema +struct<63:string,d1:timestamp> +-- !query output + 1969-12-31 16:00:00 + 1997-01-02 03:04:05 + 1997-02-10 17:32:01 + 2001-09-22 18:19:20 + + +-- !query +SELECT '' AS `16`, d1 FROM TIMESTAMP_TBL + WHERE d1 <= timestamp '1997-01-02' +-- !query schema +struct<16:string,d1:timestamp> +-- !query output + 1969-12-31 16:00:00 + 1997-01-02 00:00:00 + + +-- !query +SELECT '' AS `49`, d1 FROM TIMESTAMP_TBL + WHERE d1 >= timestamp '1997-01-02' +-- !query schema +struct<49:string,d1:timestamp> +-- !query output + 1997-01-02 00:00:00 + 1997-01-02 03:04:05 + 1997-02-10 17:32:01 + 2001-09-22 18:19:20 + + +-- !query +SELECT '' AS `54`, d1 - timestamp '1997-01-02' AS diff + FROM TIMESTAMP_TBL WHERE d1 BETWEEN '1902-01-01' AND '2038-01-01' +-- !query schema +struct<54:string,diff:interval> +-- !query output + -236720 hours + 0 seconds + 3 hours 4 minutes 5 seconds + 41393 hours 19 minutes 20 seconds + 953 hours 32 minutes 1 seconds + + +-- !query +SELECT '' AS date_trunc_week, date_trunc( 'week', timestamp '2004-02-29 15:44:17.71393' ) AS week_trunc +-- !query schema +struct +-- !query output + 2004-02-23 00:00:00 + + +-- !query +SELECT '' AS `54`, d1 - timestamp '1997-01-02' AS diff + FROM TIMESTAMP_TBL + WHERE d1 BETWEEN timestamp '1902-01-01' + AND timestamp '2038-01-01' +-- !query schema +struct<54:string,diff:interval> +-- !query output + -236720 hours + 0 seconds + 3 hours 4 minutes 5 seconds + 41393 hours 19 minutes 20 seconds + 953 hours 32 minutes 1 seconds + + +-- !query +SELECT '' AS `54`, d1 as `timestamp`, + date_part( 'year', d1) AS `year`, date_part( 'month', d1) AS `month`, + date_part( 'day', d1) AS `day`, date_part( 'hour', d1) AS `hour`, + date_part( 'minute', d1) AS `minute`, date_part( 'second', d1) AS `second` + FROM TIMESTAMP_TBL WHERE d1 BETWEEN '1902-01-01' AND '2038-01-01' +-- !query schema +struct<54:string,timestamp:timestamp,year:int,month:int,day:int,hour:int,minute:int,second:decimal(8,6)> +-- !query output + 1969-12-31 16:00:00 1969 12 31 16 0 0.000000 + 1997-01-02 00:00:00 1997 1 2 0 0 0.000000 + 1997-01-02 03:04:05 1997 1 2 3 4 5.000000 + 1997-02-10 17:32:01 1997 2 10 17 32 1.000000 + 2001-09-22 18:19:20 2001 9 22 18 19 20.000000 + + +-- !query +SELECT '' AS `54`, d1 as `timestamp`, + date_part( 'quarter', d1) AS quarter, date_part( 'msec', d1) AS msec, + date_part( 'usec', d1) AS usec + FROM TIMESTAMP_TBL WHERE d1 BETWEEN '1902-01-01' AND '2038-01-01' +-- !query schema +struct<54:string,timestamp:timestamp,quarter:int,msec:decimal(8,3),usec:int> +-- !query output + 1969-12-31 16:00:00 4 0.000 0 + 1997-01-02 00:00:00 1 0.000 0 + 1997-01-02 03:04:05 1 5000.000 5000000 + 1997-02-10 17:32:01 1 1000.000 1000000 + 2001-09-22 18:19:20 3 20000.000 20000000 + + +-- !query +SELECT '' AS `54`, d1 as `timestamp`, + date_part( 'isoyear', d1) AS isoyear, date_part( 'week', d1) AS week, + date_part( 'dow', d1) AS dow + FROM TIMESTAMP_TBL WHERE d1 BETWEEN '1902-01-01' AND '2038-01-01' +-- !query schema +struct<54:string,timestamp:timestamp,isoyear:int,week:int,dow:int> +-- !query output + 1969-12-31 16:00:00 1970 1 3 + 1997-01-02 00:00:00 1997 1 4 + 1997-01-02 03:04:05 1997 1 4 + 1997-02-10 17:32:01 1997 7 1 + 2001-09-22 18:19:20 2001 38 6 + + +-- !query +SELECT make_timestamp(2014,12,28,6,30,45.887) +-- !query schema +struct +-- !query output +2014-12-28 06:30:45.887 + + +-- !query +DROP TABLE TIMESTAMP_TBL +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/union.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/union.sql.out similarity index 70% rename from sql/core/src/test/resources/sql-tests/results/pgSQL/union.sql.out rename to sql/core/src/test/resources/sql-tests/results/postgreSQL/union.sql.out index 05dedc547086e..2fe53055cf656 100644 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/union.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/union.sql.out @@ -2,17 +2,17 @@ -- Number of queries: 72 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW INT4_TBL AS SELECT * FROM (VALUES (0), (123456), (-123456), (2147483647), (-2147483647)) AS v(f1) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE OR REPLACE TEMPORARY VIEW INT8_TBL AS SELECT * FROM (VALUES (123, 456), @@ -21,186 +21,186 @@ CREATE OR REPLACE TEMPORARY VIEW INT8_TBL AS SELECT * FROM (4567890123456789, 4567890123456789), (4567890123456789, -4567890123456789)) AS v(q1, q2) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE OR REPLACE TEMPORARY VIEW FLOAT8_TBL AS SELECT * FROM (VALUES (0.0), (-34.84), (-1004.30), (CAST('-1.2345678901234e+200' AS DOUBLE)), (CAST('-1.2345678901234e-200' AS DOUBLE))) AS v(f1) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT 1 AS two UNION SELECT 2 ORDER BY 1 --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 1 2 --- !query 4 +-- !query SELECT 1 AS one UNION SELECT 1 ORDER BY 1 --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 1 --- !query 5 +-- !query SELECT 1 AS two UNION ALL SELECT 2 --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 1 2 --- !query 6 +-- !query SELECT 1 AS two UNION ALL SELECT 1 --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 1 1 --- !query 7 +-- !query SELECT 1 AS three UNION SELECT 2 UNION SELECT 3 ORDER BY 1 --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 1 2 3 --- !query 8 +-- !query SELECT 1 AS two UNION SELECT 2 UNION SELECT 2 ORDER BY 1 --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 1 2 --- !query 9 +-- !query SELECT 1 AS three UNION SELECT 2 UNION ALL SELECT 2 ORDER BY 1 --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 1 2 2 --- !query 10 +-- !query SELECT 1.1 AS two UNION SELECT 2.2 ORDER BY 1 --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 1.1 2.2 --- !query 11 +-- !query SELECT 1.1 AS two UNION SELECT 2 ORDER BY 1 --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 1.1 -2 +2.0 --- !query 12 +-- !query SELECT 1 AS two UNION SELECT 2.2 ORDER BY 1 --- !query 12 schema +-- !query schema struct --- !query 12 output -1 +-- !query output +1.0 2.2 --- !query 13 +-- !query SELECT 1 AS one UNION SELECT double(1.0) ORDER BY 1 --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output 1.0 --- !query 14 +-- !query SELECT 1.1 AS two UNION ALL SELECT 2 ORDER BY 1 --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output 1.1 -2 +2.0 --- !query 15 +-- !query SELECT double(1.0) AS two UNION ALL SELECT 1 ORDER BY 1 --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 1.0 1.0 --- !query 16 +-- !query SELECT 1.1 AS three UNION SELECT 2 UNION SELECT 3 ORDER BY 1 --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output 1.1 -2 -3 +2.0 +3.0 --- !query 17 +-- !query SELECT double(1.1) AS two UNION SELECT 2 UNION SELECT double(2.0) ORDER BY 1 --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output 1.1 2.0 --- !query 18 +-- !query SELECT 1.1 AS three UNION SELECT 2 UNION ALL SELECT 2 ORDER BY 1 --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output 1.1 -2 -2 +2.0 +2.0 --- !query 19 +-- !query SELECT 1.1 AS two UNION (SELECT 2 UNION ALL SELECT 2) ORDER BY 1 --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output 1.1 -2 +2.0 --- !query 20 +-- !query SELECT f1 AS five FROM FLOAT8_TBL UNION SELECT f1 FROM FLOAT8_TBL ORDER BY 1 --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output -1.2345678901234E200 -1004.3 -34.84 @@ -208,13 +208,13 @@ struct 0.0 --- !query 21 +-- !query SELECT f1 AS ten FROM FLOAT8_TBL UNION ALL SELECT f1 FROM FLOAT8_TBL --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output -1.2345678901234E-200 -1.2345678901234E-200 -1.2345678901234E200 @@ -227,14 +227,14 @@ struct 0.0 --- !query 22 +-- !query SELECT f1 AS nine FROM FLOAT8_TBL UNION SELECT f1 FROM INT4_TBL ORDER BY 1 --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output -1.2345678901234E200 -2.147483647E9 -123456.0 @@ -246,13 +246,13 @@ struct 2.147483647E9 --- !query 23 +-- !query SELECT f1 AS ten FROM FLOAT8_TBL UNION ALL SELECT f1 FROM INT4_TBL --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output -1.2345678901234E-200 -1.2345678901234E200 -1004.3 @@ -265,16 +265,16 @@ struct 2.147483647E9 --- !query 24 +-- !query SELECT f1 AS five FROM FLOAT8_TBL WHERE f1 BETWEEN -1e6 AND 1e6 UNION SELECT f1 FROM INT4_TBL WHERE f1 BETWEEN 0 AND 1000000 ORDER BY 1 --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output -1004.3 -34.84 -1.2345678901234E-200 @@ -282,170 +282,170 @@ struct 123456.0 --- !query 25 +-- !query SELECT q2 FROM int8_tbl INTERSECT SELECT q1 FROM int8_tbl ORDER BY 1 --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output 123 4567890123456789 --- !query 26 +-- !query SELECT q2 FROM int8_tbl INTERSECT ALL SELECT q1 FROM int8_tbl ORDER BY 1 --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output 123 4567890123456789 4567890123456789 --- !query 27 +-- !query SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1 --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output -4567890123456789 456 --- !query 28 +-- !query SELECT q2 FROM int8_tbl EXCEPT ALL SELECT q1 FROM int8_tbl ORDER BY 1 --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output -4567890123456789 456 --- !query 29 +-- !query SELECT q2 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q1 FROM int8_tbl ORDER BY 1 --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output -4567890123456789 456 4567890123456789 --- !query 30 +-- !query SELECT q1 FROM int8_tbl EXCEPT SELECT q2 FROM int8_tbl ORDER BY 1 --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output --- !query 31 +-- !query SELECT q1 FROM int8_tbl EXCEPT ALL SELECT q2 FROM int8_tbl ORDER BY 1 --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output 123 4567890123456789 --- !query 32 +-- !query SELECT q1 FROM int8_tbl EXCEPT ALL SELECT DISTINCT q2 FROM int8_tbl ORDER BY 1 --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output 123 4567890123456789 4567890123456789 --- !query 33 +-- !query (SELECT 1,2,3 UNION SELECT 4,5,6) INTERSECT SELECT 4,5,6 --- !query 33 schema +-- !query schema struct<1:int,2:int,3:int> --- !query 33 output +-- !query output 4 5 6 --- !query 34 +-- !query (SELECT 1,2,3 UNION SELECT 4,5,6 ORDER BY 1,2) INTERSECT SELECT 4,5,6 --- !query 34 schema +-- !query schema struct<1:int,2:int,3:int> --- !query 34 output +-- !query output 4 5 6 --- !query 35 +-- !query (SELECT 1,2,3 UNION SELECT 4,5,6) EXCEPT SELECT 4,5,6 --- !query 35 schema +-- !query schema struct<1:int,2:int,3:int> --- !query 35 output +-- !query output 1 2 3 --- !query 36 +-- !query (SELECT 1,2,3 UNION SELECT 4,5,6 ORDER BY 1,2) EXCEPT SELECT 4,5,6 --- !query 36 schema +-- !query schema struct<1:int,2:int,3:int> --- !query 36 output +-- !query output 1 2 3 --- !query 37 +-- !query select count(*) from ( select unique1 from tenk1 intersect select fivethous from tenk1 ) ss --- !query 37 schema +-- !query schema struct --- !query 37 output +-- !query output 5000 --- !query 38 +-- !query select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10 --- !query 38 schema +-- !query schema struct --- !query 38 output +-- !query output 10 --- !query 39 +-- !query select count(*) from ( select unique1 from tenk1 intersect select fivethous from tenk1 ) ss --- !query 39 schema +-- !query schema struct --- !query 39 output +-- !query output 5000 --- !query 40 +-- !query select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10 --- !query 40 schema +-- !query schema struct --- !query 40 output +-- !query output 10 --- !query 41 +-- !query SELECT f1 FROM float8_tbl INTERSECT SELECT f1 FROM int4_tbl ORDER BY 1 --- !query 41 schema +-- !query schema struct --- !query 41 output +-- !query output 0.0 --- !query 42 +-- !query SELECT f1 FROM float8_tbl EXCEPT SELECT f1 FROM int4_tbl ORDER BY 1 --- !query 42 schema +-- !query schema struct --- !query 42 output +-- !query output -1.2345678901234E200 -1004.3 -34.84 -1.2345678901234E-200 --- !query 43 +-- !query SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl ORDER BY 1 --- !query 43 schema +-- !query schema struct --- !query 43 output +-- !query output -4567890123456789 123 123 @@ -455,20 +455,20 @@ struct 4567890123456789 --- !query 44 +-- !query SELECT q1 FROM int8_tbl INTERSECT (((SELECT q2 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl))) ORDER BY 1 --- !query 44 schema +-- !query schema struct --- !query 44 output +-- !query output 123 4567890123456789 --- !query 45 +-- !query (((SELECT q1 FROM int8_tbl INTERSECT SELECT q2 FROM int8_tbl ORDER BY 1))) UNION ALL SELECT q2 FROM int8_tbl --- !query 45 schema +-- !query schema struct --- !query 45 output +-- !query output 123 4567890123456789 456 @@ -478,20 +478,20 @@ struct -4567890123456789 --- !query 46 +-- !query SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1 --- !query 46 schema +-- !query schema struct --- !query 46 output +-- !query output -4567890123456789 456 --- !query 47 +-- !query SELECT q1 FROM int8_tbl UNION ALL (((SELECT q2 FROM int8_tbl EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1))) --- !query 47 schema +-- !query schema struct --- !query 47 output +-- !query output 123 123 4567890123456789 @@ -501,48 +501,48 @@ struct 456 --- !query 48 +-- !query (((SELECT q1 FROM int8_tbl UNION ALL SELECT q2 FROM int8_tbl))) EXCEPT SELECT q1 FROM int8_tbl ORDER BY 1 --- !query 48 schema +-- !query schema struct --- !query 48 output +-- !query output -4567890123456789 456 --- !query 49 +-- !query SELECT q1,q2 FROM int8_tbl EXCEPT SELECT q2,q1 FROM int8_tbl ORDER BY q2,q1 --- !query 49 schema +-- !query schema struct --- !query 49 output +-- !query output 4567890123456789 -4567890123456789 123 456 --- !query 50 +-- !query SELECT q1 FROM int8_tbl EXCEPT SELECT q2 FROM int8_tbl ORDER BY q2 LIMIT 1 --- !query 50 schema +-- !query schema struct<> --- !query 50 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`q2`' given input columns: [int8_tbl.q1]; line 1 pos 64 --- !query 51 +-- !query SELECT q1 FROM int8_tbl EXCEPT (((SELECT q2 FROM int8_tbl ORDER BY q2 LIMIT 1))) ORDER BY 1 --- !query 51 schema +-- !query schema struct --- !query 51 output +-- !query output 123 4567890123456789 --- !query 52 +-- !query (((((select * from int8_tbl))))) --- !query 52 schema +-- !query schema struct --- !query 52 output +-- !query output 123 456 123 4567890123456789 4567890123456789 -4567890123456789 @@ -550,22 +550,22 @@ struct 4567890123456789 4567890123456789 --- !query 53 +-- !query select * from range(1,5) union select * from range(1,3) --- !query 53 schema +-- !query schema struct --- !query 53 output +-- !query output 1 2 3 4 --- !query 54 +-- !query select * from range(1,6) union all select * from range(1,4) --- !query 54 schema +-- !query schema struct --- !query 54 output +-- !query output 1 1 2 @@ -576,49 +576,49 @@ struct 5 --- !query 55 +-- !query select * from range(1,6) intersect select * from range(1,4) --- !query 55 schema +-- !query schema struct --- !query 55 output +-- !query output 1 2 3 --- !query 56 +-- !query select * from range(1,6) intersect all select * from range(1,4) --- !query 56 schema +-- !query schema struct --- !query 56 output +-- !query output 1 2 3 --- !query 57 +-- !query select * from range(1,6) except select * from range(1,4) --- !query 57 schema +-- !query schema struct --- !query 57 output +-- !query output 4 5 --- !query 58 +-- !query select * from range(1,6) except all select * from range(1,4) --- !query 58 schema +-- !query schema struct --- !query 58 output +-- !query output 4 5 --- !query 59 +-- !query select * from range(1,6) union select * from range(1,4) --- !query 59 schema +-- !query schema struct --- !query 59 output +-- !query output 1 2 3 @@ -626,11 +626,11 @@ struct 5 --- !query 60 +-- !query select * from range(1,6) union all select * from range(1,4) --- !query 60 schema +-- !query schema struct --- !query 60 output +-- !query output 1 1 2 @@ -641,128 +641,128 @@ struct 5 --- !query 61 +-- !query select * from range(1,6) intersect select * from range(1,4) --- !query 61 schema +-- !query schema struct --- !query 61 output +-- !query output 1 2 3 --- !query 62 +-- !query select * from range(1,6) intersect all select * from range(1,4) --- !query 62 schema +-- !query schema struct --- !query 62 output +-- !query output 1 2 3 --- !query 63 +-- !query select * from range(1,6) except select * from range(1,4) --- !query 63 schema +-- !query schema struct --- !query 63 output +-- !query output 4 5 --- !query 64 +-- !query select * from range(1,6) except all select * from range(1,4) --- !query 64 schema +-- !query schema struct --- !query 64 output +-- !query output 4 5 --- !query 65 +-- !query SELECT cast('3.4' as decimal(38, 18)) UNION SELECT 'foo' --- !query 65 schema +-- !query schema struct --- !query 65 output +-- !query output 3.400000000000000000 foo --- !query 66 +-- !query SELECT * FROM (SELECT 1 AS t, 2 AS x UNION SELECT 2 AS t, 4 AS x) ss WHERE x < 4 ORDER BY x --- !query 66 schema +-- !query schema struct --- !query 66 output +-- !query output 1 2 --- !query 67 +-- !query SELECT * FROM (SELECT 1 AS t, id as x from range(1,11) UNION SELECT 2 AS t, 4 AS x) ss WHERE x < 4 ORDER BY x --- !query 67 schema +-- !query schema struct --- !query 67 output +-- !query output 1 1 1 2 1 3 --- !query 68 +-- !query SELECT * FROM (SELECT 1 AS t, int((random()*3)) AS x UNION SELECT 2 AS t, 4 AS x) ss WHERE x > 3 ORDER BY x --- !query 68 schema +-- !query schema struct --- !query 68 output +-- !query output 2 4 --- !query 69 +-- !query select distinct q1 from (select distinct * from int8_tbl i81 union all select distinct * from int8_tbl i82) ss where q2 = q2 --- !query 69 schema +-- !query schema struct --- !query 69 output +-- !query output 123 4567890123456789 --- !query 70 +-- !query select distinct q1 from (select distinct * from int8_tbl i81 union all select distinct * from int8_tbl i82) ss where -q1 = q2 --- !query 70 schema +-- !query schema struct --- !query 70 output +-- !query output 4567890123456789 --- !query 71 +-- !query select * from (select *, 0 as x from int8_tbl a union all select *, 1 as x from int8_tbl b) ss where (x = 0) or (q1 >= q2 and q1 <= q2) --- !query 71 schema +-- !query schema struct --- !query 71 output +-- !query output 123 456 0 123 4567890123456789 0 4567890123456789 -4567890123456789 0 diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part1.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part1.sql.out new file mode 100755 index 0000000000000..2b1de87a6be5e --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part1.sql.out @@ -0,0 +1,725 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 43 + + +-- !query +CREATE TEMPORARY VIEW tenk2 AS SELECT * FROM tenk1 +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT four, ten, SUM(SUM(four)) OVER (PARTITION BY four), AVG(ten) FROM tenk1 +GROUP BY four, ten ORDER BY four, ten +-- !query schema +struct +-- !query output +0 0 0 0.0 +0 2 0 2.0 +0 4 0 4.0 +0 6 0 6.0 +0 8 0 8.0 +1 1 2500 1.0 +1 3 2500 3.0 +1 5 2500 5.0 +1 7 2500 7.0 +1 9 2500 9.0 +2 0 5000 0.0 +2 2 5000 2.0 +2 4 5000 4.0 +2 6 5000 6.0 +2 8 5000 8.0 +3 1 7500 1.0 +3 3 7500 3.0 +3 5 7500 5.0 +3 7 7500 7.0 +3 9 7500 9.0 + + +-- !query +SELECT COUNT(*) OVER () FROM tenk1 WHERE unique2 < 10 +-- !query schema +struct +-- !query output +10 +10 +10 +10 +10 +10 +10 +10 +10 +10 + + +-- !query +SELECT COUNT(*) OVER w FROM tenk1 WHERE unique2 < 10 WINDOW w AS () +-- !query schema +struct +-- !query output +10 +10 +10 +10 +10 +10 +10 +10 +10 +10 + + +-- !query +SELECT four FROM tenk1 WHERE FALSE WINDOW w AS (PARTITION BY ten) +-- !query schema +struct +-- !query output + + + +-- !query +SELECT sum(four) OVER (PARTITION BY ten ORDER BY unique2) AS sum_1, ten, four FROM tenk1 WHERE unique2 < 10 +-- !query schema +struct +-- !query output +0 0 0 +0 0 0 +0 4 0 +1 7 1 +1 9 1 +2 0 2 +3 1 3 +3 3 3 +4 1 1 +5 1 1 + + +-- !query +SELECT row_number() OVER (ORDER BY unique2) FROM tenk1 WHERE unique2 < 10 +-- !query schema +struct +-- !query output +1 +10 +2 +3 +4 +5 +6 +7 +8 +9 + + +-- !query +SELECT rank() OVER (PARTITION BY four ORDER BY ten) AS rank_1, ten, four FROM tenk1 WHERE unique2 < 10 +-- !query schema +struct +-- !query output +1 0 0 +1 0 0 +1 0 2 +1 1 1 +1 1 1 +1 1 3 +2 3 3 +3 4 0 +3 7 1 +4 9 1 + + +-- !query +SELECT dense_rank() OVER (PARTITION BY four ORDER BY ten), ten, four FROM tenk1 WHERE unique2 < 10 +-- !query schema +struct +-- !query output +1 0 0 +1 0 0 +1 0 2 +1 1 1 +1 1 1 +1 1 3 +2 3 3 +2 4 0 +2 7 1 +3 9 1 + + +-- !query +SELECT percent_rank() OVER (PARTITION BY four ORDER BY ten), ten, four FROM tenk1 WHERE unique2 < 10 +-- !query schema +struct +-- !query output +0.0 0 0 +0.0 0 0 +0.0 0 2 +0.0 1 1 +0.0 1 1 +0.0 1 3 +0.6666666666666666 7 1 +1.0 3 3 +1.0 4 0 +1.0 9 1 + + +-- !query +SELECT cume_dist() OVER (PARTITION BY four ORDER BY ten), ten, four FROM tenk1 WHERE unique2 < 10 +-- !query schema +struct +-- !query output +0.5 1 1 +0.5 1 1 +0.5 1 3 +0.6666666666666666 0 0 +0.6666666666666666 0 0 +0.75 7 1 +1.0 0 2 +1.0 3 3 +1.0 4 0 +1.0 9 1 + + +-- !query +SELECT ntile(3) OVER (ORDER BY ten, four), ten, four FROM tenk1 WHERE unique2 < 10 +-- !query schema +struct +-- !query output +1 0 0 +1 0 0 +1 0 2 +1 1 1 +2 1 1 +2 1 3 +2 3 3 +3 4 0 +3 7 1 +3 9 1 + + +-- !query +SELECT lag(ten) OVER (PARTITION BY four ORDER BY ten), ten, four FROM tenk1 WHERE unique2 < 10 +-- !query schema +struct +-- !query output +0 0 0 +0 4 0 +1 1 1 +1 3 3 +1 7 1 +7 9 1 +NULL 0 0 +NULL 0 2 +NULL 1 1 +NULL 1 3 + + +-- !query +SELECT lead(ten) OVER (PARTITION BY four ORDER BY ten), ten, four FROM tenk1 WHERE unique2 < 10 +-- !query schema +struct +-- !query output +0 0 0 +1 1 1 +3 1 3 +4 0 0 +7 1 1 +9 7 1 +NULL 0 2 +NULL 3 3 +NULL 4 0 +NULL 9 1 + + +-- !query +SELECT lead(ten * 2, 1) OVER (PARTITION BY four ORDER BY ten), ten, four FROM tenk1 WHERE unique2 < 10 +-- !query schema +struct +-- !query output +0 0 0 +14 1 1 +18 7 1 +2 1 1 +6 1 3 +8 0 0 +NULL 0 2 +NULL 3 3 +NULL 4 0 +NULL 9 1 + + +-- !query +SELECT lead(ten * 2, 1, -1) OVER (PARTITION BY four ORDER BY ten), ten, four FROM tenk1 WHERE unique2 < 10 +-- !query schema +struct +-- !query output +-1 0 2 +-1 3 3 +-1 4 0 +-1 9 1 +0 0 0 +14 1 1 +18 7 1 +2 1 1 +6 1 3 +8 0 0 + + +-- !query +SELECT first(ten) OVER (PARTITION BY four ORDER BY ten), ten, four FROM tenk1 WHERE unique2 < 10 +-- !query schema +struct +-- !query output +0 0 0 +0 0 0 +0 0 2 +0 4 0 +1 1 1 +1 1 1 +1 1 3 +1 3 3 +1 7 1 +1 9 1 + + +-- !query +SELECT last(four) OVER (ORDER BY ten), ten, four FROM tenk1 WHERE unique2 < 10 +-- !query schema +struct +-- !query output +0 4 0 +1 1 1 +1 1 1 +1 1 3 +1 7 1 +1 9 1 +2 0 0 +2 0 0 +2 0 2 +3 3 3 + + +-- !query +SELECT last(ten) OVER (PARTITION BY four), ten, four FROM +(SELECT * FROM tenk1 WHERE unique2 < 10 ORDER BY four, ten)s +ORDER BY four, ten +-- !query schema +struct +-- !query output +4 0 0 +4 0 0 +4 4 0 +9 1 1 +9 1 1 +9 7 1 +9 9 1 +0 0 2 +3 1 3 +3 3 3 + + +-- !query +SELECT ten, two, sum(hundred) AS gsum, sum(sum(hundred)) OVER (PARTITION BY two ORDER BY ten) AS wsum +FROM tenk1 GROUP BY ten, two +-- !query schema +struct +-- !query output +0 0 45000 45000 +1 1 46000 46000 +2 0 47000 92000 +3 1 48000 94000 +4 0 49000 141000 +5 1 50000 144000 +6 0 51000 192000 +7 1 52000 196000 +8 0 53000 245000 +9 1 54000 250000 + + +-- !query +SELECT count(*) OVER (PARTITION BY four), four FROM (SELECT * FROM tenk1 WHERE two = 1)s WHERE unique2 < 10 +-- !query schema +struct +-- !query output +2 3 +2 3 +4 1 +4 1 +4 1 +4 1 + + +-- !query +SELECT (count(*) OVER (PARTITION BY four ORDER BY ten) + + sum(hundred) OVER (PARTITION BY four ORDER BY ten)) AS cntsum + FROM tenk1 WHERE unique2 < 10 +-- !query schema +struct +-- !query output +136 +22 +22 +24 +24 +51 +82 +87 +92 +92 + + +-- !query +SELECT * FROM( + SELECT count(*) OVER (PARTITION BY four ORDER BY ten) + + sum(hundred) OVER (PARTITION BY two ORDER BY ten) AS total, + count(*) OVER (PARTITION BY four ORDER BY ten) AS fourcount, + sum(hundred) OVER (PARTITION BY two ORDER BY ten) AS twosum + FROM tenk1 +)sub WHERE total <> fourcount + twosum +-- !query schema +struct +-- !query output + + + +-- !query +SELECT avg(four) OVER (PARTITION BY four ORDER BY thousand / 100) FROM tenk1 WHERE unique2 < 10 +-- !query schema +struct +-- !query output +0.0 +0.0 +0.0 +1.0 +1.0 +1.0 +1.0 +2.0 +3.0 +3.0 + + +-- !query +SELECT ten, two, sum(hundred) AS gsum, sum(sum(hundred)) OVER win AS wsum +FROM tenk1 GROUP BY ten, two WINDOW win AS (PARTITION BY two ORDER BY ten) +-- !query schema +struct +-- !query output +0 0 45000 45000 +1 1 46000 46000 +2 0 47000 92000 +3 1 48000 94000 +4 0 49000 141000 +5 1 50000 144000 +6 0 51000 192000 +7 1 52000 196000 +8 0 53000 245000 +9 1 54000 250000 + + +-- !query +SELECT count(*) OVER (PARTITION BY four) FROM (SELECT * FROM tenk1 WHERE FALSE)s +-- !query schema +struct +-- !query output + + + +-- !query +create temporary view int4_tbl as select * from values + (0), + (123456), + (-123456), + (2147483647), + (-2147483647) + as int4_tbl(f1) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT SUM(COUNT(f1)) OVER () FROM int4_tbl WHERE f1=42 +-- !query schema +struct +-- !query output +0 + + +-- !query +select ten, + sum(unique1) + sum(unique2) as res, + rank() over (order by sum(unique1) + sum(unique2)) as rank +from tenk1 +group by ten order by ten +-- !query schema +struct +-- !query output +0 9976146 4 +1 10114187 9 +2 10059554 8 +3 9878541 1 +4 9881005 2 +5 9981670 5 +6 9947099 3 +7 10120309 10 +8 9991305 6 +9 10040184 7 + + +-- !query +SELECT four, ten, +sum(ten) over (partition by four order by ten), +last(ten) over (partition by four order by ten) +FROM (select distinct ten, four from tenk1) ss +-- !query schema +struct +-- !query output +0 0 0 0 +0 2 2 2 +0 4 6 4 +0 6 12 6 +0 8 20 8 +1 1 1 1 +1 3 4 3 +1 5 9 5 +1 7 16 7 +1 9 25 9 +2 0 0 0 +2 2 2 2 +2 4 6 4 +2 6 12 6 +2 8 20 8 +3 1 1 1 +3 3 4 3 +3 5 9 5 +3 7 16 7 +3 9 25 9 + + +-- !query +SELECT four, ten, +sum(ten) over (partition by four order by ten range between unbounded preceding and current row), +last(ten) over (partition by four order by ten range between unbounded preceding and current row) +FROM (select distinct ten, four from tenk1) ss +-- !query schema +struct +-- !query output +0 0 0 0 +0 2 2 2 +0 4 6 4 +0 6 12 6 +0 8 20 8 +1 1 1 1 +1 3 4 3 +1 5 9 5 +1 7 16 7 +1 9 25 9 +2 0 0 0 +2 2 2 2 +2 4 6 4 +2 6 12 6 +2 8 20 8 +3 1 1 1 +3 3 4 3 +3 5 9 5 +3 7 16 7 +3 9 25 9 + + +-- !query +SELECT four, ten, +sum(ten) over (partition by four order by ten range between unbounded preceding and unbounded following), +last(ten) over (partition by four order by ten range between unbounded preceding and unbounded following) +FROM (select distinct ten, four from tenk1) ss +-- !query schema +struct +-- !query output +0 0 20 8 +0 2 20 8 +0 4 20 8 +0 6 20 8 +0 8 20 8 +1 1 25 9 +1 3 25 9 +1 5 25 9 +1 7 25 9 +1 9 25 9 +2 0 20 8 +2 2 20 8 +2 4 20 8 +2 6 20 8 +2 8 20 8 +3 1 25 9 +3 3 25 9 +3 5 25 9 +3 7 25 9 +3 9 25 9 + + +-- !query +SELECT sum(unique1) over (order by four range between current row and unbounded following), +unique1, four +FROM tenk1 WHERE unique1 < 10 +-- !query schema +struct +-- !query output +10 3 3 +10 7 3 +18 2 2 +18 6 2 +33 1 1 +33 5 1 +33 9 1 +45 0 0 +45 4 0 +45 8 0 + + +-- !query +SELECT sum(unique1) over (rows between current row and unbounded following), +unique1, four +FROM tenk1 WHERE unique1 < 10 +-- !query schema +struct +-- !query output +0 0 0 +10 3 3 +15 5 1 +23 8 0 +32 9 1 +38 6 2 +39 1 1 +41 2 2 +45 4 0 +7 7 3 + + +-- !query +SELECT sum(unique1) over (rows between 2 preceding and 2 following), +unique1, four +FROM tenk1 WHERE unique1 < 10 +-- !query schema +struct +-- !query output +10 0 0 +13 2 2 +15 7 3 +22 1 1 +23 3 3 +26 6 2 +29 9 1 +31 8 0 +32 5 1 +7 4 0 + + +-- !query +SELECT sum(unique1) over (rows between 2 preceding and 1 preceding), +unique1, four +FROM tenk1 WHERE unique1 < 10 +-- !query schema +struct +-- !query output +10 0 0 +13 3 3 +15 8 0 +17 5 1 +3 6 2 +4 2 2 +6 1 1 +7 9 1 +8 7 3 +NULL 4 0 + + +-- !query +SELECT sum(unique1) over (rows between 1 following and 3 following), +unique1, four +FROM tenk1 WHERE unique1 < 10 +-- !query schema +struct +-- !query output +0 7 3 +10 5 1 +15 8 0 +16 2 2 +16 9 1 +22 6 2 +23 1 1 +7 3 3 +9 4 0 +NULL 0 0 + + +-- !query +SELECT sum(unique1) over (rows between unbounded preceding and 1 following), +unique1, four +FROM tenk1 WHERE unique1 < 10 +-- !query schema +struct +-- !query output +13 1 1 +22 6 2 +30 9 1 +35 8 0 +38 5 1 +45 0 0 +45 3 3 +45 7 3 +6 4 0 +7 2 2 + + +-- !query +CREATE TEMP VIEW v_window AS +SELECT i.id, sum(i.id) over (order by i.id rows between 1 preceding and 1 following) as sum_rows +FROM range(1, 11) i +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * FROM v_window +-- !query schema +struct +-- !query output +1 3 +10 19 +2 6 +3 9 +4 12 +5 15 +6 18 +7 21 +8 24 +9 27 + + +-- !query +DROP VIEW v_window +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW tenk2 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW int4_tbl +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part2.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part2.sql.out new file mode 100644 index 0000000000000..f41659a196ae1 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part2.sql.out @@ -0,0 +1,481 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 31 + + +-- !query +CREATE TABLE empsalary ( + depname string, + empno integer, + salary int, + enroll_date date +) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO empsalary VALUES + ('develop', 10, 5200, date '2007-08-01'), + ('sales', 1, 5000, date '2006-10-01'), + ('personnel', 5, 3500, date '2007-12-10'), + ('sales', 4, 4800, date '2007-08-08'), + ('personnel', 2, 3900, date '2006-12-23'), + ('develop', 7, 4200, date '2008-01-01'), + ('develop', 9, 4500, date '2008-01-01'), + ('sales', 3, 4800, date '2007-08-01'), + ('develop', 8, 6000, date '2006-10-01'), + ('develop', 11, 5200, date '2007-08-15') +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT sum(unique1) over (order by four range between 2 preceding and 1 preceding), +unique1, four +FROM tenk1 WHERE unique1 < 10 +-- !query schema +struct +-- !query output +12 1 1 +12 5 1 +12 9 1 +23 3 3 +23 7 3 +27 2 2 +27 6 2 +NULL 0 0 +NULL 4 0 +NULL 8 0 + + +-- !query +SELECT sum(unique1) over (order by four desc range between 2 preceding and 1 preceding), +unique1, four +FROM tenk1 WHERE unique1 < 10 +-- !query schema +struct +-- !query output +10 2 2 +10 6 2 +18 1 1 +18 5 1 +18 9 1 +23 0 0 +23 4 0 +23 8 0 +NULL 3 3 +NULL 7 3 + + +-- !query +SELECT sum(unique1) over (partition by four order by unique1 range between 5 preceding and 6 following), +unique1, four +FROM tenk1 WHERE unique1 < 10 +-- !query schema +struct +-- !query output +10 3 3 +10 7 3 +12 4 0 +12 8 0 +14 9 1 +15 5 1 +4 0 0 +6 1 1 +8 2 2 +8 6 2 + + +-- !query +select ss.id, ss.y, + first(ss.y) over w, + last(ss.y) over w +from + (select x.id, x.id as y from range(1,6) as x + union all select null, 42 + union all select null, 43) ss +window w as + (order by ss.id asc nulls first range between 2 preceding and 2 following) +-- !query schema +struct +-- !query output +1 1 1 3 +2 2 1 4 +3 3 1 5 +4 4 2 5 +5 5 3 5 +NULL 42 42 43 +NULL 43 42 43 + + +-- !query +select ss.id, ss.y, + first(ss.y) over w, + last(ss.y) over w +from + (select x.id, x.id as y from range(1,6) as x + union all select null, 42 + union all select null, 43) ss +window w as + (order by ss.id asc nulls last range between 2 preceding and 2 following) +-- !query schema +struct +-- !query output +1 1 1 3 +2 2 1 4 +3 3 1 5 +4 4 2 5 +5 5 3 5 +NULL 42 42 43 +NULL 43 42 43 + + +-- !query +select ss.id, ss.y, + first(ss.y) over w, + last(ss.y) over w +from + (select x.id, x.id as y from range(1,6) as x + union all select null, 42 + union all select null, 43) ss +window w as + (order by ss.id desc nulls first range between 2 preceding and 2 following) +-- !query schema +struct +-- !query output +1 1 3 1 +2 2 4 1 +3 3 5 1 +4 4 5 2 +5 5 5 3 +NULL 42 42 43 +NULL 43 42 43 + + +-- !query +select ss.id, ss.y, + first(ss.y) over w, + last(ss.y) over w +from + (select x.id, x.id as y from range(1,6) as x + union all select null, 42 + union all select null, 43) ss +window w as + (order by ss.id desc nulls last range between 2 preceding and 2 following) +-- !query schema +struct +-- !query output +1 1 3 1 +2 2 4 1 +3 3 5 1 +4 4 5 2 +5 5 5 3 +NULL 42 42 43 +NULL 43 42 43 + + +-- !query +select x.id, last(x.id) over (order by x.id range between current row and 2147450884 following) +from range(32764, 32767) x +-- !query schema +struct +-- !query output +32764 32766 +32765 32766 +32766 32766 + + +-- !query +select x.id, last(x.id) over (order by x.id desc range between current row and 2147450885 following) +from range(-32766, -32765) x +-- !query schema +struct +-- !query output +-32766 -32766 + + +-- !query +select x.id, last(x.id) over (order by x.id range between current row and 4 following) +from range(2147483644, 2147483647) x +-- !query schema +struct +-- !query output +2147483644 2147483646 +2147483645 2147483646 +2147483646 2147483646 + + +-- !query +select x.id, last(x.id) over (order by x.id desc range between current row and 5 following) +from range(-2147483646, -2147483645) x +-- !query schema +struct +-- !query output +-2147483646 -2147483646 + + +-- !query +select x.id, last(x.id) over (order by x.id range between current row and 4 following) +from range(9223372036854775804, 9223372036854775807) x +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +long overflow + + +-- !query +select x.id, last(x.id) over (order by x.id desc range between current row and 5 following) +from range(-9223372036854775806, -9223372036854775805) x +-- !query schema +struct<> +-- !query output +java.lang.ArithmeticException +long overflow + + +-- !query +create table numerics ( + id int, + f_float4 float, + f_float8 float, + f_numeric int +) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +insert into numerics values +(1, -3, -3, -3), +(2, -1, -1, -1), +(3, 0, 0, 0), +(4, 1.1, 1.1, 1.1), +(5, 1.12, 1.12, 1.12), +(6, 2, 2, 2), +(7, 100, 100, 100) +-- !query schema +struct<> +-- !query output + + + +-- !query +select id, f_float4, first(id) over w, last(id) over w +from numerics +window w as (order by f_float4 range between + 1 preceding and 1 following) +-- !query schema +struct +-- !query output +1 -3.0 1 1 +2 -1.0 2 3 +3 0.0 2 3 +4 1.1 4 6 +5 1.12 4 6 +6 2.0 4 6 +7 100.0 7 7 + + +-- !query +select id, f_float4, first(id) over w, last(id) over w +from numerics +window w as (order by f_float4 range between + 1 preceding and 1.1 following) +-- !query schema +struct +-- !query output +1 -3.0 1 1 +2 -1.0 2 3 +3 0.0 2 4 +4 1.1 4 6 +5 1.12 4 6 +6 2.0 4 6 +7 100.0 7 7 + + +-- !query +select id, f_float4, first(id) over w, last(id) over w +from numerics +window w as (order by f_float4 range between + 'inf' preceding and 'inf' following) +-- !query schema +struct +-- !query output +1 -3.0 1 7 +2 -1.0 1 7 +3 0.0 1 7 +4 1.1 1 7 +5 1.12 1 7 +6 2.0 1 7 +7 100.0 1 7 + + +-- !query +select id, f_float4, first(id) over w, last(id) over w +from numerics +window w as (order by f_float4 range between + 1.1 preceding and 'NaN' following) +-- !query schema +struct +-- !query output +1 -3.0 1 7 +2 -1.0 2 7 +3 0.0 2 7 +4 1.1 3 7 +5 1.12 4 7 +6 2.0 4 7 +7 100.0 7 7 + + +-- !query +select id, f_float8, first(id) over w, last(id) over w +from numerics +window w as (order by f_float8 range between + 1 preceding and 1 following) +-- !query schema +struct +-- !query output +1 -3.0 1 1 +2 -1.0 2 3 +3 0.0 2 3 +4 1.1 4 6 +5 1.12 4 6 +6 2.0 4 6 +7 100.0 7 7 + + +-- !query +select id, f_float8, first(id) over w, last(id) over w +from numerics +window w as (order by f_float8 range between + 1 preceding and 1.1 following) +-- !query schema +struct +-- !query output +1 -3.0 1 1 +2 -1.0 2 3 +3 0.0 2 4 +4 1.1 4 6 +5 1.12 4 6 +6 2.0 4 6 +7 100.0 7 7 + + +-- !query +select id, f_float8, first(id) over w, last(id) over w +from numerics +window w as (order by f_float8 range between + 'inf' preceding and 'inf' following) +-- !query schema +struct +-- !query output +1 -3.0 1 7 +2 -1.0 1 7 +3 0.0 1 7 +4 1.1 1 7 +5 1.12 1 7 +6 2.0 1 7 +7 100.0 1 7 + + +-- !query +select id, f_float8, first(id) over w, last(id) over w +from numerics +window w as (order by f_float8 range between + 1.1 preceding and 'NaN' following) +-- !query schema +struct +-- !query output +1 -3.0 1 7 +2 -1.0 2 7 +3 0.0 2 7 +4 1.1 3 7 +5 1.12 4 7 +6 2.0 4 7 +7 100.0 7 7 + + +-- !query +select id, f_numeric, first(id) over w, last(id) over w +from numerics +window w as (order by f_numeric range between + 1 preceding and 1 following) +-- !query schema +struct +-- !query output +1 -3 1 1 +2 -1 2 3 +3 0 2 5 +4 1 3 6 +5 1 3 6 +6 2 4 6 +7 100 7 7 + + +-- !query +select id, f_numeric, first(id) over w, last(id) over w +from numerics +window w as (order by f_numeric range between + 1 preceding and 1.1 following) +-- !query schema +struct +-- !query output +1 -3 1 1 +2 -1 2 3 +3 0 2 5 +4 1 3 6 +5 1 3 6 +6 2 4 6 +7 100 7 7 + + +-- !query +select id, f_numeric, first(id) over w, last(id) over w +from numerics +window w as (order by f_numeric range between + 1 preceding and 1.1 following) +-- !query schema +struct +-- !query output +1 -3 1 1 +2 -1 2 3 +3 0 2 5 +4 1 3 6 +5 1 3 6 +6 2 4 6 +7 100 7 7 + + +-- !query +select id, f_numeric, first(id) over w, last(id) over w +from numerics +window w as (order by f_numeric range between + 1.1 preceding and 'NaN' following) +-- !query schema +struct<> +-- !query output +java.lang.NumberFormatException +invalid input syntax for type numeric: NaN + + +-- !query +drop table empsalary +-- !query schema +struct<> +-- !query output + + + +-- !query +drop table numerics +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part3.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part3.sql.out new file mode 100644 index 0000000000000..5a52358fe1c53 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part3.sql.out @@ -0,0 +1,409 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 29 + + +-- !query +CREATE TEMPORARY VIEW tenk2 AS SELECT * FROM tenk1 +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE empsalary ( + depname string, + empno integer, + salary int, + enroll_date date +) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO empsalary VALUES + ('develop', 10, 5200, date '2007-08-01'), + ('sales', 1, 5000, date '2006-10-01'), + ('personnel', 5, 3500, date '2007-12-10'), + ('sales', 4, 4800, date '2007-08-08'), + ('personnel', 2, 3900, date '2006-12-23'), + ('develop', 7, 4200, date '2008-01-01'), + ('develop', 9, 4500, date '2008-01-01'), + ('sales', 3, 4800, date '2007-08-01'), + ('develop', 8, 6000, date '2006-10-01'), + ('develop', 11, 5200, date '2007-08-15') +-- !query schema +struct<> +-- !query output + + + +-- !query +create table datetimes ( + id int, + f_time timestamp, + f_timetz timestamp, + f_interval timestamp, + f_timestamptz timestamp, + f_timestamp timestamp +) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +insert into datetimes values +(1, timestamp '11:00', cast ('11:00 BST' as timestamp), cast ('1 year' as timestamp), cast ('2000-10-19 10:23:54+01' as timestamp), timestamp '2000-10-19 10:23:54'), +(2, timestamp '12:00', cast ('12:00 BST' as timestamp), cast ('2 years' as timestamp), cast ('2001-10-19 10:23:54+01' as timestamp), timestamp '2001-10-19 10:23:54'), +(3, timestamp '13:00', cast ('13:00 BST' as timestamp), cast ('3 years' as timestamp), cast ('2001-10-19 10:23:54+01' as timestamp), timestamp '2001-10-19 10:23:54'), +(4, timestamp '14:00', cast ('14:00 BST' as timestamp), cast ('4 years' as timestamp), cast ('2002-10-19 10:23:54+01' as timestamp), timestamp '2002-10-19 10:23:54'), +(5, timestamp '15:00', cast ('15:00 BST' as timestamp), cast ('5 years' as timestamp), cast ('2003-10-19 10:23:54+01' as timestamp), timestamp '2003-10-19 10:23:54'), +(6, timestamp '15:00', cast ('15:00 BST' as timestamp), cast ('5 years' as timestamp), cast ('2004-10-19 10:23:54+01' as timestamp), timestamp '2004-10-19 10:23:54'), +(7, timestamp '17:00', cast ('17:00 BST' as timestamp), cast ('7 years' as timestamp), cast ('2005-10-19 10:23:54+01' as timestamp), timestamp '2005-10-19 10:23:54'), +(8, timestamp '18:00', cast ('18:00 BST' as timestamp), cast ('8 years' as timestamp), cast ('2006-10-19 10:23:54+01' as timestamp), timestamp '2006-10-19 10:23:54'), +(9, timestamp '19:00', cast ('19:00 BST' as timestamp), cast ('9 years' as timestamp), cast ('2007-10-19 10:23:54+01' as timestamp), timestamp '2007-10-19 10:23:54'), +(10, timestamp '20:00', cast ('20:00 BST' as timestamp), cast ('10 years' as timestamp), cast ('2008-10-19 10:23:54+01' as timestamp), timestamp '2008-10-19 10:23:54') +-- !query schema +struct<> +-- !query output + + + +-- !query +WITH cte (x) AS ( + SELECT * FROM range(1, 36, 2) +) +SELECT x, (sum(x) over w) +FROM cte +WINDOW w AS (ORDER BY x rows between 1 preceding and 1 following) +-- !query schema +struct +-- !query output +1 4 +11 33 +13 39 +15 45 +17 51 +19 57 +21 63 +23 69 +25 75 +27 81 +29 87 +3 9 +31 93 +33 99 +35 68 +5 15 +7 21 +9 27 + + +-- !query +WITH cte (x) AS ( + SELECT * FROM range(1, 36, 2) +) +SELECT x, (sum(x) over w) +FROM cte +WINDOW w AS (ORDER BY x range between 1 preceding and 1 following) +-- !query schema +struct +-- !query output +1 1 +11 11 +13 13 +15 15 +17 17 +19 19 +21 21 +23 23 +25 25 +27 27 +29 29 +3 3 +31 31 +33 33 +35 35 +5 5 +7 7 +9 9 + + +-- !query +WITH cte (x) AS ( + select 1 union all select 1 union all select 1 union all + SELECT * FROM range(5, 50, 2) +) +SELECT x, (sum(x) over w) +FROM cte +WINDOW w AS (ORDER BY x rows between 1 preceding and 1 following) +-- !query schema +struct +-- !query output +1 2 +1 3 +1 7 +11 33 +13 39 +15 45 +17 51 +19 57 +21 63 +23 69 +25 75 +27 81 +29 87 +31 93 +33 99 +35 105 +37 111 +39 117 +41 123 +43 129 +45 135 +47 141 +49 96 +5 13 +7 21 +9 27 + + +-- !query +WITH cte (x) AS ( + select 1 union all select 1 union all select 1 union all + SELECT * FROM range(5, 50, 2) +) +SELECT x, (sum(x) over w) +FROM cte +WINDOW w AS (ORDER BY x range between 1 preceding and 1 following) +-- !query schema +struct +-- !query output +1 3 +1 3 +1 3 +11 11 +13 13 +15 15 +17 17 +19 19 +21 21 +23 23 +25 25 +27 27 +29 29 +31 31 +33 33 +35 35 +37 37 +39 39 +41 41 +43 43 +45 45 +47 47 +49 49 +5 5 +7 7 +9 9 + + +-- !query +SELECT count(*) OVER (PARTITION BY four) FROM (SELECT * FROM tenk1 UNION ALL SELECT * FROM tenk2)s LIMIT 0 +-- !query schema +struct +-- !query output + + + +-- !query +create table t1 (f1 int, f2 int) using parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +insert into t1 values (1,1),(1,2),(2,2) +-- !query schema +struct<> +-- !query output + + + +-- !query +select f1, sum(f1) over (partition by f1 + range between 1 preceding and 1 following) +from t1 where f1 = f2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve '(PARTITION BY default.t1.`f1` RANGE BETWEEN 1 PRECEDING AND 1 FOLLOWING)' due to data type mismatch: A range window frame cannot be used in an unordered window specification.; line 1 pos 24 + + +-- !query +select f1, sum(f1) over (partition by f1 order by f2 +range between 1 preceding and 1 following) +from t1 where f1 = f2 +-- !query schema +struct +-- !query output +1 1 +2 2 + + +-- !query +select f1, sum(f1) over (partition by f1, f1 order by f2 +range between 2 preceding and 1 preceding) +from t1 where f1 = f2 +-- !query schema +struct +-- !query output +1 NULL +2 NULL + + +-- !query +select f1, sum(f1) over (partition by f1, f2 order by f2 +range between 1 following and 2 following) +from t1 where f1 = f2 +-- !query schema +struct +-- !query output +1 NULL +2 NULL + + +-- !query +SELECT rank() OVER (ORDER BY length('abc')) +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT * FROM empsalary WHERE row_number() OVER (ORDER BY salary) < 10 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +It is not allowed to use window functions inside WHERE and HAVING clauses; + + +-- !query +SELECT * FROM empsalary INNER JOIN tenk1 ON row_number() OVER (ORDER BY salary) < 10 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException + +The query operator `Join` contains one or more unsupported +expression types Aggregate, Window or Generate. +Invalid expressions: [row_number() OVER (ORDER BY default.empsalary.`salary` ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)]; + + +-- !query +SELECT rank() OVER (ORDER BY 1), count(*) FROM empsalary GROUP BY 1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException + +The query operator `Aggregate` contains one or more unsupported +expression types Aggregate, Window or Generate. +Invalid expressions: [RANK() OVER (ORDER BY 1 ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)]; + + +-- !query +SELECT * FROM rank() OVER (ORDER BY random()) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +no viable alternative at input 'ORDER'(line 1, pos 27) + +== SQL == +SELECT * FROM rank() OVER (ORDER BY random()) +---------------------------^^^ + + +-- !query +SELECT * FROM empsalary WHERE (rank() OVER (ORDER BY random())) > 10 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +It is not allowed to use window functions inside WHERE and HAVING clauses; + + +-- !query +SELECT * FROM empsalary WHERE rank() OVER (ORDER BY random()) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +It is not allowed to use window functions inside WHERE and HAVING clauses; + + +-- !query +select rank() OVER (PARTITION BY four, ORDER BY ten) FROM tenk1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +no viable alternative at input 'ORDER'(line 1, pos 39) + +== SQL == +select rank() OVER (PARTITION BY four, ORDER BY ten) FROM tenk1 +---------------------------------------^^^ + + +-- !query +SELECT range(1, 100) OVER () FROM empsalary +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Undefined function: 'range'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 7 + + +-- !query +SELECT ntile(0) OVER (ORDER BY ten), ten, four FROM tenk1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'ntile(0)' due to data type mismatch: Buckets expression must be positive, but got: 0; line 1 pos 7 + + +-- !query +DROP TABLE empsalary +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE datetimes +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE t1 +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part4.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part4.sql.out new file mode 100644 index 0000000000000..4dd4712345a89 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/window_part4.sql.out @@ -0,0 +1,504 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 39 + + +-- !query +SELECT i,AVG(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1),(2,2),(3,NULL),(4,NULL)) t(i,v) +-- !query schema +struct +-- !query output +1 1.5 +2 2.0 +3 NULL +4 NULL + + +-- !query +SELECT i,AVG(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1),(2,2),(3,NULL),(4,NULL)) t(i,v) +-- !query schema +struct +-- !query output +1 1.5 +2 2.0 +3 NULL +4 NULL + + +-- !query +SELECT i,AVG(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1),(2,2),(3,NULL),(4,NULL)) t(i,v) +-- !query schema +struct +-- !query output +1 1.5 +2 2.0 +3 NULL +4 NULL + + +-- !query +SELECT i,AVG(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1.5),(2,2.5),(3,NULL),(4,NULL)) t(i,v) +-- !query schema +struct +-- !query output +1 2.00000 +2 2.50000 +3 NULL +4 NULL + + +-- !query +SELECT i,SUM(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1),(2,2),(3,NULL),(4,NULL)) t(i,v) +-- !query schema +struct +-- !query output +1 3 +2 2 +3 NULL +4 NULL + + +-- !query +SELECT i,SUM(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1),(2,2),(3,NULL),(4,NULL)) t(i,v) +-- !query schema +struct +-- !query output +1 3 +2 2 +3 NULL +4 NULL + + +-- !query +SELECT i,SUM(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1),(2,2),(3,NULL),(4,NULL)) t(i,v) +-- !query schema +struct +-- !query output +1 3 +2 2 +3 NULL +4 NULL + + +-- !query +SELECT i,SUM(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1.1),(2,2.2),(3,NULL),(4,NULL)) t(i,v) +-- !query schema +struct +-- !query output +1 3.3 +2 2.2 +3 NULL +4 NULL + + +-- !query +SELECT SUM(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1.01),(2,2),(3,3)) v(i,n) +-- !query schema +struct +-- !query output +3.00 +5.00 +6.01 + + +-- !query +SELECT i,COUNT(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1),(2,2),(3,NULL),(4,NULL)) t(i,v) +-- !query schema +struct +-- !query output +1 2 +2 1 +3 0 +4 0 + + +-- !query +SELECT i,COUNT(*) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,1),(2,2),(3,NULL),(4,NULL)) t(i,v) +-- !query schema +struct +-- !query output +1 4 +2 3 +3 2 +4 1 + + +-- !query +SELECT VAR_POP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n) +-- !query schema +struct +-- !query output +0.0 +11266.666666666666 +13868.750000000002 +21703.999999999996 +4225.0 + + +-- !query +SELECT VAR_POP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n) +-- !query schema +struct +-- !query output +0.0 +11266.666666666666 +13868.750000000002 +21703.999999999996 +4225.0 + + +-- !query +SELECT VAR_POP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n) +-- !query schema +struct +-- !query output +0.0 +11266.666666666666 +13868.750000000002 +21703.999999999996 +4225.0 + + +-- !query +SELECT VAR_POP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n) +-- !query schema +struct +-- !query output +0.0 +11266.666666666666 +13868.750000000002 +21703.999999999996 +4225.0 + + +-- !query +SELECT VAR_SAMP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n) +-- !query schema +struct +-- !query output +16900.0 +18491.666666666668 +27129.999999999996 +8450.0 +NaN + + +-- !query +SELECT VAR_SAMP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n) +-- !query schema +struct +-- !query output +16900.0 +18491.666666666668 +27129.999999999996 +8450.0 +NaN + + +-- !query +SELECT VAR_SAMP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n) +-- !query schema +struct +-- !query output +16900.0 +18491.666666666668 +27129.999999999996 +8450.0 +NaN + + +-- !query +SELECT VAR_SAMP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n) +-- !query schema +struct +-- !query output +16900.0 +18491.666666666668 +27129.999999999996 +8450.0 +NaN + + +-- !query +SELECT VARIANCE(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n) +-- !query schema +struct +-- !query output +16900.0 +18491.666666666668 +27129.999999999996 +8450.0 +NaN + + +-- !query +SELECT VARIANCE(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n) +-- !query schema +struct +-- !query output +16900.0 +18491.666666666668 +27129.999999999996 +8450.0 +NaN + + +-- !query +SELECT VARIANCE(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n) +-- !query schema +struct +-- !query output +16900.0 +18491.666666666668 +27129.999999999996 +8450.0 +NaN + + +-- !query +SELECT VARIANCE(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n) +-- !query schema +struct +-- !query output +16900.0 +18491.666666666668 +27129.999999999996 +8450.0 +NaN + + +-- !query +SELECT STDDEV_POP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,NULL),(2,600),(3,470),(4,170),(5,430),(6,300)) r(i,n) +-- !query schema +struct +-- !query output +0.0 +106.14455552060438 +117.76565713313879 +147.32277488562315 +147.32277488562315 +65.0 + + +-- !query +SELECT STDDEV_POP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,NULL),(2,600),(3,470),(4,170),(5,430),(6,300)) r(i,n) +-- !query schema +struct +-- !query output +0.0 +106.14455552060438 +117.76565713313879 +147.32277488562315 +147.32277488562315 +65.0 + + +-- !query +SELECT STDDEV_POP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,NULL),(2,600),(3,470),(4,170),(5,430),(6,300)) r(i,n) +-- !query schema +struct +-- !query output +0.0 +106.14455552060438 +117.76565713313879 +147.32277488562315 +147.32277488562315 +65.0 + + +-- !query +SELECT STDDEV_POP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,NULL),(2,600),(3,470),(4,170),(5,430),(6,300)) r(i,n) +-- !query schema +struct +-- !query output +0.0 +106.14455552060438 +117.76565713313879 +147.32277488562315 +147.32277488562315 +65.0 + + +-- !query +SELECT STDDEV_SAMP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,NULL),(2,600),(3,470),(4,170),(5,430),(6,300)) r(i,n) +-- !query schema +struct +-- !query output +130.0 +135.9840676942217 +164.7118696390761 +164.7118696390761 +91.92388155425118 +NaN + + +-- !query +SELECT STDDEV_SAMP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,NULL),(2,600),(3,470),(4,170),(5,430),(6,300)) r(i,n) +-- !query schema +struct +-- !query output +130.0 +135.9840676942217 +164.7118696390761 +164.7118696390761 +91.92388155425118 +NaN + + +-- !query +SELECT STDDEV_SAMP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,NULL),(2,600),(3,470),(4,170),(5,430),(6,300)) r(i,n) +-- !query schema +struct +-- !query output +130.0 +135.9840676942217 +164.7118696390761 +164.7118696390761 +91.92388155425118 +NaN + + +-- !query +SELECT STDDEV_SAMP(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(1,NULL),(2,600),(3,470),(4,170),(5,430),(6,300)) r(i,n) +-- !query schema +struct +-- !query output +130.0 +135.9840676942217 +164.7118696390761 +164.7118696390761 +91.92388155425118 +NaN + + +-- !query +SELECT STDDEV(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(0,NULL),(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n) +-- !query schema +struct +-- !query output +130.0 +135.9840676942217 +164.7118696390761 +164.7118696390761 +91.92388155425118 +NaN + + +-- !query +SELECT STDDEV(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(0,NULL),(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n) +-- !query schema +struct +-- !query output +130.0 +135.9840676942217 +164.7118696390761 +164.7118696390761 +91.92388155425118 +NaN + + +-- !query +SELECT STDDEV(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(0,NULL),(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n) +-- !query schema +struct +-- !query output +130.0 +135.9840676942217 +164.7118696390761 +164.7118696390761 +91.92388155425118 +NaN + + +-- !query +SELECT STDDEV(n) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) + FROM (VALUES(0,NULL),(1,600),(2,470),(3,170),(4,430),(5,300)) r(i,n) +-- !query schema +struct +-- !query output +130.0 +135.9840676942217 +164.7118696390761 +164.7118696390761 +91.92388155425118 +NaN + + +-- !query +SELECT i,SUM(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND CURRENT ROW) + FROM (VALUES(1,1),(2,2),(3,NULL),(4,NULL)) t(i,v) +-- !query schema +struct +-- !query output +1 1 +2 2 +3 NULL +4 NULL + + +-- !query +SELECT i,SUM(v) OVER (ORDER BY i ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING) + FROM (VALUES(1,1),(2,2),(3,NULL),(4,NULL)) t(i,v) +-- !query schema +struct +-- !query output +1 3 +2 2 +3 NULL +4 NULL + + +-- !query +SELECT i,SUM(v) OVER (ORDER BY i ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING) + FROM (VALUES(1,1),(2,2),(3,3),(4,4)) t(i,v) +-- !query schema +struct +-- !query output +1 3 +2 6 +3 9 +4 7 + + +-- !query +SELECT a, b, + SUM(b) OVER(ORDER BY A ROWS BETWEEN 1 PRECEDING AND CURRENT ROW) +FROM (VALUES(1,1),(2,2),(3,(cast('nan' as int))),(4,3),(5,4)) t(a,b) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +failed to evaluate expression CAST('nan' AS INT): invalid input syntax for type numeric: nan; line 3 pos 6 diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/with.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out similarity index 60% rename from sql/core/src/test/resources/sql-tests/results/pgSQL/with.sql.out rename to sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out index 91b0ff20b6ab0..badafc9e659e2 100644 --- a/sql/core/src/test/resources/sql-tests/results/pgSQL/with.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out @@ -2,134 +2,134 @@ -- Number of queries: 51 --- !query 0 +-- !query WITH q1(x,y) AS (SELECT 1,2) SELECT * FROM q1, q1 AS q2 --- !query 0 schema +-- !query schema struct --- !query 0 output +-- !query output 1 2 1 2 --- !query 1 +-- !query SELECT count(*) FROM ( WITH q1(x) AS (SELECT rand() FROM (SELECT EXPLODE(SEQUENCE(1, 5)))) SELECT * FROM q1 UNION SELECT * FROM q1 ) ss --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 10 --- !query 2 +-- !query CREATE TABLE department ( id INTEGER, -- department ID parent_department INTEGER, -- upper department ID name string -- department name ) USING parquet --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query INSERT INTO department VALUES (0, NULL, 'ROOT') --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query INSERT INTO department VALUES (1, 0, 'A') --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query INSERT INTO department VALUES (2, 1, 'B') --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query INSERT INTO department VALUES (3, 2, 'C') --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output --- !query 7 +-- !query INSERT INTO department VALUES (4, 2, 'D') --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output --- !query 8 +-- !query INSERT INTO department VALUES (5, 0, 'E') --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output --- !query 9 +-- !query INSERT INTO department VALUES (6, 4, 'F') --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output --- !query 10 +-- !query INSERT INTO department VALUES (7, 5, 'G') --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output --- !query 11 +-- !query CREATE TABLE tree( id INTEGER, parent_id INTEGER ) USING parquet --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output --- !query 12 +-- !query INSERT INTO tree VALUES (1, NULL), (2, 1), (3,1), (4,2), (5,2), (6,2), (7,3), (8,3), (9,4), (10,4), (11,7), (12,7), (13,7), (14, 9), (15,11), (16,11) --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output --- !query 13 +-- !query create table graph( f int, t int, label string ) USING parquet --- !query 13 schema +-- !query schema struct<> --- !query 13 output +-- !query output --- !query 14 +-- !query insert into graph values (1, 2, 'arc 1 -> 2'), (1, 3, 'arc 1 -> 3'), @@ -137,61 +137,61 @@ insert into graph values (1, 4, 'arc 1 -> 4'), (4, 5, 'arc 4 -> 5'), (5, 1, 'arc 5 -> 1') --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output --- !query 15 +-- !query CREATE TABLE y (a INTEGER) USING parquet --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output --- !query 16 +-- !query INSERT INTO y SELECT EXPLODE(SEQUENCE(1, 10)) --- !query 16 schema +-- !query schema struct<> --- !query 16 output +-- !query output --- !query 17 +-- !query DROP TABLE y --- !query 17 schema +-- !query schema struct<> --- !query 17 output +-- !query output --- !query 18 +-- !query CREATE TABLE y (a INTEGER) USING parquet --- !query 18 schema +-- !query schema struct<> --- !query 18 output +-- !query output --- !query 19 +-- !query INSERT INTO y SELECT EXPLODE(SEQUENCE(1, 10)) --- !query 19 schema +-- !query schema struct<> --- !query 19 output +-- !query output --- !query 20 +-- !query with cte(foo) as ( select 42 ) select * from ((select foo from cte)) q --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output 42 --- !query 21 +-- !query WITH outermost(x) AS ( SELECT 1 UNION (WITH innermost as (SELECT 2) @@ -199,15 +199,15 @@ WITH outermost(x) AS ( UNION SELECT 3) ) SELECT * FROM outermost ORDER BY 1 --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output 1 2 3 --- !query 22 +-- !query WITH outermost(x) AS ( SELECT 1 UNION (WITH innermost as (SELECT 2) @@ -215,26 +215,26 @@ WITH outermost(x) AS ( UNION SELECT * FROM innermost) ) SELECT * FROM outermost ORDER BY 1 --- !query 22 schema +-- !query schema struct<> --- !query 22 output +-- !query output org.apache.spark.sql.AnalysisException Table or view not found: outermost; line 4 pos 23 --- !query 23 +-- !query CREATE TABLE withz USING parquet AS SELECT i AS k, CAST(i || ' v' AS string) v FROM (SELECT EXPLODE(SEQUENCE(1, 16, 3)) i) --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output --- !query 24 +-- !query SELECT * FROM withz ORDER BY k --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output 1 1 v 4 4 v 7 7 v @@ -243,111 +243,111 @@ struct 16 16 v --- !query 25 +-- !query DROP TABLE withz --- !query 25 schema +-- !query schema struct<> --- !query 25 output +-- !query output --- !query 26 +-- !query TRUNCATE TABLE y --- !query 26 schema +-- !query schema struct<> --- !query 26 output +-- !query output --- !query 27 +-- !query INSERT INTO y SELECT EXPLODE(SEQUENCE(1, 3)) --- !query 27 schema +-- !query schema struct<> --- !query 27 output +-- !query output --- !query 28 +-- !query CREATE TABLE yy (a INTEGER) USING parquet --- !query 28 schema +-- !query schema struct<> --- !query 28 output +-- !query output --- !query 29 +-- !query SELECT * FROM y --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output 1 2 3 --- !query 30 +-- !query SELECT * FROM yy --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output --- !query 31 +-- !query SELECT * FROM y --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output 1 2 3 --- !query 32 +-- !query SELECT * FROM yy --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output --- !query 33 +-- !query CREATE TABLE parent ( id int, val string ) USING parquet --- !query 33 schema +-- !query schema struct<> --- !query 33 output +-- !query output --- !query 34 +-- !query INSERT INTO parent VALUES ( 1, 'p1' ) --- !query 34 schema +-- !query schema struct<> --- !query 34 output +-- !query output --- !query 35 +-- !query SELECT * FROM parent --- !query 35 schema +-- !query schema struct --- !query 35 output +-- !query output 1 p1 --- !query 36 +-- !query SELECT * FROM parent --- !query 36 schema +-- !query schema struct --- !query 36 output +-- !query output 1 p1 --- !query 37 +-- !query create table foo (with baz) --- !query 37 schema +-- !query schema struct<> --- !query 37 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException no viable alternative at input 'with'(line 1, pos 18) @@ -357,11 +357,11 @@ create table foo (with baz) ------------------^^^ --- !query 38 +-- !query create table foo (with ordinality) --- !query 38 schema +-- !query schema struct<> --- !query 38 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException no viable alternative at input 'with'(line 1, pos 18) @@ -371,98 +371,98 @@ create table foo (with ordinality) ------------------^^^ --- !query 39 +-- !query with ordinality as (select 1 as x) select * from ordinality --- !query 39 schema +-- !query schema struct --- !query 39 output +-- !query output 1 --- !query 40 +-- !query WITH test AS (SELECT 42) INSERT INTO test VALUES (1) --- !query 40 schema +-- !query schema struct<> --- !query 40 output +-- !query output org.apache.spark.sql.AnalysisException Table not found: test; --- !query 41 +-- !query create table test (i int) USING parquet --- !query 41 schema +-- !query schema struct<> --- !query 41 output +-- !query output --- !query 42 +-- !query with test as (select 42) insert into test select * from test --- !query 42 schema +-- !query schema struct<> --- !query 42 output +-- !query output --- !query 43 +-- !query select * from test --- !query 43 schema +-- !query schema struct --- !query 43 output +-- !query output 42 --- !query 44 +-- !query drop table test --- !query 44 schema +-- !query schema struct<> --- !query 44 output +-- !query output --- !query 45 +-- !query DROP TABLE department --- !query 45 schema +-- !query schema struct<> --- !query 45 output +-- !query output --- !query 46 +-- !query DROP TABLE tree --- !query 46 schema +-- !query schema struct<> --- !query 46 output +-- !query output --- !query 47 +-- !query DROP TABLE graph --- !query 47 schema +-- !query schema struct<> --- !query 47 output +-- !query output --- !query 48 +-- !query DROP TABLE y --- !query 48 schema +-- !query schema struct<> --- !query 48 output +-- !query output --- !query 49 +-- !query DROP TABLE yy --- !query 49 schema +-- !query schema struct<> --- !query 49 output +-- !query output --- !query 50 +-- !query DROP TABLE parent --- !query 50 schema +-- !query schema struct<> --- !query 50 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/pred-pushdown.sql.out b/sql/core/src/test/resources/sql-tests/results/pred-pushdown.sql.out index 1b8ddbe4c7211..a64b8d3f6632d 100644 --- a/sql/core/src/test/resources/sql-tests/results/pred-pushdown.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/pred-pushdown.sql.out @@ -2,39 +2,39 @@ -- Number of queries: 4 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW tbl_a AS VALUES (1, 1), (2, 1), (3, 6) AS T(c1, c2) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE OR REPLACE TEMPORARY VIEW tbl_b AS VALUES 1 AS T(c1) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query SELECT * FROM tbl_a LEFT ANTI JOIN tbl_b ON ((tbl_a.c1 = tbl_a.c2) IS NULL OR tbl_a.c1 = tbl_a.c2) --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 2 1 3 6 --- !query 3 +-- !query SELECT l.c1, l.c2 FROM tbl_a l WHERE EXISTS (SELECT 1 FROM tbl_b r WHERE l.c1 = l.c2) OR l.c2 < 2 --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 1 1 2 1 diff --git a/sql/core/src/test/resources/sql-tests/results/predicate-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/predicate-functions.sql.out index d38cab8fa7862..08cc6fa993e0b 100644 --- a/sql/core/src/test/resources/sql-tests/results/predicate-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/predicate-functions.sql.out @@ -2,297 +2,297 @@ -- Number of queries: 37 --- !query 0 +-- !query select 1 = 1 --- !query 0 schema +-- !query schema struct<(1 = 1):boolean> --- !query 0 output +-- !query output true --- !query 1 +-- !query select 1 = '1' --- !query 1 schema +-- !query schema struct<(1 = CAST(1 AS INT)):boolean> --- !query 1 output +-- !query output true --- !query 2 +-- !query select 1.0 = '1' --- !query 2 schema +-- !query schema struct<(CAST(1.0 AS DOUBLE) = CAST(1 AS DOUBLE)):boolean> --- !query 2 output +-- !query output true --- !query 3 +-- !query select 1.5 = '1.51' --- !query 3 schema +-- !query schema struct<(CAST(1.5 AS DOUBLE) = CAST(1.51 AS DOUBLE)):boolean> --- !query 3 output +-- !query output false --- !query 4 +-- !query select 1 > '1' --- !query 4 schema +-- !query schema struct<(1 > CAST(1 AS INT)):boolean> --- !query 4 output +-- !query output false --- !query 5 +-- !query select 2 > '1.0' --- !query 5 schema +-- !query schema struct<(2 > CAST(1.0 AS INT)):boolean> --- !query 5 output +-- !query output true --- !query 6 +-- !query select 2 > '2.0' --- !query 6 schema +-- !query schema struct<(2 > CAST(2.0 AS INT)):boolean> --- !query 6 output +-- !query output false --- !query 7 +-- !query select 2 > '2.2' --- !query 7 schema +-- !query schema struct<(2 > CAST(2.2 AS INT)):boolean> --- !query 7 output +-- !query output false --- !query 8 +-- !query select '1.5' > 0.5 --- !query 8 schema +-- !query schema struct<(CAST(1.5 AS DOUBLE) > CAST(0.5 AS DOUBLE)):boolean> --- !query 8 output +-- !query output true --- !query 9 +-- !query select to_date('2009-07-30 04:17:52') > to_date('2009-07-30 04:17:52') --- !query 9 schema +-- !query schema struct<(to_date('2009-07-30 04:17:52') > to_date('2009-07-30 04:17:52')):boolean> --- !query 9 output +-- !query output false --- !query 10 +-- !query select to_date('2009-07-30 04:17:52') > '2009-07-30 04:17:52' --- !query 10 schema +-- !query schema struct<(to_date('2009-07-30 04:17:52') > CAST(2009-07-30 04:17:52 AS DATE)):boolean> --- !query 10 output +-- !query output false --- !query 11 +-- !query select 1 >= '1' --- !query 11 schema +-- !query schema struct<(1 >= CAST(1 AS INT)):boolean> --- !query 11 output +-- !query output true --- !query 12 +-- !query select 2 >= '1.0' --- !query 12 schema +-- !query schema struct<(2 >= CAST(1.0 AS INT)):boolean> --- !query 12 output +-- !query output true --- !query 13 +-- !query select 2 >= '2.0' --- !query 13 schema +-- !query schema struct<(2 >= CAST(2.0 AS INT)):boolean> --- !query 13 output +-- !query output true --- !query 14 +-- !query select 2.0 >= '2.2' --- !query 14 schema +-- !query schema struct<(CAST(2.0 AS DOUBLE) >= CAST(2.2 AS DOUBLE)):boolean> --- !query 14 output +-- !query output false --- !query 15 +-- !query select '1.5' >= 0.5 --- !query 15 schema +-- !query schema struct<(CAST(1.5 AS DOUBLE) >= CAST(0.5 AS DOUBLE)):boolean> --- !query 15 output +-- !query output true --- !query 16 +-- !query select to_date('2009-07-30 04:17:52') >= to_date('2009-07-30 04:17:52') --- !query 16 schema +-- !query schema struct<(to_date('2009-07-30 04:17:52') >= to_date('2009-07-30 04:17:52')):boolean> --- !query 16 output +-- !query output true --- !query 17 +-- !query select to_date('2009-07-30 04:17:52') >= '2009-07-30 04:17:52' --- !query 17 schema +-- !query schema struct<(to_date('2009-07-30 04:17:52') >= CAST(2009-07-30 04:17:52 AS DATE)):boolean> --- !query 17 output +-- !query output true --- !query 18 +-- !query select 1 < '1' --- !query 18 schema +-- !query schema struct<(1 < CAST(1 AS INT)):boolean> --- !query 18 output +-- !query output false --- !query 19 +-- !query select 2 < '1.0' --- !query 19 schema +-- !query schema struct<(2 < CAST(1.0 AS INT)):boolean> --- !query 19 output +-- !query output false --- !query 20 +-- !query select 2 < '2.0' --- !query 20 schema +-- !query schema struct<(2 < CAST(2.0 AS INT)):boolean> --- !query 20 output +-- !query output false --- !query 21 +-- !query select 2.0 < '2.2' --- !query 21 schema +-- !query schema struct<(CAST(2.0 AS DOUBLE) < CAST(2.2 AS DOUBLE)):boolean> --- !query 21 output +-- !query output true --- !query 22 +-- !query select 0.5 < '1.5' --- !query 22 schema +-- !query schema struct<(CAST(0.5 AS DOUBLE) < CAST(1.5 AS DOUBLE)):boolean> --- !query 22 output +-- !query output true --- !query 23 +-- !query select to_date('2009-07-30 04:17:52') < to_date('2009-07-30 04:17:52') --- !query 23 schema +-- !query schema struct<(to_date('2009-07-30 04:17:52') < to_date('2009-07-30 04:17:52')):boolean> --- !query 23 output +-- !query output false --- !query 24 +-- !query select to_date('2009-07-30 04:17:52') < '2009-07-30 04:17:52' --- !query 24 schema +-- !query schema struct<(to_date('2009-07-30 04:17:52') < CAST(2009-07-30 04:17:52 AS DATE)):boolean> --- !query 24 output +-- !query output false --- !query 25 +-- !query select 1 <= '1' --- !query 25 schema +-- !query schema struct<(1 <= CAST(1 AS INT)):boolean> --- !query 25 output +-- !query output true --- !query 26 +-- !query select 2 <= '1.0' --- !query 26 schema +-- !query schema struct<(2 <= CAST(1.0 AS INT)):boolean> --- !query 26 output +-- !query output false --- !query 27 +-- !query select 2 <= '2.0' --- !query 27 schema +-- !query schema struct<(2 <= CAST(2.0 AS INT)):boolean> --- !query 27 output +-- !query output true --- !query 28 +-- !query select 2.0 <= '2.2' --- !query 28 schema +-- !query schema struct<(CAST(2.0 AS DOUBLE) <= CAST(2.2 AS DOUBLE)):boolean> --- !query 28 output +-- !query output true --- !query 29 +-- !query select 0.5 <= '1.5' --- !query 29 schema +-- !query schema struct<(CAST(0.5 AS DOUBLE) <= CAST(1.5 AS DOUBLE)):boolean> --- !query 29 output +-- !query output true --- !query 30 +-- !query select to_date('2009-07-30 04:17:52') <= to_date('2009-07-30 04:17:52') --- !query 30 schema +-- !query schema struct<(to_date('2009-07-30 04:17:52') <= to_date('2009-07-30 04:17:52')):boolean> --- !query 30 output +-- !query output true --- !query 31 +-- !query select to_date('2009-07-30 04:17:52') <= '2009-07-30 04:17:52' --- !query 31 schema +-- !query schema struct<(to_date('2009-07-30 04:17:52') <= CAST(2009-07-30 04:17:52 AS DATE)):boolean> --- !query 31 output +-- !query output true --- !query 32 +-- !query select to_date('2017-03-01') = to_timestamp('2017-03-01 00:00:00') --- !query 32 schema +-- !query schema struct<(CAST(to_date('2017-03-01') AS TIMESTAMP) = to_timestamp('2017-03-01 00:00:00')):boolean> --- !query 32 output +-- !query output true --- !query 33 +-- !query select to_timestamp('2017-03-01 00:00:01') > to_date('2017-03-01') --- !query 33 schema +-- !query schema struct<(to_timestamp('2017-03-01 00:00:01') > CAST(to_date('2017-03-01') AS TIMESTAMP)):boolean> --- !query 33 output +-- !query output true --- !query 34 +-- !query select to_timestamp('2017-03-01 00:00:01') >= to_date('2017-03-01') --- !query 34 schema +-- !query schema struct<(to_timestamp('2017-03-01 00:00:01') >= CAST(to_date('2017-03-01') AS TIMESTAMP)):boolean> --- !query 34 output +-- !query output true --- !query 35 +-- !query select to_date('2017-03-01') < to_timestamp('2017-03-01 00:00:01') --- !query 35 schema +-- !query schema struct<(CAST(to_date('2017-03-01') AS TIMESTAMP) < to_timestamp('2017-03-01 00:00:01')):boolean> --- !query 35 output +-- !query output true --- !query 36 +-- !query select to_date('2017-03-01') <= to_timestamp('2017-03-01 00:00:01') --- !query 36 schema +-- !query schema struct<(CAST(to_date('2017-03-01') AS TIMESTAMP) <= to_timestamp('2017-03-01 00:00:01')):boolean> --- !query 36 output +-- !query output true diff --git a/sql/core/src/test/resources/sql-tests/results/query_regex_column.sql.out b/sql/core/src/test/resources/sql-tests/results/query_regex_column.sql.out index 2dade86f35df9..2e93ee286fd47 100644 --- a/sql/core/src/test/resources/sql-tests/results/query_regex_column.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/query_regex_column.sql.out @@ -2,312 +2,312 @@ -- Number of queries: 34 --- !query 0 +-- !query set spark.sql.parser.quotedRegexColumnNames=false --- !query 0 schema +-- !query schema struct --- !query 0 output +-- !query output spark.sql.parser.quotedRegexColumnNames false --- !query 1 +-- !query CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES (1, "1", "11"), (2, "2", "22"), (3, "3", "33"), (4, "4", "44"), (5, "5", "55"), (6, "6", "66") AS testData(key, value1, value2) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE OR REPLACE TEMPORARY VIEW testData2 AS SELECT * FROM VALUES (1, 1, 1, 2), (1, 2, 1, 2), (2, 1, 2, 3), (2, 2, 2, 3), (3, 1, 3, 4), (3, 2, 3, 4) AS testData2(A, B, c, d) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT `(a)?+.+` FROM testData2 WHERE a = 1 --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`(a)?+.+`' given input columns: [testdata2.A, testdata2.B, testdata2.c, testdata2.d]; line 1 pos 7 --- !query 4 +-- !query SELECT t.`(a)?+.+` FROM testData2 t WHERE a = 1 --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 't.`(a)?+.+`' given input columns: [t.A, t.B, t.c, t.d]; line 1 pos 7 --- !query 5 +-- !query SELECT `(a|b)` FROM testData2 WHERE a = 2 --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`(a|b)`' given input columns: [testdata2.A, testdata2.B, testdata2.c, testdata2.d]; line 1 pos 7 --- !query 6 +-- !query SELECT `(a|b)?+.+` FROM testData2 WHERE a = 2 --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`(a|b)?+.+`' given input columns: [testdata2.A, testdata2.B, testdata2.c, testdata2.d]; line 1 pos 7 --- !query 7 +-- !query SELECT SUM(`(a|b)?+.+`) FROM testData2 --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`(a|b)?+.+`' given input columns: [testdata2.A, testdata2.B, testdata2.c, testdata2.d]; line 1 pos 11 --- !query 8 +-- !query SELECT SUM(`(a)`) FROM testData2 --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`(a)`' given input columns: [testdata2.A, testdata2.B, testdata2.c, testdata2.d]; line 1 pos 11 --- !query 9 +-- !query set spark.sql.parser.quotedRegexColumnNames=true --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output spark.sql.parser.quotedRegexColumnNames true --- !query 10 +-- !query SELECT `(a)?+.+` FROM testData2 WHERE a = 1 --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 1 1 2 2 1 2 --- !query 11 +-- !query SELECT `(A)?+.+` FROM testData2 WHERE a = 1 --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 1 1 2 2 1 2 --- !query 12 +-- !query SELECT t.`(a)?+.+` FROM testData2 t WHERE a = 1 --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 1 1 2 2 1 2 --- !query 13 +-- !query SELECT t.`(A)?+.+` FROM testData2 t WHERE a = 1 --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output 1 1 2 2 1 2 --- !query 14 +-- !query SELECT `(a|B)` FROM testData2 WHERE a = 2 --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output 2 1 2 2 --- !query 15 +-- !query SELECT `(A|b)` FROM testData2 WHERE a = 2 --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 2 1 2 2 --- !query 16 +-- !query SELECT `(a|B)?+.+` FROM testData2 WHERE a = 2 --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output 2 3 2 3 --- !query 17 +-- !query SELECT `(A|b)?+.+` FROM testData2 WHERE a = 2 --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output 2 3 2 3 --- !query 18 +-- !query SELECT `(e|f)` FROM testData2 --- !query 18 schema +-- !query schema struct<> --- !query 18 output +-- !query output --- !query 19 +-- !query SELECT t.`(e|f)` FROM testData2 t --- !query 19 schema +-- !query schema struct<> --- !query 19 output +-- !query output --- !query 20 +-- !query SELECT p.`(KEY)?+.+`, b, testdata2.`(b)?+.+` FROM testData p join testData2 ON p.key = testData2.a WHERE key < 3 --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output 1 11 1 1 1 2 1 11 2 1 1 2 2 22 1 2 2 3 2 22 2 2 2 3 --- !query 21 +-- !query SELECT p.`(key)?+.+`, b, testdata2.`(b)?+.+` FROM testData p join testData2 ON p.key = testData2.a WHERE key < 3 --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output 1 11 1 1 1 2 1 11 2 1 1 2 2 22 1 2 2 3 2 22 2 2 2 3 --- !query 22 +-- !query set spark.sql.caseSensitive=true --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output spark.sql.caseSensitive true --- !query 23 +-- !query CREATE OR REPLACE TEMPORARY VIEW testdata3 AS SELECT * FROM VALUES (0, 1), (1, 2), (2, 3), (3, 4) AS testdata3(a, b) --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output --- !query 24 +-- !query SELECT `(A)?+.+` FROM testdata3 --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output 0 1 1 2 2 3 3 4 --- !query 25 +-- !query SELECT `(a)?+.+` FROM testdata3 --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output 1 2 3 4 --- !query 26 +-- !query SELECT `(A)?+.+` FROM testdata3 WHERE a > 1 --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output 2 3 3 4 --- !query 27 +-- !query SELECT `(a)?+.+` FROM testdata3 where `a` > 1 --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output 3 4 --- !query 28 +-- !query SELECT SUM(`a`) FROM testdata3 --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output 6 --- !query 29 +-- !query SELECT SUM(`(a)`) FROM testdata3 --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output 6 --- !query 30 +-- !query SELECT SUM(`(a)?+.+`) FROM testdata3 --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output 10 --- !query 31 +-- !query SELECT SUM(a) FROM testdata3 GROUP BY `a` --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output 0 1 2 3 --- !query 32 +-- !query SELECT SUM(a) FROM testdata3 GROUP BY `(a)` --- !query 32 schema +-- !query schema struct<> --- !query 32 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`(a)`' given input columns: [testdata3.a, testdata3.b]; line 1 pos 38 --- !query 33 +-- !query SELECT SUM(a) FROM testdata3 GROUP BY `(a)?+.+` --- !query 33 schema +-- !query schema struct<> --- !query 33 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`(a)?+.+`' given input columns: [testdata3.a, testdata3.b]; line 1 pos 38 diff --git a/sql/core/src/test/resources/sql-tests/results/random.sql.out b/sql/core/src/test/resources/sql-tests/results/random.sql.out index acd0609aabb16..9d00a82b76780 100644 --- a/sql/core/src/test/resources/sql-tests/results/random.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/random.sql.out @@ -2,83 +2,83 @@ -- Number of queries: 10 --- !query 0 +-- !query SELECT rand(0) --- !query 0 schema +-- !query schema struct --- !query 0 output +-- !query output 0.7604953758285915 --- !query 1 +-- !query SELECT rand(cast(3 / 7 AS int)) --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 0.7604953758285915 --- !query 2 +-- !query SELECT rand(NULL) --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 0.7604953758285915 --- !query 3 +-- !query SELECT rand(cast(NULL AS int)) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 0.7604953758285915 --- !query 4 +-- !query SELECT rand(1.0) --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'rand(1.0BD)' due to data type mismatch: argument 1 requires (int or bigint) type, however, '1.0BD' is of decimal(2,1) type.; line 1 pos 7 --- !query 5 +-- !query SELECT randn(0L) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 1.6034991609278433 --- !query 6 +-- !query SELECT randn(cast(3 / 7 AS long)) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 1.6034991609278433 --- !query 7 +-- !query SELECT randn(NULL) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 1.6034991609278433 --- !query 8 +-- !query SELECT randn(cast(NULL AS long)) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 1.6034991609278433 --- !query 9 +-- !query SELECT rand('1') --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'rand('1')' due to data type mismatch: argument 1 requires (int or bigint) type, however, ''1'' is of string type.; line 1 pos 7 diff --git a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out new file mode 100644 index 0000000000000..c92c1ddca774f --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out @@ -0,0 +1,69 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 8 + + +-- !query +SELECT regexp_extract('1a 2b 14m', '\\d+') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Regex group count is 0, but the specified group index is 1 + + +-- !query +SELECT regexp_extract('1a 2b 14m', '\\d+', 0) +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT regexp_extract('1a 2b 14m', '\\d+', 1) +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Regex group count is 0, but the specified group index is 1 + + +-- !query +SELECT regexp_extract('1a 2b 14m', '\\d+', 2) +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Regex group count is 0, but the specified group index is 2 + + +-- !query +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)') +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 0) +-- !query schema +struct +-- !query output +1a + + +-- !query +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 1) +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 2) +-- !query schema +struct +-- !query output +a diff --git a/sql/core/src/test/resources/sql-tests/results/show-create-table.sql.out b/sql/core/src/test/resources/sql-tests/results/show-create-table.sql.out index 1faf16cc30509..e8ee07171651d 100644 --- a/sql/core/src/test/resources/sql-tests/results/show-create-table.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/show-create-table.sql.out @@ -1,222 +1,405 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 24 +-- Number of queries: 41 --- !query 0 +-- !query CREATE TABLE tbl (a INT, b STRING, c INT) USING parquet --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SHOW CREATE TABLE tbl --- !query 1 schema +-- !query schema struct --- !query 1 output -CREATE TABLE `tbl` (`a` INT, `b` STRING, `c` INT) +-- !query output +CREATE TABLE `tbl` ( + `a` INT, + `b` STRING, + `c` INT) USING parquet --- !query 2 +-- !query DROP TABLE tbl --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query CREATE TABLE tbl (a INT, b STRING, c INT) USING parquet OPTIONS ('a' 1) --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query SHOW CREATE TABLE tbl --- !query 4 schema +-- !query schema struct --- !query 4 output -CREATE TABLE `tbl` (`a` INT, `b` STRING, `c` INT) +-- !query output +CREATE TABLE `tbl` ( + `a` INT, + `b` STRING, + `c` INT) USING parquet OPTIONS ( - `a` '1' -) + `a` '1') --- !query 5 +-- !query DROP TABLE tbl --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query CREATE TABLE tbl (a INT, b STRING, c INT) USING parquet OPTIONS ('path' '/path/to/table') --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output --- !query 7 +-- !query SHOW CREATE TABLE tbl --- !query 7 schema +-- !query schema struct --- !query 7 output -CREATE TABLE `tbl` (`a` INT, `b` STRING, `c` INT) +-- !query output +CREATE TABLE `tbl` ( + `a` INT, + `b` STRING, + `c` INT) USING parquet LOCATION 'file:/path/to/table' --- !query 8 +-- !query DROP TABLE tbl --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output --- !query 9 +-- !query CREATE TABLE tbl (a INT, b STRING, c INT) USING parquet LOCATION '/path/to/table' --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output --- !query 10 +-- !query SHOW CREATE TABLE tbl --- !query 10 schema +-- !query schema struct --- !query 10 output -CREATE TABLE `tbl` (`a` INT, `b` STRING, `c` INT) +-- !query output +CREATE TABLE `tbl` ( + `a` INT, + `b` STRING, + `c` INT) USING parquet LOCATION 'file:/path/to/table' --- !query 11 +-- !query DROP TABLE tbl --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output --- !query 12 +-- !query CREATE TABLE tbl (a INT, b STRING, c INT) USING parquet PARTITIONED BY (a) --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output --- !query 13 +-- !query SHOW CREATE TABLE tbl --- !query 13 schema +-- !query schema struct --- !query 13 output -CREATE TABLE `tbl` (`b` STRING, `c` INT, `a` INT) +-- !query output +CREATE TABLE `tbl` ( + `b` STRING, + `c` INT, + `a` INT) USING parquet PARTITIONED BY (a) --- !query 14 +-- !query DROP TABLE tbl --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output --- !query 15 +-- !query CREATE TABLE tbl (a INT, b STRING, c INT) USING parquet CLUSTERED BY (a) SORTED BY (b ASC) INTO 2 BUCKETS --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output --- !query 16 +-- !query SHOW CREATE TABLE tbl --- !query 16 schema +-- !query schema struct --- !query 16 output -CREATE TABLE `tbl` (`a` INT, `b` STRING, `c` INT) +-- !query output +CREATE TABLE `tbl` ( + `a` INT, + `b` STRING, + `c` INT) USING parquet CLUSTERED BY (a) SORTED BY (b) INTO 2 BUCKETS --- !query 17 +-- !query DROP TABLE tbl --- !query 17 schema +-- !query schema struct<> --- !query 17 output +-- !query output --- !query 18 +-- !query CREATE TABLE tbl (a INT, b STRING, c INT) USING parquet COMMENT 'This is a comment' --- !query 18 schema +-- !query schema struct<> --- !query 18 output +-- !query output --- !query 19 +-- !query SHOW CREATE TABLE tbl --- !query 19 schema +-- !query schema struct --- !query 19 output -CREATE TABLE `tbl` (`a` INT, `b` STRING, `c` INT) +-- !query output +CREATE TABLE `tbl` ( + `a` INT, + `b` STRING, + `c` INT) USING parquet COMMENT 'This is a comment' --- !query 20 +-- !query DROP TABLE tbl --- !query 20 schema +-- !query schema struct<> --- !query 20 output +-- !query output --- !query 21 +-- !query CREATE TABLE tbl (a INT, b STRING, c INT) USING parquet TBLPROPERTIES ('a' = '1') --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output --- !query 22 +-- !query SHOW CREATE TABLE tbl --- !query 22 schema +-- !query schema struct --- !query 22 output -CREATE TABLE `tbl` (`a` INT, `b` STRING, `c` INT) +-- !query output +CREATE TABLE `tbl` ( + `a` INT, + `b` STRING, + `c` INT) USING parquet TBLPROPERTIES ( - 'a' = '1' -) + 'a' = '1') --- !query 23 +-- !query DROP TABLE tbl --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output + + + +-- !query +CREATE TABLE tbl (a REAL, b NUMERIC, c NUMERIC(10), d NUMERIC(10,1)) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +SHOW CREATE TABLE tbl +-- !query schema +struct +-- !query output +CREATE TABLE `tbl` ( + `a` FLOAT, + `b` DECIMAL(10,0), + `c` DECIMAL(10,0), + `d` DECIMAL(10,1)) +USING parquet + + +-- !query +DROP TABLE tbl +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE tbl (a INT, b STRING, c INT) USING parquet +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE VIEW view_SPARK_30302 (aaa, bbb) +AS SELECT a, b FROM tbl +-- !query schema +struct<> +-- !query output + + + +-- !query +SHOW CREATE TABLE view_SPARK_30302 AS SERDE +-- !query schema +struct +-- !query output +CREATE VIEW `view_SPARK_30302`( + `aaa`, + `bbb`) +AS SELECT a, b FROM tbl + + +-- !query +DROP VIEW view_SPARK_30302 +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE VIEW view_SPARK_30302 (aaa COMMENT 'comment with \'quoted text\' for aaa', bbb) +COMMENT 'This is a comment with \'quoted text\' for view' +AS SELECT a, b FROM tbl +-- !query schema +struct<> +-- !query output + + + +-- !query +SHOW CREATE TABLE view_SPARK_30302 AS SERDE +-- !query schema +struct +-- !query output +CREATE VIEW `view_SPARK_30302`( + `aaa` COMMENT 'comment with \'quoted text\' for aaa', + `bbb`) +COMMENT 'This is a comment with \'quoted text\' for view' +AS SELECT a, b FROM tbl + + +-- !query +DROP VIEW view_SPARK_30302 +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE VIEW view_SPARK_30302 (aaa, bbb) +TBLPROPERTIES ('a' = '1', 'b' = '2') +AS SELECT a, b FROM tbl +-- !query schema +struct<> +-- !query output + + + +-- !query +SHOW CREATE TABLE view_SPARK_30302 AS SERDE +-- !query schema +struct +-- !query output +CREATE VIEW `view_SPARK_30302`( + `aaa`, + `bbb`) +TBLPROPERTIES ( + 'a' = '1', + 'b' = '2') +AS SELECT a, b FROM tbl + + +-- !query +DROP VIEW view_SPARK_30302 +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE VIEW view_SPARK_30302 (aaa, bbb) +AS SELECT a, b FROM tbl +-- !query schema +struct<> +-- !query output + + + +-- !query +SHOW CREATE TABLE view_SPARK_30302 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Hive view isn't supported by SHOW CREATE TABLE; + + +-- !query +DROP VIEW view_SPARK_30302 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE tbl +-- !query schema +struct<> +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out index f22cb7e200e6c..501e185b07f7a 100644 --- a/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/show-tables.sql.out @@ -2,67 +2,67 @@ -- Number of queries: 26 --- !query 0 +-- !query CREATE DATABASE showdb --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query USE showdb --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE TABLE show_t1(a String, b Int, c String, d String) USING parquet PARTITIONED BY (c, d) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query ALTER TABLE show_t1 ADD PARTITION (c='Us', d=1) --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query CREATE TABLE show_t2(b String, d Int) USING parquet --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query CREATE TEMPORARY VIEW show_t3(e int) USING parquet --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query CREATE GLOBAL TEMP VIEW show_t4 AS SELECT 1 as col1 --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output --- !query 7 +-- !query SHOW TABLES --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output aggtest arraydata mapdata @@ -74,11 +74,11 @@ tenk1 testdata --- !query 8 +-- !query SHOW TABLES IN showdb --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output aggtest arraydata mapdata @@ -90,40 +90,40 @@ tenk1 testdata --- !query 9 +-- !query SHOW TABLES 'show_t*' --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output show_t1 show_t2 show_t3 --- !query 10 +-- !query SHOW TABLES LIKE 'show_t1*|show_t2*' --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output show_t1 show_t2 --- !query 11 +-- !query SHOW TABLES IN showdb 'show_t*' --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output show_t1 show_t2 show_t3 --- !query 12 +-- !query SHOW TABLE EXTENDED LIKE 'show_t*' --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output show_t3 true Table: show_t3 Created Time [not included in comparison] Last Access [not included in comparison] @@ -140,7 +140,7 @@ Last Access [not included in comparison] Created By [not included in comparison] Type: MANAGED Provider: parquet -Location [not included in comparison]sql/core/spark-warehouse/showdb.db/show_t1 +Location [not included in comparison]/{warehouse_dir}/showdb.db/show_t1 Partition Provider: Catalog Partition Columns: [`c`, `d`] Schema: root @@ -157,17 +157,17 @@ Last Access [not included in comparison] Created By [not included in comparison] Type: MANAGED Provider: parquet -Location [not included in comparison]sql/core/spark-warehouse/showdb.db/show_t2 +Location [not included in comparison]/{warehouse_dir}/showdb.db/show_t2 Schema: root |-- b: string (nullable = true) |-- d: integer (nullable = true) --- !query 13 +-- !query SHOW TABLE EXTENDED --- !query 13 schema +-- !query schema struct<> --- !query 13 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException mismatched input '' expecting {'FROM', 'IN', 'LIKE'}(line 1, pos 19) @@ -177,22 +177,22 @@ SHOW TABLE EXTENDED -------------------^^^ --- !query 14 +-- !query SHOW TABLE EXTENDED LIKE 'show_t1' PARTITION(c='Us', d=1) --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output showdb show_t1 false Partition Values: [c=Us, d=1] -Location [not included in comparison]sql/core/spark-warehouse/showdb.db/show_t1/c=Us/d=1 +Location [not included in comparison]/{warehouse_dir}/showdb.db/show_t1/c=Us/d=1 Created Time [not included in comparison] Last Access [not included in comparison] --- !query 15 +-- !query SHOW TABLE EXTENDED PARTITION(c='Us', d=1) --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException mismatched input 'PARTITION' expecting {'FROM', 'IN', 'LIKE'}(line 1, pos 20) @@ -202,87 +202,87 @@ SHOW TABLE EXTENDED PARTITION(c='Us', d=1) --------------------^^^ --- !query 16 +-- !query SHOW TABLE EXTENDED LIKE 'show_t*' PARTITION(c='Us', d=1) --- !query 16 schema +-- !query schema struct<> --- !query 16 output +-- !query output org.apache.spark.sql.catalyst.analysis.NoSuchTableException Table or view 'show_t*' not found in database 'showdb'; --- !query 17 +-- !query SHOW TABLE EXTENDED LIKE 'show_t1' PARTITION(c='Us') --- !query 17 schema +-- !query schema struct<> --- !query 17 output +-- !query output org.apache.spark.sql.AnalysisException Partition spec is invalid. The spec (c) must match the partition spec (c, d) defined in table '`showdb`.`show_t1`'; --- !query 18 +-- !query SHOW TABLE EXTENDED LIKE 'show_t1' PARTITION(a='Us', d=1) --- !query 18 schema +-- !query schema struct<> --- !query 18 output +-- !query output org.apache.spark.sql.AnalysisException Partition spec is invalid. The spec (a, d) must match the partition spec (c, d) defined in table '`showdb`.`show_t1`'; --- !query 19 +-- !query SHOW TABLE EXTENDED LIKE 'show_t1' PARTITION(c='Ch', d=1) --- !query 19 schema +-- !query schema struct<> --- !query 19 output +-- !query output org.apache.spark.sql.catalyst.analysis.NoSuchPartitionException Partition not found in table 'show_t1' database 'showdb': c -> Ch d -> 1; --- !query 20 +-- !query DROP TABLE show_t1 --- !query 20 schema +-- !query schema struct<> --- !query 20 output +-- !query output --- !query 21 +-- !query DROP TABLE show_t2 --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output --- !query 22 +-- !query DROP VIEW show_t3 --- !query 22 schema +-- !query schema struct<> --- !query 22 output +-- !query output --- !query 23 +-- !query DROP VIEW global_temp.show_t4 --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output --- !query 24 +-- !query USE default --- !query 24 schema +-- !query schema struct<> --- !query 24 output +-- !query output --- !query 25 +-- !query DROP DATABASE showdb --- !query 25 schema +-- !query schema struct<> --- !query 25 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/show_columns.sql.out b/sql/core/src/test/resources/sql-tests/results/show_columns.sql.out index 71d6e120e8943..4f5db7f6c6b2f 100644 --- a/sql/core/src/test/resources/sql-tests/results/show_columns.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/show_columns.sql.out @@ -2,216 +2,216 @@ -- Number of queries: 25 --- !query 0 +-- !query CREATE DATABASE showdb --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query USE showdb --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE TABLE showcolumn1 (col1 int, `col 2` int) USING json --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query CREATE TABLE showcolumn2 (price int, qty int, year int, month int) USING parquet partitioned by (year, month) --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query CREATE TEMPORARY VIEW showColumn3 (col3 int, `col 4` int) USING json --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query CREATE GLOBAL TEMP VIEW showColumn4 AS SELECT 1 as col1, 'abc' as `col 5` --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query SHOW COLUMNS IN showcolumn1 --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output col 2 col1 --- !query 7 +-- !query SHOW COLUMNS IN showdb.showcolumn1 --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output col 2 col1 --- !query 8 +-- !query SHOW COLUMNS IN showcolumn1 FROM showdb --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output col 2 col1 --- !query 9 +-- !query SHOW COLUMNS IN showcolumn2 IN showdb --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output month price qty year --- !query 10 +-- !query SHOW COLUMNS IN badtable FROM showdb --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output org.apache.spark.sql.catalyst.analysis.NoSuchTableException Table or view 'badtable' not found in database 'showdb'; --- !query 11 +-- !query SHOW COLUMNS IN showdb.showcolumn1 from SHOWDB --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output col 2 col1 --- !query 12 +-- !query SHOW COLUMNS IN showdb.showcolumn1 FROM baddb --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output org.apache.spark.sql.AnalysisException SHOW COLUMNS with conflicting databases: 'baddb' != 'showdb'; --- !query 13 +-- !query SHOW COLUMNS IN showcolumn3 --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output col 4 col3 --- !query 14 +-- !query SHOW COLUMNS IN showdb.showcolumn3 --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output org.apache.spark.sql.catalyst.analysis.NoSuchTableException Table or view 'showcolumn3' not found in database 'showdb'; --- !query 15 +-- !query SHOW COLUMNS IN showcolumn3 FROM showdb --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output org.apache.spark.sql.catalyst.analysis.NoSuchTableException Table or view 'showcolumn3' not found in database 'showdb'; --- !query 16 +-- !query SHOW COLUMNS IN showcolumn4 --- !query 16 schema +-- !query schema struct<> --- !query 16 output +-- !query output org.apache.spark.sql.catalyst.analysis.NoSuchTableException Table or view 'showcolumn4' not found in database 'showdb'; --- !query 17 +-- !query SHOW COLUMNS IN global_temp.showcolumn4 --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output col 5 col1 --- !query 18 +-- !query SHOW COLUMNS IN showcolumn4 FROM global_temp --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output col 5 col1 --- !query 19 +-- !query DROP TABLE showcolumn1 --- !query 19 schema +-- !query schema struct<> --- !query 19 output +-- !query output --- !query 20 +-- !query DROP TABLE showColumn2 --- !query 20 schema +-- !query schema struct<> --- !query 20 output +-- !query output --- !query 21 +-- !query DROP VIEW showcolumn3 --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output --- !query 22 +-- !query DROP VIEW global_temp.showcolumn4 --- !query 22 schema +-- !query schema struct<> --- !query 22 output +-- !query output --- !query 23 +-- !query use default --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output --- !query 24 +-- !query DROP DATABASE showdb --- !query 24 schema +-- !query schema struct<> --- !query 24 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/sql-compatibility-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/sql-compatibility-functions.sql.out index 69a8e958000db..6f1bbd03bc223 100644 --- a/sql/core/src/test/resources/sql-tests/results/sql-compatibility-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/sql-compatibility-functions.sql.out @@ -2,114 +2,114 @@ -- Number of queries: 14 --- !query 0 +-- !query SELECT ifnull(null, 'x'), ifnull('y', 'x'), ifnull(null, null) --- !query 0 schema +-- !query schema struct --- !query 0 output +-- !query output x y NULL --- !query 1 +-- !query SELECT nullif('x', 'x'), nullif('x', 'y') --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output NULL x --- !query 2 +-- !query SELECT nvl(null, 'x'), nvl('y', 'x'), nvl(null, null) --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output x y NULL --- !query 3 +-- !query SELECT nvl2(null, 'x', 'y'), nvl2('n', 'x', 'y'), nvl2(null, null, null) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output y x NULL --- !query 4 +-- !query SELECT ifnull(1, 2.1d), ifnull(null, 2.1d) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 1.0 2.1 --- !query 5 +-- !query SELECT nullif(1, 2.1d), nullif(1, 1.0d) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 1 NULL --- !query 6 +-- !query SELECT nvl(1, 2.1d), nvl(null, 2.1d) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 1.0 2.1 --- !query 7 +-- !query SELECT nvl2(null, 1, 2.1d), nvl2('n', 1, 2.1d) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 2.1 1.0 --- !query 8 +-- !query SELECT boolean(1), tinyint(1), smallint(1), int(1), bigint(1) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output true 1 1 1 1 --- !query 9 +-- !query SELECT float(1), double(1), decimal(1) --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 1.0 1.0 1 --- !query 10 +-- !query SELECT date("2014-04-04"), timestamp(date("2014-04-04")) --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 2014-04-04 2014-04-04 00:00:00 --- !query 11 +-- !query SELECT string(1, 2) --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output org.apache.spark.sql.AnalysisException Function string accepts only one argument; line 1 pos 7 --- !query 12 +-- !query CREATE TEMPORARY VIEW tempView1 AS VALUES (1, NAMED_STRUCT('col1', 'gamma', 'col2', 'delta')) AS T(id, st) --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output --- !query 13 +-- !query SELECT nvl(st.col1, "value"), count(*) FROM from tempView1 GROUP BY nvl(st.col1, "value") --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output gamma 1 diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 87c3e04017643..33d1b25aee483 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -2,267 +2,267 @@ -- Number of queries: 33 --- !query 0 +-- !query select concat_ws() --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output org.apache.spark.sql.AnalysisException requirement failed: concat_ws requires at least one argument.; line 1 pos 7 --- !query 1 +-- !query select format_string() --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output org.apache.spark.sql.AnalysisException requirement failed: format_string() should take at least 1 argument; line 1 pos 7 --- !query 2 +-- !query select 'a' || 'b' || 'c' --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output abc --- !query 3 +-- !query select replace('abc', 'b', '123') --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output a123c --- !query 4 +-- !query select replace('abc', 'b') --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output ac --- !query 5 +-- !query select length(uuid()), (uuid() <> uuid()) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 36 true --- !query 6 +-- !query select position('bar' in 'foobarbar'), position(null, 'foobarbar'), position('aaads', null) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 4 NULL NULL --- !query 7 +-- !query select left("abcd", 2), left("abcd", 5), left("abcd", '2'), left("abcd", null) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output ab abcd ab NULL --- !query 8 +-- !query select left(null, -2), left("abcd", -2), left("abcd", 0), left("abcd", 'a') --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output NULL NULL --- !query 9 +-- !query select right("abcd", 2), right("abcd", 5), right("abcd", '2'), right("abcd", null) --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output cd abcd cd NULL --- !query 10 +-- !query select right(null, -2), right("abcd", -2), right("abcd", 0), right("abcd", 'a') --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output NULL NULL --- !query 11 +-- !query SELECT split('aa1cc2ee3', '[1-9]+') --- !query 11 schema +-- !query schema struct> --- !query 11 output +-- !query output ["aa","cc","ee",""] --- !query 12 +-- !query SELECT split('aa1cc2ee3', '[1-9]+', 2) --- !query 12 schema +-- !query schema struct> --- !query 12 output +-- !query output ["aa","cc2ee3"] --- !query 13 +-- !query SELECT substr('Spark SQL', 5) --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output k SQL --- !query 14 +-- !query SELECT substr('Spark SQL', -3) --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output SQL --- !query 15 +-- !query SELECT substr('Spark SQL', 5, 1) --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output k --- !query 16 +-- !query SELECT substr('Spark SQL' from 5) --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output k SQL --- !query 17 +-- !query SELECT substr('Spark SQL' from -3) --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output SQL --- !query 18 +-- !query SELECT substr('Spark SQL' from 5 for 1) --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output k --- !query 19 +-- !query SELECT substring('Spark SQL', 5) --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output k SQL --- !query 20 +-- !query SELECT substring('Spark SQL', -3) --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output SQL --- !query 21 +-- !query SELECT substring('Spark SQL', 5, 1) --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output k --- !query 22 +-- !query SELECT substring('Spark SQL' from 5) --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output k SQL --- !query 23 +-- !query SELECT substring('Spark SQL' from -3) --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output SQL --- !query 24 +-- !query SELECT substring('Spark SQL' from 5 for 1) --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output k --- !query 25 +-- !query SELECT trim('yxTomxx', 'xyz'), trim(BOTH 'xyz' FROM 'yxTomxx'), trim('xyz' FROM 'yxTomxx') --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output Tom Tom Tom --- !query 26 +-- !query SELECT trim('xxxbarxxx', 'x'), trim(BOTH 'x' FROM 'xxxbarxxx'), trim('x' FROM 'xxxbarxxx') --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output bar bar bar --- !query 27 +-- !query SELECT ltrim('zzzytest', 'xyz'), trim(LEADING 'xyz' FROM 'zzzytest') --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output test test --- !query 28 +-- !query SELECT ltrim('zzzytestxyz', 'xyz'), trim(LEADING 'xyz' FROM 'zzzytestxyz') --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output testxyz testxyz --- !query 29 +-- !query SELECT ltrim('xyxXxyLAST WORD', 'xy'), trim(LEADING 'xy' FROM 'xyxXxyLAST WORD') --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output XxyLAST WORD XxyLAST WORD --- !query 30 +-- !query SELECT rtrim('testxxzx', 'xyz'), trim(TRAILING 'xyz' FROM 'testxxzx') --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output test test --- !query 31 +-- !query SELECT rtrim('xyztestxxzx', 'xyz'), trim(TRAILING 'xyz' FROM 'xyztestxxzx') --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output xyztest xyztest --- !query 32 +-- !query SELECT rtrim('TURNERyxXxy', 'xy'), trim(TRAILING 'xy' FROM 'TURNERyxXxy') --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output TURNERyxX TURNERyxX diff --git a/sql/core/src/test/resources/sql-tests/results/struct.sql.out b/sql/core/src/test/resources/sql-tests/results/struct.sql.out index 1da33bc736f0b..f294c5213d319 100644 --- a/sql/core/src/test/resources/sql-tests/results/struct.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/struct.sql.out @@ -2,89 +2,89 @@ -- Number of queries: 9 --- !query 0 +-- !query CREATE TEMPORARY VIEW tbl_x AS VALUES (1, NAMED_STRUCT('C', 'gamma', 'D', 'delta')), (2, NAMED_STRUCT('C', 'epsilon', 'D', 'eta')), (3, NAMED_STRUCT('C', 'theta', 'D', 'iota')) AS T(ID, ST) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT STRUCT('alpha', 'beta') ST --- !query 1 schema +-- !query schema struct> --- !query 1 output +-- !query output {"col1":"alpha","col2":"beta"} --- !query 2 +-- !query SELECT STRUCT('alpha' AS A, 'beta' AS B) ST --- !query 2 schema +-- !query schema struct> --- !query 2 output +-- !query output {"A":"alpha","B":"beta"} --- !query 3 +-- !query SELECT ID, STRUCT(ST.*) NST FROM tbl_x --- !query 3 schema +-- !query schema struct> --- !query 3 output +-- !query output 1 {"C":"gamma","D":"delta"} 2 {"C":"epsilon","D":"eta"} 3 {"C":"theta","D":"iota"} --- !query 4 +-- !query SELECT ID, STRUCT(ST.*,CAST(ID AS STRING) AS E) NST FROM tbl_x --- !query 4 schema +-- !query schema struct> --- !query 4 output +-- !query output 1 {"C":"gamma","D":"delta","E":"1"} 2 {"C":"epsilon","D":"eta","E":"2"} 3 {"C":"theta","D":"iota","E":"3"} --- !query 5 +-- !query SELECT ID, STRUCT(CAST(ID AS STRING) AS AA, ST.*) NST FROM tbl_x --- !query 5 schema +-- !query schema struct> --- !query 5 output +-- !query output 1 {"AA":"1","C":"gamma","D":"delta"} 2 {"AA":"2","C":"epsilon","D":"eta"} 3 {"AA":"3","C":"theta","D":"iota"} --- !query 6 +-- !query SELECT ID, STRUCT(ST.*).C NST FROM tbl_x --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 1 gamma 2 epsilon 3 theta --- !query 7 +-- !query SELECT ID, STRUCT(ST.C, ST.D).D NST FROM tbl_x --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 1 delta 2 eta 3 iota --- !query 8 +-- !query SELECT ID, STRUCT(ST.C as STC, ST.D as STD).STD FROM tbl_x --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 1 delta 2 eta 3 iota diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-aggregate.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-aggregate.sql.out index 97f494cc05063..9f11b46d4088b 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-aggregate.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-aggregate.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 11 --- !query 0 +-- !query CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES (100, "emp 1", date "2005-01-01", 100.00D, 10), (100, "emp 1", date "2005-01-01", 100.00D, 10), @@ -14,13 +14,13 @@ CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES (700, "emp 7", date "2010-01-01", 400.00D, 100), (800, "emp 8", date "2016-01-01", 150.00D, 70) AS EMP(id, emp_name, hiredate, salary, dept_id) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES (10, "dept 1", "CA"), (20, "dept 2", "NY"), @@ -29,13 +29,13 @@ CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES (50, "dept 5 - unassigned", "NJ"), (70, "dept 7", "FL") AS DEPT(dept_id, dept_name, state) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE TEMPORARY VIEW BONUS AS SELECT * FROM VALUES ("emp 1", 10.00D), ("emp 1", 20.00D), @@ -46,13 +46,13 @@ CREATE TEMPORARY VIEW BONUS AS SELECT * FROM VALUES ("emp 5", 1000.00D), ("emp 6 - no dept", 500.00D) AS BONUS(emp_name, bonus_amt) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT emp.dept_id, avg(salary), sum(salary) @@ -61,25 +61,25 @@ WHERE EXISTS (SELECT state FROM dept WHERE dept.dept_id = emp.dept_id) GROUP BY dept_id --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 10 133.33333333333334 400.0 20 300.0 300.0 30 400.0 400.0 70 150.0 150.0 --- !query 4 +-- !query SELECT emp_name FROM emp WHERE EXISTS (SELECT max(dept.dept_id) a FROM dept WHERE dept.dept_id = emp.dept_id GROUP BY dept.dept_id) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output emp 1 emp 1 emp 2 @@ -88,20 +88,20 @@ emp 4 emp 8 --- !query 5 +-- !query SELECT count(*) FROM emp WHERE EXISTS (SELECT max(dept.dept_id) a FROM dept WHERE dept.dept_id = emp.dept_id GROUP BY dept.dept_id) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 6 --- !query 6 +-- !query SELECT * FROM bonus WHERE EXISTS (SELECT 1 @@ -111,9 +111,9 @@ WHERE EXISTS (SELECT 1 FROM dept WHERE emp.dept_id = dept.dept_id GROUP BY dept.dept_id)) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output emp 1 10.0 emp 1 20.0 emp 2 100.0 @@ -122,7 +122,7 @@ emp 3 300.0 emp 4 100.0 --- !query 7 +-- !query SELECT emp.dept_id, Avg(salary), Sum(salary) @@ -131,42 +131,42 @@ WHERE NOT EXISTS (SELECT state FROM dept WHERE dept.dept_id = emp.dept_id) GROUP BY dept_id --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 100 400.0 800.0 NULL 400.0 400.0 --- !query 8 +-- !query SELECT emp_name FROM emp WHERE NOT EXISTS (SELECT max(dept.dept_id) a FROM dept WHERE dept.dept_id = emp.dept_id GROUP BY dept.dept_id) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output emp 5 emp 6 - no dept emp 7 --- !query 9 +-- !query SELECT count(*) FROM emp WHERE NOT EXISTS (SELECT max(dept.dept_id) a FROM dept WHERE dept.dept_id = emp.dept_id GROUP BY dept.dept_id) --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 3 --- !query 10 +-- !query SELECT * FROM bonus WHERE NOT EXISTS (SELECT 1 @@ -176,8 +176,8 @@ WHERE NOT EXISTS (SELECT 1 FROM dept WHERE emp.dept_id = dept.dept_id GROUP BY dept.dept_id)) --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output emp 5 1000.0 emp 6 - no dept 500.0 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-basic.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-basic.sql.out index 900e4d573bef1..a54fb47fe34f8 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-basic.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-basic.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 13 --- !query 0 +-- !query CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES (100, "emp 1", date "2005-01-01", 100.00D, 10), (100, "emp 1", date "2005-01-01", 100.00D, 10), @@ -14,13 +14,13 @@ CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES (700, "emp 7", date "2010-01-01", 400.00D, 100), (800, "emp 8", date "2016-01-01", 150.00D, 70) AS EMP(id, emp_name, hiredate, salary, dept_id) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES (10, "dept 1", "CA"), (20, "dept 2", "NY"), @@ -29,13 +29,13 @@ CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES (50, "dept 5 - unassigned", "NJ"), (70, "dept 7", "FL") AS DEPT(dept_id, dept_name, state) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE TEMPORARY VIEW BONUS AS SELECT * FROM VALUES ("emp 1", 10.00D), ("emp 1", 20.00D), @@ -46,22 +46,22 @@ CREATE TEMPORARY VIEW BONUS AS SELECT * FROM VALUES ("emp 5", 1000.00D), ("emp 6 - no dept", 500.00D) AS BONUS(emp_name, bonus_amt) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT * FROM emp WHERE EXISTS (SELECT 1 FROM dept WHERE dept.dept_id > 10 AND dept.dept_id < 30) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 100 emp 1 2005-01-01 100.0 10 100 emp 1 2005-01-01 100.0 10 200 emp 2 2003-01-01 200.0 10 @@ -73,15 +73,15 @@ struct 800 emp 8 2016-01-01 150.0 70 --- !query 4 +-- !query SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_name FROM dept WHERE emp.dept_id = dept.dept_id) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 100 emp 1 2005-01-01 100.0 10 100 emp 1 2005-01-01 100.0 10 200 emp 2 2003-01-01 200.0 10 @@ -90,16 +90,16 @@ struct 800 emp 8 2016-01-01 150.0 70 --- !query 5 +-- !query SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_name FROM dept WHERE emp.dept_id = dept.dept_id OR emp.dept_id IS NULL) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 100 emp 1 2005-01-01 100.0 10 100 emp 1 2005-01-01 100.0 10 200 emp 2 2003-01-01 200.0 10 @@ -109,92 +109,92 @@ struct 800 emp 8 2016-01-01 150.0 70 --- !query 6 +-- !query SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_name FROM dept WHERE emp.dept_id = dept.dept_id) AND emp.id > 200 --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 300 emp 3 2002-01-01 300.0 20 400 emp 4 2005-01-01 400.0 30 800 emp 8 2016-01-01 150.0 70 --- !query 7 +-- !query SELECT emp.emp_name FROM emp WHERE EXISTS (SELECT dept.state FROM dept WHERE emp.dept_id = dept.dept_id) AND emp.id > 200 --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output emp 3 emp 4 emp 8 --- !query 8 +-- !query SELECT * FROM dept WHERE NOT EXISTS (SELECT emp_name FROM emp WHERE emp.dept_id = dept.dept_id) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 40 dept 4 - unassigned OR 50 dept 5 - unassigned NJ --- !query 9 +-- !query SELECT * FROM dept WHERE NOT EXISTS (SELECT emp_name FROM emp WHERE emp.dept_id = dept.dept_id OR state = 'NJ') --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 40 dept 4 - unassigned OR --- !query 10 +-- !query SELECT * FROM bonus WHERE NOT EXISTS (SELECT * FROM emp WHERE emp.emp_name = emp_name AND bonus_amt > emp.salary) --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output emp 1 10.0 emp 1 20.0 emp 2 100.0 emp 4 100.0 --- !query 11 +-- !query SELECT emp.* FROM emp WHERE NOT EXISTS (SELECT NULL FROM bonus WHERE bonus.emp_name = emp.emp_name) --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 700 emp 7 2010-01-01 400.0 100 800 emp 8 2016-01-01 150.0 70 --- !query 12 +-- !query SELECT * FROM bonus WHERE EXISTS (SELECT emp_name @@ -203,9 +203,9 @@ WHERE EXISTS (SELECT emp_name AND EXISTS (SELECT state FROM dept WHERE dept.dept_id = emp.dept_id)) --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output emp 1 10.0 emp 1 20.0 emp 2 100.0 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-cte.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-cte.sql.out index c6c1c04e1c73d..3c8a19998a786 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-cte.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-cte.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 8 --- !query 0 +-- !query CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES (100, "emp 1", date "2005-01-01", 100.00D, 10), (100, "emp 1", date "2005-01-01", 100.00D, 10), @@ -14,13 +14,13 @@ CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES (700, "emp 7", date "2010-01-01", 400.00D, 100), (800, "emp 8", date "2016-01-01", 150.00D, 70) AS EMP(id, emp_name, hiredate, salary, dept_id) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES (10, "dept 1", "CA"), (20, "dept 2", "NY"), @@ -29,13 +29,13 @@ CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES (50, "dept 5 - unassigned", "NJ"), (70, "dept 7", "FL") AS DEPT(dept_id, dept_name, state) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE TEMPORARY VIEW BONUS AS SELECT * FROM VALUES ("emp 1", 10.00D), ("emp 1", 20.00D), @@ -46,13 +46,13 @@ CREATE TEMPORARY VIEW BONUS AS SELECT * FROM VALUES ("emp 5", 1000.00D), ("emp 6 - no dept", 500.00D) AS BONUS(emp_name, bonus_amt) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query WITH bonus_cte AS (SELECT * FROM bonus @@ -73,16 +73,16 @@ WHERE a.bonus_amt > 30 AND EXISTS (SELECT 1 FROM bonus_cte b WHERE a.emp_name = b.emp_name) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output emp 2 100.0 emp 2 300.0 emp 3 300.0 emp 4 100.0 --- !query 4 +-- !query WITH emp_cte AS (SELECT * FROM emp @@ -99,16 +99,16 @@ WHERE EXISTS (SELECT * JOIN dept_cte b ON a.dept_id = b.dept_id WHERE bonus.emp_name = a.emp_name) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output emp 1 10.0 emp 1 20.0 emp 2 100.0 emp 2 300.0 --- !query 5 +-- !query WITH emp_cte AS (SELECT * FROM emp @@ -130,9 +130,9 @@ WHERE e.dept_id = d.dept_id LEFT JOIN dept_cte b ON a.dept_id = b.dept_id WHERE e.emp_name = a.emp_name) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output emp 1 10.0 emp 1 20.0 emp 2 100.0 @@ -140,7 +140,7 @@ emp 2 300.0 emp 3 300.0 --- !query 6 +-- !query WITH empdept AS (SELECT id, salary, @@ -159,9 +159,9 @@ WHERE EXISTS (SELECT dept_id, GROUP BY dept_id HAVING count(*) > 1) GROUP BY emp_name --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output emp 1 30.0 emp 2 400.0 emp 3 300.0 @@ -170,7 +170,7 @@ emp 5 1000.0 emp 6 - no dept 500.0 --- !query 7 +-- !query WITH empdept AS (SELECT id, salary, @@ -189,9 +189,9 @@ WHERE NOT EXISTS (SELECT dept_id, GROUP BY dept_id HAVING count(*) < 1) GROUP BY emp_name --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output emp 1 30.0 emp 2 400.0 emp 3 300.0 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-having.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-having.sql.out index de90f5e260e1b..aa4d2ab7e4133 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-having.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-having.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 8 --- !query 0 +-- !query CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES (100, "emp 1", date "2005-01-01", 100.00D, 10), (100, "emp 1", date "2005-01-01", 100.00D, 10), @@ -14,13 +14,13 @@ CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES (700, "emp 7", date "2010-01-01", 400.00D, 100), (800, "emp 8", date "2016-01-01", 150.00D, 70) AS EMP(id, emp_name, hiredate, salary, dept_id) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES (10, "dept 1", "CA"), (20, "dept 2", "NY"), @@ -29,13 +29,13 @@ CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES (50, "dept 5 - unassigned", "NJ"), (70, "dept 7", "FL") AS DEPT(dept_id, dept_name, state) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE TEMPORARY VIEW BONUS AS SELECT * FROM VALUES ("emp 1", 10.00D), ("emp 1", 20.00D), @@ -46,22 +46,22 @@ CREATE TEMPORARY VIEW BONUS AS SELECT * FROM VALUES ("emp 5", 1000.00D), ("emp 6 - no dept", 500.00D) AS BONUS(emp_name, bonus_amt) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT dept_id, count(*) FROM emp GROUP BY dept_id HAVING EXISTS (SELECT 1 FROM bonus WHERE bonus_amt < min(emp.salary)) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 10 3 100 2 20 1 @@ -70,7 +70,7 @@ struct NULL 1 --- !query 4 +-- !query SELECT * FROM dept WHERE EXISTS (SELECT dept_id, @@ -80,9 +80,9 @@ WHERE EXISTS (SELECT dept_id, HAVING EXISTS (SELECT 1 FROM bonus WHERE bonus_amt < Min(emp.salary))) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 10 dept 1 CA 20 dept 2 NY 30 dept 3 TX @@ -91,7 +91,7 @@ struct 70 dept 7 FL --- !query 5 +-- !query SELECT dept_id, Max(salary) FROM emp gp @@ -103,9 +103,9 @@ WHERE EXISTS (SELECT dept_id, FROM bonus WHERE bonus_amt < Min(p.salary))) GROUP BY gp.dept_id --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 10 200.0 100 400.0 20 300.0 @@ -114,7 +114,7 @@ struct NULL 400.0 --- !query 6 +-- !query SELECT * FROM dept WHERE EXISTS (SELECT dept_id, @@ -124,9 +124,9 @@ WHERE EXISTS (SELECT dept_id, HAVING EXISTS (SELECT 1 FROM bonus WHERE bonus_amt > Min(emp.salary))) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 10 dept 1 CA 20 dept 2 NY 30 dept 3 TX @@ -135,7 +135,7 @@ struct 70 dept 7 FL --- !query 7 +-- !query SELECT * FROM dept WHERE EXISTS (SELECT dept_id, @@ -147,7 +147,7 @@ WHERE EXISTS (SELECT dept_id, FROM bonus WHERE ( bonus_amt > min(emp.salary) AND count(emp.dept_id) > 1 ))) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 10 dept 1 CA diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-joins-and-set-ops.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-joins-and-set-ops.sql.out index c488cba01d4d0..1a5294930422a 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-joins-and-set-ops.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-joins-and-set-ops.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 17 --- !query 0 +-- !query CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES (100, "emp 1", date "2005-01-01", 100.00D, 10), (100, "emp 1", date "2005-01-01", 100.00D, 10), @@ -14,13 +14,13 @@ CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES (700, "emp 7", date "2010-01-01", 400.00D, 100), (800, "emp 8", date "2016-01-01", 150.00D, 70) AS EMP(id, emp_name, hiredate, salary, dept_id) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES (10, "dept 1", "CA"), (20, "dept 2", "NY"), @@ -29,13 +29,13 @@ CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES (50, "dept 5 - unassigned", "NJ"), (70, "dept 7", "FL") AS DEPT(dept_id, dept_name, state) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE TEMPORARY VIEW BONUS AS SELECT * FROM VALUES ("emp 1", 10.00D), ("emp 1", 20.00D), @@ -46,13 +46,13 @@ CREATE TEMPORARY VIEW BONUS AS SELECT * FROM VALUES ("emp 5", 1000.00D), ("emp 6 - no dept", 500.00D) AS BONUS(emp_name, bonus_amt) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT * FROM emp, dept @@ -60,9 +60,9 @@ WHERE emp.dept_id = dept.dept_id AND EXISTS (SELECT * FROM bonus WHERE bonus.emp_name = emp.emp_name) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 100 emp 1 2005-01-01 100.0 10 10 dept 1 CA 100 emp 1 2005-01-01 100.0 10 10 dept 1 CA 200 emp 2 2003-01-01 200.0 10 10 dept 1 CA @@ -70,7 +70,7 @@ struct --- !query 4 output +-- !query output 100 emp 1 2005-01-01 100.0 10 10 dept 1 CA 100 emp 1 2005-01-01 100.0 10 10 dept 1 CA 200 emp 2 2003-01-01 200.0 10 10 dept 1 CA @@ -88,7 +88,7 @@ struct --- !query 5 output +-- !query output 100 emp 1 2005-01-01 100.0 10 10 dept 1 CA 100 emp 1 2005-01-01 100.0 10 10 dept 1 CA 200 emp 2 2003-01-01 200.0 10 10 dept 1 CA @@ -108,7 +108,7 @@ struct --- !query 6 output +-- !query output 800 emp 8 2016-01-01 150.0 70 70 dept 7 FL --- !query 7 +-- !query SELECT * FROM bonus WHERE EXISTS (SELECT * @@ -130,9 +130,9 @@ WHERE EXISTS (SELECT * JOIN dept ON dept.dept_id = emp.dept_id WHERE bonus.emp_name = emp.emp_name) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output emp 1 10.0 emp 1 20.0 emp 2 100.0 @@ -141,7 +141,7 @@ emp 3 300.0 emp 4 100.0 --- !query 8 +-- !query SELECT * FROM bonus WHERE EXISTS (SELECT * @@ -149,9 +149,9 @@ WHERE EXISTS (SELECT * RIGHT JOIN dept ON dept.dept_id = emp.dept_id WHERE bonus.emp_name = emp.emp_name) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output emp 1 10.0 emp 1 20.0 emp 2 100.0 @@ -160,7 +160,7 @@ emp 3 300.0 emp 4 100.0 --- !query 9 +-- !query SELECT * FROM bonus WHERE EXISTS (SELECT dept.dept_id, @@ -174,9 +174,9 @@ WHERE EXISTS (SELECT dept.dept_id, GROUP BY dept.dept_id, emp.emp_name ORDER BY emp.emp_name) --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output emp 1 10.0 emp 1 20.0 emp 2 100.0 @@ -185,7 +185,7 @@ emp 3 300.0 emp 4 100.0 --- !query 10 +-- !query SELECT emp_name, Sum(bonus_amt) FROM bonus @@ -199,13 +199,13 @@ WHERE EXISTS (SELECT emp_name, HAVING Count(*) > 1 ORDER BY emp_name) GROUP BY emp_name --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output emp 1 30.0 --- !query 11 +-- !query SELECT emp_name, Sum(bonus_amt) FROM bonus @@ -219,9 +219,9 @@ WHERE NOT EXISTS (SELECT emp_name, HAVING Count(*) > 1 ORDER BY emp_name) GROUP BY emp_name --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output emp 2 400.0 emp 3 300.0 emp 4 100.0 @@ -229,7 +229,7 @@ emp 5 1000.0 emp 6 - no dept 500.0 --- !query 12 +-- !query SELECT * FROM emp WHERE EXISTS (SELECT * @@ -240,9 +240,9 @@ WHERE EXISTS (SELECT * FROM dept WHERE dept_id >= 30 AND dept_id <= 50) --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 100 emp 1 2005-01-01 100.0 10 100 emp 1 2005-01-01 100.0 10 200 emp 2 2003-01-01 200.0 10 @@ -254,7 +254,7 @@ struct 800 emp 8 2016-01-01 150.0 70 --- !query 13 +-- !query SELECT * FROM emp WHERE EXISTS (SELECT * @@ -265,13 +265,13 @@ WHERE EXISTS (SELECT * FROM dept WHERE dept_id >= 30 AND dept_id <= 50) --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output --- !query 14 +-- !query SELECT * FROM emp WHERE NOT EXISTS (SELECT * @@ -282,9 +282,9 @@ WHERE NOT EXISTS (SELECT * FROM dept WHERE dept_id >= 30 AND dept_id <= 50) --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output 100 emp 1 2005-01-01 100.0 10 100 emp 1 2005-01-01 100.0 10 200 emp 2 2003-01-01 200.0 10 @@ -296,7 +296,7 @@ struct 800 emp 8 2016-01-01 150.0 70 --- !query 15 +-- !query SELECT * FROM emp WHERE EXISTS (SELECT * @@ -316,9 +316,9 @@ WHERE EXISTS (SELECT * FROM dept WHERE dept_id >= 30 AND dept_id <= 50) --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 100 emp 1 2005-01-01 100.0 10 100 emp 1 2005-01-01 100.0 10 200 emp 2 2003-01-01 200.0 10 @@ -330,7 +330,7 @@ struct 800 emp 8 2016-01-01 150.0 70 --- !query 16 +-- !query SELECT * FROM emp WHERE EXISTS (SELECT * @@ -350,9 +350,9 @@ WHERE EXISTS (SELECT * FROM dept WHERE dept_id >= 30 AND dept_id <= 50) --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output 100 emp 1 2005-01-01 100.0 10 200 emp 2 2003-01-01 200.0 10 300 emp 3 2002-01-01 300.0 20 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-orderby-limit.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-orderby-limit.sql.out index ee13ff2c4f38d..ebd4da6ccbd5d 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-orderby-limit.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-orderby-limit.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 12 --- !query 0 +-- !query CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES (100, "emp 1", date "2005-01-01", 100.00D, 10), (100, "emp 1", date "2005-01-01", 100.00D, 10), @@ -14,13 +14,13 @@ CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES (700, "emp 7", date "2010-01-01", 400.00D, 100), (800, "emp 8", date "2016-01-01", 150.00D, 70) AS EMP(id, emp_name, hiredate, salary, dept_id) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES (10, "dept 1", "CA"), (20, "dept 2", "NY"), @@ -29,13 +29,13 @@ CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES (50, "dept 5 - unassigned", "NJ"), (70, "dept 7", "FL") AS DEPT(dept_id, dept_name, state) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE TEMPORARY VIEW BONUS AS SELECT * FROM VALUES ("emp 1", 10.00D), ("emp 1", 20.00D), @@ -46,13 +46,13 @@ CREATE TEMPORARY VIEW BONUS AS SELECT * FROM VALUES ("emp 5", 1000.00D), ("emp 6 - no dept", 500.00D) AS BONUS(emp_name, bonus_amt) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_id @@ -60,9 +60,9 @@ WHERE EXISTS (SELECT dept.dept_id WHERE emp.dept_id = dept.dept_id ORDER BY state) ORDER BY hiredate --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 300 emp 3 2002-01-01 300.0 20 200 emp 2 2003-01-01 200.0 10 100 emp 1 2005-01-01 100.0 10 @@ -71,7 +71,7 @@ struct 800 emp 8 2016-01-01 150.0 70 --- !query 4 +-- !query SELECT id, hiredate FROM emp @@ -80,9 +80,9 @@ WHERE EXISTS (SELECT dept.dept_id WHERE emp.dept_id = dept.dept_id ORDER BY state) ORDER BY hiredate DESC --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 800 2016-01-01 100 2005-01-01 100 2005-01-01 @@ -91,7 +91,7 @@ struct 300 2002-01-01 --- !query 5 +-- !query SELECT * FROM emp WHERE NOT EXISTS (SELECT dept.dept_id @@ -99,15 +99,15 @@ WHERE NOT EXISTS (SELECT dept.dept_id WHERE emp.dept_id = dept.dept_id ORDER BY state) ORDER BY hiredate --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 500 emp 5 2001-01-01 400.0 NULL 600 emp 6 - no dept 2001-01-01 400.0 100 700 emp 7 2010-01-01 400.0 100 --- !query 6 +-- !query SELECT emp_name FROM emp WHERE NOT EXISTS (SELECT max(dept.dept_id) a @@ -115,15 +115,15 @@ WHERE NOT EXISTS (SELECT max(dept.dept_id) a WHERE dept.dept_id = emp.dept_id GROUP BY state ORDER BY state) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output emp 5 emp 6 - no dept emp 7 --- !query 7 +-- !query SELECT count(*) FROM emp WHERE NOT EXISTS (SELECT max(dept.dept_id) a @@ -131,22 +131,22 @@ WHERE NOT EXISTS (SELECT max(dept.dept_id) a WHERE dept.dept_id = emp.dept_id GROUP BY dept_id ORDER BY dept_id) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 3 --- !query 8 +-- !query SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_name FROM dept WHERE dept.dept_id > 10 LIMIT 1) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 100 emp 1 2005-01-01 100.0 10 100 emp 1 2005-01-01 100.0 10 200 emp 2 2003-01-01 200.0 10 @@ -158,16 +158,16 @@ struct 800 emp 8 2016-01-01 150.0 70 --- !query 9 +-- !query SELECT * FROM emp WHERE EXISTS (SELECT max(dept.dept_id) FROM dept GROUP BY state LIMIT 1) --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 100 emp 1 2005-01-01 100.0 10 100 emp 1 2005-01-01 100.0 10 200 emp 2 2003-01-01 200.0 10 @@ -179,16 +179,16 @@ struct 800 emp 8 2016-01-01 150.0 70 --- !query 10 +-- !query SELECT * FROM emp WHERE NOT EXISTS (SELECT dept.dept_name FROM dept WHERE dept.dept_id > 100 LIMIT 1) --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 100 emp 1 2005-01-01 100.0 10 100 emp 1 2005-01-01 100.0 10 200 emp 2 2003-01-01 200.0 10 @@ -200,7 +200,7 @@ struct 800 emp 8 2016-01-01 150.0 70 --- !query 11 +-- !query SELECT * FROM emp WHERE NOT EXISTS (SELECT max(dept.dept_id) @@ -208,9 +208,9 @@ WHERE NOT EXISTS (SELECT max(dept.dept_id) WHERE dept.dept_id > 100 GROUP BY state LIMIT 1) --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 100 emp 1 2005-01-01 100.0 10 100 emp 1 2005-01-01 100.0 10 200 emp 2 2003-01-01 200.0 10 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-within-and-or.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-within-and-or.sql.out index 865e4ed14e4ab..6a17c2fc86d40 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-within-and-or.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/exists-subquery/exists-within-and-or.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 8 --- !query 0 +-- !query CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES (100, "emp 1", date "2005-01-01", 100.00D, 10), (100, "emp 1", date "2005-01-01", 100.00D, 10), @@ -14,13 +14,13 @@ CREATE TEMPORARY VIEW EMP AS SELECT * FROM VALUES (700, "emp 7", date "2010-01-01", 400.00D, 100), (800, "emp 8", date "2016-01-01", 150.00D, 70) AS EMP(id, emp_name, hiredate, salary, dept_id) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES (10, "dept 1", "CA"), (20, "dept 2", "NY"), @@ -29,13 +29,13 @@ CREATE TEMPORARY VIEW DEPT AS SELECT * FROM VALUES (50, "dept 5 - unassigned", "NJ"), (70, "dept 7", "FL") AS DEPT(dept_id, dept_name, state) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE TEMPORARY VIEW BONUS AS SELECT * FROM VALUES ("emp 1", 10.00D), ("emp 1", 20.00D), @@ -46,22 +46,22 @@ CREATE TEMPORARY VIEW BONUS AS SELECT * FROM VALUES ("emp 5", 1000.00D), ("emp 6 - no dept", 500.00D) AS BONUS(emp_name, bonus_amt) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT emp.emp_name FROM emp WHERE EXISTS (SELECT dept.state FROM dept WHERE emp.dept_id = dept.dept_id) OR emp.id > 200 --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output emp 1 emp 1 emp 2 @@ -73,16 +73,16 @@ emp 7 emp 8 --- !query 4 +-- !query SELECT * FROM emp WHERE EXISTS (SELECT dept.dept_name FROM dept WHERE emp.dept_id = dept.dept_id) OR emp.dept_id IS NULL --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 100 emp 1 2005-01-01 100.0 10 100 emp 1 2005-01-01 100.0 10 200 emp 2 2003-01-01 200.0 10 @@ -92,7 +92,7 @@ struct 800 emp 8 2016-01-01 150.0 70 --- !query 5 +-- !query SELECT emp.emp_name FROM emp WHERE EXISTS (SELECT dept.state @@ -103,14 +103,14 @@ WHERE EXISTS (SELECT dept.state FROM dept WHERE emp.dept_id = dept.dept_id AND dept.dept_id = 30) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output emp 3 emp 4 --- !query 6 +-- !query SELECT * FROM bonus WHERE ( NOT EXISTS (SELECT * @@ -121,9 +121,9 @@ WHERE ( NOT EXISTS (SELECT * FROM emp WHERE emp.emp_name = emp_name OR bonus_amt < emp.salary) ) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output emp 1 10.0 emp 1 20.0 emp 2 100.0 @@ -134,7 +134,7 @@ emp 5 1000.0 emp 6 - no dept 500.0 --- !query 7 +-- !query SELECT * FROM bonus WHERE NOT EXISTS ( SELECT * @@ -147,9 +147,9 @@ emp_name IN SELECT emp_name FROM emp WHERE bonus_amt < emp.salary) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output emp 1 10.0 emp 1 20.0 emp 2 100.0 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-basic.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-basic.sql.out index 686fe4975379b..a33f78abf27f9 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-basic.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-basic.sql.out @@ -2,44 +2,44 @@ -- Number of queries: 7 --- !query 0 +-- !query create temporary view tab_a as select * from values (1, 1) as tab_a(a1, b1) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view tab_b as select * from values (1, 1) as tab_b(a2, b2) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query create temporary view struct_tab as select struct(col1 as a, col2 as b) as record from values (1, 1), (1, 2), (2, 1), (2, 2) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query select 1 from tab_a where (a1, b1) not in (select a2, b2 from tab_b) --- !query 3 schema +-- !query schema struct<1:int> --- !query 3 output +-- !query output --- !query 4 +-- !query select 1 from tab_a where (a1, b1) not in (select (a2, b2) from tab_b) --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(named_struct('a1', tab_a.`a1`, 'b1', tab_a.`b1`) IN (listquery()))' due to data type mismatch: The number of columns in the left hand side of an IN subquery does not match the @@ -52,19 +52,19 @@ Right side columns: [`named_struct(a2, a2, b2, b2)`].; --- !query 5 +-- !query select count(*) from struct_tab where record in (select (a2 as a, b2 as b) from tab_b) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 1 --- !query 6 +-- !query select count(*) from struct_tab where record not in (select (a2 as a, b2 as b) from tab_b) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 3 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-group-by.sql.out index a159aa81eff1c..f378664014fdb 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-group-by.sql.out @@ -2,86 +2,86 @@ -- Number of queries: 19 --- !query 0 +-- !query create temporary view t1 as select * from values - ("t1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), - ("t1b", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("t1a", 16S, 12, 21L, float(15.0), 20D, 20E2, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), - ("t1a", 16S, 12, 10L, float(15.0), 20D, 20E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ("t1c", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), - ("t1d", null, 16, 22L, float(17.0), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', null), - ("t1d", null, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-07-04 01:02:00.001', null), - ("t1e", 10S, null, 25L, float(17.0), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), - ("t1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), - ("t1d", 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ("t1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), - ("t1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') + ("t1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), + ("t1b", 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("t1a", 16S, 12, 21L, float(15.0), 20D, 20E2BD, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), + ("t1a", 16S, 12, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ("t1c", 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), + ("t1d", null, 16, 22L, float(17.0), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', null), + ("t1d", null, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.001', null), + ("t1e", 10S, null, 25L, float(17.0), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), + ("t1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), + ("t1d", 10S, null, 12L, float(17.0), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ("t1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), + ("t1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view t2 as select * from values - ("t2a", 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), - ("t1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("t1b", 8S, 16, 119L, float(17), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ("t1c", 12S, 16, 219L, float(17), 25D, 26E2, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), - ("t1b", null, 16, 319L, float(17), 25D, 26E2, timestamp '2017-05-04 01:01:00.000', null), - ("t2e", 8S, null, 419L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ("t1f", 19S, null, 519L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("t1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ("t1b", 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ("t1c", 12S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), - ("t1e", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), - ("t1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), - ("t1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) + ("t2a", 6S, 12, 14L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), + ("t1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("t1b", 8S, 16, 119L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ("t1c", 12S, 16, 219L, float(17), 25D, 26E2BD, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), + ("t1b", null, 16, 319L, float(17), 25D, 26E2BD, timestamp '2017-05-04 01:01:00.000', null), + ("t2e", 8S, null, 419L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ("t1f", 19S, null, 519L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("t1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ("t1b", 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ("t1c", 12S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), + ("t1e", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), + ("t1f", 19S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), + ("t1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', null) as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query create temporary view t3 as select * from values - ("t3a", 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), - ("t3a", 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("t1b", 10S, 12, 219L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("t1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("t1b", 8S, 16, 319L, float(17), 25D, 26E2, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), - ("t1b", 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), - ("t3c", 17S, 16, 519L, float(17), 25D, 26E2, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), - ("t3c", 17S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), - ("t1b", null, 16, 419L, float(17), 25D, 26E2, timestamp '2014-10-04 01:02:00.000', null), - ("t1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), - ("t3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("t3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') + ("t3a", 6S, 12, 110L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), + ("t3a", 6S, 12, 10L, float(15), 20D, 20E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("t1b", 10S, 12, 219L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("t1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("t1b", 8S, 16, 319L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), + ("t1b", 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), + ("t3c", 17S, 16, 519L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), + ("t3c", 17S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), + ("t1b", null, 16, 419L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:02:00.000', null), + ("t1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-11-04 01:02:00.000', null), + ("t3b", 8S, null, 719L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("t3b", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT t1a, Avg(t1b) FROM t1 WHERE t1a IN (SELECT t2a FROM t2) GROUP BY t1a --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output t1b 8.0 t1c 8.0 t1e 10.0 --- !query 4 +-- !query SELECT t1a, Max(t1b) FROM t1 @@ -90,13 +90,13 @@ WHERE t1b IN (SELECT t2b WHERE t1a = t2a) GROUP BY t1a, t1d --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output t1b 8 --- !query 5 +-- !query SELECT t1a, t1b FROM t1 @@ -105,14 +105,14 @@ WHERE t1c IN (SELECT t2c WHERE t1a = t2a) GROUP BY t1a, t1b --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output t1b 8 t1c 8 --- !query 6 +-- !query SELECT t1a, Sum(DISTINCT( t1b )) FROM t1 @@ -124,14 +124,14 @@ WHERE t1c IN (SELECT t2c WHERE t1a = t3a) GROUP BY t1a, t1c --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output t1b 8 t1c 8 --- !query 7 +-- !query SELECT t1a, Sum(DISTINCT( t1b )) FROM t1 @@ -143,13 +143,13 @@ WHERE t1c IN (SELECT t2c WHERE t1a = t3a) GROUP BY t1a, t1c --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output t1b 8 --- !query 8 +-- !query SELECT t1a, Count(DISTINCT( t1b )) FROM t1 @@ -159,21 +159,21 @@ WHERE t1c IN (SELECT t2c GROUP BY t1a, t1c HAVING t1a = "t1b" --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output t1b 1 --- !query 9 +-- !query SELECT * FROM t1 WHERE t1b IN (SELECT Max(t2b) FROM t2 GROUP BY t2a) --- !query 9 schema -struct --- !query 9 output +-- !query schema +struct +-- !query output t1a 6 8 10 15.0 20.0 2000 2014-04-04 01:00:00 2014-04-04 t1a 6 8 10 15.0 20.0 2000 2014-04-04 01:02:00.001 2014-04-04 t1b 8 16 19 17.0 25.0 2600 2014-05-04 01:01:00 2014-05-04 @@ -184,7 +184,7 @@ t1e 10 NULL 19 17.0 25.0 2600 2014-09-04 01:02:00.001 2014-09-04 t1e 10 NULL 25 17.0 25.0 2600 2014-08-04 01:01:00 2014-08-04 --- !query 10 +-- !query SELECT * FROM (SELECT t2a, t2b @@ -194,13 +194,13 @@ FROM (SELECT t2a, WHERE t1b = t2b) GROUP BY t2a, t2b) t2 --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output t1b 8 --- !query 11 +-- !query SELECT Count(DISTINCT( * )) FROM t1 WHERE t1b IN (SELECT Min(t2b) @@ -208,13 +208,13 @@ WHERE t1b IN (SELECT Min(t2b) WHERE t1a = t2a AND t1c = t2c GROUP BY t2a) --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 1 --- !query 12 +-- !query SELECT t1a, t1b FROM t1 @@ -224,14 +224,14 @@ WHERE t1c IN (SELECT Max(t2c) GROUP BY t2a, t2c HAVING t2c > 8) --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output t1b 8 t1c 8 --- !query 13 +-- !query SELECT t1a, t1b FROM t1 @@ -242,9 +242,9 @@ WHERE t1c IN (SELECT t2c WHERE t3a = t2a GROUP BY t3b) GROUP BY t2c) --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output t1a 16 t1a 16 t1b 8 @@ -253,7 +253,7 @@ t1d NULL t1d NULL --- !query 14 +-- !query SELECT t1a, Min(t1b) FROM t1 @@ -262,14 +262,14 @@ WHERE t1c IN (SELECT Min(t2c) WHERE t2b = t1b GROUP BY t2a) GROUP BY t1a --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output t1b 8 t1c 8 --- !query 15 +-- !query SELECT t1a, Min(t1b) FROM t1 @@ -282,16 +282,16 @@ WHERE t1c IN (SELECT Min(t2c) GROUP BY t2c) GROUP BY t1a, t1d --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output t1b 8 t1c 8 t1d NULL t1d NULL --- !query 16 +-- !query SELECT t1a, Min(t1b) FROM t1 @@ -304,14 +304,14 @@ WHERE t1c IN (SELECT Min(t2c) WHERE t1c = t3c GROUP BY t3d) GROUP BY t1a --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output t1b 8 t1c 8 --- !query 17 +-- !query SELECT t1a, Min(t1b) FROM t1 @@ -324,16 +324,16 @@ WHERE t1c IN (SELECT Min(t2c) WHERE t1c = t3c GROUP BY t3d) GROUP BY t1a --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output t1a 16 t1b 8 t1c 8 t1d NULL --- !query 18 +-- !query SELECT t1a, Min(t1b) FROM t1 @@ -349,9 +349,9 @@ WHERE t1c IN (SELECT Min(t2c) HAVING t3d = t1d) GROUP BY t1a HAVING Min(t1b) IS NOT NULL --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output t1a 16 t1b 8 t1c 8 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-having.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-having.sql.out index b90ebf57e739b..09b6adbe62b36 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-having.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-having.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 12 --- !query 0 +-- !query create temporary view t1 as select * from values ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), ("val1b", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), @@ -17,13 +17,13 @@ create temporary view t1 as select * from values ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view t2 as select * from values ("val2a", 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), @@ -39,13 +39,13 @@ create temporary view t2 as select * from values ("val1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query create temporary view t3 as select * from values ("val3a", 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), ("val3a", 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), @@ -60,13 +60,13 @@ create temporary view t3 as select * from values ("val3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), ("val3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT t1a, t1b, t1h @@ -75,16 +75,16 @@ WHERE t1b IN (SELECT t2b FROM t2 GROUP BY t2b HAVING t2b < 10) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output val1a 6 2014-04-04 01:00:00 val1a 6 2014-04-04 01:02:00.001 val1b 8 2014-05-04 01:01:00 val1c 8 2014-05-04 01:02:00.001 --- !query 4 +-- !query SELECT t1a, t1b, t1c @@ -94,13 +94,13 @@ WHERE t1b IN (SELECT Min(t2b) WHERE t1a = t2a GROUP BY t2b HAVING t2b > 1) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output val1b 8 16 --- !query 5 +-- !query SELECT t1a, t1b, t1c FROM t1 WHERE t1b IN (SELECT t2b @@ -108,13 +108,13 @@ WHERE t1b IN (SELECT t2b WHERE t1c < t2c) GROUP BY t1a, t1b, t1c HAVING t1b < 10 --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output val1a 6 8 --- !query 6 +-- !query SELECT t1a, t1b, t1c FROM t1 WHERE t1b IN (SELECT t2b @@ -122,14 +122,14 @@ WHERE t1b IN (SELECT t2b WHERE t1c = t2c) GROUP BY t1a, t1b, t1c HAVING COUNT (DISTINCT t1b) < 10 --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output val1b 8 16 val1c 8 16 --- !query 7 +-- !query SELECT Count(DISTINCT( t1a )), t1b FROM t1 @@ -140,13 +140,13 @@ WHERE t1c IN (SELECT t2c HAVING t2c > 10) GROUP BY t1b HAVING t1b >= 8 --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 2 8 --- !query 8 +-- !query SELECT t1a, Max(t1b) FROM t1 @@ -158,13 +158,13 @@ HAVING t1a IN (SELECT t2a FROM t3 WHERE t2c = t3c) ) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output val1b 8 --- !query 9 +-- !query SELECT t1a, t1c, Min(t1d) @@ -175,16 +175,16 @@ WHERE t1a NOT IN (SELECT t2a HAVING t2a > 'val2a') GROUP BY t1a, t1c HAVING Min(t1d) > t1c --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output val1a 8 10 val1b 16 19 val1c 16 19 val1d 16 19 --- !query 10 +-- !query SELECT t1a, t1b FROM t1 @@ -195,13 +195,13 @@ WHERE t1d NOT IN (SELECT t2d HAVING t2c > 8) GROUP BY t1a, t1b HAVING t1b < 10 --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output val1a 6 --- !query 11 +-- !query SELECT t1a, Max(t1b) FROM t1 @@ -210,8 +210,8 @@ GROUP BY t1a HAVING t1a NOT IN (SELECT t2a FROM t2 WHERE t2b > 3) --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output val1a 16 val1d 10 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-joins.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-joins.sql.out index ab6a11a2b7efa..615b67f629e55 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-joins.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-joins.sql.out @@ -1,8 +1,8 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 14 +-- Number of queries: 34 --- !query 0 +-- !query create temporary view t1 as select * from values ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), ("val1b", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), @@ -17,13 +17,13 @@ create temporary view t1 as select * from values ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view t2 as select * from values ("val2a", 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), @@ -39,13 +39,13 @@ create temporary view t2 as select * from values ("val1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query create temporary view t3 as select * from values ("val3a", 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), ("val3a", 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), @@ -60,13 +60,43 @@ create temporary view t3 as select * from values ("val3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), ("val3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query +create temporary view s1 as select * from values + (1), (3), (5), (7), (9) + as s1(id) +-- !query schema +struct<> +-- !query output + + + +-- !query +create temporary view s2 as select * from values + (1), (3), (4), (6), (9) + as s2(id) +-- !query schema +struct<> +-- !query output + + + +-- !query +create temporary view s3 as select * from values + (3), (4), (6), (9) + as s3(id) +-- !query schema +struct<> +-- !query output + + + +-- !query SELECT t1a, t1b, t1c, t3a, t3b, t3c FROM t1 natural JOIN t3 WHERE t1a IN (SELECT t2a @@ -77,14 +107,14 @@ WHERE t1a IN (SELECT t2a ORDER BY t1a, t1b, t1c DESC nulls first --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output val1b 8 16 val1b 8 16 val1b 8 16 val1b 8 16 --- !query 4 +-- !query SELECT Count(DISTINCT(t1a)), t1b, t3a, @@ -102,10 +132,10 @@ GROUP BY t1a, t3a, t3b, t3c -ORDER BY t1a DESC, t3b DESC --- !query 4 schema +ORDER BY t1a DESC, t3b DESC, t3c ASC +-- !query schema struct --- !query 4 output +-- !query output 1 10 val3b 8 NULL 1 10 val1b 8 16 1 10 val3a 6 12 @@ -113,7 +143,7 @@ struct 1 8 val3a 6 12 --- !query 5 +-- !query SELECT Count(DISTINCT(t1a)) FROM t1 natural right JOIN t3 WHERE t1a IN @@ -129,13 +159,13 @@ AND t1d IN AND t1a = t3a GROUP BY t1a ORDER BY t1a --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 1 --- !query 6 +-- !query SELECT t1a, t1b, t1c, @@ -151,9 +181,9 @@ where t1a IN AND t1b != t3b AND t1a = 'val1b' ORDER BY t1a --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output val1b 8 16 val3a 6 12 val1b 8 16 val3a 6 12 val1b 8 16 val1b 10 12 @@ -162,7 +192,7 @@ val1b 8 16 val3c 17 16 val1b 8 16 val3c 17 16 --- !query 7 +-- !query SELECT Count(DISTINCT(t1a)), t1b FROM t1 RIGHT JOIN t3 @@ -181,13 +211,13 @@ GROUP BY t1a, t1b HAVING t1b > 8 ORDER BY t1a --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 1 10 --- !query 8 +-- !query SELECT Count(DISTINCT(t1a)) FROM t1 LEFT OUTER JOIN t3 @@ -199,15 +229,15 @@ WHERE t1a IN WHERE t1h < t2h ) GROUP BY t1a ORDER BY t1a --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 1 1 1 --- !query 9 +-- !query SELECT Count(DISTINCT(t1a)), t1b FROM t1 INNER JOIN t2 @@ -224,14 +254,14 @@ OR t1a IN WHERE t2h < t1h) GROUP BY t1b HAVING t1b > 6 --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 1 10 1 8 --- !query 10 +-- !query SELECT Count(DISTINCT(t1a)), t1b FROM t1 @@ -249,13 +279,13 @@ AND t1h IN where t2b = t3b) GROUP BY t1b HAVING t1b > 8 --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 1 10 --- !query 11 +-- !query SELECT Count(DISTINCT(t1a)), t1b FROM t1 @@ -280,13 +310,13 @@ AND t1b IN GROUP BY t1b HAVING t1b > 8 --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 1 10 --- !query 12 +-- !query SELECT Count(DISTINCT(t1a)), t1b FROM t1 @@ -314,13 +344,13 @@ AND t1b IN AND t1a = t2a GROUP BY t1b ORDER BY t1b DESC --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 1 8 --- !query 13 +-- !query SELECT t1a, t1b, t1c, @@ -345,9 +375,222 @@ and t1a = t2a Group By t1a, t1b, t1c, t2a, t2b, t2c HAVING t2c IS NOT NULL ORDER By t2b DESC nulls last --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output val1b 8 16 1 10 12 val1b 8 16 1 8 16 val1b 8 16 1 NULL 16 + + +-- !query +SELECT s1.id FROM s1 +JOIN s2 ON s1.id = s2.id +AND s1.id IN (SELECT 9) +-- !query schema +struct +-- !query output +9 + + +-- !query +SELECT s1.id FROM s1 +JOIN s2 ON s1.id = s2.id +AND s1.id NOT IN (SELECT 9) +-- !query schema +struct +-- !query output +1 +3 + + +-- !query +SELECT s1.id FROM s1 +JOIN s2 ON s1.id = s2.id +AND s1.id IN (SELECT id FROM s3) +-- !query schema +struct +-- !query output +3 +9 + + +-- !query +SELECT s1.id AS id2 FROM s1 +LEFT SEMI JOIN s2 +ON s1.id = s2.id +AND s1.id IN (SELECT id FROM s3) +-- !query schema +struct +-- !query output +3 +9 + + +-- !query +SELECT s1.id as id2 FROM s1 +LEFT ANTI JOIN s2 +ON s1.id = s2.id +AND s1.id IN (SELECT id FROM s3) +-- !query schema +struct +-- !query output +1 +5 +7 + + +-- !query +SELECT s1.id, s2.id as id2 FROM s1 +LEFT OUTER JOIN s2 +ON s1.id = s2.id +AND s1.id IN (SELECT id FROM s3) +-- !query schema +struct +-- !query output +1 NULL +3 3 +5 NULL +7 NULL +9 9 + + +-- !query +SELECT s1.id, s2.id as id2 FROM s1 +RIGHT OUTER JOIN s2 +ON s1.id = s2.id +AND s1.id IN (SELECT id FROM s3) +-- !query schema +struct +-- !query output +3 3 +9 9 +NULL 1 +NULL 4 +NULL 6 + + +-- !query +SELECT s1.id, s2.id AS id2 FROM s1 +FULL OUTER JOIN s2 +ON s1.id = s2.id +AND s1.id IN (SELECT id FROM s3) +-- !query schema +struct +-- !query output +1 NULL +3 3 +5 NULL +7 NULL +9 9 +NULL 1 +NULL 4 +NULL 6 + + +-- !query +SELECT s1.id FROM s1 +JOIN s2 ON s1.id = s2.id +AND s1.id NOT IN (SELECT id FROM s3) +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT s1.id AS id2 FROM s1 +LEFT SEMI JOIN s2 +ON s1.id = s2.id +AND s1.id NOT IN (SELECT id FROM s3) +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT s1.id AS id2 FROM s1 +LEFT ANTI JOIN s2 +ON s1.id = s2.id +AND s1.id NOT IN (SELECT id FROM s3) +-- !query schema +struct +-- !query output +3 +5 +7 +9 + + +-- !query +SELECT s1.id, s2.id AS id2 FROM s1 +LEFT OUTER JOIN s2 +ON s1.id = s2.id +AND s1.id NOT IN (SELECT id FROM s3) +-- !query schema +struct +-- !query output +1 1 +3 NULL +5 NULL +7 NULL +9 NULL + + +-- !query +SELECT s1.id, s2.id AS id2 FROM s1 +RIGHT OUTER JOIN s2 +ON s1.id = s2.id +AND s1.id NOT IN (SELECT id FROM s3) +-- !query schema +struct +-- !query output +1 1 +NULL 3 +NULL 4 +NULL 6 +NULL 9 + + +-- !query +SELECT s1.id, s2.id AS id2 FROM s1 +FULL OUTER JOIN s2 +ON s1.id = s2.id +AND s1.id NOT IN (SELECT id FROM s3) +-- !query schema +struct +-- !query output +1 1 +3 NULL +5 NULL +7 NULL +9 NULL +NULL 3 +NULL 4 +NULL 6 +NULL 9 + + +-- !query +DROP VIEW s1 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW s2 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW s3 +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-limit.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-limit.sql.out index 71ca1f8649475..1c335445114c7 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-limit.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-limit.sql.out @@ -2,85 +2,85 @@ -- Number of queries: 8 --- !query 0 +-- !query create temporary view t1 as select * from values - ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), - ("val1b", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("val1a", 16S, 12, 21L, float(15.0), 20D, 20E2, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), - ("val1a", 16S, 12, 10L, float(15.0), 20D, 20E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ("val1c", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), - ("val1d", null, 16, 22L, float(17.0), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', null), - ("val1d", null, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-07-04 01:02:00.001', null), - ("val1e", 10S, null, 25L, float(17.0), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), - ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), - ("val1d", 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), - ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') + ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), + ("val1b", 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("val1a", 16S, 12, 21L, float(15.0), 20D, 20E2BD, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), + ("val1a", 16S, 12, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ("val1c", 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), + ("val1d", null, 16, 22L, float(17.0), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', null), + ("val1d", null, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.001', null), + ("val1e", 10S, null, 25L, float(17.0), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), + ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), + ("val1d", 10S, null, 12L, float(17.0), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), + ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view t2 as select * from values - ("val2a", 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), - ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("val1b", 8S, 16, 119L, float(17), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ("val1c", 12S, 16, 219L, float(17), 25D, 26E2, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), - ("val1b", null, 16, 319L, float(17), 25D, 26E2, timestamp '2017-05-04 01:01:00.000', null), - ("val2e", 8S, null, 419L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ("val1f", 19S, null, 519L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ("val1b", 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ("val1c", 12S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), - ("val1e", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), - ("val1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), - ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) + ("val2a", 6S, 12, 14L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), + ("val1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("val1b", 8S, 16, 119L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ("val1c", 12S, 16, 219L, float(17), 25D, 26E2BD, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), + ("val1b", null, 16, 319L, float(17), 25D, 26E2BD, timestamp '2017-05-04 01:01:00.000', null), + ("val2e", 8S, null, 419L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ("val1f", 19S, null, 519L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("val1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ("val1b", 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ("val1c", 12S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), + ("val1e", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), + ("val1f", 19S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), + ("val1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', null) as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query create temporary view t3 as select * from values - ("val3a", 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), - ("val3a", 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val1b", 10S, 12, 219L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val1b", 8S, 16, 319L, float(17), 25D, 26E2, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), - ("val1b", 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), - ("val3c", 17S, 16, 519L, float(17), 25D, 26E2, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), - ("val3c", 17S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), - ("val1b", null, 16, 419L, float(17), 25D, 26E2, timestamp '2014-10-04 01:02:00.000', null), - ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), - ("val3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') + ("val3a", 6S, 12, 110L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), + ("val3a", 6S, 12, 10L, float(15), 20D, 20E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val1b", 10S, 12, 219L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val1b", 8S, 16, 319L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), + ("val1b", 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), + ("val3c", 17S, 16, 519L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), + ("val3c", 17S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), + ("val1b", null, 16, 419L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:02:00.000', null), + ("val1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-11-04 01:02:00.000', null), + ("val3b", 8S, null, 719L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val3b", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT * FROM t1 WHERE t1a IN (SELECT t2a FROM t2 WHERE t1d = t2d) LIMIT 2 --- !query 3 schema -struct --- !query 3 output +-- !query schema +struct +-- !query output val1b 8 16 19 17.0 25.0 2600 2014-05-04 01:01:00 2014-05-04 val1c 8 16 19 17.0 25.0 2600 2014-05-04 01:02:00.001 2014-05-05 --- !query 4 +-- !query SELECT * FROM t1 WHERE t1c IN (SELECT t2c @@ -88,16 +88,16 @@ WHERE t1c IN (SELECT t2c WHERE t2b >= 8 LIMIT 2) LIMIT 4 --- !query 4 schema -struct --- !query 4 output +-- !query schema +struct +-- !query output val1a 16 12 10 15.0 20.0 2000 2014-07-04 01:01:00 2014-07-04 val1a 16 12 21 15.0 20.0 2000 2014-06-04 01:02:00.001 2014-06-04 val1b 8 16 19 17.0 25.0 2600 2014-05-04 01:01:00 2014-05-04 val1c 8 16 19 17.0 25.0 2600 2014-05-04 01:02:00.001 2014-05-05 --- !query 5 +-- !query SELECT Count(DISTINCT( t1a )), t1b FROM t1 @@ -108,29 +108,29 @@ WHERE t1d IN (SELECT t2d GROUP BY t1b ORDER BY t1b DESC NULLS FIRST LIMIT 1 --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 1 NULL --- !query 6 +-- !query SELECT * FROM t1 WHERE t1b NOT IN (SELECT t2b FROM t2 WHERE t2b > 6 LIMIT 2) --- !query 6 schema -struct --- !query 6 output +-- !query schema +struct +-- !query output val1a 16 12 10 15.0 20.0 2000 2014-07-04 01:01:00 2014-07-04 val1a 16 12 21 15.0 20.0 2000 2014-06-04 01:02:00.001 2014-06-04 val1a 6 8 10 15.0 20.0 2000 2014-04-04 01:00:00 2014-04-04 val1a 6 8 10 15.0 20.0 2000 2014-04-04 01:02:00.001 2014-04-04 --- !query 7 +-- !query SELECT Count(DISTINCT( t1a )), t1b FROM t1 @@ -141,7 +141,7 @@ WHERE t1d NOT IN (SELECT t2d GROUP BY t1b ORDER BY t1b NULLS last LIMIT 1 --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 1 6 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-multiple-columns.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-multiple-columns.sql.out index 7a96c4bc5a30b..c6e13715bd9fa 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-multiple-columns.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-multiple-columns.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 8 --- !query 0 +-- !query create temporary view t1 as select * from values ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), ("val1b", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), @@ -17,13 +17,13 @@ create temporary view t1 as select * from values ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view t2 as select * from values ("val2a", 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), @@ -39,13 +39,13 @@ create temporary view t2 as select * from values ("val1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query create temporary view t3 as select * from values ("val3a", 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), ("val3a", 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), @@ -60,13 +60,13 @@ create temporary view t3 as select * from values ("val3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), ("val3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT t1a, t1b, t1h @@ -77,16 +77,16 @@ WHERE ( t1a, t1h ) NOT IN (SELECT t2a, WHERE t2a = t1a ORDER BY t2a) AND t1a = 'val1a' --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output val1a 16 2014-06-04 01:02:00.001 val1a 16 2014-07-04 01:01:00 val1a 6 2014-04-04 01:00:00 val1a 6 2014-04-04 01:02:00.001 --- !query 4 +-- !query SELECT t1a, t1b, t1d @@ -97,14 +97,14 @@ WHERE ( t1b, t1d ) IN (SELECT t2b, WHERE t2i IN (SELECT t3i FROM t3 WHERE t2b > t3b)) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output val1e 10 19 val1e 10 19 --- !query 5 +-- !query SELECT t1a, t1b, t1d @@ -116,16 +116,16 @@ WHERE ( t1b, t1d ) NOT IN (SELECT t2b, FROM t3 WHERE t2b > t3b)) AND t1a = 'val1a' --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output val1a 16 10 val1a 16 21 val1a 6 10 val1a 6 10 --- !query 6 +-- !query SELECT t2a FROM (SELECT t2a FROM t2 @@ -144,13 +144,13 @@ FROM (SELECT t2a WHERE ( t2a, t2b ) IN (SELECT t3a, t3b FROM t3)) AS t4 --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output val1b --- !query 7 +-- !query WITH cte1 AS ( SELECT t1a, @@ -169,9 +169,9 @@ FROM ( FROM cte1 JOIN cte1 cte2 on cte1.t1b = cte2.t1b) s --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output val1b 8 val1b 8 val1b 8 val1c 8 val1c 8 val1b 8 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-order-by.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-order-by.sql.out index 4bebd9622c3c5..96b418c54bf5b 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-order-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-order-by.sql.out @@ -2,79 +2,79 @@ -- Number of queries: 18 --- !query 0 +-- !query create temporary view t1 as select * from values - ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), - ("val1b", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("val1a", 16S, 12, 21L, float(15.0), 20D, 20E2, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), - ("val1a", 16S, 12, 10L, float(15.0), 20D, 20E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ("val1c", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), - ("val1d", null, 16, 22L, float(17.0), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', null), - ("val1d", null, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-07-04 01:02:00.001', null), - ("val1e", 10S, null, 25L, float(17.0), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), - ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), - ("val1d", 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), - ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') + ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), + ("val1b", 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("val1a", 16S, 12, 21L, float(15.0), 20D, 20E2BD, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), + ("val1a", 16S, 12, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ("val1c", 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), + ("val1d", null, 16, 22L, float(17.0), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', null), + ("val1d", null, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.001', null), + ("val1e", 10S, null, 25L, float(17.0), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), + ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), + ("val1d", 10S, null, 12L, float(17.0), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), + ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view t2 as select * from values - ("val2a", 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), - ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("val1b", 8S, 16, 119L, float(17), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ("val1c", 12S, 16, 219L, float(17), 25D, 26E2, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), - ("val1b", null, 16, 319L, float(17), 25D, 26E2, timestamp '2017-05-04 01:01:00.000', null), - ("val2e", 8S, null, 419L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ("val1f", 19S, null, 519L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ("val1b", 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ("val1c", 12S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), - ("val1e", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), - ("val1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), - ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) + ("val2a", 6S, 12, 14L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), + ("val1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("val1b", 8S, 16, 119L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ("val1c", 12S, 16, 219L, float(17), 25D, 26E2BD, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), + ("val1b", null, 16, 319L, float(17), 25D, 26E2BD, timestamp '2017-05-04 01:01:00.000', null), + ("val2e", 8S, null, 419L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ("val1f", 19S, null, 519L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("val1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ("val1b", 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ("val1c", 12S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), + ("val1e", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), + ("val1f", 19S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), + ("val1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', null) as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query create temporary view t3 as select * from values - ("val3a", 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), - ("val3a", 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val1b", 10S, 12, 219L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val1b", 8S, 16, 319L, float(17), 25D, 26E2, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), - ("val1b", 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), - ("val3c", 17S, 16, 519L, float(17), 25D, 26E2, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), - ("val3c", 17S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), - ("val1b", null, 16, 419L, float(17), 25D, 26E2, timestamp '2014-10-04 01:02:00.000', null), - ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), - ("val3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') + ("val3a", 6S, 12, 110L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), + ("val3a", 6S, 12, 10L, float(15), 20D, 20E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val1b", 10S, 12, 219L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val1b", 8S, 16, 319L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), + ("val1b", 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), + ("val3c", 17S, 16, 519L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), + ("val3c", 17S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), + ("val1b", null, 16, 419L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:02:00.000', null), + ("val1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-11-04 01:02:00.000', null), + ("val3b", 8S, null, 719L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val3b", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT * FROM t1 WHERE t1a IN (SELECT t2a FROM t2) ORDER BY t1a --- !query 3 schema -struct --- !query 3 output +-- !query schema +struct +-- !query output val1b 8 16 19 17.0 25.0 2600 2014-05-04 01:01:00 2014-05-04 val1c 8 16 19 17.0 25.0 2600 2014-05-04 01:02:00.001 2014-05-05 val1e 10 NULL 25 17.0 25.0 2600 2014-08-04 01:01:00 2014-08-04 @@ -82,20 +82,20 @@ val1e 10 NULL 19 17.0 25.0 2600 2014-09-04 01:02:00.001 2014-09-04 val1e 10 NULL 19 17.0 25.0 2600 2014-05-04 01:01:00 2014-05-04 --- !query 4 +-- !query SELECT t1a FROM t1 WHERE t1b IN (SELECT t2b FROM t2 WHERE t1a = t2a) ORDER BY t1b DESC --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output val1b --- !query 5 +-- !query SELECT t1a, t1b FROM t1 @@ -103,40 +103,40 @@ WHERE t1c IN (SELECT t2c FROM t2 WHERE t1a = t2a) ORDER BY 2 DESC nulls last --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output val1b 8 val1c 8 --- !query 6 +-- !query SELECT Count(DISTINCT( t1a )) FROM t1 WHERE t1b IN (SELECT t2b FROM t2 WHERE t1a = t2a) ORDER BY Count(DISTINCT( t1a )) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 1 --- !query 7 +-- !query SELECT * FROM t1 WHERE t1b IN (SELECT t2c FROM t2 ORDER BY t2d) --- !query 7 schema -struct --- !query 7 output +-- !query schema +struct +-- !query output val1a 16 12 10 15.0 20.0 2000 2014-07-04 01:01:00 2014-07-04 val1a 16 12 21 15.0 20.0 2000 2014-06-04 01:02:00.001 2014-06-04 --- !query 8 +-- !query SELECT * FROM t1 WHERE t1b IN (SELECT Min(t2b) @@ -144,9 +144,9 @@ WHERE t1b IN (SELECT Min(t2b) WHERE t1b = t2b ORDER BY Min(t2b)) ORDER BY t1c DESC nulls first --- !query 8 schema -struct --- !query 8 output +-- !query schema +struct +-- !query output val1e 10 NULL 25 17.0 25.0 2600 2014-08-04 01:01:00 2014-08-04 val1e 10 NULL 19 17.0 25.0 2600 2014-09-04 01:02:00.001 2014-09-04 val1d 10 NULL 12 17.0 25.0 2600 2015-05-04 01:01:00 2015-05-04 @@ -157,7 +157,7 @@ val1a 6 8 10 15.0 20.0 2000 2014-04-04 01:00:00 2014-04-04 val1a 6 8 10 15.0 20.0 2000 2014-04-04 01:02:00.001 2014-04-04 --- !query 9 +-- !query SELECT t1a, t1b, t1h @@ -170,22 +170,22 @@ WHERE t1c IN (SELECT t2c FROM t2 WHERE t1h > t2h) ORDER BY t1h DESC nulls last --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output val1c 8 2014-05-04 01:02:00.001 val1b 8 2014-05-04 01:01:00 --- !query 10 +-- !query SELECT * FROM t1 WHERE t1a NOT IN (SELECT t2a FROM t2) ORDER BY t1a --- !query 10 schema -struct --- !query 10 output +-- !query schema +struct +-- !query output val1a 6 8 10 15.0 20.0 2000 2014-04-04 01:00:00 2014-04-04 val1a 16 12 21 15.0 20.0 2000 2014-06-04 01:02:00.001 2014-06-04 val1a 16 12 10 15.0 20.0 2000 2014-07-04 01:01:00 2014-07-04 @@ -195,7 +195,7 @@ val1d NULL 16 19 17.0 25.0 2600 2014-07-04 01:02:00.001 NULL val1d 10 NULL 12 17.0 25.0 2600 2015-05-04 01:01:00 2015-05-04 --- !query 11 +-- !query SELECT t1a, t1b FROM t1 @@ -203,9 +203,9 @@ WHERE t1a NOT IN (SELECT t2a FROM t2 WHERE t1a = t2a) ORDER BY t1b DESC nulls last --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output val1a 16 val1a 16 val1d 10 @@ -215,7 +215,7 @@ val1d NULL val1d NULL --- !query 12 +-- !query SELECT * FROM t1 WHERE t1a NOT IN (SELECT t2a @@ -225,32 +225,32 @@ WHERE t1a NOT IN (SELECT t2a FROM t2 ORDER BY t2b DESC nulls last) ORDER BY t1c DESC nulls last --- !query 12 schema -struct --- !query 12 output +-- !query schema +struct +-- !query output val1d NULL 16 22 17.0 25.0 2600 2014-06-04 01:01:00 NULL val1d NULL 16 19 17.0 25.0 2600 2014-07-04 01:02:00.001 NULL val1a 16 12 21 15.0 20.0 2000 2014-06-04 01:02:00.001 2014-06-04 val1a 16 12 10 15.0 20.0 2000 2014-07-04 01:01:00 2014-07-04 --- !query 13 +-- !query SELECT * FROM t1 WHERE t1b IN (SELECT Min(t2b) FROM t2 GROUP BY t2a ORDER BY t2a DESC) --- !query 13 schema -struct --- !query 13 output +-- !query schema +struct +-- !query output val1a 6 8 10 15.0 20.0 2000 2014-04-04 01:00:00 2014-04-04 val1a 6 8 10 15.0 20.0 2000 2014-04-04 01:02:00.001 2014-04-04 val1b 8 16 19 17.0 25.0 2600 2014-05-04 01:01:00 2014-05-04 val1c 8 16 19 17.0 25.0 2600 2014-05-04 01:02:00.001 2014-05-05 --- !query 14 +-- !query SELECT t1a, Count(DISTINCT( t1b )) FROM t1 @@ -262,22 +262,22 @@ WHERE t1b IN (SELECT Min(t2b) GROUP BY t1a, t1h ORDER BY t1a --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output val1b 1 --- !query 15 +-- !query SELECT * FROM t1 WHERE t1b NOT IN (SELECT Min(t2b) FROM t2 GROUP BY t2a ORDER BY t2a) --- !query 15 schema -struct --- !query 15 output +-- !query schema +struct +-- !query output val1a 16 12 10 15.0 20.0 2000 2014-07-04 01:01:00 2014-07-04 val1a 16 12 21 15.0 20.0 2000 2014-06-04 01:02:00.001 2014-06-04 val1d 10 NULL 12 17.0 25.0 2600 2015-05-04 01:01:00 2015-05-04 @@ -286,7 +286,7 @@ val1e 10 NULL 19 17.0 25.0 2600 2014-09-04 01:02:00.001 2014-09-04 val1e 10 NULL 25 17.0 25.0 2600 2014-08-04 01:01:00 2014-08-04 --- !query 16 +-- !query SELECT t1a, Sum(DISTINCT( t1b )) FROM t1 @@ -296,16 +296,16 @@ WHERE t1b NOT IN (SELECT Min(t2b) GROUP BY t2c ORDER BY t2c DESC nulls last) GROUP BY t1a --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output val1a 22 val1c 8 val1d 10 val1e 10 --- !query 17 +-- !query SELECT Count(DISTINCT( t1a )), t1b FROM t1 @@ -317,9 +317,9 @@ WHERE t1h NOT IN (SELECT t2h GROUP BY t1a, t1b ORDER BY t1b DESC nulls last --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output 1 16 1 10 1 10 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-set-operations.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-set-operations.sql.out index e06f9206d3401..783f4031a452b 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-set-operations.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-set-operations.sql.out @@ -2,71 +2,71 @@ -- Number of queries: 16 --- !query 0 +-- !query create temporary view t1 as select * from values - ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), - ("val1b", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("val1a", 16S, 12, 21L, float(15.0), 20D, 20E2, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), - ("val1a", 16S, 12, 10L, float(15.0), 20D, 20E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ("val1c", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), - ("val1d", null, 16, 22L, float(17.0), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', null), - ("val1d", null, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-07-04 01:02:00.001', null), - ("val1e", 10S, null, 25L, float(17.0), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), - ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), - ("val1d", 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), - ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') + ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), + ("val1b", 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("val1a", 16S, 12, 21L, float(15.0), 20D, 20E2BD, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), + ("val1a", 16S, 12, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ("val1c", 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), + ("val1d", null, 16, 22L, float(17.0), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', null), + ("val1d", null, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.001', null), + ("val1e", 10S, null, 25L, float(17.0), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), + ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), + ("val1d", 10S, null, 12L, float(17.0), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), + ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view t2 as select * from values - ("val2a", 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), - ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("val1b", 8S, 16, 119L, float(17), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ("val1c", 12S, 16, 219L, float(17), 25D, 26E2, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), - ("val1b", null, 16, 319L, float(17), 25D, 26E2, timestamp '2017-05-04 01:01:00.000', null), - ("val2e", 8S, null, 419L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ("val1f", 19S, null, 519L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ("val1b", 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ("val1c", 12S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), - ("val1e", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), - ("val1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), - ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) + ("val2a", 6S, 12, 14L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), + ("val1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("val1b", 8S, 16, 119L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ("val1c", 12S, 16, 219L, float(17), 25D, 26E2BD, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), + ("val1b", null, 16, 319L, float(17), 25D, 26E2BD, timestamp '2017-05-04 01:01:00.000', null), + ("val2e", 8S, null, 419L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ("val1f", 19S, null, 519L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("val1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ("val1b", 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ("val1c", 12S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), + ("val1e", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), + ("val1f", 19S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), + ("val1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', null) as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query create temporary view t3 as select * from values - ("val3a", 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), - ("val3a", 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val1b", 10S, 12, 219L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val1b", 8S, 16, 319L, float(17), 25D, 26E2, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), - ("val1b", 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), - ("val3c", 17S, 16, 519L, float(17), 25D, 26E2, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), - ("val3c", 17S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), - ("val1b", null, 16, 419L, float(17), 25D, 26E2, timestamp '2014-10-04 01:02:00.000', null), - ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), - ("val3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("val3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') + ("val3a", 6S, 12, 110L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), + ("val3a", 6S, 12, 10L, float(15), 20D, 20E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val1b", 10S, 12, 219L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val1b", 8S, 16, 319L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), + ("val1b", 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), + ("val3c", 17S, 16, 519L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), + ("val3c", 17S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), + ("val1b", null, 16, 419L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:02:00.000', null), + ("val1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-11-04 01:02:00.000', null), + ("val3b", 8S, null, 719L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("val3b", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT t2a, t2b, t2c, @@ -84,16 +84,16 @@ FROM (SELECT * WHERE t2i IS NOT NULL AND 2 * t2b = t2c ORDER BY t2c DESC nulls first --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output val1b 8 16 2015-05-04 01:01:00 2015-05-04 val1b 8 16 2014-07-04 01:01:00 2014-07-04 val1b 8 16 2014-06-04 01:02:00 2014-06-04 val1b 8 16 2014-07-04 01:02:00 2014-07-04 --- !query 4 +-- !query SELECT t2a, t2b, t2d, @@ -115,15 +115,15 @@ GROUP BY t2a, t2d, t2i ORDER BY t2d DESC --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output val1b 8 119 1 2015-05-04 val1b 8 19 1 2014-07-04 val1b 8 19 1 2014-05-04 --- !query 5 +-- !query SELECT t2a, t2b, t2c, @@ -163,9 +163,9 @@ WHERE t1a IN (SELECT t3a FROM t3 WHERE t3d = t1d) GROUP BY t1a, t1b, t1c --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output val1b 10 12 19 val1b 8 16 119 val1b 8 16 19 @@ -174,7 +174,7 @@ val1b NULL 16 319 val1c 12 16 219 --- !query 6 +-- !query SELECT DISTINCT( t2a ), t2b, Count(t2c), @@ -209,16 +209,16 @@ GROUP BY t2a, t2h, t2i HAVING t2b IS NOT NULL --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output val1b 8 1 119 2015-05-04 01:01:00 2015-05-04 val1b 8 1 19 2014-07-04 01:01:00 2014-07-04 val1c 12 1 19 2014-08-04 01:01:00 2014-08-05 val1c 12 1 219 2016-05-04 01:01:00 2016-05-04 --- !query 7 +-- !query SELECT t2a, t2b, Count(t2c), @@ -265,9 +265,9 @@ FROM t2 WHERE t2d IN (SELECT min(t1d) FROM t1 WHERE t2c = t1c) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output val1b 8 1 119 2015-05-04 01:01:00 2015-05-04 val1b 8 1 19 2014-07-04 01:01:00 2014-07-04 val1b 8 16 19 2014-07-04 01:01:00 2014-07-04 @@ -275,7 +275,7 @@ val1b NULL 16 19 2014-05-04 01:01:00 NULL val1c 12 16 19 2014-08-04 01:01:00 2014-08-05 --- !query 8 +-- !query SELECT t2a, t2b, t2c, @@ -312,16 +312,16 @@ FROM t2 WHERE t2c IN (SELECT Max(t1c) FROM t1 WHERE t1d = t2d) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output val1b 8 16 119 val1b 8 16 19 val1b NULL 16 19 val1c 12 16 19 --- !query 9 +-- !query SELECT DISTINCT(t1a), t1b, t1c, @@ -354,9 +354,9 @@ WHERE t1a IN (SELECT t3a GROUP BY t1a, t1b, t1c, t1d HAVING t1c IS NOT NULL AND t1b IS NOT NULL ORDER BY t1c DESC, t1a DESC --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output val1c 8 16 19 val1b 8 16 19 val1a 16 12 21 @@ -364,7 +364,7 @@ val1a 16 12 10 val1a 6 8 10 --- !query 10 +-- !query SELECT t1a, t1b, t1c @@ -378,9 +378,9 @@ WHERE t1b IN (SELECT t2b FROM t1 WHERE t1b > 6) AS t3 WHERE t2b = t1b) --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output val1b 8 16 val1c 8 16 val1d 10 NULL @@ -389,7 +389,7 @@ val1e 10 NULL val1e 10 NULL --- !query 11 +-- !query SELECT t1a, t1b, t1c @@ -401,9 +401,9 @@ WHERE t1h IN (SELECT t2h SELECT t3h FROM t3) AS t3) ORDER BY t1b DESC NULLs first, t1c DESC NULLs last --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output val1d NULL 16 val1a 16 12 val1e 10 NULL @@ -412,7 +412,7 @@ val1e 10 NULL val1b 8 16 --- !query 12 +-- !query SELECT t1a, t1b, t1c @@ -446,16 +446,16 @@ WHERE t1b IN WHERE t1b > 6) AS t4 WHERE t2b = t1b) ORDER BY t1c DESC NULLS last, t1a DESC --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output val1c 8 16 val1b 8 16 val1e 10 NULL val1d 10 NULL --- !query 13 +-- !query SELECT * FROM (SELECT * FROM (SELECT * @@ -497,13 +497,13 @@ FROM (SELECT * WHERE t4.t2b IN (SELECT Min(t3b) FROM t3 WHERE t4.t2a = t3a)) --- !query 13 schema -struct --- !query 13 output +-- !query schema +struct +-- !query output val1b 8 16 19 17.0 25.0 2600 2014-05-04 01:01:00 2014-05-04 --- !query 14 +-- !query SELECT t2a, t2b, t2c, @@ -530,14 +530,14 @@ WHERE t3.t2a NOT IN (SELECT t1a FROM t2) AND t2c IS NOT NULL ORDER BY t2a --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output val2a 6 12 2014-04-04 val2a 6 12 2014-04-04 --- !query 15 +-- !query SELECT Count(DISTINCT(t1a)), t1b, t1c, @@ -581,9 +581,9 @@ HAVING t1b NOT IN SELECT t3b FROM t3) ORDER BY t1c DESC NULLS LAST, t1i --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 1 8 16 2014-05-04 1 8 16 2014-05-05 1 16 12 2014-06-04 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-with-cte.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-with-cte.sql.out index 7d3943e3764c5..b9cc68a339746 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-with-cte.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/in-with-cte.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 13 --- !query 0 +-- !query create temporary view t1 as select * from values ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), ("val1b", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), @@ -17,13 +17,13 @@ create temporary view t1 as select * from values ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view t2 as select * from values ("val2a", 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), @@ -39,13 +39,13 @@ create temporary view t2 as select * from values ("val1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query create temporary view t3 as select * from values ("val3a", 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), ("val3a", 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), @@ -60,13 +60,13 @@ create temporary view t3 as select * from values ("val3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), ("val3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query WITH cte1 AS (SELECT t1a, t1b @@ -81,16 +81,16 @@ FROM t1 WHERE t1b IN (SELECT cte1.t1b FROM cte1 WHERE cte1.t1b > 0) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output val1a 16 12 10 2014-07-04 01:01:00 val1a 16 12 21 2014-06-04 01:02:00.001 val1a 6 8 10 2014-04-04 01:00:00 val1a 6 8 10 2014-04-04 01:02:00.001 --- !query 4 +-- !query WITH cte1 AS ( SELECT t1a, @@ -118,16 +118,16 @@ WHERE t1b IN FROM cte1 ) GROUP BY t1a, t1b, t1c HAVING t1c IS NOT NULL --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 1 16 12 1 6 8 1 8 16 1 8 16 --- !query 5 +-- !query WITH cte1 AS ( SELECT t1a, @@ -155,16 +155,16 @@ WHERE t1c IN ON cte1.t1b < cte5.t1b LEFT OUTER JOIN cte1 cte6 ON cte1.t1d > cte6.t1d) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output val1b 8 16 2014-05-04 01:01:00 val1c 8 16 2014-05-04 01:02:00.001 val1d NULL 16 2014-06-04 01:01:00 val1d NULL 16 2014-07-04 01:02:00.001 --- !query 6 +-- !query WITH cte1 AS (SELECT t1a, t1b @@ -186,13 +186,13 @@ FROM (SELECT * ON cte1.t1a = cte3.t1a INNER JOIN cte1 cte4 ON cte1.t1b = cte4.t1b) s --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output val1b 8 val1b 8 val1b 8 val1b 8 --- !query 7 +-- !query WITH cte1 AS ( SELECT t1a, @@ -217,13 +217,13 @@ WHERE t1b IN SELECT t1b FROM t1) GROUP BY t1b --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 2 8 --- !query 8 +-- !query WITH cte1 AS ( SELECT t1a, @@ -244,13 +244,13 @@ FROM ( RIGHT OUTER JOIN cte1 cte3 ON cte1.t1b = cte3.t1b LEFT OUTER JOIN cte1 cte4 ON cte1.t1c = cte4.t1c ) s --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output val1b 8 16 val1b 8 16 val1b 8 16 val1b 8 16 --- !query 9 +-- !query WITH cte1 AS (SELECT t1a, t1b @@ -266,13 +266,13 @@ FROM (SELECT cte1.t1a, RIGHT OUTER JOIN cte1 cte2 ON cte1.t1a = cte2.t1a) s GROUP BY s.t1b --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 2 8 --- !query 10 +-- !query WITH cte1 AS ( SELECT t1a, @@ -295,13 +295,13 @@ WHERE s.t1b IN FROM t1 INNER JOIN cte1 ON t1.t1a = cte1.t1a) --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 8 --- !query 11 +-- !query WITH cte1 AS (SELECT t1a, t1b @@ -316,9 +316,9 @@ WHERE t1b NOT IN (SELECT cte1.t1b FROM cte1 WHERE cte1.t1b < 0) AND t1c > 10 --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output val1a 16 12 2014-06-04 01:02:00.001 val1a 16 12 2014-07-04 01:01:00 val1b 8 16 2014-05-04 01:01:00 @@ -327,7 +327,7 @@ val1d NULL 16 2014-06-04 01:01:00 val1d NULL 16 2014-07-04 01:02:00.001 --- !query 12 +-- !query WITH cte1 AS ( SELECT t1a, @@ -357,8 +357,8 @@ WHERE t1b NOT IN JOIN cte1 cte4 ON cte1.t1c = cte4.t1c) AND t1c IS NOT NULL ORDER BY t1c DESC --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output val1b 8 16 19 2014-05-04 01:01:00 val1c 8 16 19 2014-05-04 01:02:00.001 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-group-by.sql.out index 6b86a9f6a0d00..720db9e8bdb15 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-group-by.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 8 --- !query 0 +-- !query create temporary view t1 as select * from values ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), ("val1b", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), @@ -17,13 +17,13 @@ create temporary view t1 as select * from values ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view t2 as select * from values ("val2a", 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), @@ -39,13 +39,13 @@ create temporary view t2 as select * from values ("val1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query create temporary view t3 as select * from values ("val3a", 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), ("val3a", 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), @@ -60,27 +60,27 @@ create temporary view t3 as select * from values ("val3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), ("val3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT t1a, Avg(t1b) FROM t1 WHERE t1a NOT IN (SELECT t2a FROM t2) GROUP BY t1a --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output val1a 11.0 val1d 10.0 --- !query 4 +-- !query SELECT t1a, Sum(DISTINCT( t1b )) FROM t1 @@ -88,15 +88,15 @@ WHERE t1d NOT IN (SELECT t2d FROM t2 WHERE t1h < t2h) GROUP BY t1a --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output val1a 22 val1d 10 val1e 10 --- !query 5 +-- !query SELECT Count(*) FROM (SELECT * FROM t2 @@ -107,13 +107,13 @@ WHERE t2b NOT IN (SELECT Min(t2b) FROM t2 WHERE t2b = t2b GROUP BY t2c) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 4 --- !query 6 +-- !query SELECT t1a, max(t1b) FROM t1 @@ -122,16 +122,16 @@ WHERE t1c NOT IN (SELECT Max(t2b) WHERE t1a = t2a GROUP BY t2a) GROUP BY t1a --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output val1a 16 val1b 8 val1c 8 val1d 10 --- !query 7 +-- !query SELECT t1a, t1b FROM t1 @@ -141,9 +141,9 @@ WHERE t1c IN (SELECT t2b FROM t3 WHERE t3a = t2a GROUP BY t3b) order by t2a) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output val1a 16 val1a 16 val1a 6 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-joins.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-joins.sql.out index bae5d00cc8632..4872e3c953ff6 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-joins.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-joins.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 9 --- !query 0 +-- !query create temporary view t1 as select * from values ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), ("val1b", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), @@ -17,13 +17,13 @@ create temporary view t1 as select * from values ("val1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), ("val1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view t2 as select * from values ("val2a", 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), ("val1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), @@ -39,13 +39,13 @@ create temporary view t2 as select * from values ("val1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ("val1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query create temporary view t3 as select * from values ("val3a", 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), ("val3a", 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), @@ -60,13 +60,13 @@ create temporary view t3 as select * from values ("val3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), ("val3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT t1a, t1b, t1c, @@ -78,9 +78,9 @@ FROM t1 WHERE t1a NOT IN (SELECT t2a FROM t2) AND t1b = t3b --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output val1a 6 8 val3a 6 12 val1a 6 8 val3a 6 12 val1a 6 8 val3a 6 12 @@ -89,7 +89,7 @@ val1d 10 NULL val1b 10 12 val1d 10 NULL val1b 10 12 --- !query 4 +-- !query SELECT t1a, t1b, t1c, @@ -113,15 +113,15 @@ AND t1d = t2d GROUP BY t1a, t1b, t1c, t3a, t3b, t3c HAVING count(distinct(t3a)) >= 1 ORDER BY t1a, t3b --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output val1c 8 16 1 6 12 val1c 8 16 1 10 12 val1c 8 16 1 17 16 --- !query 5 +-- !query SELECT t1a, t1b, t1c, @@ -141,9 +141,9 @@ AND t1d NOT IN FROM t2 RIGHT JOIN t1 on t2e = t1e WHERE t1a = t2a) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output val1a 16 12 10 2014-07-04 01:01:00 val1a 16 12 21 2014-06-04 01:02:00.001 val1a 6 8 10 2014-04-04 01:00:00 @@ -153,7 +153,7 @@ val1d NULL 16 22 2014-06-04 01:01:00 val1e 10 NULL 25 2014-08-04 01:01:00 --- !query 6 +-- !query SELECT Count(DISTINCT( t1a )), t1b, t1c, @@ -169,10 +169,10 @@ GROUP BY t1b, HAVING t1d NOT IN (SELECT t2d FROM t2 WHERE t1d = t2d) -ORDER BY t1b DESC --- !query 6 schema +ORDER BY t1b DESC, t1d ASC +-- !query schema struct --- !query 6 output +-- !query output 1 16 12 10 1 16 12 21 1 10 NULL 12 @@ -180,7 +180,7 @@ struct 1 NULL 16 22 --- !query 7 +-- !query SELECT COUNT(DISTINCT(t1a)), t1b, t1c, @@ -195,13 +195,13 @@ GROUP BY t1b, t1c, t1d HAVING t1b < sum(t1c) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 1 6 8 10 --- !query 8 +-- !query SELECT COUNT(DISTINCT(t1a)), t1b, t1c, @@ -223,7 +223,7 @@ GROUP BY t1b, t1c, t1d HAVING t1b < sum(t1c) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 1 6 8 10 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql.out index f02f760727976..bc9e6f842557e 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column-literal.sql.out @@ -2,47 +2,47 @@ -- Number of queries: 4 --- !query 0 +-- !query CREATE TEMPORARY VIEW m AS SELECT * FROM VALUES (null, null), (null, 1.0), (2, 3.0), (4, 5.0) AS m(a, b) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT * FROM m WHERE b = 1.0 -- Matches (null, 1.0) AND (a, b) NOT IN ((2, 3.0)) --- !query 1 schema +-- !query schema struct --- !query 1 output -NULL 1 +-- !query output +NULL 1.0 --- !query 2 +-- !query SELECT * FROM m WHERE b = 3.0 -- Matches (2, 3.0) AND (a, b) NOT IN ((2, 3.0)) --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT * FROM m WHERE b = 5.0 -- Matches (4, 5.0) AND (a, b) NOT IN ((2, 3.0)) --- !query 3 schema +-- !query schema struct --- !query 3 output -4 5 +-- !query output +4 5.0 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column.sql.out index a27a66e3f27f5..54d6da8d0da83 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-multi-column.sql.out @@ -2,119 +2,119 @@ -- Number of queries: 9 --- !query 0 +-- !query CREATE TEMPORARY VIEW m AS SELECT * FROM VALUES (null, null), (null, 1.0), (2, 3.0), (4, 5.0) AS m(a, b) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TEMPORARY VIEW s AS SELECT * FROM VALUES (null, null), (0, 1.0), (2, 3.0), (4, null) AS s(c, d) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query SELECT * FROM m WHERE (a, b) NOT IN (SELECT * FROM s WHERE d > 5.0) -- Matches no rows --- !query 2 schema +-- !query schema struct --- !query 2 output -2 3 -4 5 -NULL 1 +-- !query output +2 3.0 +4 5.0 +NULL 1.0 NULL NULL --- !query 3 +-- !query SELECT * FROM m WHERE (a, b) NOT IN (SELECT * FROM s WHERE c IS NULL AND d IS NULL) -- Matches only (null, null) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output --- !query 4 +-- !query SELECT * FROM m WHERE a IS NULL AND b IS NULL -- Matches only (null, null) AND (a, b) NOT IN (SELECT * FROM s WHERE c IS NOT NULL) -- Matches (0, 1.0), (2, 3.0), (4, null) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output --- !query 5 +-- !query SELECT * FROM m WHERE b = 1.0 -- Matches (null, 1.0) AND (a, b) NOT IN (SELECT * FROM s WHERE c IS NOT NULL) -- Matches (0, 1.0), (2, 3.0), (4, null) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output --- !query 6 +-- !query SELECT * FROM m WHERE b = 1.0 -- Matches (null, 1.0) AND (a, b) NOT IN (SELECT * FROM s WHERE c = 2) -- Matches (2, 3.0) --- !query 6 schema +-- !query schema struct --- !query 6 output -NULL 1 +-- !query output +NULL 1.0 --- !query 7 +-- !query SELECT * FROM m WHERE b = 3.0 -- Matches (2, 3.0) AND (a, b) NOT IN (SELECT * FROM s WHERE c = 2) -- Matches (2, 3.0) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output --- !query 8 +-- !query SELECT * FROM m WHERE b = 5.0 -- Matches (4, 5.0) AND (a, b) NOT IN (SELECT * FROM s WHERE c = 2) -- Matches (2, 3.0) --- !query 8 schema +-- !query schema struct --- !query 8 output -4 5 +-- !query output +4 5.0 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-single-column-literal.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-single-column-literal.sql.out index cf8f03eaa9311..0fc9cf289155d 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-single-column-literal.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-single-column-literal.sql.out @@ -2,56 +2,56 @@ -- Number of queries: 5 --- !query 0 +-- !query CREATE TEMPORARY VIEW m AS SELECT * FROM VALUES (null, 1.0), (2, 3.0), (4, 5.0) AS m(a, b) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT * FROM m WHERE a NOT IN (null) --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output --- !query 2 +-- !query SELECT * FROM m WHERE b = 1.0 -- Only matches (null, 1.0) AND a NOT IN (2) --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT * FROM m WHERE b = 3.0 -- Only matches (2, 3.0) AND a NOT IN (2) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output --- !query 4 +-- !query SELECT * FROM m WHERE b = 3.0 -- Only matches (2, 3.0) AND a NOT IN (6) --- !query 4 schema +-- !query schema struct --- !query 4 output -2 3 +-- !query output +2 3.0 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-single-column.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-single-column.sql.out index d07981cfd11e5..ef40fd462f883 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-single-column.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/not-in-unit-tests-single-column.sql.out @@ -2,130 +2,130 @@ -- Number of queries: 10 --- !query 0 +-- !query CREATE TEMPORARY VIEW m AS SELECT * FROM VALUES (null, 1.0), (2, 3.0), (4, 5.0) AS m(a, b) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TEMPORARY VIEW s AS SELECT * FROM VALUES (null, 1.0), (2, 3.0), (6, 7.0) AS s(c, d) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query SELECT * FROM m WHERE a NOT IN (SELECT c FROM s WHERE d > 10.0) -- (empty subquery) --- !query 2 schema +-- !query schema struct --- !query 2 output -2 3 -4 5 -NULL 1 +-- !query output +2 3.0 +4 5.0 +NULL 1.0 --- !query 3 +-- !query SELECT * FROM m WHERE a NOT IN (SELECT c FROM s WHERE d = 1.0) -- Only matches (null, 1.0) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output --- !query 4 +-- !query SELECT * FROM m WHERE b = 1.0 -- Only matches (null, 1.0) AND a NOT IN (SELECT c FROM s WHERE d = 3.0) -- Matches (2, 3.0) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output --- !query 5 +-- !query SELECT * FROM m WHERE b = 3.0 -- Only matches (2, 3.0) AND a NOT IN (SELECT c FROM s WHERE d = 3.0) -- Matches (2, 3.0) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output --- !query 6 +-- !query SELECT * FROM m WHERE b = 3.0 -- Only matches (2, 3.0) AND a NOT IN (SELECT c FROM s WHERE d = 7.0) -- Matches (6, 7.0) --- !query 6 schema +-- !query schema struct --- !query 6 output -2 3 +-- !query output +2 3.0 --- !query 7 +-- !query SELECT * FROM m WHERE a NOT IN (SELECT c FROM s WHERE d = b + 10) -- Matches no row --- !query 7 schema +-- !query schema struct --- !query 7 output -2 3 -4 5 -NULL 1 +-- !query output +2 3.0 +4 5.0 +NULL 1.0 --- !query 8 +-- !query SELECT * FROM m WHERE b = 1.0 -- Only matches (null, 1.0) AND a NOT IN (SELECT c FROM s WHERE d = b + 10) -- Matches no row --- !query 8 schema +-- !query schema struct --- !query 8 output -NULL 1 +-- !query output +NULL 1.0 --- !query 9 +-- !query SELECT * FROM m WHERE b = 3.0 -- Only matches (2, 3.0) AND a NOT IN (SELECT c FROM s WHERE d = b + 10) -- Matches no row --- !query 9 schema +-- !query schema struct --- !query 9 output -2 3 +-- !query output +2 3.0 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/simple-in.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/simple-in.sql.out index d69b4bcf185c3..0661e1c9e4d96 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/simple-in.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/in-subquery/simple-in.sql.out @@ -2,78 +2,78 @@ -- Number of queries: 14 --- !query 0 +-- !query create temporary view t1 as select * from values - ("t1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), - ("t1b", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("t1a", 16S, 12, 21L, float(15.0), 20D, 20E2, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), - ("t1a", 16S, 12, 10L, float(15.0), 20D, 20E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ("t1c", 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), - ("t1d", null, 16, 22L, float(17.0), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', null), - ("t1d", null, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-07-04 01:02:00.001', null), - ("t1e", 10S, null, 25L, float(17.0), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), - ("t1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), - ("t1d", 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ("t1a", 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), - ("t1e", 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') + ("t1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:00:00.000', date '2014-04-04'), + ("t1b", 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("t1a", 16S, 12, 21L, float(15.0), 20D, 20E2BD, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), + ("t1a", 16S, 12, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ("t1c", 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), + ("t1d", null, 16, 22L, float(17.0), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', null), + ("t1d", null, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.001', null), + ("t1e", 10S, null, 25L, float(17.0), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), + ("t1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), + ("t1d", 10S, null, 12L, float(17.0), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ("t1a", 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), + ("t1e", 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view t2 as select * from values - ("t2a", 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), - ("t1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("t1b", 8S, 16, 119L, float(17), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ("t1c", 12S, 16, 219L, float(17), 25D, 26E2, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), - ("t1b", null, 16, 319L, float(17), 25D, 26E2, timestamp '2017-05-04 01:01:00.000', null), - ("t2e", 8S, null, 419L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ("t1f", 19S, null, 519L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ("t1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ("t1b", 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ("t1c", 12S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), - ("t1e", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), - ("t1f", 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), - ("t1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) + ("t2a", 6S, 12, 14L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), + ("t1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("t1b", 8S, 16, 119L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ("t1c", 12S, 16, 219L, float(17), 25D, 26E2BD, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), + ("t1b", null, 16, 319L, float(17), 25D, 26E2BD, timestamp '2017-05-04 01:01:00.000', null), + ("t2e", 8S, null, 419L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ("t1f", 19S, null, 519L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ("t1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ("t1b", 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ("t1c", 12S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), + ("t1e", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), + ("t1f", 19S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), + ("t1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', null) as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query create temporary view t3 as select * from values - ("t3a", 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), - ("t3a", 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("t1b", 10S, 12, 219L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("t1b", 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("t1b", 8S, 16, 319L, float(17), 25D, 26E2, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), - ("t1b", 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), - ("t3c", 17S, 16, 519L, float(17), 25D, 26E2, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), - ("t3c", 17S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), - ("t1b", null, 16, 419L, float(17), 25D, 26E2, timestamp '2014-10-04 01:02:00.000', null), - ("t1b", null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), - ("t3b", 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ("t3b", 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') + ("t3a", 6S, 12, 110L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), + ("t3a", 6S, 12, 10L, float(15), 20D, 20E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("t1b", 10S, 12, 219L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("t1b", 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("t1b", 8S, 16, 319L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), + ("t1b", 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), + ("t3c", 17S, 16, 519L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), + ("t3c", 17S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), + ("t1b", null, 16, 419L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:02:00.000', null), + ("t1b", null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-11-04 01:02:00.000', null), + ("t3b", 8S, null, 719L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ("t3b", 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT * FROM t1 WHERE t1a IN (SELECT t2a FROM t2) --- !query 3 schema -struct --- !query 3 output +-- !query schema +struct +-- !query output t1b 8 16 19 17.0 25.0 2600 2014-05-04 01:01:00 2014-05-04 t1c 8 16 19 17.0 25.0 2600 2014-05-04 01:02:00.001 2014-05-05 t1e 10 NULL 19 17.0 25.0 2600 2014-05-04 01:01:00 2014-05-04 @@ -81,35 +81,35 @@ t1e 10 NULL 19 17.0 25.0 2600 2014-09-04 01:02:00.001 2014-09-04 t1e 10 NULL 25 17.0 25.0 2600 2014-08-04 01:01:00 2014-08-04 --- !query 4 +-- !query SELECT * FROM t1 WHERE t1b IN (SELECT t2b FROM t2 WHERE t1a = t2a) --- !query 4 schema -struct --- !query 4 output +-- !query schema +struct +-- !query output t1b 8 16 19 17.0 25.0 2600 2014-05-04 01:01:00 2014-05-04 --- !query 5 +-- !query SELECT t1a, t1b FROM t1 WHERE t1c IN (SELECT t2b FROM t2 WHERE t1a != t2a) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output t1a 16 t1a 16 t1a 6 t1a 6 --- !query 6 +-- !query SELECT t1a, t1b FROM t1 @@ -117,14 +117,14 @@ WHERE t1c IN (SELECT t2b FROM t2 WHERE t1a = t2a OR t1b > t2b) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output t1a 16 t1a 16 --- !query 7 +-- !query SELECT t1a, t1b FROM t1 @@ -133,14 +133,14 @@ WHERE t1c IN (SELECT t2b WHERE t2i IN (SELECT t3i FROM t3 WHERE t2c = t3c)) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output t1a 6 t1a 6 --- !query 8 +-- !query SELECT t1a, t1b FROM t1 @@ -150,23 +150,23 @@ WHERE t1c IN (SELECT t2b FROM t3 WHERE t2c = t3c AND t2b IS NOT NULL)) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output t1a 6 t1a 6 --- !query 9 +-- !query SELECT DISTINCT( t1a ), t1b, t1h FROM t1 WHERE t1a NOT IN (SELECT t2a FROM t2) --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output t1a 16 2014-06-04 01:02:00.001 t1a 16 2014-07-04 01:01:00 t1a 6 2014-04-04 01:00:00 @@ -176,49 +176,49 @@ t1d NULL 2014-06-04 01:01:00 t1d NULL 2014-07-04 01:02:00.001 --- !query 10 +-- !query create temporary view a as select * from values (1, 1), (2, 1), (null, 1), (1, 3), (null, 3), (1, null), (null, 2) as a(a1, a2) --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output --- !query 11 +-- !query create temporary view b as select * from values (1, 1, 2), (null, 3, 2), (1, null, 2), (1, 2, null) as b(b1, b2, b3) --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output --- !query 12 +-- !query SELECT a1, a2 FROM a WHERE a1 NOT IN (SELECT b.b1 FROM b WHERE a.a2 = b.b2) --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 1 NULL 2 1 --- !query 13 +-- !query SELECT a1, a2 FROM a WHERE a1 NOT IN (SELECT b.b1 FROM b WHERE a.a2 = b.b2 AND b.b3 > 1) --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output 1 NULL 2 1 NULL 2 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out index 7b47a6139f60a..ec7ecf28754ef 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out @@ -2,37 +2,37 @@ -- Number of queries: 8 --- !query 0 +-- !query CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (1, 2, 3) AS t1(t1a, t1b, t1c) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (1, 0, 1) AS t2(t2a, t2b, t2c) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE TEMPORARY VIEW t3 AS SELECT * FROM VALUES (3, 1, 2) AS t3(t3a, t3b, t3c) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT t1a, t2b FROM t1, t2 WHERE t1b = t2c @@ -42,14 +42,14 @@ AND t2b = (SELECT max(avg) WHERE t2a = t1.t1b ) ) --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output org.apache.spark.sql.AnalysisException grouping expressions sequence is empty, and 't2.`t2b`' is not an aggregate function. Wrap '(avg(CAST(t2.`t2b` AS BIGINT)) AS `avg`)' in windowing function(s) or wrap 't2.`t2b`' in first() (or first_value) if you don't care which value you get.; --- !query 4 +-- !query SELECT * FROM t1 WHERE t1a IN (SELECT min(t2a) @@ -59,14 +59,14 @@ WHERE t1a IN (SELECT min(t2a) FROM t3 GROUP BY t3b HAVING t3b > t2b )) --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output org.apache.spark.sql.AnalysisException Resolved attribute(s) t2b#x missing from min(t2a)#x,t2c#x in operator !Filter t2c#x IN (list#x [t2b#x]).; --- !query 5 +-- !query SELECT t1a FROM t1 GROUP BY 1 @@ -74,14 +74,14 @@ HAVING EXISTS (SELECT t2a FROM t2 GROUP BY 1 HAVING t2a < min(t1a + t2a)) --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output org.apache.spark.sql.AnalysisException Found an aggregate expression in a correlated predicate that has both outer and local references, which is not supported yet. Aggregate expression: min((t1.`t1a` + t2.`t2a`)), Outer references: t1.`t1a`, Local references: t2.`t2a`.; --- !query 6 +-- !query SELECT t1a FROM t1 WHERE t1a IN (SELECT t2a @@ -90,28 +90,28 @@ WHERE t1a IN (SELECT t2a FROM t3 GROUP BY 1 HAVING min(t2a + t3a) > 1)) --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output org.apache.spark.sql.AnalysisException Found an aggregate expression in a correlated predicate that has both outer and local references, which is not supported yet. Aggregate expression: min((t2.`t2a` + t3.`t3a`)), Outer references: t2.`t2a`, Local references: t3.`t3a`.; --- !query 7 +-- !query SELECT t1a FROM t1 WHERE t1a IN (SELECT t2a FROM t2 WHERE EXISTS (SELECT min(t2a) FROM t3)) --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output org.apache.spark.sql.AnalysisException Expressions referencing the outer query are not supported outside of WHERE/HAVING clauses: Aggregate [min(outer(t2a#x)) AS min(outer())#x] -+- SubqueryAlias `t3` ++- SubqueryAlias t3 +- Project [t3a#x, t3b#x, t3c#x] - +- SubqueryAlias `t3` + +- SubqueryAlias t3 +- LocalRelation [t3a#x, t3b#x, t3c#x] ; diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/subq-input-typecheck.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/subq-input-typecheck.sql.out index dcd30055bca19..776598127075b 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/subq-input-typecheck.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/subq-input-typecheck.sql.out @@ -2,57 +2,57 @@ -- Number of queries: 10 --- !query 0 +-- !query CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (1, 2, 3) AS t1(t1a, t1b, t1c) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (1, 0, 1) AS t2(t2a, t2b, t2c) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE TEMPORARY VIEW t3 AS SELECT * FROM VALUES (3, 1, 2) AS t3(t3a, t3b, t3c) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query CREATE TEMPORARY VIEW t4 AS SELECT * FROM VALUES (CAST(1 AS DOUBLE), CAST(2 AS STRING), CAST(3 AS STRING)) AS t1(t4a, t4b, t4c) --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query CREATE TEMPORARY VIEW t5 AS SELECT * FROM VALUES - (CAST(1 AS DECIMAL(18, 0)), CAST(2 AS STRING), CAST(3 AS BIGINT)) + (CAST('2011-01-01 01:01:01' AS TIMESTAMP), CAST(2 AS STRING), CAST(3 AS BIGINT)) AS t1(t5a, t5b, t5c) --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query SELECT ( SELECT max(t2b), min(t2b) FROM t2 @@ -60,14 +60,14 @@ SELECT GROUP BY t2.t2b ) FROM t1 --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output org.apache.spark.sql.AnalysisException Scalar subquery must return only one column, but got 2; --- !query 6 +-- !query SELECT ( SELECT max(t2b), min(t2b) FROM t2 @@ -75,22 +75,22 @@ SELECT GROUP BY t2.t2b ) FROM t1 --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output org.apache.spark.sql.AnalysisException Scalar subquery must return only one column, but got 2; --- !query 7 +-- !query SELECT * FROM t1 WHERE t1a IN (SELECT t2a, t2b FROM t2 WHERE t1a = t2a) --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(t1.`t1a` IN (listquery(t1.`t1a`)))' due to data type mismatch: The number of columns in the left hand side of an IN subquery does not match the @@ -103,15 +103,15 @@ Right side columns: [t2.`t2a`, t2.`t2b`].; --- !query 8 +-- !query SELECT * FROM T1 WHERE (t1a, t1b) IN (SELECT t2a FROM t2 WHERE t1a = t2a) --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(named_struct('t1a', t1.`t1a`, 't1b', t1.`t1b`) IN (listquery(t1.`t1a`)))' due to data type mismatch: The number of columns in the left hand side of an IN subquery does not match the @@ -124,23 +124,23 @@ Right side columns: [t2.`t2a`].; --- !query 9 +-- !query SELECT * FROM t4 WHERE (t4a, t4b, t4c) IN (SELECT t5a, t5b, t5c FROM t5) --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(named_struct('t4a', t4.`t4a`, 't4b', t4.`t4b`, 't4c', t4.`t4c`) IN (listquery()))' due to data type mismatch: The data type of one or more elements in the left hand side of an IN subquery is not compatible with the data type of the output of the subquery Mismatched columns: -[(t4.`t4a`:double, t5.`t5a`:decimal(18,0)), (t4.`t4c`:string, t5.`t5c`:bigint)] +[(t4.`t4a`:double, t5.`t5a`:timestamp), (t4.`t4c`:string, t5.`t5c`:bigint)] Left side: [double, string, string]. Right side: -[decimal(18,0), string, bigint].; +[timestamp, string, bigint].; diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out index dd82efba0dde1..b7eef929864fc 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-predicate.sql.out @@ -2,36 +2,36 @@ -- Number of queries: 27 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW p AS VALUES (1, 1) AS T(pk, pv) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE OR REPLACE TEMPORARY VIEW c AS VALUES (1, 1) AS T(ck, cv) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query SELECT pk, cv FROM p, c WHERE p.pk = c.ck AND c.cv = (SELECT avg(c1.cv) FROM c c1 WHERE c1.ck = p.pk) --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 1 1 --- !query 3 +-- !query SELECT pk, cv FROM p, c WHERE p.pk = c.ck @@ -40,105 +40,105 @@ AND c.cv = (SELECT max(avg) FROM c c1 WHERE c1.ck = p.pk GROUP BY c1.cv)) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 1 1 --- !query 4 +-- !query create temporary view t1 as select * from values - ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 00:00:00.000', date '2014-04-04'), - ('val1b', 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ('val1a', 16S, 12, 21L, float(15.0), 20D, 20E2, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), - ('val1a', 16S, 12, 10L, float(15.0), 20D, 20E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ('val1c', 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), - ('val1d', null, 16, 22L, float(17.0), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', null), - ('val1d', null, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-07-04 01:02:00.001', null), - ('val1e', 10S, null, 25L, float(17.0), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), - ('val1e', 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), - ('val1d', 10S, null, 12L, float(17.0), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), - ('val1e', 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') + ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 00:00:00.000', date '2014-04-04'), + ('val1b', 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ('val1a', 16S, 12, 21L, float(15.0), 20D, 20E2BD, timestamp '2014-06-04 01:02:00.001', date '2014-06-04'), + ('val1a', 16S, 12, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ('val1c', 8S, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.001', date '2014-05-05'), + ('val1d', null, 16, 22L, float(17.0), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', null), + ('val1d', null, 16, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.001', null), + ('val1e', 10S, null, 25L, float(17.0), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-04'), + ('val1e', 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.001', date '2014-09-04'), + ('val1d', 10S, null, 12L, float(17.0), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), + ('val1e', 10S, null, 19L, float(17.0), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i) --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query create temporary view t2 as select * from values - ('val2a', 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), - ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ('val1b', 8S, 16, 119L, float(17), 25D, 26E2, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), - ('val1c', 12S, 16, 219L, float(17), 25D, 26E2, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), - ('val1b', null, 16, 319L, float(17), 25D, 26E2, timestamp '2017-05-04 01:01:00.000', null), - ('val2e', 8S, null, 419L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ('val1f', 19S, null, 519L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), - ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), - ('val1b', 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), - ('val1c', 12S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), - ('val1e', 8S, null, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), - ('val1f', 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), - ('val1b', null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) + ('val2a', 6S, 12, 14L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), + ('val1b', 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ('val1b', 8S, 16, 119L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:01:00.000', date '2015-05-04'), + ('val1c', 12S, 16, 219L, float(17), 25D, 26E2BD, timestamp '2016-05-04 01:01:00.000', date '2016-05-04'), + ('val1b', null, 16, 319L, float(17), 25D, 26E2BD, timestamp '2017-05-04 01:01:00.000', null), + ('val2e', 8S, null, 419L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ('val1f', 19S, null, 519L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), + ('val1b', 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:01:00.000', date '2014-06-04'), + ('val1b', 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:01:00.000', date '2014-07-04'), + ('val1c', 12S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:01:00.000', date '2014-08-05'), + ('val1e', 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:01:00.000', date '2014-09-04'), + ('val1f', 19S, null, 19L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), + ('val1b', null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:01:00.000', null) as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i) --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query create temporary view t3 as select * from values - ('val3a', 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), - ('val3a', 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val1b', 10S, 12, 219L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val1b', 8S, 16, 319L, float(17), 25D, 26E2, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), - ('val1b', 8S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), - ('val3c', 17S, 16, 519L, float(17), 25D, 26E2, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), - ('val3c', 17S, 16, 19L, float(17), 25D, 26E2, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), - ('val1b', null, 16, 419L, float(17), 25D, 26E2, timestamp '2014-10-04 01:02:00.000', null), - ('val1b', null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-11-04 01:02:00.000', null), - ('val3b', 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), - ('val3b', 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') + ('val3a', 6S, 12, 110L, float(15), 20D, 20E2BD, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), + ('val3a', 6S, 12, 10L, float(15), 20D, 20E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ('val1b', 10S, 12, 219L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ('val1b', 10S, 12, 19L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ('val1b', 8S, 16, 319L, float(17), 25D, 26E2BD, timestamp '2014-06-04 01:02:00.000', date '2014-06-04'), + ('val1b', 8S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-07-04 01:02:00.000', date '2014-07-04'), + ('val3c', 17S, 16, 519L, float(17), 25D, 26E2BD, timestamp '2014-08-04 01:02:00.000', date '2014-08-04'), + ('val3c', 17S, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-09-04 01:02:00.000', date '2014-09-05'), + ('val1b', null, 16, 419L, float(17), 25D, 26E2BD, timestamp '2014-10-04 01:02:00.000', null), + ('val1b', null, 16, 19L, float(17), 25D, 26E2BD, timestamp '2014-11-04 01:02:00.000', null), + ('val3b', 8S, null, 719L, float(17), 25D, 26E2BD, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), + ('val3b', 8S, null, 19L, float(17), 25D, 26E2BD, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i) --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output --- !query 7 +-- !query SELECT t1a, t1b FROM t1 WHERE t1c = (SELECT max(t2c) FROM t2) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output val1b 8 val1c 8 val1d NULL val1d NULL --- !query 8 +-- !query SELECT t1a, t1d, t1f FROM t1 WHERE t1c = (SELECT max(t2c) FROM t2) AND t1b > (SELECT min(t3b) FROM t3) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output val1b 19 25.0 val1c 19 25.0 --- !query 9 +-- !query SELECT t1a, t1h FROM t1 WHERE t1c = (SELECT max(t2c) @@ -146,69 +146,69 @@ WHERE t1c = (SELECT max(t2c) OR t1b = (SELECT min(t3b) FROM t3 WHERE t3b > 10) --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output val1b 2014-05-04 01:01:00 val1c 2014-05-04 01:02:00.001 val1d 2014-06-04 01:01:00 val1d 2014-07-04 01:02:00.001 --- !query 10 +-- !query SELECT t1a, t1b, t2d FROM t1 LEFT JOIN t2 ON t1a = t2a WHERE t1b = (SELECT min(t3b) FROM t3) --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output val1a 6 NULL val1a 6 NULL --- !query 11 +-- !query SELECT t1a, t1b, t1g FROM t1 WHERE t1c + 5 = (SELECT max(t2e) FROM t2) --- !query 11 schema -struct --- !query 11 output +-- !query schema +struct +-- !query output val1a 16 2000 val1a 16 2000 --- !query 12 +-- !query SELECT t1a, t1h FROM t1 WHERE date(t1h) = (SELECT min(t2i) FROM t2) --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output val1a 2014-04-04 00:00:00 val1a 2014-04-04 01:02:00.001 --- !query 13 +-- !query SELECT t2d, t1a FROM t1, t2 WHERE t1b = t2b AND t2c + 1 = (SELECT max(t2c) + 1 FROM t2, t1 WHERE t2b = t1b) --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output 119 val1b 119 val1c 19 val1b 19 val1c --- !query 14 +-- !query SELECT DISTINCT t2a, max_t1g FROM t2, (SELECT max(t1g) max_t1g, t1a FROM t1 @@ -216,15 +216,15 @@ FROM t2, (SELECT max(t1g) max_t1g, t1a WHERE t2a = t1a AND max_t1g = (SELECT max(t1g) FROM t1) --- !query 14 schema -struct --- !query 14 output +-- !query schema +struct +-- !query output val1b 2600 val1c 2600 val1e 2600 --- !query 15 +-- !query SELECT t3b, t3c FROM t3 WHERE (SELECT max(t3c) @@ -234,40 +234,40 @@ WHERE (SELECT max(t3c) FROM t3 WHERE t3c > 0) AND (t3b is null or t3c is null) --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 8 NULL 8 NULL NULL 16 NULL 16 --- !query 16 +-- !query SELECT t1a FROM t1 WHERE t1a < (SELECT max(t2a) FROM t2 WHERE t2c = t1c GROUP BY t2c) --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output val1a val1a val1b --- !query 17 +-- !query SELECT t1a, t1c FROM t1 WHERE (SELECT max(t2a) FROM t2 WHERE t2c = t1c GROUP BY t2c) IS NULL --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output val1a 8 val1a 8 val1d NULL @@ -276,7 +276,7 @@ val1e NULL val1e NULL --- !query 18 +-- !query SELECT t1a FROM t1 WHERE t1a = (SELECT max(t2a) @@ -285,14 +285,14 @@ WHERE t1a = (SELECT max(t2a) GROUP BY t2c HAVING count(*) >= 0) OR t1i > '2014-12-31' --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output val1c val1d --- !query 19 +-- !query SELECT t1a FROM t1 WHERE t1a = (SELECT max(t2a) @@ -301,14 +301,14 @@ WHERE t1a = (SELECT max(t2a) GROUP BY t2c HAVING count(*) >= 1) OR t1i > '2014-12-31' --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output val1c val1d --- !query 20 +-- !query SELECT count(t1a) FROM t1 RIGHT JOIN t2 ON t1d = t2d @@ -316,13 +316,13 @@ WHERE t1a < (SELECT max(t2a) FROM t2 WHERE t2c = t1c GROUP BY t2c) --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output 7 --- !query 21 +-- !query SELECT t1a FROM t1 WHERE t1b <= (SELECT max(t2b) @@ -333,14 +333,14 @@ AND t1b >= (SELECT min(t2b) FROM t2 WHERE t2c = t1c GROUP BY t2c) --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output val1b val1c --- !query 22 +-- !query SELECT t1a FROM t1 WHERE t1a <= (SELECT max(t2a) @@ -354,14 +354,14 @@ WHERE t1a >= (SELECT min(t2a) FROM t2 WHERE t2c = t1c GROUP BY t2c) --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output val1b val1c --- !query 23 +-- !query SELECT t1a FROM t1 WHERE t1a <= (SELECT max(t2a) @@ -375,9 +375,9 @@ WHERE t1a >= (SELECT min(t2a) FROM t2 WHERE t2c = t1c GROUP BY t2c) --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output val1a val1a val1b @@ -388,7 +388,7 @@ val1d val1d --- !query 24 +-- !query SELECT t1a FROM t1 WHERE t1a <= (SELECT max(t2a) @@ -402,16 +402,16 @@ WHERE t1a >= (SELECT min(t2a) FROM t2 WHERE t2c = t1c GROUP BY t2c) --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output val1a val1b val1c val1d --- !query 25 +-- !query SELECT t1a FROM t1 WHERE t1a <= (SELECT max(t2a) @@ -425,13 +425,13 @@ WHERE t1a >= (SELECT min(t2a) FROM t2 WHERE t2c = t1c GROUP BY t2c) --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output val1a --- !query 26 +-- !query SELECT t1a FROM t1 GROUP BY t1a, t1c @@ -439,8 +439,8 @@ HAVING max(t1b) <= (SELECT max(t2b) FROM t2 WHERE t2c = t1c GROUP BY t2c) --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output val1b val1c diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out index 807bb47221885..184b8daf9d28e 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 11 --- !query 0 +-- !query create temporary view t1 as select * from values ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 00:00:00.000', date '2014-04-04'), ('val1b', 8S, 16, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), @@ -17,13 +17,13 @@ create temporary view t1 as select * from values ('val1a', 6S, 8, 10L, float(15.0), 20D, 20E2, timestamp '2014-04-04 01:02:00.001', date '2014-04-04'), ('val1e', 10S, null, 19L, float(17.0), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04') as t1(t1a, t1b, t1c, t1d, t1e, t1f, t1g, t1h, t1i) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view t2 as select * from values ('val2a', 6S, 12, 14L, float(15), 20D, 20E2, timestamp '2014-04-04 01:01:00.000', date '2014-04-04'), ('val1b', 10S, 12, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', date '2014-05-04'), @@ -39,13 +39,13 @@ create temporary view t2 as select * from values ('val1f', 19S, null, 19L, float(17), 25D, 26E2, timestamp '2014-10-04 01:01:00.000', date '2014-10-04'), ('val1b', null, 16, 19L, float(17), 25D, 26E2, timestamp '2014-05-04 01:01:00.000', null) as t2(t2a, t2b, t2c, t2d, t2e, t2f, t2g, t2h, t2i) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query create temporary view t3 as select * from values ('val3a', 6S, 12, 110L, float(15), 20D, 20E2, timestamp '2014-04-04 01:02:00.000', date '2014-04-04'), ('val3a', 6S, 12, 10L, float(15), 20D, 20E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), @@ -60,24 +60,24 @@ create temporary view t3 as select * from values ('val3b', 8S, null, 719L, float(17), 25D, 26E2, timestamp '2014-05-04 01:02:00.000', date '2014-05-04'), ('val3b', 8S, null, 19L, float(17), 25D, 26E2, timestamp '2015-05-04 01:02:00.000', date '2015-05-04') as t3(t3a, t3b, t3c, t3d, t3e, t3f, t3g, t3h, t3i) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT (SELECT min(t3d) FROM t3) min_t3d, (SELECT max(t2h) FROM t2) max_t2h FROM t1 WHERE t1a = 'val1c' --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 10 2017-05-04 01:01:00 --- !query 4 +-- !query SELECT t1a, count(*) FROM t1 WHERE t1c IN (SELECT (SELECT min(t3c) FROM t3) @@ -85,13 +85,13 @@ WHERE t1c IN (SELECT (SELECT min(t3c) FROM t3) GROUP BY t2g HAVING count(*) > 1) GROUP BY t1a --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output val1a 2 --- !query 5 +-- !query SELECT (SELECT min(t3d) FROM t3) min_t3d, null FROM t1 @@ -101,14 +101,14 @@ SELECT null, (SELECT max(t2h) FROM t2) max_t2h FROM t1 WHERE t1a = 'val1c' --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 10 NULL NULL 2017-05-04 01:01:00 --- !query 6 +-- !query SELECT (SELECT min(t3c) FROM t3) min_t3d FROM t1 WHERE t1a = 'val1a' @@ -116,13 +116,13 @@ INTERSECT SELECT (SELECT min(t2c) FROM t2) min_t2d FROM t1 WHERE t1a = 'val1d' --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 12 --- !query 7 +-- !query SELECT q1.t1a, q2.t2a, q1.min_t3d, q2.avg_t3d FROM (SELECT t1a, (SELECT min(t3d) FROM t3) min_t3d FROM t1 @@ -133,9 +133,9 @@ FROM (SELECT t1a, (SELECT min(t3d) FROM t3) min_t3d WHERE t2a IN ('val1c', 'val2a')) q2 ON q1.t1a = q2.t2a AND q1.min_t3d < q2.avg_t3d --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output NULL val2a NULL 200.83333333333334 val1c val1c 10 200.83333333333334 val1c val1c 10 200.83333333333334 @@ -144,18 +144,18 @@ val1e NULL 10 NULL val1e NULL 10 NULL --- !query 8 +-- !query SELECT (SELECT min(t3d) FROM t3 WHERE t3.t3a = t1.t1a) min_t3d, (SELECT max(t2h) FROM t2 WHERE t2.t2a = t1.t1a) max_t2h FROM t1 WHERE t1a = 'val1b' --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 19 2017-05-04 01:01:00 --- !query 9 +-- !query SELECT (SELECT min(t3d) FROM t3 WHERE t3a = t1a) min_t3d FROM t1 WHERE t1a = 'val1b' @@ -163,13 +163,13 @@ MINUS SELECT (SELECT min(t3d) FROM t3) abs_min_t3d FROM t1 WHERE t1a = 'val1b' --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 19 --- !query 10 +-- !query SELECT t1a, t1b FROM t1 WHERE NOT EXISTS (SELECT (SELECT max(t2b) @@ -182,9 +182,9 @@ WHERE NOT EXISTS (SELECT (SELECT max(t2b) ON t2a = t1a WHERE t2c = t3c) AND t3a = t1a) --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output val1a 16 val1a 16 val1a 6 diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/subquery-in-from.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/subquery-in-from.sql.out index 50370df349168..11a51dca25341 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/subquery-in-from.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/subquery-in-from.sql.out @@ -2,49 +2,49 @@ -- Number of queries: 6 --- !query 0 +-- !query SELECT * FROM (SELECT * FROM testData) AS t WHERE key = 1 --- !query 0 schema +-- !query schema struct --- !query 0 output +-- !query output 1 1 --- !query 1 +-- !query FROM (SELECT * FROM testData WHERE key = 1) AS t SELECT * --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 1 1 --- !query 2 +-- !query SELECT * FROM (SELECT * FROM testData) t WHERE key = 1 --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 1 1 --- !query 3 +-- !query FROM (SELECT * FROM testData WHERE key = 1) t SELECT * --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 1 1 --- !query 4 +-- !query SELECT * FROM (SELECT * FROM testData) WHERE key = 1 --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 1 1 --- !query 5 +-- !query FROM (SELECT * FROM testData WHERE key = 1) SELECT * --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 1 1 diff --git a/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out b/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out index 1a2bd5ea91cde..25967a3968f23 100644 --- a/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/table-aliases.sql.out @@ -2,96 +2,96 @@ -- Number of queries: 11 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES (1, 1), (1, 2), (2, 1) AS testData(a, b) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT * FROM testData AS t(col1, col2) WHERE col1 = 1 --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 1 1 1 2 --- !query 2 +-- !query SELECT * FROM testData AS t(col1, col2) WHERE col1 = 2 --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 2 1 --- !query 3 +-- !query SELECT col1 AS k, SUM(col2) FROM testData AS t(col1, col2) GROUP BY k --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 1 3 2 1 --- !query 4 +-- !query SELECT * FROM testData AS t(col1, col2, col3) --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output org.apache.spark.sql.AnalysisException Number of column aliases does not match number of columns. Number of column aliases: 3; number of columns: 2.; line 1 pos 14 --- !query 5 +-- !query SELECT * FROM testData AS t(col1) --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output org.apache.spark.sql.AnalysisException Number of column aliases does not match number of columns. Number of column aliases: 1; number of columns: 2.; line 1 pos 14 --- !query 6 +-- !query SELECT a AS col1, b AS col2 FROM testData AS t(c, d) --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`a`' given input columns: [c, d]; line 1 pos 7 --- !query 7 +-- !query SELECT * FROM (SELECT 1 AS a, 1 AS b) t(col1, col2) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 1 1 --- !query 8 +-- !query CREATE OR REPLACE TEMPORARY VIEW src1 AS SELECT * FROM VALUES (1, "a"), (2, "b"), (3, "c") AS src1(id, v1) --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output --- !query 9 +-- !query CREATE OR REPLACE TEMPORARY VIEW src2 AS SELECT * FROM VALUES (2, 1.0), (3, 3.2), (1, 8.5) AS src2(id, v2) --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output --- !query 10 +-- !query SELECT * FROM (src1 s1 INNER JOIN src2 s2 ON s1.id = s2.id) dst(a, b, c, d) --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 1 a 1 8.5 -2 b 2 1 +2 b 2 1.0 3 c 3 3.2 diff --git a/sql/core/src/test/resources/sql-tests/results/table-valued-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/table-valued-functions.sql.out index fdbea0ee90720..16d483df62fd5 100644 --- a/sql/core/src/test/resources/sql-tests/results/table-valued-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/table-valued-functions.sql.out @@ -2,20 +2,20 @@ -- Number of queries: 8 --- !query 0 +-- !query select * from dummy(3) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output org.apache.spark.sql.AnalysisException could not resolve `dummy` to a table-valued function; line 1 pos 14 --- !query 1 +-- !query select * from range(6 + cos(3)) --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 0 1 2 @@ -23,11 +23,11 @@ struct 4 --- !query 2 +-- !query select * from range(5, 10) --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 5 6 7 @@ -35,11 +35,11 @@ struct 9 --- !query 3 +-- !query select * from range(0, 10, 2) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 0 2 4 @@ -47,11 +47,11 @@ struct 8 --- !query 4 +-- !query select * from range(0, 10, 1, 200) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 0 1 2 @@ -64,11 +64,11 @@ struct 9 --- !query 5 +-- !query select * from range(1, 1, 1, 1, 1) --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output org.apache.spark.sql.AnalysisException error: table-valued function range with alternatives: (end: long) @@ -78,11 +78,11 @@ error: table-valued function range with alternatives: cannot be applied to: (integer, integer, integer, integer, integer); line 1 pos 14 --- !query 6 +-- !query select * from range(1, null) --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output org.apache.spark.sql.AnalysisException error: table-valued function range with alternatives: (end: long) @@ -92,10 +92,10 @@ error: table-valued function range with alternatives: cannot be applied to: (integer, null); line 1 pos 14 --- !query 7 +-- !query select * from RaNgE(2) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 0 1 diff --git a/sql/core/src/test/resources/sql-tests/results/tablesample-negative.sql.out b/sql/core/src/test/resources/sql-tests/results/tablesample-negative.sql.out index 35f3931736b83..0188cdd0f8e71 100644 --- a/sql/core/src/test/resources/sql-tests/results/tablesample-negative.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/tablesample-negative.sql.out @@ -2,35 +2,35 @@ -- Number of queries: 6 --- !query 0 +-- !query CREATE DATABASE mydb1 --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query USE mydb1 --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE TABLE t1 USING parquet AS SELECT 1 AS i1 --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT mydb1.t1 FROM t1 TABLESAMPLE (-1 PERCENT) --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException Sampling fraction (-0.01) must be on interval [0, 1](line 1, pos 24) @@ -40,11 +40,11 @@ SELECT mydb1.t1 FROM t1 TABLESAMPLE (-1 PERCENT) ------------------------^^^ --- !query 4 +-- !query SELECT mydb1.t1 FROM t1 TABLESAMPLE (101 PERCENT) --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException Sampling fraction (1.01) must be on interval [0, 1](line 1, pos 24) @@ -54,9 +54,9 @@ SELECT mydb1.t1 FROM t1 TABLESAMPLE (101 PERCENT) ------------------------^^^ --- !query 5 +-- !query DROP DATABASE mydb1 CASCADE --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/arrayJoin.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/arrayJoin.sql.out index c3d5fad0870bc..b49e6b5f21b16 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/arrayJoin.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/arrayJoin.sql.out @@ -2,89 +2,89 @@ -- Number of queries: 11 --- !query 0 +-- !query SELECT array_join(array(true, false), ', ') --- !query 0 schema +-- !query schema struct --- !query 0 output +-- !query output true, false --- !query 1 +-- !query SELECT array_join(array(2Y, 1Y), ', ') --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 2, 1 --- !query 2 +-- !query SELECT array_join(array(2S, 1S), ', ') --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 2, 1 --- !query 3 +-- !query SELECT array_join(array(2, 1), ', ') --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 2, 1 --- !query 4 +-- !query SELECT array_join(array(2L, 1L), ', ') --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 2, 1 --- !query 5 +-- !query SELECT array_join(array(9223372036854775809, 9223372036854775808), ', ') --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 9223372036854775809, 9223372036854775808 --- !query 6 +-- !query SELECT array_join(array(2.0D, 1.0D), ', ') --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 2.0, 1.0 --- !query 7 +-- !query SELECT array_join(array(float(2.0), float(1.0)), ', ') --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 2.0, 1.0 --- !query 8 +-- !query SELECT array_join(array(date '2016-03-14', date '2016-03-13'), ', ') --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 2016-03-14, 2016-03-13 --- !query 9 +-- !query SELECT array_join(array(timestamp '2016-11-15 20:54:00.000', timestamp '2016-11-12 20:54:00.000'), ', ') --- !query 9 schema -struct --- !query 9 output +-- !query schema +struct +-- !query output 2016-11-15 20:54:00, 2016-11-12 20:54:00 --- !query 10 +-- !query SELECT array_join(array('a', 'b'), ', ') --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output a, b diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/binaryComparison.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/binaryComparison.sql.out index 55caab8528fa9..0e1a3d0bc4d9e 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/binaryComparison.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/binaryComparison.sql.out @@ -2,2121 +2,2121 @@ -- Number of queries: 265 --- !query 0 +-- !query CREATE TEMPORARY VIEW t AS SELECT 1 --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT cast(1 as binary) = '1' FROM t --- !query 1 schema +-- !query schema struct<(CAST(1 AS BINARY) = CAST(1 AS BINARY)):boolean> --- !query 1 output +-- !query output false --- !query 2 +-- !query SELECT cast(1 as binary) > '2' FROM t --- !query 2 schema +-- !query schema struct<(CAST(1 AS BINARY) > CAST(2 AS BINARY)):boolean> --- !query 2 output +-- !query output false --- !query 3 +-- !query SELECT cast(1 as binary) >= '2' FROM t --- !query 3 schema +-- !query schema struct<(CAST(1 AS BINARY) >= CAST(2 AS BINARY)):boolean> --- !query 3 output +-- !query output false --- !query 4 +-- !query SELECT cast(1 as binary) < '2' FROM t --- !query 4 schema +-- !query schema struct<(CAST(1 AS BINARY) < CAST(2 AS BINARY)):boolean> --- !query 4 output +-- !query output true --- !query 5 +-- !query SELECT cast(1 as binary) <= '2' FROM t --- !query 5 schema +-- !query schema struct<(CAST(1 AS BINARY) <= CAST(2 AS BINARY)):boolean> --- !query 5 output +-- !query output true --- !query 6 +-- !query SELECT cast(1 as binary) <> '2' FROM t --- !query 6 schema +-- !query schema struct<(NOT (CAST(1 AS BINARY) = CAST(2 AS BINARY))):boolean> --- !query 6 output +-- !query output true --- !query 7 +-- !query SELECT cast(1 as binary) = cast(null as string) FROM t --- !query 7 schema +-- !query schema struct<(CAST(1 AS BINARY) = CAST(CAST(NULL AS STRING) AS BINARY)):boolean> --- !query 7 output +-- !query output NULL --- !query 8 +-- !query SELECT cast(1 as binary) > cast(null as string) FROM t --- !query 8 schema +-- !query schema struct<(CAST(1 AS BINARY) > CAST(CAST(NULL AS STRING) AS BINARY)):boolean> --- !query 8 output +-- !query output NULL --- !query 9 +-- !query SELECT cast(1 as binary) >= cast(null as string) FROM t --- !query 9 schema +-- !query schema struct<(CAST(1 AS BINARY) >= CAST(CAST(NULL AS STRING) AS BINARY)):boolean> --- !query 9 output +-- !query output NULL --- !query 10 +-- !query SELECT cast(1 as binary) < cast(null as string) FROM t --- !query 10 schema +-- !query schema struct<(CAST(1 AS BINARY) < CAST(CAST(NULL AS STRING) AS BINARY)):boolean> --- !query 10 output +-- !query output NULL --- !query 11 +-- !query SELECT cast(1 as binary) <= cast(null as string) FROM t --- !query 11 schema +-- !query schema struct<(CAST(1 AS BINARY) <= CAST(CAST(NULL AS STRING) AS BINARY)):boolean> --- !query 11 output +-- !query output NULL --- !query 12 +-- !query SELECT cast(1 as binary) <> cast(null as string) FROM t --- !query 12 schema +-- !query schema struct<(NOT (CAST(1 AS BINARY) = CAST(CAST(NULL AS STRING) AS BINARY))):boolean> --- !query 12 output +-- !query output NULL --- !query 13 +-- !query SELECT '1' = cast(1 as binary) FROM t --- !query 13 schema +-- !query schema struct<(CAST(1 AS BINARY) = CAST(1 AS BINARY)):boolean> --- !query 13 output +-- !query output false --- !query 14 +-- !query SELECT '2' > cast(1 as binary) FROM t --- !query 14 schema +-- !query schema struct<(CAST(2 AS BINARY) > CAST(1 AS BINARY)):boolean> --- !query 14 output +-- !query output true --- !query 15 +-- !query SELECT '2' >= cast(1 as binary) FROM t --- !query 15 schema +-- !query schema struct<(CAST(2 AS BINARY) >= CAST(1 AS BINARY)):boolean> --- !query 15 output +-- !query output true --- !query 16 +-- !query SELECT '2' < cast(1 as binary) FROM t --- !query 16 schema +-- !query schema struct<(CAST(2 AS BINARY) < CAST(1 AS BINARY)):boolean> --- !query 16 output +-- !query output false --- !query 17 +-- !query SELECT '2' <= cast(1 as binary) FROM t --- !query 17 schema +-- !query schema struct<(CAST(2 AS BINARY) <= CAST(1 AS BINARY)):boolean> --- !query 17 output +-- !query output false --- !query 18 +-- !query SELECT '2' <> cast(1 as binary) FROM t --- !query 18 schema +-- !query schema struct<(NOT (CAST(2 AS BINARY) = CAST(1 AS BINARY))):boolean> --- !query 18 output +-- !query output true --- !query 19 +-- !query SELECT cast(null as string) = cast(1 as binary) FROM t --- !query 19 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS BINARY) = CAST(1 AS BINARY)):boolean> --- !query 19 output +-- !query output NULL --- !query 20 +-- !query SELECT cast(null as string) > cast(1 as binary) FROM t --- !query 20 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS BINARY) > CAST(1 AS BINARY)):boolean> --- !query 20 output +-- !query output NULL --- !query 21 +-- !query SELECT cast(null as string) >= cast(1 as binary) FROM t --- !query 21 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS BINARY) >= CAST(1 AS BINARY)):boolean> --- !query 21 output +-- !query output NULL --- !query 22 +-- !query SELECT cast(null as string) < cast(1 as binary) FROM t --- !query 22 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS BINARY) < CAST(1 AS BINARY)):boolean> --- !query 22 output +-- !query output NULL --- !query 23 +-- !query SELECT cast(null as string) <= cast(1 as binary) FROM t --- !query 23 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS BINARY) <= CAST(1 AS BINARY)):boolean> --- !query 23 output +-- !query output NULL --- !query 24 +-- !query SELECT cast(null as string) <> cast(1 as binary) FROM t --- !query 24 schema +-- !query schema struct<(NOT (CAST(CAST(NULL AS STRING) AS BINARY) = CAST(1 AS BINARY))):boolean> --- !query 24 output +-- !query output NULL --- !query 25 +-- !query SELECT cast(1 as tinyint) = '1' FROM t --- !query 25 schema +-- !query schema struct<(CAST(1 AS TINYINT) = CAST(1 AS TINYINT)):boolean> --- !query 25 output +-- !query output true --- !query 26 +-- !query SELECT cast(1 as tinyint) > '2' FROM t --- !query 26 schema +-- !query schema struct<(CAST(1 AS TINYINT) > CAST(2 AS TINYINT)):boolean> --- !query 26 output +-- !query output false --- !query 27 +-- !query SELECT cast(1 as tinyint) >= '2' FROM t --- !query 27 schema +-- !query schema struct<(CAST(1 AS TINYINT) >= CAST(2 AS TINYINT)):boolean> --- !query 27 output +-- !query output false --- !query 28 +-- !query SELECT cast(1 as tinyint) < '2' FROM t --- !query 28 schema +-- !query schema struct<(CAST(1 AS TINYINT) < CAST(2 AS TINYINT)):boolean> --- !query 28 output +-- !query output true --- !query 29 +-- !query SELECT cast(1 as tinyint) <= '2' FROM t --- !query 29 schema +-- !query schema struct<(CAST(1 AS TINYINT) <= CAST(2 AS TINYINT)):boolean> --- !query 29 output +-- !query output true --- !query 30 +-- !query SELECT cast(1 as tinyint) <> '2' FROM t --- !query 30 schema +-- !query schema struct<(NOT (CAST(1 AS TINYINT) = CAST(2 AS TINYINT))):boolean> --- !query 30 output +-- !query output true --- !query 31 +-- !query SELECT cast(1 as tinyint) = cast(null as string) FROM t --- !query 31 schema +-- !query schema struct<(CAST(1 AS TINYINT) = CAST(CAST(NULL AS STRING) AS TINYINT)):boolean> --- !query 31 output +-- !query output NULL --- !query 32 +-- !query SELECT cast(1 as tinyint) > cast(null as string) FROM t --- !query 32 schema +-- !query schema struct<(CAST(1 AS TINYINT) > CAST(CAST(NULL AS STRING) AS TINYINT)):boolean> --- !query 32 output +-- !query output NULL --- !query 33 +-- !query SELECT cast(1 as tinyint) >= cast(null as string) FROM t --- !query 33 schema +-- !query schema struct<(CAST(1 AS TINYINT) >= CAST(CAST(NULL AS STRING) AS TINYINT)):boolean> --- !query 33 output +-- !query output NULL --- !query 34 +-- !query SELECT cast(1 as tinyint) < cast(null as string) FROM t --- !query 34 schema +-- !query schema struct<(CAST(1 AS TINYINT) < CAST(CAST(NULL AS STRING) AS TINYINT)):boolean> --- !query 34 output +-- !query output NULL --- !query 35 +-- !query SELECT cast(1 as tinyint) <= cast(null as string) FROM t --- !query 35 schema +-- !query schema struct<(CAST(1 AS TINYINT) <= CAST(CAST(NULL AS STRING) AS TINYINT)):boolean> --- !query 35 output +-- !query output NULL --- !query 36 +-- !query SELECT cast(1 as tinyint) <> cast(null as string) FROM t --- !query 36 schema +-- !query schema struct<(NOT (CAST(1 AS TINYINT) = CAST(CAST(NULL AS STRING) AS TINYINT))):boolean> --- !query 36 output +-- !query output NULL --- !query 37 +-- !query SELECT '1' = cast(1 as tinyint) FROM t --- !query 37 schema +-- !query schema struct<(CAST(1 AS TINYINT) = CAST(1 AS TINYINT)):boolean> --- !query 37 output +-- !query output true --- !query 38 +-- !query SELECT '2' > cast(1 as tinyint) FROM t --- !query 38 schema +-- !query schema struct<(CAST(2 AS TINYINT) > CAST(1 AS TINYINT)):boolean> --- !query 38 output +-- !query output true --- !query 39 +-- !query SELECT '2' >= cast(1 as tinyint) FROM t --- !query 39 schema +-- !query schema struct<(CAST(2 AS TINYINT) >= CAST(1 AS TINYINT)):boolean> --- !query 39 output +-- !query output true --- !query 40 +-- !query SELECT '2' < cast(1 as tinyint) FROM t --- !query 40 schema +-- !query schema struct<(CAST(2 AS TINYINT) < CAST(1 AS TINYINT)):boolean> --- !query 40 output +-- !query output false --- !query 41 +-- !query SELECT '2' <= cast(1 as tinyint) FROM t --- !query 41 schema +-- !query schema struct<(CAST(2 AS TINYINT) <= CAST(1 AS TINYINT)):boolean> --- !query 41 output +-- !query output false --- !query 42 +-- !query SELECT '2' <> cast(1 as tinyint) FROM t --- !query 42 schema +-- !query schema struct<(NOT (CAST(2 AS TINYINT) = CAST(1 AS TINYINT))):boolean> --- !query 42 output +-- !query output true --- !query 43 +-- !query SELECT cast(null as string) = cast(1 as tinyint) FROM t --- !query 43 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS TINYINT) = CAST(1 AS TINYINT)):boolean> --- !query 43 output +-- !query output NULL --- !query 44 +-- !query SELECT cast(null as string) > cast(1 as tinyint) FROM t --- !query 44 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS TINYINT) > CAST(1 AS TINYINT)):boolean> --- !query 44 output +-- !query output NULL --- !query 45 +-- !query SELECT cast(null as string) >= cast(1 as tinyint) FROM t --- !query 45 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS TINYINT) >= CAST(1 AS TINYINT)):boolean> --- !query 45 output +-- !query output NULL --- !query 46 +-- !query SELECT cast(null as string) < cast(1 as tinyint) FROM t --- !query 46 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS TINYINT) < CAST(1 AS TINYINT)):boolean> --- !query 46 output +-- !query output NULL --- !query 47 +-- !query SELECT cast(null as string) <= cast(1 as tinyint) FROM t --- !query 47 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS TINYINT) <= CAST(1 AS TINYINT)):boolean> --- !query 47 output +-- !query output NULL --- !query 48 +-- !query SELECT cast(null as string) <> cast(1 as tinyint) FROM t --- !query 48 schema +-- !query schema struct<(NOT (CAST(CAST(NULL AS STRING) AS TINYINT) = CAST(1 AS TINYINT))):boolean> --- !query 48 output +-- !query output NULL --- !query 49 +-- !query SELECT cast(1 as smallint) = '1' FROM t --- !query 49 schema +-- !query schema struct<(CAST(1 AS SMALLINT) = CAST(1 AS SMALLINT)):boolean> --- !query 49 output +-- !query output true --- !query 50 +-- !query SELECT cast(1 as smallint) > '2' FROM t --- !query 50 schema +-- !query schema struct<(CAST(1 AS SMALLINT) > CAST(2 AS SMALLINT)):boolean> --- !query 50 output +-- !query output false --- !query 51 +-- !query SELECT cast(1 as smallint) >= '2' FROM t --- !query 51 schema +-- !query schema struct<(CAST(1 AS SMALLINT) >= CAST(2 AS SMALLINT)):boolean> --- !query 51 output +-- !query output false --- !query 52 +-- !query SELECT cast(1 as smallint) < '2' FROM t --- !query 52 schema +-- !query schema struct<(CAST(1 AS SMALLINT) < CAST(2 AS SMALLINT)):boolean> --- !query 52 output +-- !query output true --- !query 53 +-- !query SELECT cast(1 as smallint) <= '2' FROM t --- !query 53 schema +-- !query schema struct<(CAST(1 AS SMALLINT) <= CAST(2 AS SMALLINT)):boolean> --- !query 53 output +-- !query output true --- !query 54 +-- !query SELECT cast(1 as smallint) <> '2' FROM t --- !query 54 schema +-- !query schema struct<(NOT (CAST(1 AS SMALLINT) = CAST(2 AS SMALLINT))):boolean> --- !query 54 output +-- !query output true --- !query 55 +-- !query SELECT cast(1 as smallint) = cast(null as string) FROM t --- !query 55 schema +-- !query schema struct<(CAST(1 AS SMALLINT) = CAST(CAST(NULL AS STRING) AS SMALLINT)):boolean> --- !query 55 output +-- !query output NULL --- !query 56 +-- !query SELECT cast(1 as smallint) > cast(null as string) FROM t --- !query 56 schema +-- !query schema struct<(CAST(1 AS SMALLINT) > CAST(CAST(NULL AS STRING) AS SMALLINT)):boolean> --- !query 56 output +-- !query output NULL --- !query 57 +-- !query SELECT cast(1 as smallint) >= cast(null as string) FROM t --- !query 57 schema +-- !query schema struct<(CAST(1 AS SMALLINT) >= CAST(CAST(NULL AS STRING) AS SMALLINT)):boolean> --- !query 57 output +-- !query output NULL --- !query 58 +-- !query SELECT cast(1 as smallint) < cast(null as string) FROM t --- !query 58 schema +-- !query schema struct<(CAST(1 AS SMALLINT) < CAST(CAST(NULL AS STRING) AS SMALLINT)):boolean> --- !query 58 output +-- !query output NULL --- !query 59 +-- !query SELECT cast(1 as smallint) <= cast(null as string) FROM t --- !query 59 schema +-- !query schema struct<(CAST(1 AS SMALLINT) <= CAST(CAST(NULL AS STRING) AS SMALLINT)):boolean> --- !query 59 output +-- !query output NULL --- !query 60 +-- !query SELECT cast(1 as smallint) <> cast(null as string) FROM t --- !query 60 schema +-- !query schema struct<(NOT (CAST(1 AS SMALLINT) = CAST(CAST(NULL AS STRING) AS SMALLINT))):boolean> --- !query 60 output +-- !query output NULL --- !query 61 +-- !query SELECT '1' = cast(1 as smallint) FROM t --- !query 61 schema +-- !query schema struct<(CAST(1 AS SMALLINT) = CAST(1 AS SMALLINT)):boolean> --- !query 61 output +-- !query output true --- !query 62 +-- !query SELECT '2' > cast(1 as smallint) FROM t --- !query 62 schema +-- !query schema struct<(CAST(2 AS SMALLINT) > CAST(1 AS SMALLINT)):boolean> --- !query 62 output +-- !query output true --- !query 63 +-- !query SELECT '2' >= cast(1 as smallint) FROM t --- !query 63 schema +-- !query schema struct<(CAST(2 AS SMALLINT) >= CAST(1 AS SMALLINT)):boolean> --- !query 63 output +-- !query output true --- !query 64 +-- !query SELECT '2' < cast(1 as smallint) FROM t --- !query 64 schema +-- !query schema struct<(CAST(2 AS SMALLINT) < CAST(1 AS SMALLINT)):boolean> --- !query 64 output +-- !query output false --- !query 65 +-- !query SELECT '2' <= cast(1 as smallint) FROM t --- !query 65 schema +-- !query schema struct<(CAST(2 AS SMALLINT) <= CAST(1 AS SMALLINT)):boolean> --- !query 65 output +-- !query output false --- !query 66 +-- !query SELECT '2' <> cast(1 as smallint) FROM t --- !query 66 schema +-- !query schema struct<(NOT (CAST(2 AS SMALLINT) = CAST(1 AS SMALLINT))):boolean> --- !query 66 output +-- !query output true --- !query 67 +-- !query SELECT cast(null as string) = cast(1 as smallint) FROM t --- !query 67 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS SMALLINT) = CAST(1 AS SMALLINT)):boolean> --- !query 67 output +-- !query output NULL --- !query 68 +-- !query SELECT cast(null as string) > cast(1 as smallint) FROM t --- !query 68 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS SMALLINT) > CAST(1 AS SMALLINT)):boolean> --- !query 68 output +-- !query output NULL --- !query 69 +-- !query SELECT cast(null as string) >= cast(1 as smallint) FROM t --- !query 69 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS SMALLINT) >= CAST(1 AS SMALLINT)):boolean> --- !query 69 output +-- !query output NULL --- !query 70 +-- !query SELECT cast(null as string) < cast(1 as smallint) FROM t --- !query 70 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS SMALLINT) < CAST(1 AS SMALLINT)):boolean> --- !query 70 output +-- !query output NULL --- !query 71 +-- !query SELECT cast(null as string) <= cast(1 as smallint) FROM t --- !query 71 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS SMALLINT) <= CAST(1 AS SMALLINT)):boolean> --- !query 71 output +-- !query output NULL --- !query 72 +-- !query SELECT cast(null as string) <> cast(1 as smallint) FROM t --- !query 72 schema +-- !query schema struct<(NOT (CAST(CAST(NULL AS STRING) AS SMALLINT) = CAST(1 AS SMALLINT))):boolean> --- !query 72 output +-- !query output NULL --- !query 73 +-- !query SELECT cast(1 as int) = '1' FROM t --- !query 73 schema +-- !query schema struct<(CAST(1 AS INT) = CAST(1 AS INT)):boolean> --- !query 73 output +-- !query output true --- !query 74 +-- !query SELECT cast(1 as int) > '2' FROM t --- !query 74 schema +-- !query schema struct<(CAST(1 AS INT) > CAST(2 AS INT)):boolean> --- !query 74 output +-- !query output false --- !query 75 +-- !query SELECT cast(1 as int) >= '2' FROM t --- !query 75 schema +-- !query schema struct<(CAST(1 AS INT) >= CAST(2 AS INT)):boolean> --- !query 75 output +-- !query output false --- !query 76 +-- !query SELECT cast(1 as int) < '2' FROM t --- !query 76 schema +-- !query schema struct<(CAST(1 AS INT) < CAST(2 AS INT)):boolean> --- !query 76 output +-- !query output true --- !query 77 +-- !query SELECT cast(1 as int) <= '2' FROM t --- !query 77 schema +-- !query schema struct<(CAST(1 AS INT) <= CAST(2 AS INT)):boolean> --- !query 77 output +-- !query output true --- !query 78 +-- !query SELECT cast(1 as int) <> '2' FROM t --- !query 78 schema +-- !query schema struct<(NOT (CAST(1 AS INT) = CAST(2 AS INT))):boolean> --- !query 78 output +-- !query output true --- !query 79 +-- !query SELECT cast(1 as int) = cast(null as string) FROM t --- !query 79 schema +-- !query schema struct<(CAST(1 AS INT) = CAST(CAST(NULL AS STRING) AS INT)):boolean> --- !query 79 output +-- !query output NULL --- !query 80 +-- !query SELECT cast(1 as int) > cast(null as string) FROM t --- !query 80 schema +-- !query schema struct<(CAST(1 AS INT) > CAST(CAST(NULL AS STRING) AS INT)):boolean> --- !query 80 output +-- !query output NULL --- !query 81 +-- !query SELECT cast(1 as int) >= cast(null as string) FROM t --- !query 81 schema +-- !query schema struct<(CAST(1 AS INT) >= CAST(CAST(NULL AS STRING) AS INT)):boolean> --- !query 81 output +-- !query output NULL --- !query 82 +-- !query SELECT cast(1 as int) < cast(null as string) FROM t --- !query 82 schema +-- !query schema struct<(CAST(1 AS INT) < CAST(CAST(NULL AS STRING) AS INT)):boolean> --- !query 82 output +-- !query output NULL --- !query 83 +-- !query SELECT cast(1 as int) <= cast(null as string) FROM t --- !query 83 schema +-- !query schema struct<(CAST(1 AS INT) <= CAST(CAST(NULL AS STRING) AS INT)):boolean> --- !query 83 output +-- !query output NULL --- !query 84 +-- !query SELECT cast(1 as int) <> cast(null as string) FROM t --- !query 84 schema +-- !query schema struct<(NOT (CAST(1 AS INT) = CAST(CAST(NULL AS STRING) AS INT))):boolean> --- !query 84 output +-- !query output NULL --- !query 85 +-- !query SELECT '1' = cast(1 as int) FROM t --- !query 85 schema +-- !query schema struct<(CAST(1 AS INT) = CAST(1 AS INT)):boolean> --- !query 85 output +-- !query output true --- !query 86 +-- !query SELECT '2' > cast(1 as int) FROM t --- !query 86 schema +-- !query schema struct<(CAST(2 AS INT) > CAST(1 AS INT)):boolean> --- !query 86 output +-- !query output true --- !query 87 +-- !query SELECT '2' >= cast(1 as int) FROM t --- !query 87 schema +-- !query schema struct<(CAST(2 AS INT) >= CAST(1 AS INT)):boolean> --- !query 87 output +-- !query output true --- !query 88 +-- !query SELECT '2' < cast(1 as int) FROM t --- !query 88 schema +-- !query schema struct<(CAST(2 AS INT) < CAST(1 AS INT)):boolean> --- !query 88 output +-- !query output false --- !query 89 +-- !query SELECT '2' <> cast(1 as int) FROM t --- !query 89 schema +-- !query schema struct<(NOT (CAST(2 AS INT) = CAST(1 AS INT))):boolean> --- !query 89 output +-- !query output true --- !query 90 +-- !query SELECT '2' <= cast(1 as int) FROM t --- !query 90 schema +-- !query schema struct<(CAST(2 AS INT) <= CAST(1 AS INT)):boolean> --- !query 90 output +-- !query output false --- !query 91 +-- !query SELECT cast(null as string) = cast(1 as int) FROM t --- !query 91 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS INT) = CAST(1 AS INT)):boolean> --- !query 91 output +-- !query output NULL --- !query 92 +-- !query SELECT cast(null as string) > cast(1 as int) FROM t --- !query 92 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS INT) > CAST(1 AS INT)):boolean> --- !query 92 output +-- !query output NULL --- !query 93 +-- !query SELECT cast(null as string) >= cast(1 as int) FROM t --- !query 93 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS INT) >= CAST(1 AS INT)):boolean> --- !query 93 output +-- !query output NULL --- !query 94 +-- !query SELECT cast(null as string) < cast(1 as int) FROM t --- !query 94 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS INT) < CAST(1 AS INT)):boolean> --- !query 94 output +-- !query output NULL --- !query 95 +-- !query SELECT cast(null as string) <> cast(1 as int) FROM t --- !query 95 schema +-- !query schema struct<(NOT (CAST(CAST(NULL AS STRING) AS INT) = CAST(1 AS INT))):boolean> --- !query 95 output +-- !query output NULL --- !query 96 +-- !query SELECT cast(null as string) <= cast(1 as int) FROM t --- !query 96 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS INT) <= CAST(1 AS INT)):boolean> --- !query 96 output +-- !query output NULL --- !query 97 +-- !query SELECT cast(1 as bigint) = '1' FROM t --- !query 97 schema +-- !query schema struct<(CAST(1 AS BIGINT) = CAST(1 AS BIGINT)):boolean> --- !query 97 output +-- !query output true --- !query 98 +-- !query SELECT cast(1 as bigint) > '2' FROM t --- !query 98 schema +-- !query schema struct<(CAST(1 AS BIGINT) > CAST(2 AS BIGINT)):boolean> --- !query 98 output +-- !query output false --- !query 99 +-- !query SELECT cast(1 as bigint) >= '2' FROM t --- !query 99 schema +-- !query schema struct<(CAST(1 AS BIGINT) >= CAST(2 AS BIGINT)):boolean> --- !query 99 output +-- !query output false --- !query 100 +-- !query SELECT cast(1 as bigint) < '2' FROM t --- !query 100 schema +-- !query schema struct<(CAST(1 AS BIGINT) < CAST(2 AS BIGINT)):boolean> --- !query 100 output +-- !query output true --- !query 101 +-- !query SELECT cast(1 as bigint) <= '2' FROM t --- !query 101 schema +-- !query schema struct<(CAST(1 AS BIGINT) <= CAST(2 AS BIGINT)):boolean> --- !query 101 output +-- !query output true --- !query 102 +-- !query SELECT cast(1 as bigint) <> '2' FROM t --- !query 102 schema +-- !query schema struct<(NOT (CAST(1 AS BIGINT) = CAST(2 AS BIGINT))):boolean> --- !query 102 output +-- !query output true --- !query 103 +-- !query SELECT cast(1 as bigint) = cast(null as string) FROM t --- !query 103 schema +-- !query schema struct<(CAST(1 AS BIGINT) = CAST(CAST(NULL AS STRING) AS BIGINT)):boolean> --- !query 103 output +-- !query output NULL --- !query 104 +-- !query SELECT cast(1 as bigint) > cast(null as string) FROM t --- !query 104 schema +-- !query schema struct<(CAST(1 AS BIGINT) > CAST(CAST(NULL AS STRING) AS BIGINT)):boolean> --- !query 104 output +-- !query output NULL --- !query 105 +-- !query SELECT cast(1 as bigint) >= cast(null as string) FROM t --- !query 105 schema +-- !query schema struct<(CAST(1 AS BIGINT) >= CAST(CAST(NULL AS STRING) AS BIGINT)):boolean> --- !query 105 output +-- !query output NULL --- !query 106 +-- !query SELECT cast(1 as bigint) < cast(null as string) FROM t --- !query 106 schema +-- !query schema struct<(CAST(1 AS BIGINT) < CAST(CAST(NULL AS STRING) AS BIGINT)):boolean> --- !query 106 output +-- !query output NULL --- !query 107 +-- !query SELECT cast(1 as bigint) <= cast(null as string) FROM t --- !query 107 schema +-- !query schema struct<(CAST(1 AS BIGINT) <= CAST(CAST(NULL AS STRING) AS BIGINT)):boolean> --- !query 107 output +-- !query output NULL --- !query 108 +-- !query SELECT cast(1 as bigint) <> cast(null as string) FROM t --- !query 108 schema +-- !query schema struct<(NOT (CAST(1 AS BIGINT) = CAST(CAST(NULL AS STRING) AS BIGINT))):boolean> --- !query 108 output +-- !query output NULL --- !query 109 +-- !query SELECT '1' = cast(1 as bigint) FROM t --- !query 109 schema +-- !query schema struct<(CAST(1 AS BIGINT) = CAST(1 AS BIGINT)):boolean> --- !query 109 output +-- !query output true --- !query 110 +-- !query SELECT '2' > cast(1 as bigint) FROM t --- !query 110 schema +-- !query schema struct<(CAST(2 AS BIGINT) > CAST(1 AS BIGINT)):boolean> --- !query 110 output +-- !query output true --- !query 111 +-- !query SELECT '2' >= cast(1 as bigint) FROM t --- !query 111 schema +-- !query schema struct<(CAST(2 AS BIGINT) >= CAST(1 AS BIGINT)):boolean> --- !query 111 output +-- !query output true --- !query 112 +-- !query SELECT '2' < cast(1 as bigint) FROM t --- !query 112 schema +-- !query schema struct<(CAST(2 AS BIGINT) < CAST(1 AS BIGINT)):boolean> --- !query 112 output +-- !query output false --- !query 113 +-- !query SELECT '2' <= cast(1 as bigint) FROM t --- !query 113 schema +-- !query schema struct<(CAST(2 AS BIGINT) <= CAST(1 AS BIGINT)):boolean> --- !query 113 output +-- !query output false --- !query 114 +-- !query SELECT '2' <> cast(1 as bigint) FROM t --- !query 114 schema +-- !query schema struct<(NOT (CAST(2 AS BIGINT) = CAST(1 AS BIGINT))):boolean> --- !query 114 output +-- !query output true --- !query 115 +-- !query SELECT cast(null as string) = cast(1 as bigint) FROM t --- !query 115 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS BIGINT) = CAST(1 AS BIGINT)):boolean> --- !query 115 output +-- !query output NULL --- !query 116 +-- !query SELECT cast(null as string) > cast(1 as bigint) FROM t --- !query 116 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS BIGINT) > CAST(1 AS BIGINT)):boolean> --- !query 116 output +-- !query output NULL --- !query 117 +-- !query SELECT cast(null as string) >= cast(1 as bigint) FROM t --- !query 117 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS BIGINT) >= CAST(1 AS BIGINT)):boolean> --- !query 117 output +-- !query output NULL --- !query 118 +-- !query SELECT cast(null as string) < cast(1 as bigint) FROM t --- !query 118 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS BIGINT) < CAST(1 AS BIGINT)):boolean> --- !query 118 output +-- !query output NULL --- !query 119 +-- !query SELECT cast(null as string) <= cast(1 as bigint) FROM t --- !query 119 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS BIGINT) <= CAST(1 AS BIGINT)):boolean> --- !query 119 output +-- !query output NULL --- !query 120 +-- !query SELECT cast(null as string) <> cast(1 as bigint) FROM t --- !query 120 schema +-- !query schema struct<(NOT (CAST(CAST(NULL AS STRING) AS BIGINT) = CAST(1 AS BIGINT))):boolean> --- !query 120 output +-- !query output NULL --- !query 121 +-- !query SELECT cast(1 as decimal(10, 0)) = '1' FROM t --- !query 121 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(1 AS DOUBLE)):boolean> --- !query 121 output +-- !query output true --- !query 122 +-- !query SELECT cast(1 as decimal(10, 0)) > '2' FROM t --- !query 122 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) > CAST(2 AS DOUBLE)):boolean> --- !query 122 output +-- !query output false --- !query 123 +-- !query SELECT cast(1 as decimal(10, 0)) >= '2' FROM t --- !query 123 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) >= CAST(2 AS DOUBLE)):boolean> --- !query 123 output +-- !query output false --- !query 124 +-- !query SELECT cast(1 as decimal(10, 0)) < '2' FROM t --- !query 124 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) < CAST(2 AS DOUBLE)):boolean> --- !query 124 output +-- !query output true --- !query 125 +-- !query SELECT cast(1 as decimal(10, 0)) <> '2' FROM t --- !query 125 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(2 AS DOUBLE))):boolean> --- !query 125 output +-- !query output true --- !query 126 +-- !query SELECT cast(1 as decimal(10, 0)) <= '2' FROM t --- !query 126 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) <= CAST(2 AS DOUBLE)):boolean> --- !query 126 output +-- !query output true --- !query 127 +-- !query SELECT cast(1 as decimal(10, 0)) = cast(null as string) FROM t --- !query 127 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(CAST(NULL AS STRING) AS DOUBLE)):boolean> --- !query 127 output +-- !query output NULL --- !query 128 +-- !query SELECT cast(1 as decimal(10, 0)) > cast(null as string) FROM t --- !query 128 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) > CAST(CAST(NULL AS STRING) AS DOUBLE)):boolean> --- !query 128 output +-- !query output NULL --- !query 129 +-- !query SELECT cast(1 as decimal(10, 0)) >= cast(null as string) FROM t --- !query 129 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) >= CAST(CAST(NULL AS STRING) AS DOUBLE)):boolean> --- !query 129 output +-- !query output NULL --- !query 130 +-- !query SELECT cast(1 as decimal(10, 0)) < cast(null as string) FROM t --- !query 130 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) < CAST(CAST(NULL AS STRING) AS DOUBLE)):boolean> --- !query 130 output +-- !query output NULL --- !query 131 +-- !query SELECT cast(1 as decimal(10, 0)) <> cast(null as string) FROM t --- !query 131 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(CAST(NULL AS STRING) AS DOUBLE))):boolean> --- !query 131 output +-- !query output NULL --- !query 132 +-- !query SELECT cast(1 as decimal(10, 0)) <= cast(null as string) FROM t --- !query 132 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) <= CAST(CAST(NULL AS STRING) AS DOUBLE)):boolean> --- !query 132 output +-- !query output NULL --- !query 133 +-- !query SELECT '1' = cast(1 as decimal(10, 0)) FROM t --- !query 133 schema +-- !query schema struct<(CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 133 output +-- !query output true --- !query 134 +-- !query SELECT '2' > cast(1 as decimal(10, 0)) FROM t --- !query 134 schema +-- !query schema struct<(CAST(2 AS DOUBLE) > CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 134 output +-- !query output true --- !query 135 +-- !query SELECT '2' >= cast(1 as decimal(10, 0)) FROM t --- !query 135 schema +-- !query schema struct<(CAST(2 AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 135 output +-- !query output true --- !query 136 +-- !query SELECT '2' < cast(1 as decimal(10, 0)) FROM t --- !query 136 schema +-- !query schema struct<(CAST(2 AS DOUBLE) < CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 136 output +-- !query output false --- !query 137 +-- !query SELECT '2' <= cast(1 as decimal(10, 0)) FROM t --- !query 137 schema +-- !query schema struct<(CAST(2 AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 137 output +-- !query output false --- !query 138 +-- !query SELECT '2' <> cast(1 as decimal(10, 0)) FROM t --- !query 138 schema +-- !query schema struct<(NOT (CAST(2 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE))):boolean> --- !query 138 output +-- !query output true --- !query 139 +-- !query SELECT cast(null as string) = cast(1 as decimal(10, 0)) FROM t --- !query 139 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS DOUBLE) = CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 139 output +-- !query output NULL --- !query 140 +-- !query SELECT cast(null as string) > cast(1 as decimal(10, 0)) FROM t --- !query 140 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS DOUBLE) > CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 140 output +-- !query output NULL --- !query 141 +-- !query SELECT cast(null as string) >= cast(1 as decimal(10, 0)) FROM t --- !query 141 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 141 output +-- !query output NULL --- !query 142 +-- !query SELECT cast(null as string) < cast(1 as decimal(10, 0)) FROM t --- !query 142 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS DOUBLE) < CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 142 output +-- !query output NULL --- !query 143 +-- !query SELECT cast(null as string) <= cast(1 as decimal(10, 0)) FROM t --- !query 143 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 143 output +-- !query output NULL --- !query 144 +-- !query SELECT cast(null as string) <> cast(1 as decimal(10, 0)) FROM t --- !query 144 schema +-- !query schema struct<(NOT (CAST(CAST(NULL AS STRING) AS DOUBLE) = CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE))):boolean> --- !query 144 output +-- !query output NULL --- !query 145 +-- !query SELECT cast(1 as double) = '1' FROM t --- !query 145 schema +-- !query schema struct<(CAST(1 AS DOUBLE) = CAST(1 AS DOUBLE)):boolean> --- !query 145 output +-- !query output true --- !query 146 +-- !query SELECT cast(1 as double) > '2' FROM t --- !query 146 schema +-- !query schema struct<(CAST(1 AS DOUBLE) > CAST(2 AS DOUBLE)):boolean> --- !query 146 output +-- !query output false --- !query 147 +-- !query SELECT cast(1 as double) >= '2' FROM t --- !query 147 schema +-- !query schema struct<(CAST(1 AS DOUBLE) >= CAST(2 AS DOUBLE)):boolean> --- !query 147 output +-- !query output false --- !query 148 +-- !query SELECT cast(1 as double) < '2' FROM t --- !query 148 schema +-- !query schema struct<(CAST(1 AS DOUBLE) < CAST(2 AS DOUBLE)):boolean> --- !query 148 output +-- !query output true --- !query 149 +-- !query SELECT cast(1 as double) <= '2' FROM t --- !query 149 schema +-- !query schema struct<(CAST(1 AS DOUBLE) <= CAST(2 AS DOUBLE)):boolean> --- !query 149 output +-- !query output true --- !query 150 +-- !query SELECT cast(1 as double) <> '2' FROM t --- !query 150 schema +-- !query schema struct<(NOT (CAST(1 AS DOUBLE) = CAST(2 AS DOUBLE))):boolean> --- !query 150 output +-- !query output true --- !query 151 +-- !query SELECT cast(1 as double) = cast(null as string) FROM t --- !query 151 schema +-- !query schema struct<(CAST(1 AS DOUBLE) = CAST(CAST(NULL AS STRING) AS DOUBLE)):boolean> --- !query 151 output +-- !query output NULL --- !query 152 +-- !query SELECT cast(1 as double) > cast(null as string) FROM t --- !query 152 schema +-- !query schema struct<(CAST(1 AS DOUBLE) > CAST(CAST(NULL AS STRING) AS DOUBLE)):boolean> --- !query 152 output +-- !query output NULL --- !query 153 +-- !query SELECT cast(1 as double) >= cast(null as string) FROM t --- !query 153 schema +-- !query schema struct<(CAST(1 AS DOUBLE) >= CAST(CAST(NULL AS STRING) AS DOUBLE)):boolean> --- !query 153 output +-- !query output NULL --- !query 154 +-- !query SELECT cast(1 as double) < cast(null as string) FROM t --- !query 154 schema +-- !query schema struct<(CAST(1 AS DOUBLE) < CAST(CAST(NULL AS STRING) AS DOUBLE)):boolean> --- !query 154 output +-- !query output NULL --- !query 155 +-- !query SELECT cast(1 as double) <= cast(null as string) FROM t --- !query 155 schema +-- !query schema struct<(CAST(1 AS DOUBLE) <= CAST(CAST(NULL AS STRING) AS DOUBLE)):boolean> --- !query 155 output +-- !query output NULL --- !query 156 +-- !query SELECT cast(1 as double) <> cast(null as string) FROM t --- !query 156 schema +-- !query schema struct<(NOT (CAST(1 AS DOUBLE) = CAST(CAST(NULL AS STRING) AS DOUBLE))):boolean> --- !query 156 output +-- !query output NULL --- !query 157 +-- !query SELECT '1' = cast(1 as double) FROM t --- !query 157 schema +-- !query schema struct<(CAST(1 AS DOUBLE) = CAST(1 AS DOUBLE)):boolean> --- !query 157 output +-- !query output true --- !query 158 +-- !query SELECT '2' > cast(1 as double) FROM t --- !query 158 schema +-- !query schema struct<(CAST(2 AS DOUBLE) > CAST(1 AS DOUBLE)):boolean> --- !query 158 output +-- !query output true --- !query 159 +-- !query SELECT '2' >= cast(1 as double) FROM t --- !query 159 schema +-- !query schema struct<(CAST(2 AS DOUBLE) >= CAST(1 AS DOUBLE)):boolean> --- !query 159 output +-- !query output true --- !query 160 +-- !query SELECT '2' < cast(1 as double) FROM t --- !query 160 schema +-- !query schema struct<(CAST(2 AS DOUBLE) < CAST(1 AS DOUBLE)):boolean> --- !query 160 output +-- !query output false --- !query 161 +-- !query SELECT '2' <= cast(1 as double) FROM t --- !query 161 schema +-- !query schema struct<(CAST(2 AS DOUBLE) <= CAST(1 AS DOUBLE)):boolean> --- !query 161 output +-- !query output false --- !query 162 +-- !query SELECT '2' <> cast(1 as double) FROM t --- !query 162 schema +-- !query schema struct<(NOT (CAST(2 AS DOUBLE) = CAST(1 AS DOUBLE))):boolean> --- !query 162 output +-- !query output true --- !query 163 +-- !query SELECT cast(null as string) = cast(1 as double) FROM t --- !query 163 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS DOUBLE) = CAST(1 AS DOUBLE)):boolean> --- !query 163 output +-- !query output NULL --- !query 164 +-- !query SELECT cast(null as string) > cast(1 as double) FROM t --- !query 164 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS DOUBLE) > CAST(1 AS DOUBLE)):boolean> --- !query 164 output +-- !query output NULL --- !query 165 +-- !query SELECT cast(null as string) >= cast(1 as double) FROM t --- !query 165 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS DOUBLE) >= CAST(1 AS DOUBLE)):boolean> --- !query 165 output +-- !query output NULL --- !query 166 +-- !query SELECT cast(null as string) < cast(1 as double) FROM t --- !query 166 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS DOUBLE) < CAST(1 AS DOUBLE)):boolean> --- !query 166 output +-- !query output NULL --- !query 167 +-- !query SELECT cast(null as string) <= cast(1 as double) FROM t --- !query 167 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS DOUBLE) <= CAST(1 AS DOUBLE)):boolean> --- !query 167 output +-- !query output NULL --- !query 168 +-- !query SELECT cast(null as string) <> cast(1 as double) FROM t --- !query 168 schema +-- !query schema struct<(NOT (CAST(CAST(NULL AS STRING) AS DOUBLE) = CAST(1 AS DOUBLE))):boolean> --- !query 168 output +-- !query output NULL --- !query 169 +-- !query SELECT cast(1 as float) = '1' FROM t --- !query 169 schema +-- !query schema struct<(CAST(1 AS FLOAT) = CAST(1 AS FLOAT)):boolean> --- !query 169 output +-- !query output true --- !query 170 +-- !query SELECT cast(1 as float) > '2' FROM t --- !query 170 schema +-- !query schema struct<(CAST(1 AS FLOAT) > CAST(2 AS FLOAT)):boolean> --- !query 170 output +-- !query output false --- !query 171 +-- !query SELECT cast(1 as float) >= '2' FROM t --- !query 171 schema +-- !query schema struct<(CAST(1 AS FLOAT) >= CAST(2 AS FLOAT)):boolean> --- !query 171 output +-- !query output false --- !query 172 +-- !query SELECT cast(1 as float) < '2' FROM t --- !query 172 schema +-- !query schema struct<(CAST(1 AS FLOAT) < CAST(2 AS FLOAT)):boolean> --- !query 172 output +-- !query output true --- !query 173 +-- !query SELECT cast(1 as float) <= '2' FROM t --- !query 173 schema +-- !query schema struct<(CAST(1 AS FLOAT) <= CAST(2 AS FLOAT)):boolean> --- !query 173 output +-- !query output true --- !query 174 +-- !query SELECT cast(1 as float) <> '2' FROM t --- !query 174 schema +-- !query schema struct<(NOT (CAST(1 AS FLOAT) = CAST(2 AS FLOAT))):boolean> --- !query 174 output +-- !query output true --- !query 175 +-- !query SELECT cast(1 as float) = cast(null as string) FROM t --- !query 175 schema +-- !query schema struct<(CAST(1 AS FLOAT) = CAST(CAST(NULL AS STRING) AS FLOAT)):boolean> --- !query 175 output +-- !query output NULL --- !query 176 +-- !query SELECT cast(1 as float) > cast(null as string) FROM t --- !query 176 schema +-- !query schema struct<(CAST(1 AS FLOAT) > CAST(CAST(NULL AS STRING) AS FLOAT)):boolean> --- !query 176 output +-- !query output NULL --- !query 177 +-- !query SELECT cast(1 as float) >= cast(null as string) FROM t --- !query 177 schema +-- !query schema struct<(CAST(1 AS FLOAT) >= CAST(CAST(NULL AS STRING) AS FLOAT)):boolean> --- !query 177 output +-- !query output NULL --- !query 178 +-- !query SELECT cast(1 as float) < cast(null as string) FROM t --- !query 178 schema +-- !query schema struct<(CAST(1 AS FLOAT) < CAST(CAST(NULL AS STRING) AS FLOAT)):boolean> --- !query 178 output +-- !query output NULL --- !query 179 +-- !query SELECT cast(1 as float) <= cast(null as string) FROM t --- !query 179 schema +-- !query schema struct<(CAST(1 AS FLOAT) <= CAST(CAST(NULL AS STRING) AS FLOAT)):boolean> --- !query 179 output +-- !query output NULL --- !query 180 +-- !query SELECT cast(1 as float) <> cast(null as string) FROM t --- !query 180 schema +-- !query schema struct<(NOT (CAST(1 AS FLOAT) = CAST(CAST(NULL AS STRING) AS FLOAT))):boolean> --- !query 180 output +-- !query output NULL --- !query 181 +-- !query SELECT '1' = cast(1 as float) FROM t --- !query 181 schema +-- !query schema struct<(CAST(1 AS FLOAT) = CAST(1 AS FLOAT)):boolean> --- !query 181 output +-- !query output true --- !query 182 +-- !query SELECT '2' > cast(1 as float) FROM t --- !query 182 schema +-- !query schema struct<(CAST(2 AS FLOAT) > CAST(1 AS FLOAT)):boolean> --- !query 182 output +-- !query output true --- !query 183 +-- !query SELECT '2' >= cast(1 as float) FROM t --- !query 183 schema +-- !query schema struct<(CAST(2 AS FLOAT) >= CAST(1 AS FLOAT)):boolean> --- !query 183 output +-- !query output true --- !query 184 +-- !query SELECT '2' < cast(1 as float) FROM t --- !query 184 schema +-- !query schema struct<(CAST(2 AS FLOAT) < CAST(1 AS FLOAT)):boolean> --- !query 184 output +-- !query output false --- !query 185 +-- !query SELECT '2' <= cast(1 as float) FROM t --- !query 185 schema +-- !query schema struct<(CAST(2 AS FLOAT) <= CAST(1 AS FLOAT)):boolean> --- !query 185 output +-- !query output false --- !query 186 +-- !query SELECT '2' <> cast(1 as float) FROM t --- !query 186 schema +-- !query schema struct<(NOT (CAST(2 AS FLOAT) = CAST(1 AS FLOAT))):boolean> --- !query 186 output +-- !query output true --- !query 187 +-- !query SELECT cast(null as string) = cast(1 as float) FROM t --- !query 187 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS FLOAT) = CAST(1 AS FLOAT)):boolean> --- !query 187 output +-- !query output NULL --- !query 188 +-- !query SELECT cast(null as string) > cast(1 as float) FROM t --- !query 188 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS FLOAT) > CAST(1 AS FLOAT)):boolean> --- !query 188 output +-- !query output NULL --- !query 189 +-- !query SELECT cast(null as string) >= cast(1 as float) FROM t --- !query 189 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS FLOAT) >= CAST(1 AS FLOAT)):boolean> --- !query 189 output +-- !query output NULL --- !query 190 +-- !query SELECT cast(null as string) < cast(1 as float) FROM t --- !query 190 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS FLOAT) < CAST(1 AS FLOAT)):boolean> --- !query 190 output +-- !query output NULL --- !query 191 +-- !query SELECT cast(null as string) <= cast(1 as float) FROM t --- !query 191 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS FLOAT) <= CAST(1 AS FLOAT)):boolean> --- !query 191 output +-- !query output NULL --- !query 192 +-- !query SELECT cast(null as string) <> cast(1 as float) FROM t --- !query 192 schema +-- !query schema struct<(NOT (CAST(CAST(NULL AS STRING) AS FLOAT) = CAST(1 AS FLOAT))):boolean> --- !query 192 output +-- !query output NULL --- !query 193 +-- !query SELECT '1996-09-09' = date('1996-09-09') FROM t --- !query 193 schema +-- !query schema struct<(CAST(1996-09-09 AS DATE) = CAST(1996-09-09 AS DATE)):boolean> --- !query 193 output +-- !query output true --- !query 194 +-- !query SELECT '1996-9-10' > date('1996-09-09') FROM t --- !query 194 schema +-- !query schema struct<(CAST(1996-9-10 AS DATE) > CAST(1996-09-09 AS DATE)):boolean> --- !query 194 output +-- !query output true --- !query 195 +-- !query SELECT '1996-9-10' >= date('1996-09-09') FROM t --- !query 195 schema +-- !query schema struct<(CAST(1996-9-10 AS DATE) >= CAST(1996-09-09 AS DATE)):boolean> --- !query 195 output +-- !query output true --- !query 196 +-- !query SELECT '1996-9-10' < date('1996-09-09') FROM t --- !query 196 schema +-- !query schema struct<(CAST(1996-9-10 AS DATE) < CAST(1996-09-09 AS DATE)):boolean> --- !query 196 output +-- !query output false --- !query 197 +-- !query SELECT '1996-9-10' <= date('1996-09-09') FROM t --- !query 197 schema +-- !query schema struct<(CAST(1996-9-10 AS DATE) <= CAST(1996-09-09 AS DATE)):boolean> --- !query 197 output +-- !query output false --- !query 198 +-- !query SELECT '1996-9-10' <> date('1996-09-09') FROM t --- !query 198 schema +-- !query schema struct<(NOT (CAST(1996-9-10 AS DATE) = CAST(1996-09-09 AS DATE))):boolean> --- !query 198 output +-- !query output true --- !query 199 +-- !query SELECT cast(null as string) = date('1996-09-09') FROM t --- !query 199 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS DATE) = CAST(1996-09-09 AS DATE)):boolean> --- !query 199 output +-- !query output NULL --- !query 200 +-- !query SELECT cast(null as string)> date('1996-09-09') FROM t --- !query 200 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS DATE) > CAST(1996-09-09 AS DATE)):boolean> --- !query 200 output +-- !query output NULL --- !query 201 +-- !query SELECT cast(null as string)>= date('1996-09-09') FROM t --- !query 201 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS DATE) >= CAST(1996-09-09 AS DATE)):boolean> --- !query 201 output +-- !query output NULL --- !query 202 +-- !query SELECT cast(null as string)< date('1996-09-09') FROM t --- !query 202 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS DATE) < CAST(1996-09-09 AS DATE)):boolean> --- !query 202 output +-- !query output NULL --- !query 203 +-- !query SELECT cast(null as string)<= date('1996-09-09') FROM t --- !query 203 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS DATE) <= CAST(1996-09-09 AS DATE)):boolean> --- !query 203 output +-- !query output NULL --- !query 204 +-- !query SELECT cast(null as string)<> date('1996-09-09') FROM t --- !query 204 schema +-- !query schema struct<(NOT (CAST(CAST(NULL AS STRING) AS DATE) = CAST(1996-09-09 AS DATE))):boolean> --- !query 204 output +-- !query output NULL --- !query 205 +-- !query SELECT date('1996-09-09') = '1996-09-09' FROM t --- !query 205 schema +-- !query schema struct<(CAST(1996-09-09 AS DATE) = CAST(1996-09-09 AS DATE)):boolean> --- !query 205 output +-- !query output true --- !query 206 +-- !query SELECT date('1996-9-10') > '1996-09-09' FROM t --- !query 206 schema +-- !query schema struct<(CAST(1996-9-10 AS DATE) > CAST(1996-09-09 AS DATE)):boolean> --- !query 206 output +-- !query output true --- !query 207 +-- !query SELECT date('1996-9-10') >= '1996-09-09' FROM t --- !query 207 schema +-- !query schema struct<(CAST(1996-9-10 AS DATE) >= CAST(1996-09-09 AS DATE)):boolean> --- !query 207 output +-- !query output true --- !query 208 +-- !query SELECT date('1996-9-10') < '1996-09-09' FROM t --- !query 208 schema +-- !query schema struct<(CAST(1996-9-10 AS DATE) < CAST(1996-09-09 AS DATE)):boolean> --- !query 208 output +-- !query output false --- !query 209 +-- !query SELECT date('1996-9-10') <= '1996-09-09' FROM t --- !query 209 schema +-- !query schema struct<(CAST(1996-9-10 AS DATE) <= CAST(1996-09-09 AS DATE)):boolean> --- !query 209 output +-- !query output false --- !query 210 +-- !query SELECT date('1996-9-10') <> '1996-09-09' FROM t --- !query 210 schema +-- !query schema struct<(NOT (CAST(1996-9-10 AS DATE) = CAST(1996-09-09 AS DATE))):boolean> --- !query 210 output +-- !query output true --- !query 211 +-- !query SELECT date('1996-09-09') = cast(null as string) FROM t --- !query 211 schema +-- !query schema struct<(CAST(1996-09-09 AS DATE) = CAST(CAST(NULL AS STRING) AS DATE)):boolean> --- !query 211 output +-- !query output NULL --- !query 212 +-- !query SELECT date('1996-9-10') > cast(null as string) FROM t --- !query 212 schema +-- !query schema struct<(CAST(1996-9-10 AS DATE) > CAST(CAST(NULL AS STRING) AS DATE)):boolean> --- !query 212 output +-- !query output NULL --- !query 213 +-- !query SELECT date('1996-9-10') >= cast(null as string) FROM t --- !query 213 schema +-- !query schema struct<(CAST(1996-9-10 AS DATE) >= CAST(CAST(NULL AS STRING) AS DATE)):boolean> --- !query 213 output +-- !query output NULL --- !query 214 +-- !query SELECT date('1996-9-10') < cast(null as string) FROM t --- !query 214 schema +-- !query schema struct<(CAST(1996-9-10 AS DATE) < CAST(CAST(NULL AS STRING) AS DATE)):boolean> --- !query 214 output +-- !query output NULL --- !query 215 +-- !query SELECT date('1996-9-10') <= cast(null as string) FROM t --- !query 215 schema +-- !query schema struct<(CAST(1996-9-10 AS DATE) <= CAST(CAST(NULL AS STRING) AS DATE)):boolean> --- !query 215 output +-- !query output NULL --- !query 216 +-- !query SELECT date('1996-9-10') <> cast(null as string) FROM t --- !query 216 schema +-- !query schema struct<(NOT (CAST(1996-9-10 AS DATE) = CAST(CAST(NULL AS STRING) AS DATE))):boolean> --- !query 216 output +-- !query output NULL --- !query 217 +-- !query SELECT '1996-09-09 12:12:12.4' = timestamp('1996-09-09 12:12:12.4') FROM t --- !query 217 schema +-- !query schema struct<(CAST(1996-09-09 12:12:12.4 AS TIMESTAMP) = CAST(1996-09-09 12:12:12.4 AS TIMESTAMP)):boolean> --- !query 217 output +-- !query output true --- !query 218 +-- !query SELECT '1996-09-09 12:12:12.5' > timestamp('1996-09-09 12:12:12.4') FROM t --- !query 218 schema +-- !query schema struct<(CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) > CAST(1996-09-09 12:12:12.4 AS TIMESTAMP)):boolean> --- !query 218 output +-- !query output true --- !query 219 +-- !query SELECT '1996-09-09 12:12:12.5' >= timestamp('1996-09-09 12:12:12.4') FROM t --- !query 219 schema +-- !query schema struct<(CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) >= CAST(1996-09-09 12:12:12.4 AS TIMESTAMP)):boolean> --- !query 219 output +-- !query output true --- !query 220 +-- !query SELECT '1996-09-09 12:12:12.5' < timestamp('1996-09-09 12:12:12.4') FROM t --- !query 220 schema +-- !query schema struct<(CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) < CAST(1996-09-09 12:12:12.4 AS TIMESTAMP)):boolean> --- !query 220 output +-- !query output false --- !query 221 +-- !query SELECT '1996-09-09 12:12:12.5' <= timestamp('1996-09-09 12:12:12.4') FROM t --- !query 221 schema +-- !query schema struct<(CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) <= CAST(1996-09-09 12:12:12.4 AS TIMESTAMP)):boolean> --- !query 221 output +-- !query output false --- !query 222 +-- !query SELECT '1996-09-09 12:12:12.5' <> timestamp('1996-09-09 12:12:12.4') FROM t --- !query 222 schema +-- !query schema struct<(NOT (CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) = CAST(1996-09-09 12:12:12.4 AS TIMESTAMP))):boolean> --- !query 222 output +-- !query output true --- !query 223 +-- !query SELECT cast(null as string) = timestamp('1996-09-09 12:12:12.4') FROM t --- !query 223 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS TIMESTAMP) = CAST(1996-09-09 12:12:12.4 AS TIMESTAMP)):boolean> --- !query 223 output +-- !query output NULL --- !query 224 +-- !query SELECT cast(null as string) > timestamp('1996-09-09 12:12:12.4') FROM t --- !query 224 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS TIMESTAMP) > CAST(1996-09-09 12:12:12.4 AS TIMESTAMP)):boolean> --- !query 224 output +-- !query output NULL --- !query 225 +-- !query SELECT cast(null as string) >= timestamp('1996-09-09 12:12:12.4') FROM t --- !query 225 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS TIMESTAMP) >= CAST(1996-09-09 12:12:12.4 AS TIMESTAMP)):boolean> --- !query 225 output +-- !query output NULL --- !query 226 +-- !query SELECT cast(null as string) < timestamp('1996-09-09 12:12:12.4') FROM t --- !query 226 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS TIMESTAMP) < CAST(1996-09-09 12:12:12.4 AS TIMESTAMP)):boolean> --- !query 226 output +-- !query output NULL --- !query 227 +-- !query SELECT cast(null as string) <= timestamp('1996-09-09 12:12:12.4') FROM t --- !query 227 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS TIMESTAMP) <= CAST(1996-09-09 12:12:12.4 AS TIMESTAMP)):boolean> --- !query 227 output +-- !query output NULL --- !query 228 +-- !query SELECT cast(null as string) <> timestamp('1996-09-09 12:12:12.4') FROM t --- !query 228 schema +-- !query schema struct<(NOT (CAST(CAST(NULL AS STRING) AS TIMESTAMP) = CAST(1996-09-09 12:12:12.4 AS TIMESTAMP))):boolean> --- !query 228 output +-- !query output NULL --- !query 229 +-- !query SELECT timestamp('1996-09-09 12:12:12.4' )= '1996-09-09 12:12:12.4' FROM t --- !query 229 schema +-- !query schema struct<(CAST(1996-09-09 12:12:12.4 AS TIMESTAMP) = CAST(1996-09-09 12:12:12.4 AS TIMESTAMP)):boolean> --- !query 229 output +-- !query output true --- !query 230 +-- !query SELECT timestamp('1996-09-09 12:12:12.5' )> '1996-09-09 12:12:12.4' FROM t --- !query 230 schema +-- !query schema struct<(CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) > CAST(1996-09-09 12:12:12.4 AS TIMESTAMP)):boolean> --- !query 230 output +-- !query output true --- !query 231 +-- !query SELECT timestamp('1996-09-09 12:12:12.5' )>= '1996-09-09 12:12:12.4' FROM t --- !query 231 schema +-- !query schema struct<(CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) >= CAST(1996-09-09 12:12:12.4 AS TIMESTAMP)):boolean> --- !query 231 output +-- !query output true --- !query 232 +-- !query SELECT timestamp('1996-09-09 12:12:12.5' )< '1996-09-09 12:12:12.4' FROM t --- !query 232 schema +-- !query schema struct<(CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) < CAST(1996-09-09 12:12:12.4 AS TIMESTAMP)):boolean> --- !query 232 output +-- !query output false --- !query 233 +-- !query SELECT timestamp('1996-09-09 12:12:12.5' )<= '1996-09-09 12:12:12.4' FROM t --- !query 233 schema +-- !query schema struct<(CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) <= CAST(1996-09-09 12:12:12.4 AS TIMESTAMP)):boolean> --- !query 233 output +-- !query output false --- !query 234 +-- !query SELECT timestamp('1996-09-09 12:12:12.5' )<> '1996-09-09 12:12:12.4' FROM t --- !query 234 schema +-- !query schema struct<(NOT (CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) = CAST(1996-09-09 12:12:12.4 AS TIMESTAMP))):boolean> --- !query 234 output +-- !query output true --- !query 235 +-- !query SELECT timestamp('1996-09-09 12:12:12.4' )= cast(null as string) FROM t --- !query 235 schema +-- !query schema struct<(CAST(1996-09-09 12:12:12.4 AS TIMESTAMP) = CAST(CAST(NULL AS STRING) AS TIMESTAMP)):boolean> --- !query 235 output +-- !query output NULL --- !query 236 +-- !query SELECT timestamp('1996-09-09 12:12:12.5' )> cast(null as string) FROM t --- !query 236 schema +-- !query schema struct<(CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) > CAST(CAST(NULL AS STRING) AS TIMESTAMP)):boolean> --- !query 236 output +-- !query output NULL --- !query 237 +-- !query SELECT timestamp('1996-09-09 12:12:12.5' )>= cast(null as string) FROM t --- !query 237 schema +-- !query schema struct<(CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) >= CAST(CAST(NULL AS STRING) AS TIMESTAMP)):boolean> --- !query 237 output +-- !query output NULL --- !query 238 +-- !query SELECT timestamp('1996-09-09 12:12:12.5' )< cast(null as string) FROM t --- !query 238 schema +-- !query schema struct<(CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) < CAST(CAST(NULL AS STRING) AS TIMESTAMP)):boolean> --- !query 238 output +-- !query output NULL --- !query 239 +-- !query SELECT timestamp('1996-09-09 12:12:12.5' )<= cast(null as string) FROM t --- !query 239 schema +-- !query schema struct<(CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) <= CAST(CAST(NULL AS STRING) AS TIMESTAMP)):boolean> --- !query 239 output +-- !query output NULL --- !query 240 +-- !query SELECT timestamp('1996-09-09 12:12:12.5' )<> cast(null as string) FROM t --- !query 240 schema +-- !query schema struct<(NOT (CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) = CAST(CAST(NULL AS STRING) AS TIMESTAMP))):boolean> --- !query 240 output +-- !query output NULL --- !query 241 +-- !query SELECT ' ' = X'0020' FROM t --- !query 241 schema +-- !query schema struct<(CAST( AS BINARY) = X'0020'):boolean> --- !query 241 output +-- !query output false --- !query 242 +-- !query SELECT ' ' > X'001F' FROM t --- !query 242 schema +-- !query schema struct<(CAST( AS BINARY) > X'001F'):boolean> --- !query 242 output +-- !query output true --- !query 243 +-- !query SELECT ' ' >= X'001F' FROM t --- !query 243 schema +-- !query schema struct<(CAST( AS BINARY) >= X'001F'):boolean> --- !query 243 output +-- !query output true --- !query 244 +-- !query SELECT ' ' < X'001F' FROM t --- !query 244 schema +-- !query schema struct<(CAST( AS BINARY) < X'001F'):boolean> --- !query 244 output +-- !query output false --- !query 245 +-- !query SELECT ' ' <= X'001F' FROM t --- !query 245 schema +-- !query schema struct<(CAST( AS BINARY) <= X'001F'):boolean> --- !query 245 output +-- !query output false --- !query 246 +-- !query SELECT ' ' <> X'001F' FROM t --- !query 246 schema +-- !query schema struct<(NOT (CAST( AS BINARY) = X'001F')):boolean> --- !query 246 output +-- !query output true --- !query 247 +-- !query SELECT cast(null as string) = X'0020' FROM t --- !query 247 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS BINARY) = X'0020'):boolean> --- !query 247 output +-- !query output NULL --- !query 248 +-- !query SELECT cast(null as string) > X'001F' FROM t --- !query 248 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS BINARY) > X'001F'):boolean> --- !query 248 output +-- !query output NULL --- !query 249 +-- !query SELECT cast(null as string) >= X'001F' FROM t --- !query 249 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS BINARY) >= X'001F'):boolean> --- !query 249 output +-- !query output NULL --- !query 250 +-- !query SELECT cast(null as string) < X'001F' FROM t --- !query 250 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS BINARY) < X'001F'):boolean> --- !query 250 output +-- !query output NULL --- !query 251 +-- !query SELECT cast(null as string) <= X'001F' FROM t --- !query 251 schema +-- !query schema struct<(CAST(CAST(NULL AS STRING) AS BINARY) <= X'001F'):boolean> --- !query 251 output +-- !query output NULL --- !query 252 +-- !query SELECT cast(null as string) <> X'001F' FROM t --- !query 252 schema +-- !query schema struct<(NOT (CAST(CAST(NULL AS STRING) AS BINARY) = X'001F')):boolean> --- !query 252 output +-- !query output NULL --- !query 253 +-- !query SELECT X'0020' = ' ' FROM t --- !query 253 schema +-- !query schema struct<(X'0020' = CAST( AS BINARY)):boolean> --- !query 253 output +-- !query output false --- !query 254 +-- !query SELECT X'001F' > ' ' FROM t --- !query 254 schema +-- !query schema struct<(X'001F' > CAST( AS BINARY)):boolean> --- !query 254 output +-- !query output false --- !query 255 +-- !query SELECT X'001F' >= ' ' FROM t --- !query 255 schema +-- !query schema struct<(X'001F' >= CAST( AS BINARY)):boolean> --- !query 255 output +-- !query output false --- !query 256 +-- !query SELECT X'001F' < ' ' FROM t --- !query 256 schema +-- !query schema struct<(X'001F' < CAST( AS BINARY)):boolean> --- !query 256 output +-- !query output true --- !query 257 +-- !query SELECT X'001F' <= ' ' FROM t --- !query 257 schema +-- !query schema struct<(X'001F' <= CAST( AS BINARY)):boolean> --- !query 257 output +-- !query output true --- !query 258 +-- !query SELECT X'001F' <> ' ' FROM t --- !query 258 schema +-- !query schema struct<(NOT (X'001F' = CAST( AS BINARY))):boolean> --- !query 258 output +-- !query output true --- !query 259 +-- !query SELECT X'0020' = cast(null as string) FROM t --- !query 259 schema +-- !query schema struct<(X'0020' = CAST(CAST(NULL AS STRING) AS BINARY)):boolean> --- !query 259 output +-- !query output NULL --- !query 260 +-- !query SELECT X'001F' > cast(null as string) FROM t --- !query 260 schema +-- !query schema struct<(X'001F' > CAST(CAST(NULL AS STRING) AS BINARY)):boolean> --- !query 260 output +-- !query output NULL --- !query 261 +-- !query SELECT X'001F' >= cast(null as string) FROM t --- !query 261 schema +-- !query schema struct<(X'001F' >= CAST(CAST(NULL AS STRING) AS BINARY)):boolean> --- !query 261 output +-- !query output NULL --- !query 262 +-- !query SELECT X'001F' < cast(null as string) FROM t --- !query 262 schema +-- !query schema struct<(X'001F' < CAST(CAST(NULL AS STRING) AS BINARY)):boolean> --- !query 262 output +-- !query output NULL --- !query 263 +-- !query SELECT X'001F' <= cast(null as string) FROM t --- !query 263 schema +-- !query schema struct<(X'001F' <= CAST(CAST(NULL AS STRING) AS BINARY)):boolean> --- !query 263 output +-- !query output NULL --- !query 264 +-- !query SELECT X'001F' <> cast(null as string) FROM t --- !query 264 schema +-- !query schema struct<(NOT (X'001F' = CAST(CAST(NULL AS STRING) AS BINARY))):boolean> --- !query 264 output +-- !query output NULL diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/booleanEquality.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/booleanEquality.sql.out index 46775d79ff4a2..dc068e70d66db 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/booleanEquality.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/booleanEquality.sql.out @@ -2,801 +2,801 @@ -- Number of queries: 97 --- !query 0 +-- !query CREATE TEMPORARY VIEW t AS SELECT 1 --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT true = cast(1 as tinyint) FROM t --- !query 1 schema +-- !query schema struct<(CAST(true AS TINYINT) = CAST(1 AS TINYINT)):boolean> --- !query 1 output +-- !query output true --- !query 2 +-- !query SELECT true = cast(1 as smallint) FROM t --- !query 2 schema +-- !query schema struct<(CAST(true AS SMALLINT) = CAST(1 AS SMALLINT)):boolean> --- !query 2 output +-- !query output true --- !query 3 +-- !query SELECT true = cast(1 as int) FROM t --- !query 3 schema +-- !query schema struct<(CAST(true AS INT) = CAST(1 AS INT)):boolean> --- !query 3 output +-- !query output true --- !query 4 +-- !query SELECT true = cast(1 as bigint) FROM t --- !query 4 schema +-- !query schema struct<(CAST(true AS BIGINT) = CAST(1 AS BIGINT)):boolean> --- !query 4 output +-- !query output true --- !query 5 +-- !query SELECT true = cast(1 as float) FROM t --- !query 5 schema +-- !query schema struct<(CAST(true AS FLOAT) = CAST(1 AS FLOAT)):boolean> --- !query 5 output +-- !query output true --- !query 6 +-- !query SELECT true = cast(1 as double) FROM t --- !query 6 schema +-- !query schema struct<(CAST(true AS DOUBLE) = CAST(1 AS DOUBLE)):boolean> --- !query 6 output +-- !query output true --- !query 7 +-- !query SELECT true = cast(1 as decimal(10, 0)) FROM t --- !query 7 schema +-- !query schema struct<(CAST(true AS DECIMAL(10,0)) = CAST(1 AS DECIMAL(10,0))):boolean> --- !query 7 output +-- !query output true --- !query 8 +-- !query SELECT true = cast(1 as string) FROM t --- !query 8 schema +-- !query schema struct<(true = CAST(CAST(1 AS STRING) AS BOOLEAN)):boolean> --- !query 8 output +-- !query output true --- !query 9 +-- !query SELECT true = cast('1' as binary) FROM t --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(true = CAST('1' AS BINARY))' due to data type mismatch: differing types in '(true = CAST('1' AS BINARY))' (boolean and binary).; line 1 pos 7 --- !query 10 +-- !query SELECT true = cast(1 as boolean) FROM t --- !query 10 schema +-- !query schema struct<(true = CAST(1 AS BOOLEAN)):boolean> --- !query 10 output +-- !query output true --- !query 11 +-- !query SELECT true = cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(true = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(true = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (boolean and timestamp).; line 1 pos 7 --- !query 12 +-- !query SELECT true = cast('2017-12-11 09:30:00' as date) FROM t --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(true = CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(true = CAST('2017-12-11 09:30:00' AS DATE))' (boolean and date).; line 1 pos 7 --- !query 13 +-- !query SELECT true <=> cast(1 as tinyint) FROM t --- !query 13 schema +-- !query schema struct<(CAST(true AS TINYINT) <=> CAST(1 AS TINYINT)):boolean> --- !query 13 output +-- !query output true --- !query 14 +-- !query SELECT true <=> cast(1 as smallint) FROM t --- !query 14 schema +-- !query schema struct<(CAST(true AS SMALLINT) <=> CAST(1 AS SMALLINT)):boolean> --- !query 14 output +-- !query output true --- !query 15 +-- !query SELECT true <=> cast(1 as int) FROM t --- !query 15 schema +-- !query schema struct<(CAST(true AS INT) <=> CAST(1 AS INT)):boolean> --- !query 15 output +-- !query output true --- !query 16 +-- !query SELECT true <=> cast(1 as bigint) FROM t --- !query 16 schema +-- !query schema struct<(CAST(true AS BIGINT) <=> CAST(1 AS BIGINT)):boolean> --- !query 16 output +-- !query output true --- !query 17 +-- !query SELECT true <=> cast(1 as float) FROM t --- !query 17 schema +-- !query schema struct<(CAST(true AS FLOAT) <=> CAST(1 AS FLOAT)):boolean> --- !query 17 output +-- !query output true --- !query 18 +-- !query SELECT true <=> cast(1 as double) FROM t --- !query 18 schema +-- !query schema struct<(CAST(true AS DOUBLE) <=> CAST(1 AS DOUBLE)):boolean> --- !query 18 output +-- !query output true --- !query 19 +-- !query SELECT true <=> cast(1 as decimal(10, 0)) FROM t --- !query 19 schema +-- !query schema struct<(CAST(true AS DECIMAL(10,0)) <=> CAST(1 AS DECIMAL(10,0))):boolean> --- !query 19 output +-- !query output true --- !query 20 +-- !query SELECT true <=> cast(1 as string) FROM t --- !query 20 schema +-- !query schema struct<(true <=> CAST(CAST(1 AS STRING) AS BOOLEAN)):boolean> --- !query 20 output +-- !query output true --- !query 21 +-- !query SELECT true <=> cast('1' as binary) FROM t --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(true <=> CAST('1' AS BINARY))' due to data type mismatch: differing types in '(true <=> CAST('1' AS BINARY))' (boolean and binary).; line 1 pos 7 --- !query 22 +-- !query SELECT true <=> cast(1 as boolean) FROM t --- !query 22 schema +-- !query schema struct<(true <=> CAST(1 AS BOOLEAN)):boolean> --- !query 22 output +-- !query output true --- !query 23 +-- !query SELECT true <=> cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(true <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(true <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (boolean and timestamp).; line 1 pos 7 --- !query 24 +-- !query SELECT true <=> cast('2017-12-11 09:30:00' as date) FROM t --- !query 24 schema +-- !query schema struct<> --- !query 24 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(true <=> CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(true <=> CAST('2017-12-11 09:30:00' AS DATE))' (boolean and date).; line 1 pos 7 --- !query 25 +-- !query SELECT cast(1 as tinyint) = true FROM t --- !query 25 schema +-- !query schema struct<(CAST(1 AS TINYINT) = CAST(true AS TINYINT)):boolean> --- !query 25 output +-- !query output true --- !query 26 +-- !query SELECT cast(1 as smallint) = true FROM t --- !query 26 schema +-- !query schema struct<(CAST(1 AS SMALLINT) = CAST(true AS SMALLINT)):boolean> --- !query 26 output +-- !query output true --- !query 27 +-- !query SELECT cast(1 as int) = true FROM t --- !query 27 schema +-- !query schema struct<(CAST(1 AS INT) = CAST(true AS INT)):boolean> --- !query 27 output +-- !query output true --- !query 28 +-- !query SELECT cast(1 as bigint) = true FROM t --- !query 28 schema +-- !query schema struct<(CAST(1 AS BIGINT) = CAST(true AS BIGINT)):boolean> --- !query 28 output +-- !query output true --- !query 29 +-- !query SELECT cast(1 as float) = true FROM t --- !query 29 schema +-- !query schema struct<(CAST(1 AS FLOAT) = CAST(true AS FLOAT)):boolean> --- !query 29 output +-- !query output true --- !query 30 +-- !query SELECT cast(1 as double) = true FROM t --- !query 30 schema +-- !query schema struct<(CAST(1 AS DOUBLE) = CAST(true AS DOUBLE)):boolean> --- !query 30 output +-- !query output true --- !query 31 +-- !query SELECT cast(1 as decimal(10, 0)) = true FROM t --- !query 31 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) = CAST(true AS DECIMAL(10,0))):boolean> --- !query 31 output +-- !query output true --- !query 32 +-- !query SELECT cast(1 as string) = true FROM t --- !query 32 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS BOOLEAN) = true):boolean> --- !query 32 output +-- !query output true --- !query 33 +-- !query SELECT cast('1' as binary) = true FROM t --- !query 33 schema +-- !query schema struct<> --- !query 33 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) = true)' due to data type mismatch: differing types in '(CAST('1' AS BINARY) = true)' (binary and boolean).; line 1 pos 7 --- !query 34 +-- !query SELECT cast(1 as boolean) = true FROM t --- !query 34 schema +-- !query schema struct<(CAST(1 AS BOOLEAN) = true):boolean> --- !query 34 output +-- !query output true --- !query 35 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) = true FROM t --- !query 35 schema +-- !query schema struct<> --- !query 35 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = true)' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = true)' (timestamp and boolean).; line 1 pos 7 --- !query 36 +-- !query SELECT cast('2017-12-11 09:30:00' as date) = true FROM t --- !query 36 schema +-- !query schema struct<> --- !query 36 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) = true)' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) = true)' (date and boolean).; line 1 pos 7 --- !query 37 +-- !query SELECT cast(1 as tinyint) <=> true FROM t --- !query 37 schema +-- !query schema struct<(CAST(1 AS TINYINT) <=> CAST(true AS TINYINT)):boolean> --- !query 37 output +-- !query output true --- !query 38 +-- !query SELECT cast(1 as smallint) <=> true FROM t --- !query 38 schema +-- !query schema struct<(CAST(1 AS SMALLINT) <=> CAST(true AS SMALLINT)):boolean> --- !query 38 output +-- !query output true --- !query 39 +-- !query SELECT cast(1 as int) <=> true FROM t --- !query 39 schema +-- !query schema struct<(CAST(1 AS INT) <=> CAST(true AS INT)):boolean> --- !query 39 output +-- !query output true --- !query 40 +-- !query SELECT cast(1 as bigint) <=> true FROM t --- !query 40 schema +-- !query schema struct<(CAST(1 AS BIGINT) <=> CAST(true AS BIGINT)):boolean> --- !query 40 output +-- !query output true --- !query 41 +-- !query SELECT cast(1 as float) <=> true FROM t --- !query 41 schema +-- !query schema struct<(CAST(1 AS FLOAT) <=> CAST(true AS FLOAT)):boolean> --- !query 41 output +-- !query output true --- !query 42 +-- !query SELECT cast(1 as double) <=> true FROM t --- !query 42 schema +-- !query schema struct<(CAST(1 AS DOUBLE) <=> CAST(true AS DOUBLE)):boolean> --- !query 42 output +-- !query output true --- !query 43 +-- !query SELECT cast(1 as decimal(10, 0)) <=> true FROM t --- !query 43 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) <=> CAST(true AS DECIMAL(10,0))):boolean> --- !query 43 output +-- !query output true --- !query 44 +-- !query SELECT cast(1 as string) <=> true FROM t --- !query 44 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS BOOLEAN) <=> true):boolean> --- !query 44 output +-- !query output true --- !query 45 +-- !query SELECT cast('1' as binary) <=> true FROM t --- !query 45 schema +-- !query schema struct<> --- !query 45 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) <=> true)' due to data type mismatch: differing types in '(CAST('1' AS BINARY) <=> true)' (binary and boolean).; line 1 pos 7 --- !query 46 +-- !query SELECT cast(1 as boolean) <=> true FROM t --- !query 46 schema +-- !query schema struct<(CAST(1 AS BOOLEAN) <=> true):boolean> --- !query 46 output +-- !query output true --- !query 47 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) <=> true FROM t --- !query 47 schema +-- !query schema struct<> --- !query 47 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> true)' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> true)' (timestamp and boolean).; line 1 pos 7 --- !query 48 +-- !query SELECT cast('2017-12-11 09:30:00' as date) <=> true FROM t --- !query 48 schema +-- !query schema struct<> --- !query 48 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) <=> true)' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) <=> true)' (date and boolean).; line 1 pos 7 --- !query 49 +-- !query SELECT false = cast(0 as tinyint) FROM t --- !query 49 schema +-- !query schema struct<(CAST(false AS TINYINT) = CAST(0 AS TINYINT)):boolean> --- !query 49 output +-- !query output true --- !query 50 +-- !query SELECT false = cast(0 as smallint) FROM t --- !query 50 schema +-- !query schema struct<(CAST(false AS SMALLINT) = CAST(0 AS SMALLINT)):boolean> --- !query 50 output +-- !query output true --- !query 51 +-- !query SELECT false = cast(0 as int) FROM t --- !query 51 schema +-- !query schema struct<(CAST(false AS INT) = CAST(0 AS INT)):boolean> --- !query 51 output +-- !query output true --- !query 52 +-- !query SELECT false = cast(0 as bigint) FROM t --- !query 52 schema +-- !query schema struct<(CAST(false AS BIGINT) = CAST(0 AS BIGINT)):boolean> --- !query 52 output +-- !query output true --- !query 53 +-- !query SELECT false = cast(0 as float) FROM t --- !query 53 schema +-- !query schema struct<(CAST(false AS FLOAT) = CAST(0 AS FLOAT)):boolean> --- !query 53 output +-- !query output true --- !query 54 +-- !query SELECT false = cast(0 as double) FROM t --- !query 54 schema +-- !query schema struct<(CAST(false AS DOUBLE) = CAST(0 AS DOUBLE)):boolean> --- !query 54 output +-- !query output true --- !query 55 +-- !query SELECT false = cast(0 as decimal(10, 0)) FROM t --- !query 55 schema +-- !query schema struct<(CAST(false AS DECIMAL(10,0)) = CAST(0 AS DECIMAL(10,0))):boolean> --- !query 55 output +-- !query output true --- !query 56 +-- !query SELECT false = cast(0 as string) FROM t --- !query 56 schema +-- !query schema struct<(false = CAST(CAST(0 AS STRING) AS BOOLEAN)):boolean> --- !query 56 output +-- !query output true --- !query 57 +-- !query SELECT false = cast('0' as binary) FROM t --- !query 57 schema +-- !query schema struct<> --- !query 57 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(false = CAST('0' AS BINARY))' due to data type mismatch: differing types in '(false = CAST('0' AS BINARY))' (boolean and binary).; line 1 pos 7 --- !query 58 +-- !query SELECT false = cast(0 as boolean) FROM t --- !query 58 schema +-- !query schema struct<(false = CAST(0 AS BOOLEAN)):boolean> --- !query 58 output +-- !query output true --- !query 59 +-- !query SELECT false = cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 59 schema +-- !query schema struct<> --- !query 59 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(false = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(false = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (boolean and timestamp).; line 1 pos 7 --- !query 60 +-- !query SELECT false = cast('2017-12-11 09:30:00' as date) FROM t --- !query 60 schema +-- !query schema struct<> --- !query 60 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(false = CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(false = CAST('2017-12-11 09:30:00' AS DATE))' (boolean and date).; line 1 pos 7 --- !query 61 +-- !query SELECT false <=> cast(0 as tinyint) FROM t --- !query 61 schema +-- !query schema struct<(CAST(false AS TINYINT) <=> CAST(0 AS TINYINT)):boolean> --- !query 61 output +-- !query output true --- !query 62 +-- !query SELECT false <=> cast(0 as smallint) FROM t --- !query 62 schema +-- !query schema struct<(CAST(false AS SMALLINT) <=> CAST(0 AS SMALLINT)):boolean> --- !query 62 output +-- !query output true --- !query 63 +-- !query SELECT false <=> cast(0 as int) FROM t --- !query 63 schema +-- !query schema struct<(CAST(false AS INT) <=> CAST(0 AS INT)):boolean> --- !query 63 output +-- !query output true --- !query 64 +-- !query SELECT false <=> cast(0 as bigint) FROM t --- !query 64 schema +-- !query schema struct<(CAST(false AS BIGINT) <=> CAST(0 AS BIGINT)):boolean> --- !query 64 output +-- !query output true --- !query 65 +-- !query SELECT false <=> cast(0 as float) FROM t --- !query 65 schema +-- !query schema struct<(CAST(false AS FLOAT) <=> CAST(0 AS FLOAT)):boolean> --- !query 65 output +-- !query output true --- !query 66 +-- !query SELECT false <=> cast(0 as double) FROM t --- !query 66 schema +-- !query schema struct<(CAST(false AS DOUBLE) <=> CAST(0 AS DOUBLE)):boolean> --- !query 66 output +-- !query output true --- !query 67 +-- !query SELECT false <=> cast(0 as decimal(10, 0)) FROM t --- !query 67 schema +-- !query schema struct<(CAST(false AS DECIMAL(10,0)) <=> CAST(0 AS DECIMAL(10,0))):boolean> --- !query 67 output +-- !query output true --- !query 68 +-- !query SELECT false <=> cast(0 as string) FROM t --- !query 68 schema +-- !query schema struct<(false <=> CAST(CAST(0 AS STRING) AS BOOLEAN)):boolean> --- !query 68 output +-- !query output true --- !query 69 +-- !query SELECT false <=> cast('0' as binary) FROM t --- !query 69 schema +-- !query schema struct<> --- !query 69 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(false <=> CAST('0' AS BINARY))' due to data type mismatch: differing types in '(false <=> CAST('0' AS BINARY))' (boolean and binary).; line 1 pos 7 --- !query 70 +-- !query SELECT false <=> cast(0 as boolean) FROM t --- !query 70 schema +-- !query schema struct<(false <=> CAST(0 AS BOOLEAN)):boolean> --- !query 70 output +-- !query output true --- !query 71 +-- !query SELECT false <=> cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 71 schema +-- !query schema struct<> --- !query 71 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(false <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(false <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (boolean and timestamp).; line 1 pos 7 --- !query 72 +-- !query SELECT false <=> cast('2017-12-11 09:30:00' as date) FROM t --- !query 72 schema +-- !query schema struct<> --- !query 72 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(false <=> CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(false <=> CAST('2017-12-11 09:30:00' AS DATE))' (boolean and date).; line 1 pos 7 --- !query 73 +-- !query SELECT cast(0 as tinyint) = false FROM t --- !query 73 schema +-- !query schema struct<(CAST(0 AS TINYINT) = CAST(false AS TINYINT)):boolean> --- !query 73 output +-- !query output true --- !query 74 +-- !query SELECT cast(0 as smallint) = false FROM t --- !query 74 schema +-- !query schema struct<(CAST(0 AS SMALLINT) = CAST(false AS SMALLINT)):boolean> --- !query 74 output +-- !query output true --- !query 75 +-- !query SELECT cast(0 as int) = false FROM t --- !query 75 schema +-- !query schema struct<(CAST(0 AS INT) = CAST(false AS INT)):boolean> --- !query 75 output +-- !query output true --- !query 76 +-- !query SELECT cast(0 as bigint) = false FROM t --- !query 76 schema +-- !query schema struct<(CAST(0 AS BIGINT) = CAST(false AS BIGINT)):boolean> --- !query 76 output +-- !query output true --- !query 77 +-- !query SELECT cast(0 as float) = false FROM t --- !query 77 schema +-- !query schema struct<(CAST(0 AS FLOAT) = CAST(false AS FLOAT)):boolean> --- !query 77 output +-- !query output true --- !query 78 +-- !query SELECT cast(0 as double) = false FROM t --- !query 78 schema +-- !query schema struct<(CAST(0 AS DOUBLE) = CAST(false AS DOUBLE)):boolean> --- !query 78 output +-- !query output true --- !query 79 +-- !query SELECT cast(0 as decimal(10, 0)) = false FROM t --- !query 79 schema +-- !query schema struct<(CAST(0 AS DECIMAL(10,0)) = CAST(false AS DECIMAL(10,0))):boolean> --- !query 79 output +-- !query output true --- !query 80 +-- !query SELECT cast(0 as string) = false FROM t --- !query 80 schema +-- !query schema struct<(CAST(CAST(0 AS STRING) AS BOOLEAN) = false):boolean> --- !query 80 output +-- !query output true --- !query 81 +-- !query SELECT cast('0' as binary) = false FROM t --- !query 81 schema +-- !query schema struct<> --- !query 81 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('0' AS BINARY) = false)' due to data type mismatch: differing types in '(CAST('0' AS BINARY) = false)' (binary and boolean).; line 1 pos 7 --- !query 82 +-- !query SELECT cast(0 as boolean) = false FROM t --- !query 82 schema +-- !query schema struct<(CAST(0 AS BOOLEAN) = false):boolean> --- !query 82 output +-- !query output true --- !query 83 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) = false FROM t --- !query 83 schema +-- !query schema struct<> --- !query 83 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = false)' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = false)' (timestamp and boolean).; line 1 pos 7 --- !query 84 +-- !query SELECT cast('2017-12-11 09:30:00' as date) = false FROM t --- !query 84 schema +-- !query schema struct<> --- !query 84 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) = false)' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) = false)' (date and boolean).; line 1 pos 7 --- !query 85 +-- !query SELECT cast(0 as tinyint) <=> false FROM t --- !query 85 schema +-- !query schema struct<(CAST(0 AS TINYINT) <=> CAST(false AS TINYINT)):boolean> --- !query 85 output +-- !query output true --- !query 86 +-- !query SELECT cast(0 as smallint) <=> false FROM t --- !query 86 schema +-- !query schema struct<(CAST(0 AS SMALLINT) <=> CAST(false AS SMALLINT)):boolean> --- !query 86 output +-- !query output true --- !query 87 +-- !query SELECT cast(0 as int) <=> false FROM t --- !query 87 schema +-- !query schema struct<(CAST(0 AS INT) <=> CAST(false AS INT)):boolean> --- !query 87 output +-- !query output true --- !query 88 +-- !query SELECT cast(0 as bigint) <=> false FROM t --- !query 88 schema +-- !query schema struct<(CAST(0 AS BIGINT) <=> CAST(false AS BIGINT)):boolean> --- !query 88 output +-- !query output true --- !query 89 +-- !query SELECT cast(0 as float) <=> false FROM t --- !query 89 schema +-- !query schema struct<(CAST(0 AS FLOAT) <=> CAST(false AS FLOAT)):boolean> --- !query 89 output +-- !query output true --- !query 90 +-- !query SELECT cast(0 as double) <=> false FROM t --- !query 90 schema +-- !query schema struct<(CAST(0 AS DOUBLE) <=> CAST(false AS DOUBLE)):boolean> --- !query 90 output +-- !query output true --- !query 91 +-- !query SELECT cast(0 as decimal(10, 0)) <=> false FROM t --- !query 91 schema +-- !query schema struct<(CAST(0 AS DECIMAL(10,0)) <=> CAST(false AS DECIMAL(10,0))):boolean> --- !query 91 output +-- !query output true --- !query 92 +-- !query SELECT cast(0 as string) <=> false FROM t --- !query 92 schema +-- !query schema struct<(CAST(CAST(0 AS STRING) AS BOOLEAN) <=> false):boolean> --- !query 92 output +-- !query output true --- !query 93 +-- !query SELECT cast('0' as binary) <=> false FROM t --- !query 93 schema +-- !query schema struct<> --- !query 93 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('0' AS BINARY) <=> false)' due to data type mismatch: differing types in '(CAST('0' AS BINARY) <=> false)' (binary and boolean).; line 1 pos 7 --- !query 94 +-- !query SELECT cast(0 as boolean) <=> false FROM t --- !query 94 schema +-- !query schema struct<(CAST(0 AS BOOLEAN) <=> false):boolean> --- !query 94 output +-- !query output true --- !query 95 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) <=> false FROM t --- !query 95 schema +-- !query schema struct<> --- !query 95 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> false)' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> false)' (timestamp and boolean).; line 1 pos 7 --- !query 96 +-- !query SELECT cast('2017-12-11 09:30:00' as date) <=> false FROM t --- !query 96 schema +-- !query schema struct<> --- !query 96 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) <=> false)' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) <=> false)' (date and boolean).; line 1 pos 7 diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/caseWhenCoercion.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/caseWhenCoercion.sql.out index 1e1cbc3304141..18d97c2f1b42a 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/caseWhenCoercion.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/caseWhenCoercion.sql.out @@ -2,1231 +2,1231 @@ -- Number of queries: 145 --- !query 0 +-- !query CREATE TEMPORARY VIEW t AS SELECT 1 --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as tinyint) END FROM t --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 1 --- !query 2 +-- !query SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as smallint) END FROM t --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 1 --- !query 3 +-- !query SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as int) END FROM t --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 1 --- !query 4 +-- !query SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as bigint) END FROM t --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 1 --- !query 5 +-- !query SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as float) END FROM t --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 1.0 --- !query 6 +-- !query SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as double) END FROM t --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 1.0 --- !query 7 +-- !query SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as decimal(10, 0)) END FROM t --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 1 --- !query 8 +-- !query SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as string) END FROM t --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 1 --- !query 9 +-- !query SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast('2' as binary) END FROM t --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS TINYINT) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN tinyint ELSE binary END; line 1 pos 7 --- !query 10 +-- !query SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as boolean) END FROM t --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS TINYINT) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN tinyint ELSE boolean END; line 1 pos 7 --- !query 11 +-- !query SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS TINYINT) ELSE CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN tinyint ELSE timestamp END; line 1 pos 7 --- !query 12 +-- !query SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast('2017-12-11 09:30:00' as date) END FROM t --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS TINYINT) ELSE CAST('2017-12-11 09:30:00' AS DATE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN tinyint ELSE date END; line 1 pos 7 --- !query 13 +-- !query SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as tinyint) END FROM t --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output 1 --- !query 14 +-- !query SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as smallint) END FROM t --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output 1 --- !query 15 +-- !query SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as int) END FROM t --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 1 --- !query 16 +-- !query SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as bigint) END FROM t --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output 1 --- !query 17 +-- !query SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as float) END FROM t --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output 1.0 --- !query 18 +-- !query SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as double) END FROM t --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output 1.0 --- !query 19 +-- !query SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as decimal(10, 0)) END FROM t --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output 1 --- !query 20 +-- !query SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as string) END FROM t --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output 1 --- !query 21 +-- !query SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast('2' as binary) END FROM t --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS SMALLINT) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN smallint ELSE binary END; line 1 pos 7 --- !query 22 +-- !query SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as boolean) END FROM t --- !query 22 schema +-- !query schema struct<> --- !query 22 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS SMALLINT) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN smallint ELSE boolean END; line 1 pos 7 --- !query 23 +-- !query SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS SMALLINT) ELSE CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN smallint ELSE timestamp END; line 1 pos 7 --- !query 24 +-- !query SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast('2017-12-11 09:30:00' as date) END FROM t --- !query 24 schema +-- !query schema struct<> --- !query 24 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS SMALLINT) ELSE CAST('2017-12-11 09:30:00' AS DATE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN smallint ELSE date END; line 1 pos 7 --- !query 25 +-- !query SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as tinyint) END FROM t --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output 1 --- !query 26 +-- !query SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as smallint) END FROM t --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output 1 --- !query 27 +-- !query SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as int) END FROM t --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output 1 --- !query 28 +-- !query SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as bigint) END FROM t --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output 1 --- !query 29 +-- !query SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as float) END FROM t --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output 1.0 --- !query 30 +-- !query SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as double) END FROM t --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output 1.0 --- !query 31 +-- !query SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as decimal(10, 0)) END FROM t --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output 1 --- !query 32 +-- !query SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as string) END FROM t --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output 1 --- !query 33 +-- !query SELECT CASE WHEN true THEN cast(1 as int) ELSE cast('2' as binary) END FROM t --- !query 33 schema +-- !query schema struct<> --- !query 33 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS INT) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN int ELSE binary END; line 1 pos 7 --- !query 34 +-- !query SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as boolean) END FROM t --- !query 34 schema +-- !query schema struct<> --- !query 34 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS INT) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN int ELSE boolean END; line 1 pos 7 --- !query 35 +-- !query SELECT CASE WHEN true THEN cast(1 as int) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t --- !query 35 schema +-- !query schema struct<> --- !query 35 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS INT) ELSE CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN int ELSE timestamp END; line 1 pos 7 --- !query 36 +-- !query SELECT CASE WHEN true THEN cast(1 as int) ELSE cast('2017-12-11 09:30:00' as date) END FROM t --- !query 36 schema +-- !query schema struct<> --- !query 36 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS INT) ELSE CAST('2017-12-11 09:30:00' AS DATE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN int ELSE date END; line 1 pos 7 --- !query 37 +-- !query SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as tinyint) END FROM t --- !query 37 schema +-- !query schema struct --- !query 37 output +-- !query output 1 --- !query 38 +-- !query SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as smallint) END FROM t --- !query 38 schema +-- !query schema struct --- !query 38 output +-- !query output 1 --- !query 39 +-- !query SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as int) END FROM t --- !query 39 schema +-- !query schema struct --- !query 39 output +-- !query output 1 --- !query 40 +-- !query SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as bigint) END FROM t --- !query 40 schema +-- !query schema struct --- !query 40 output +-- !query output 1 --- !query 41 +-- !query SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as float) END FROM t --- !query 41 schema +-- !query schema struct --- !query 41 output +-- !query output 1.0 --- !query 42 +-- !query SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as double) END FROM t --- !query 42 schema +-- !query schema struct --- !query 42 output +-- !query output 1.0 --- !query 43 +-- !query SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as decimal(10, 0)) END FROM t --- !query 43 schema +-- !query schema struct --- !query 43 output +-- !query output 1 --- !query 44 +-- !query SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as string) END FROM t --- !query 44 schema +-- !query schema struct --- !query 44 output +-- !query output 1 --- !query 45 +-- !query SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast('2' as binary) END FROM t --- !query 45 schema +-- !query schema struct<> --- !query 45 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS BIGINT) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN bigint ELSE binary END; line 1 pos 7 --- !query 46 +-- !query SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as boolean) END FROM t --- !query 46 schema +-- !query schema struct<> --- !query 46 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS BIGINT) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN bigint ELSE boolean END; line 1 pos 7 --- !query 47 +-- !query SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t --- !query 47 schema +-- !query schema struct<> --- !query 47 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS BIGINT) ELSE CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN bigint ELSE timestamp END; line 1 pos 7 --- !query 48 +-- !query SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast('2017-12-11 09:30:00' as date) END FROM t --- !query 48 schema +-- !query schema struct<> --- !query 48 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS BIGINT) ELSE CAST('2017-12-11 09:30:00' AS DATE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN bigint ELSE date END; line 1 pos 7 --- !query 49 +-- !query SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as tinyint) END FROM t --- !query 49 schema +-- !query schema struct --- !query 49 output +-- !query output 1.0 --- !query 50 +-- !query SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as smallint) END FROM t --- !query 50 schema +-- !query schema struct --- !query 50 output +-- !query output 1.0 --- !query 51 +-- !query SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as int) END FROM t --- !query 51 schema +-- !query schema struct --- !query 51 output +-- !query output 1.0 --- !query 52 +-- !query SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as bigint) END FROM t --- !query 52 schema +-- !query schema struct --- !query 52 output +-- !query output 1.0 --- !query 53 +-- !query SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as float) END FROM t --- !query 53 schema +-- !query schema struct --- !query 53 output +-- !query output 1.0 --- !query 54 +-- !query SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as double) END FROM t --- !query 54 schema +-- !query schema struct --- !query 54 output +-- !query output 1.0 --- !query 55 +-- !query SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as decimal(10, 0)) END FROM t --- !query 55 schema +-- !query schema struct --- !query 55 output +-- !query output 1.0 --- !query 56 +-- !query SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as string) END FROM t --- !query 56 schema +-- !query schema struct --- !query 56 output +-- !query output 1.0 --- !query 57 +-- !query SELECT CASE WHEN true THEN cast(1 as float) ELSE cast('2' as binary) END FROM t --- !query 57 schema +-- !query schema struct<> --- !query 57 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS FLOAT) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN float ELSE binary END; line 1 pos 7 --- !query 58 +-- !query SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as boolean) END FROM t --- !query 58 schema +-- !query schema struct<> --- !query 58 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS FLOAT) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN float ELSE boolean END; line 1 pos 7 --- !query 59 +-- !query SELECT CASE WHEN true THEN cast(1 as float) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t --- !query 59 schema +-- !query schema struct<> --- !query 59 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS FLOAT) ELSE CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN float ELSE timestamp END; line 1 pos 7 --- !query 60 +-- !query SELECT CASE WHEN true THEN cast(1 as float) ELSE cast('2017-12-11 09:30:00' as date) END FROM t --- !query 60 schema +-- !query schema struct<> --- !query 60 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS FLOAT) ELSE CAST('2017-12-11 09:30:00' AS DATE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN float ELSE date END; line 1 pos 7 --- !query 61 +-- !query SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as tinyint) END FROM t --- !query 61 schema +-- !query schema struct --- !query 61 output +-- !query output 1.0 --- !query 62 +-- !query SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as smallint) END FROM t --- !query 62 schema +-- !query schema struct --- !query 62 output +-- !query output 1.0 --- !query 63 +-- !query SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as int) END FROM t --- !query 63 schema +-- !query schema struct --- !query 63 output +-- !query output 1.0 --- !query 64 +-- !query SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as bigint) END FROM t --- !query 64 schema +-- !query schema struct --- !query 64 output +-- !query output 1.0 --- !query 65 +-- !query SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as float) END FROM t --- !query 65 schema +-- !query schema struct --- !query 65 output +-- !query output 1.0 --- !query 66 +-- !query SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as double) END FROM t --- !query 66 schema +-- !query schema struct --- !query 66 output +-- !query output 1.0 --- !query 67 +-- !query SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as decimal(10, 0)) END FROM t --- !query 67 schema +-- !query schema struct --- !query 67 output +-- !query output 1.0 --- !query 68 +-- !query SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as string) END FROM t --- !query 68 schema +-- !query schema struct --- !query 68 output +-- !query output 1.0 --- !query 69 +-- !query SELECT CASE WHEN true THEN cast(1 as double) ELSE cast('2' as binary) END FROM t --- !query 69 schema +-- !query schema struct<> --- !query 69 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS DOUBLE) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN double ELSE binary END; line 1 pos 7 --- !query 70 +-- !query SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as boolean) END FROM t --- !query 70 schema +-- !query schema struct<> --- !query 70 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS DOUBLE) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN double ELSE boolean END; line 1 pos 7 --- !query 71 +-- !query SELECT CASE WHEN true THEN cast(1 as double) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t --- !query 71 schema +-- !query schema struct<> --- !query 71 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS DOUBLE) ELSE CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN double ELSE timestamp END; line 1 pos 7 --- !query 72 +-- !query SELECT CASE WHEN true THEN cast(1 as double) ELSE cast('2017-12-11 09:30:00' as date) END FROM t --- !query 72 schema +-- !query schema struct<> --- !query 72 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS DOUBLE) ELSE CAST('2017-12-11 09:30:00' AS DATE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN double ELSE date END; line 1 pos 7 --- !query 73 +-- !query SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as tinyint) END FROM t --- !query 73 schema +-- !query schema struct --- !query 73 output +-- !query output 1 --- !query 74 +-- !query SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as smallint) END FROM t --- !query 74 schema +-- !query schema struct --- !query 74 output +-- !query output 1 --- !query 75 +-- !query SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as int) END FROM t --- !query 75 schema +-- !query schema struct --- !query 75 output +-- !query output 1 --- !query 76 +-- !query SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as bigint) END FROM t --- !query 76 schema +-- !query schema struct --- !query 76 output +-- !query output 1 --- !query 77 +-- !query SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as float) END FROM t --- !query 77 schema +-- !query schema struct --- !query 77 output +-- !query output 1.0 --- !query 78 +-- !query SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as double) END FROM t --- !query 78 schema +-- !query schema struct --- !query 78 output +-- !query output 1.0 --- !query 79 +-- !query SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as decimal(10, 0)) END FROM t --- !query 79 schema +-- !query schema struct --- !query 79 output +-- !query output 1 --- !query 80 +-- !query SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as string) END FROM t --- !query 80 schema +-- !query schema struct --- !query 80 output +-- !query output 1 --- !query 81 +-- !query SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast('2' as binary) END FROM t --- !query 81 schema +-- !query schema struct<> --- !query 81 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS DECIMAL(10,0)) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN decimal(10,0) ELSE binary END; line 1 pos 7 --- !query 82 +-- !query SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as boolean) END FROM t --- !query 82 schema +-- !query schema struct<> --- !query 82 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS DECIMAL(10,0)) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN decimal(10,0) ELSE boolean END; line 1 pos 7 --- !query 83 +-- !query SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t --- !query 83 schema +-- !query schema struct<> --- !query 83 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS DECIMAL(10,0)) ELSE CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN decimal(10,0) ELSE timestamp END; line 1 pos 7 --- !query 84 +-- !query SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast('2017-12-11 09:30:00' as date) END FROM t --- !query 84 schema +-- !query schema struct<> --- !query 84 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS DECIMAL(10,0)) ELSE CAST('2017-12-11 09:30:00' AS DATE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN decimal(10,0) ELSE date END; line 1 pos 7 --- !query 85 +-- !query SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as tinyint) END FROM t --- !query 85 schema +-- !query schema struct --- !query 85 output +-- !query output 1 --- !query 86 +-- !query SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as smallint) END FROM t --- !query 86 schema +-- !query schema struct --- !query 86 output +-- !query output 1 --- !query 87 +-- !query SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as int) END FROM t --- !query 87 schema +-- !query schema struct --- !query 87 output +-- !query output 1 --- !query 88 +-- !query SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as bigint) END FROM t --- !query 88 schema +-- !query schema struct --- !query 88 output +-- !query output 1 --- !query 89 +-- !query SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as float) END FROM t --- !query 89 schema +-- !query schema struct --- !query 89 output +-- !query output 1 --- !query 90 +-- !query SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as double) END FROM t --- !query 90 schema +-- !query schema struct --- !query 90 output +-- !query output 1 --- !query 91 +-- !query SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as decimal(10, 0)) END FROM t --- !query 91 schema +-- !query schema struct --- !query 91 output +-- !query output 1 --- !query 92 +-- !query SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as string) END FROM t --- !query 92 schema +-- !query schema struct --- !query 92 output +-- !query output 1 --- !query 93 +-- !query SELECT CASE WHEN true THEN cast(1 as string) ELSE cast('2' as binary) END FROM t --- !query 93 schema +-- !query schema struct<> --- !query 93 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS STRING) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN string ELSE binary END; line 1 pos 7 --- !query 94 +-- !query SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as boolean) END FROM t --- !query 94 schema +-- !query schema struct<> --- !query 94 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS STRING) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN string ELSE boolean END; line 1 pos 7 --- !query 95 +-- !query SELECT CASE WHEN true THEN cast(1 as string) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t --- !query 95 schema +-- !query schema struct --- !query 95 output +-- !query output 1 --- !query 96 +-- !query SELECT CASE WHEN true THEN cast(1 as string) ELSE cast('2017-12-11 09:30:00' as date) END FROM t --- !query 96 schema +-- !query schema struct --- !query 96 output +-- !query output 1 --- !query 97 +-- !query SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as tinyint) END FROM t --- !query 97 schema +-- !query schema struct<> --- !query 97 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST(2 AS TINYINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN binary ELSE tinyint END; line 1 pos 7 --- !query 98 +-- !query SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as smallint) END FROM t --- !query 98 schema +-- !query schema struct<> --- !query 98 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST(2 AS SMALLINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN binary ELSE smallint END; line 1 pos 7 --- !query 99 +-- !query SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as int) END FROM t --- !query 99 schema +-- !query schema struct<> --- !query 99 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST(2 AS INT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN binary ELSE int END; line 1 pos 7 --- !query 100 +-- !query SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as bigint) END FROM t --- !query 100 schema +-- !query schema struct<> --- !query 100 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST(2 AS BIGINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN binary ELSE bigint END; line 1 pos 7 --- !query 101 +-- !query SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as float) END FROM t --- !query 101 schema +-- !query schema struct<> --- !query 101 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST(2 AS FLOAT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN binary ELSE float END; line 1 pos 7 --- !query 102 +-- !query SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as double) END FROM t --- !query 102 schema +-- !query schema struct<> --- !query 102 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST(2 AS DOUBLE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN binary ELSE double END; line 1 pos 7 --- !query 103 +-- !query SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as decimal(10, 0)) END FROM t --- !query 103 schema +-- !query schema struct<> --- !query 103 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST(2 AS DECIMAL(10,0)) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN binary ELSE decimal(10,0) END; line 1 pos 7 --- !query 104 +-- !query SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as string) END FROM t --- !query 104 schema +-- !query schema struct<> --- !query 104 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST(2 AS STRING) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN binary ELSE string END; line 1 pos 7 --- !query 105 +-- !query SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast('2' as binary) END FROM t --- !query 105 schema +-- !query schema struct --- !query 105 output +-- !query output 1 --- !query 106 +-- !query SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as boolean) END FROM t --- !query 106 schema +-- !query schema struct<> --- !query 106 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN binary ELSE boolean END; line 1 pos 7 --- !query 107 +-- !query SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t --- !query 107 schema +-- !query schema struct<> --- !query 107 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN binary ELSE timestamp END; line 1 pos 7 --- !query 108 +-- !query SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast('2017-12-11 09:30:00' as date) END FROM t --- !query 108 schema +-- !query schema struct<> --- !query 108 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST('2017-12-11 09:30:00' AS DATE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN binary ELSE date END; line 1 pos 7 --- !query 109 +-- !query SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as tinyint) END FROM t --- !query 109 schema +-- !query schema struct<> --- !query 109 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST(2 AS TINYINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN boolean ELSE tinyint END; line 1 pos 7 --- !query 110 +-- !query SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as smallint) END FROM t --- !query 110 schema +-- !query schema struct<> --- !query 110 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST(2 AS SMALLINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN boolean ELSE smallint END; line 1 pos 7 --- !query 111 +-- !query SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as int) END FROM t --- !query 111 schema +-- !query schema struct<> --- !query 111 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST(2 AS INT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN boolean ELSE int END; line 1 pos 7 --- !query 112 +-- !query SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as bigint) END FROM t --- !query 112 schema +-- !query schema struct<> --- !query 112 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST(2 AS BIGINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN boolean ELSE bigint END; line 1 pos 7 --- !query 113 +-- !query SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as float) END FROM t --- !query 113 schema +-- !query schema struct<> --- !query 113 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST(2 AS FLOAT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN boolean ELSE float END; line 1 pos 7 --- !query 114 +-- !query SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as double) END FROM t --- !query 114 schema +-- !query schema struct<> --- !query 114 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST(2 AS DOUBLE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN boolean ELSE double END; line 1 pos 7 --- !query 115 +-- !query SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as decimal(10, 0)) END FROM t --- !query 115 schema +-- !query schema struct<> --- !query 115 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST(2 AS DECIMAL(10,0)) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN boolean ELSE decimal(10,0) END; line 1 pos 7 --- !query 116 +-- !query SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as string) END FROM t --- !query 116 schema +-- !query schema struct<> --- !query 116 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST(2 AS STRING) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN boolean ELSE string END; line 1 pos 7 --- !query 117 +-- !query SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast('2' as binary) END FROM t --- !query 117 schema +-- !query schema struct<> --- !query 117 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN boolean ELSE binary END; line 1 pos 7 --- !query 118 +-- !query SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as boolean) END FROM t --- !query 118 schema +-- !query schema struct --- !query 118 output +-- !query output true --- !query 119 +-- !query SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t --- !query 119 schema +-- !query schema struct<> --- !query 119 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN boolean ELSE timestamp END; line 1 pos 7 --- !query 120 +-- !query SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast('2017-12-11 09:30:00' as date) END FROM t --- !query 120 schema +-- !query schema struct<> --- !query 120 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST('2017-12-11 09:30:00' AS DATE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN boolean ELSE date END; line 1 pos 7 --- !query 121 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as tinyint) END FROM t --- !query 121 schema +-- !query schema struct<> --- !query 121 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) ELSE CAST(2 AS TINYINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN timestamp ELSE tinyint END; line 1 pos 7 --- !query 122 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as smallint) END FROM t --- !query 122 schema +-- !query schema struct<> --- !query 122 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) ELSE CAST(2 AS SMALLINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN timestamp ELSE smallint END; line 1 pos 7 --- !query 123 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as int) END FROM t --- !query 123 schema +-- !query schema struct<> --- !query 123 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) ELSE CAST(2 AS INT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN timestamp ELSE int END; line 1 pos 7 --- !query 124 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as bigint) END FROM t --- !query 124 schema +-- !query schema struct<> --- !query 124 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) ELSE CAST(2 AS BIGINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN timestamp ELSE bigint END; line 1 pos 7 --- !query 125 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as float) END FROM t --- !query 125 schema +-- !query schema struct<> --- !query 125 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) ELSE CAST(2 AS FLOAT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN timestamp ELSE float END; line 1 pos 7 --- !query 126 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as double) END FROM t --- !query 126 schema +-- !query schema struct<> --- !query 126 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) ELSE CAST(2 AS DOUBLE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN timestamp ELSE double END; line 1 pos 7 --- !query 127 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as decimal(10, 0)) END FROM t --- !query 127 schema +-- !query schema struct<> --- !query 127 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) ELSE CAST(2 AS DECIMAL(10,0)) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN timestamp ELSE decimal(10,0) END; line 1 pos 7 --- !query 128 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as string) END FROM t --- !query 128 schema +-- !query schema struct --- !query 128 output +-- !query output 2017-12-12 09:30:00 --- !query 129 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast('2' as binary) END FROM t --- !query 129 schema +-- !query schema struct<> --- !query 129 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN timestamp ELSE binary END; line 1 pos 7 --- !query 130 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as boolean) END FROM t --- !query 130 schema +-- !query schema struct<> --- !query 130 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN timestamp ELSE boolean END; line 1 pos 7 --- !query 131 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t --- !query 131 schema +-- !query schema struct --- !query 131 output +-- !query output 2017-12-12 09:30:00 --- !query 132 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast('2017-12-11 09:30:00' as date) END FROM t --- !query 132 schema +-- !query schema struct --- !query 132 output +-- !query output 2017-12-12 09:30:00 --- !query 133 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as tinyint) END FROM t --- !query 133 schema +-- !query schema struct<> --- !query 133 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00' AS DATE) ELSE CAST(2 AS TINYINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN date ELSE tinyint END; line 1 pos 7 --- !query 134 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as smallint) END FROM t --- !query 134 schema +-- !query schema struct<> --- !query 134 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00' AS DATE) ELSE CAST(2 AS SMALLINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN date ELSE smallint END; line 1 pos 7 --- !query 135 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as int) END FROM t --- !query 135 schema +-- !query schema struct<> --- !query 135 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00' AS DATE) ELSE CAST(2 AS INT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN date ELSE int END; line 1 pos 7 --- !query 136 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as bigint) END FROM t --- !query 136 schema +-- !query schema struct<> --- !query 136 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00' AS DATE) ELSE CAST(2 AS BIGINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN date ELSE bigint END; line 1 pos 7 --- !query 137 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as float) END FROM t --- !query 137 schema +-- !query schema struct<> --- !query 137 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00' AS DATE) ELSE CAST(2 AS FLOAT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN date ELSE float END; line 1 pos 7 --- !query 138 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as double) END FROM t --- !query 138 schema +-- !query schema struct<> --- !query 138 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00' AS DATE) ELSE CAST(2 AS DOUBLE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN date ELSE double END; line 1 pos 7 --- !query 139 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as decimal(10, 0)) END FROM t --- !query 139 schema +-- !query schema struct<> --- !query 139 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00' AS DATE) ELSE CAST(2 AS DECIMAL(10,0)) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN date ELSE decimal(10,0) END; line 1 pos 7 --- !query 140 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as string) END FROM t --- !query 140 schema +-- !query schema struct --- !query 140 output +-- !query output 2017-12-12 --- !query 141 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast('2' as binary) END FROM t --- !query 141 schema +-- !query schema struct<> --- !query 141 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00' AS DATE) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN date ELSE binary END; line 1 pos 7 --- !query 142 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as boolean) END FROM t --- !query 142 schema +-- !query schema struct<> --- !query 142 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00' AS DATE) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type, got CASE WHEN ... THEN date ELSE boolean END; line 1 pos 7 --- !query 143 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t --- !query 143 schema +-- !query schema struct --- !query 143 output +-- !query output 2017-12-12 00:00:00 --- !query 144 +-- !query SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast('2017-12-11 09:30:00' as date) END FROM t --- !query 144 schema +-- !query schema struct --- !query 144 output +-- !query output 2017-12-12 diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/concat.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/concat.sql.out index 6c6d3110d7d0d..bd157c474d249 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/concat.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/concat.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 14 --- !query 0 +-- !query SELECT (col1 || col2 || col3) col FROM ( SELECT @@ -11,9 +11,9 @@ FROM ( encode(string(id + 2), 'utf-8') col3 FROM range(10) ) --- !query 0 schema +-- !query schema struct --- !query 0 output +-- !query output 012 123 234 @@ -26,7 +26,7 @@ struct 91011 --- !query 1 +-- !query SELECT ((col1 || col2) || (col3 || col4) || col5) col FROM ( SELECT @@ -37,9 +37,9 @@ FROM ( CAST(id AS DOUBLE) col5 FROM range(10) ) --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output prefix_0120.0 prefix_1231.0 prefix_2342.0 @@ -52,7 +52,7 @@ prefix_89108.0 prefix_910119.0 --- !query 2 +-- !query SELECT ((col1 || col2) || (col3 || col4)) col FROM ( SELECT @@ -62,9 +62,9 @@ FROM ( encode(string(id + 3), 'utf-8') col4 FROM range(10) ) --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 0123 1234 2345 @@ -77,15 +77,15 @@ struct 9101112 --- !query 3 +-- !query set spark.sql.function.concatBinaryAsString=true --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output spark.sql.function.concatBinaryAsString true --- !query 4 +-- !query SELECT (col1 || col2) col FROM ( SELECT @@ -93,9 +93,9 @@ FROM ( encode(string(id + 1), 'utf-8') col2 FROM range(10) ) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 01 12 23 @@ -108,7 +108,7 @@ struct 910 --- !query 5 +-- !query SELECT (col1 || col2 || col3 || col4) col FROM ( SELECT @@ -118,9 +118,9 @@ FROM ( encode(string(id + 3), 'utf-8') col4 FROM range(10) ) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 0123 1234 2345 @@ -133,7 +133,7 @@ struct 9101112 --- !query 6 +-- !query SELECT ((col1 || col2) || (col3 || col4)) col FROM ( SELECT @@ -143,9 +143,9 @@ FROM ( encode(string(id + 3), 'utf-8') col4 FROM range(10) ) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 0123 1234 2345 @@ -158,15 +158,15 @@ struct 9101112 --- !query 7 +-- !query set spark.sql.function.concatBinaryAsString=false --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output spark.sql.function.concatBinaryAsString false --- !query 8 +-- !query SELECT (col1 || col2) col FROM ( SELECT @@ -174,9 +174,9 @@ FROM ( encode(string(id + 1), 'utf-8') col2 FROM range(10) ) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 01 12 23 @@ -189,7 +189,7 @@ struct 910 --- !query 9 +-- !query SELECT (col1 || col2 || col3 || col4) col FROM ( SELECT @@ -199,9 +199,9 @@ FROM ( encode(string(id + 3), 'utf-8') col4 FROM range(10) ) --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 0123 1234 2345 @@ -214,7 +214,7 @@ struct 9101112 --- !query 10 +-- !query SELECT ((col1 || col2) || (col3 || col4)) col FROM ( SELECT @@ -224,9 +224,9 @@ FROM ( encode(string(id + 3), 'utf-8') col4 FROM range(10) ) --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 0123 1234 2345 @@ -239,7 +239,7 @@ struct 9101112 --- !query 11 +-- !query CREATE TEMPORARY VIEW various_arrays AS SELECT * FROM VALUES ( array(true, false), array(true), array(2Y, 1Y), array(3Y, 4Y), @@ -272,13 +272,13 @@ CREATE TEMPORARY VIEW various_arrays AS SELECT * FROM VALUES ( struct_array1, struct_array2, map_array1, map_array2 ) --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output --- !query 12 +-- !query SELECT (boolean_array1 || boolean_array2) boolean_array, (tinyint_array1 || tinyint_array2) tinyint_array, @@ -295,13 +295,13 @@ SELECT (struct_array1 || struct_array2) struct_array, (map_array1 || map_array2) map_array FROM various_arrays --- !query 12 schema +-- !query schema struct,tinyint_array:array,smallint_array:array,int_array:array,bigint_array:array,decimal_array:array,double_array:array,float_array:array,data_array:array,timestamp_array:array,string_array:array,array_array:array>,struct_array:array>,map_array:array>> --- !query 12 output -[true,false,true] [2,1,3,4] [2,1,3,4] [2,1,3,4] [2,1,3,4] [9223372036854775809,9223372036854775808,9223372036854775808,9223372036854775809] [2.0,1.0,3.0,4.0] [2.0,1.0,3.0,4.0] [2016-03-14,2016-03-13,2016-03-12,2016-03-11] [2016-11-15 20:54:00.0,2016-11-12 20:54:00.0,2016-11-11 20:54:00.0] ["a","b","c","d"] [["a","b"],["c","d"],["e"],["f"]] [{"col1":"a","col2":1},{"col1":"b","col2":2},{"col1":"c","col2":3},{"col1":"d","col2":4}] [{"a":1},{"b":2},{"c":3},{"d":4}] +-- !query output +[true,false,true] [2,1,3,4] [2,1,3,4] [2,1,3,4] [2,1,3,4] [9223372036854775809,9223372036854775808,9223372036854775808,9223372036854775809] [2.0,1.0,3.0,4.0] [2.0,1.0,3.0,4.0] [2016-03-14,2016-03-13,2016-03-12,2016-03-11] [2016-11-15 20:54:00,2016-11-12 20:54:00,2016-11-11 20:54:00] ["a","b","c","d"] [["a","b"],["c","d"],["e"],["f"]] [{"col1":"a","col2":1},{"col1":"b","col2":2},{"col1":"c","col2":3},{"col1":"d","col2":4}] [{"a":1},{"b":2},{"c":3},{"d":4}] --- !query 13 +-- !query SELECT (tinyint_array1 || smallint_array2) ts_array, (smallint_array1 || int_array2) si_array, @@ -313,7 +313,7 @@ SELECT (timestamp_array1 || string_array2) tst_array, (string_array1 || int_array2) sti_array FROM various_arrays --- !query 13 schema +-- !query schema struct,si_array:array,ib_array:array,bd_array:array,dd_array:array,df_array:array,std_array:array,tst_array:array,sti_array:array> --- !query 13 output +-- !query output [2,1,3,4] [2,1,3,4] [2,1,3,4] [2,1,9223372036854775808,9223372036854775809] [9.223372036854776E18,9.223372036854776E18,3.0,4.0] [2.0,1.0,3.0,4.0] ["a","b","2016-03-12","2016-03-11"] ["2016-11-15 20:54:00","2016-11-12 20:54:00","c","d"] ["a","b","3","4"] diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/dateTimeOperations.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/dateTimeOperations.sql.out index a4cd408c04bf8..d5c27ade8e152 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/dateTimeOperations.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/dateTimeOperations.sql.out @@ -2,348 +2,348 @@ -- Number of queries: 40 --- !query 0 +-- !query CREATE TEMPORARY VIEW t AS SELECT 1 --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query select cast(1 as tinyint) + interval 2 day --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS TINYINT) + interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS TINYINT) + interval 2 days)' (tinyint and interval).; line 1 pos 7 +cannot resolve 'CAST(1 AS TINYINT) + INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS TINYINT)' is of tinyint type.; line 1 pos 7 --- !query 2 +-- !query select cast(1 as smallint) + interval 2 day --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS SMALLINT) + interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS SMALLINT) + interval 2 days)' (smallint and interval).; line 1 pos 7 +cannot resolve 'CAST(1 AS SMALLINT) + INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS SMALLINT)' is of smallint type.; line 1 pos 7 --- !query 3 +-- !query select cast(1 as int) + interval 2 day --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS INT) + interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS INT) + interval 2 days)' (int and interval).; line 1 pos 7 +cannot resolve 'CAST(1 AS INT) + INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS INT)' is of int type.; line 1 pos 7 --- !query 4 +-- !query select cast(1 as bigint) + interval 2 day --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS BIGINT) + interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS BIGINT) + interval 2 days)' (bigint and interval).; line 1 pos 7 +cannot resolve 'CAST(1 AS BIGINT) + INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS BIGINT)' is of bigint type.; line 1 pos 7 --- !query 5 +-- !query select cast(1 as float) + interval 2 day --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS FLOAT) + interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS FLOAT) + interval 2 days)' (float and interval).; line 1 pos 7 +cannot resolve 'CAST(1 AS FLOAT) + INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS FLOAT)' is of float type.; line 1 pos 7 --- !query 6 +-- !query select cast(1 as double) + interval 2 day --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS DOUBLE) + interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS DOUBLE) + interval 2 days)' (double and interval).; line 1 pos 7 +cannot resolve 'CAST(1 AS DOUBLE) + INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS DOUBLE)' is of double type.; line 1 pos 7 --- !query 7 +-- !query select cast(1 as decimal(10, 0)) + interval 2 day --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS DECIMAL(10,0)) + interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) + interval 2 days)' (decimal(10,0) and interval).; line 1 pos 7 +cannot resolve 'CAST(1 AS DECIMAL(10,0)) + INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS DECIMAL(10,0))' is of decimal(10,0) type.; line 1 pos 7 --- !query 8 +-- !query select cast('2017-12-11' as string) + interval 2 day --- !query 8 schema -struct --- !query 8 output +-- !query schema +struct +-- !query output 2017-12-13 00:00:00 --- !query 9 +-- !query select cast('2017-12-11 09:30:00' as string) + interval 2 day --- !query 9 schema -struct --- !query 9 output +-- !query schema +struct +-- !query output 2017-12-13 09:30:00 --- !query 10 +-- !query select cast('1' as binary) + interval 2 day --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST('1' AS BINARY) + interval 2 days)' due to data type mismatch: differing types in '(CAST('1' AS BINARY) + interval 2 days)' (binary and interval).; line 1 pos 7 +cannot resolve 'CAST('1' AS BINARY) + INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST('1' AS BINARY)' is of binary type.; line 1 pos 7 --- !query 11 +-- !query select cast(1 as boolean) + interval 2 day --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS BOOLEAN) + interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) + interval 2 days)' (boolean and interval).; line 1 pos 7 +cannot resolve 'CAST(1 AS BOOLEAN) + INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS BOOLEAN)' is of boolean type.; line 1 pos 7 --- !query 12 +-- !query select cast('2017-12-11 09:30:00.0' as timestamp) + interval 2 day --- !query 12 schema -struct --- !query 12 output +-- !query schema +struct +-- !query output 2017-12-13 09:30:00 --- !query 13 +-- !query select cast('2017-12-11 09:30:00' as date) + interval 2 day --- !query 13 schema -struct --- !query 13 output +-- !query schema +struct +-- !query output 2017-12-13 --- !query 14 +-- !query select interval 2 day + cast(1 as tinyint) --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(interval 2 days + CAST(1 AS TINYINT))' due to data type mismatch: differing types in '(interval 2 days + CAST(1 AS TINYINT))' (interval and tinyint).; line 1 pos 7 +cannot resolve 'CAST(1 AS TINYINT) + INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS TINYINT)' is of tinyint type.; line 1 pos 7 --- !query 15 +-- !query select interval 2 day + cast(1 as smallint) --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(interval 2 days + CAST(1 AS SMALLINT))' due to data type mismatch: differing types in '(interval 2 days + CAST(1 AS SMALLINT))' (interval and smallint).; line 1 pos 7 +cannot resolve 'CAST(1 AS SMALLINT) + INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS SMALLINT)' is of smallint type.; line 1 pos 7 --- !query 16 +-- !query select interval 2 day + cast(1 as int) --- !query 16 schema +-- !query schema struct<> --- !query 16 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(interval 2 days + CAST(1 AS INT))' due to data type mismatch: differing types in '(interval 2 days + CAST(1 AS INT))' (interval and int).; line 1 pos 7 +cannot resolve 'CAST(1 AS INT) + INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS INT)' is of int type.; line 1 pos 7 --- !query 17 +-- !query select interval 2 day + cast(1 as bigint) --- !query 17 schema +-- !query schema struct<> --- !query 17 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(interval 2 days + CAST(1 AS BIGINT))' due to data type mismatch: differing types in '(interval 2 days + CAST(1 AS BIGINT))' (interval and bigint).; line 1 pos 7 +cannot resolve 'CAST(1 AS BIGINT) + INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS BIGINT)' is of bigint type.; line 1 pos 7 --- !query 18 +-- !query select interval 2 day + cast(1 as float) --- !query 18 schema +-- !query schema struct<> --- !query 18 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(interval 2 days + CAST(1 AS FLOAT))' due to data type mismatch: differing types in '(interval 2 days + CAST(1 AS FLOAT))' (interval and float).; line 1 pos 7 +cannot resolve 'CAST(1 AS FLOAT) + INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS FLOAT)' is of float type.; line 1 pos 7 --- !query 19 +-- !query select interval 2 day + cast(1 as double) --- !query 19 schema +-- !query schema struct<> --- !query 19 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(interval 2 days + CAST(1 AS DOUBLE))' due to data type mismatch: differing types in '(interval 2 days + CAST(1 AS DOUBLE))' (interval and double).; line 1 pos 7 +cannot resolve 'CAST(1 AS DOUBLE) + INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS DOUBLE)' is of double type.; line 1 pos 7 --- !query 20 +-- !query select interval 2 day + cast(1 as decimal(10, 0)) --- !query 20 schema +-- !query schema struct<> --- !query 20 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(interval 2 days + CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(interval 2 days + CAST(1 AS DECIMAL(10,0)))' (interval and decimal(10,0)).; line 1 pos 7 +cannot resolve 'CAST(1 AS DECIMAL(10,0)) + INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS DECIMAL(10,0))' is of decimal(10,0) type.; line 1 pos 7 --- !query 21 +-- !query select interval 2 day + cast('2017-12-11' as string) --- !query 21 schema -struct --- !query 21 output +-- !query schema +struct +-- !query output 2017-12-13 00:00:00 --- !query 22 +-- !query select interval 2 day + cast('2017-12-11 09:30:00' as string) --- !query 22 schema -struct --- !query 22 output +-- !query schema +struct +-- !query output 2017-12-13 09:30:00 --- !query 23 +-- !query select interval 2 day + cast('1' as binary) --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(interval 2 days + CAST('1' AS BINARY))' due to data type mismatch: differing types in '(interval 2 days + CAST('1' AS BINARY))' (interval and binary).; line 1 pos 7 +cannot resolve 'CAST('1' AS BINARY) + INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST('1' AS BINARY)' is of binary type.; line 1 pos 7 --- !query 24 +-- !query select interval 2 day + cast(1 as boolean) --- !query 24 schema +-- !query schema struct<> --- !query 24 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(interval 2 days + CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(interval 2 days + CAST(1 AS BOOLEAN))' (interval and boolean).; line 1 pos 7 +cannot resolve 'CAST(1 AS BOOLEAN) + INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS BOOLEAN)' is of boolean type.; line 1 pos 7 --- !query 25 +-- !query select interval 2 day + cast('2017-12-11 09:30:00.0' as timestamp) --- !query 25 schema -struct --- !query 25 output +-- !query schema +struct +-- !query output 2017-12-13 09:30:00 --- !query 26 +-- !query select interval 2 day + cast('2017-12-11 09:30:00' as date) --- !query 26 schema -struct --- !query 26 output +-- !query schema +struct +-- !query output 2017-12-13 --- !query 27 +-- !query select cast(1 as tinyint) - interval 2 day --- !query 27 schema +-- !query schema struct<> --- !query 27 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS TINYINT) - interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS TINYINT) - interval 2 days)' (tinyint and interval).; line 1 pos 7 +cannot resolve 'CAST(1 AS TINYINT) - INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS TINYINT)' is of tinyint type.; line 1 pos 7 --- !query 28 +-- !query select cast(1 as smallint) - interval 2 day --- !query 28 schema +-- !query schema struct<> --- !query 28 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS SMALLINT) - interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS SMALLINT) - interval 2 days)' (smallint and interval).; line 1 pos 7 +cannot resolve 'CAST(1 AS SMALLINT) - INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS SMALLINT)' is of smallint type.; line 1 pos 7 --- !query 29 +-- !query select cast(1 as int) - interval 2 day --- !query 29 schema +-- !query schema struct<> --- !query 29 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS INT) - interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS INT) - interval 2 days)' (int and interval).; line 1 pos 7 +cannot resolve 'CAST(1 AS INT) - INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS INT)' is of int type.; line 1 pos 7 --- !query 30 +-- !query select cast(1 as bigint) - interval 2 day --- !query 30 schema +-- !query schema struct<> --- !query 30 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS BIGINT) - interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS BIGINT) - interval 2 days)' (bigint and interval).; line 1 pos 7 +cannot resolve 'CAST(1 AS BIGINT) - INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS BIGINT)' is of bigint type.; line 1 pos 7 --- !query 31 +-- !query select cast(1 as float) - interval 2 day --- !query 31 schema +-- !query schema struct<> --- !query 31 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS FLOAT) - interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS FLOAT) - interval 2 days)' (float and interval).; line 1 pos 7 +cannot resolve 'CAST(1 AS FLOAT) - INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS FLOAT)' is of float type.; line 1 pos 7 --- !query 32 +-- !query select cast(1 as double) - interval 2 day --- !query 32 schema +-- !query schema struct<> --- !query 32 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS DOUBLE) - interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS DOUBLE) - interval 2 days)' (double and interval).; line 1 pos 7 +cannot resolve 'CAST(1 AS DOUBLE) - INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS DOUBLE)' is of double type.; line 1 pos 7 --- !query 33 +-- !query select cast(1 as decimal(10, 0)) - interval 2 day --- !query 33 schema +-- !query schema struct<> --- !query 33 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS DECIMAL(10,0)) - interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) - interval 2 days)' (decimal(10,0) and interval).; line 1 pos 7 +cannot resolve 'CAST(1 AS DECIMAL(10,0)) - INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS DECIMAL(10,0))' is of decimal(10,0) type.; line 1 pos 7 --- !query 34 +-- !query select cast('2017-12-11' as string) - interval 2 day --- !query 34 schema -struct --- !query 34 output +-- !query schema +struct +-- !query output 2017-12-09 00:00:00 --- !query 35 +-- !query select cast('2017-12-11 09:30:00' as string) - interval 2 day --- !query 35 schema -struct --- !query 35 output +-- !query schema +struct +-- !query output 2017-12-09 09:30:00 --- !query 36 +-- !query select cast('1' as binary) - interval 2 day --- !query 36 schema +-- !query schema struct<> --- !query 36 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST('1' AS BINARY) - interval 2 days)' due to data type mismatch: differing types in '(CAST('1' AS BINARY) - interval 2 days)' (binary and interval).; line 1 pos 7 +cannot resolve 'CAST('1' AS BINARY) - INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST('1' AS BINARY)' is of binary type.; line 1 pos 7 --- !query 37 +-- !query select cast(1 as boolean) - interval 2 day --- !query 37 schema +-- !query schema struct<> --- !query 37 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS BOOLEAN) - interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) - interval 2 days)' (boolean and interval).; line 1 pos 7 +cannot resolve 'CAST(1 AS BOOLEAN) - INTERVAL '2 days'' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS BOOLEAN)' is of boolean type.; line 1 pos 7 --- !query 38 +-- !query select cast('2017-12-11 09:30:00.0' as timestamp) - interval 2 day --- !query 38 schema -struct --- !query 38 output +-- !query schema +struct +-- !query output 2017-12-09 09:30:00 --- !query 39 +-- !query select cast('2017-12-11 09:30:00' as date) - interval 2 day --- !query 39 schema -struct --- !query 39 output +-- !query schema +struct +-- !query output 2017-12-09 diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalPrecision.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalPrecision.sql.out index 6ee7f59d69877..33bd3850732f0 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalPrecision.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalPrecision.sql.out @@ -2,9513 +2,9513 @@ -- Number of queries: 1145 --- !query 0 +-- !query CREATE TEMPORARY VIEW t AS SELECT 1 --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT cast(1 as tinyint) + cast(1 as decimal(3, 0)) FROM t --- !query 1 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) + CAST(1 AS DECIMAL(3,0))):decimal(4,0)> --- !query 1 output +-- !query output 2 --- !query 2 +-- !query SELECT cast(1 as tinyint) + cast(1 as decimal(5, 0)) FROM t --- !query 2 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(6,0)) + CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(6,0))):decimal(6,0)> --- !query 2 output +-- !query output 2 --- !query 3 +-- !query SELECT cast(1 as tinyint) + cast(1 as decimal(10, 0)) FROM t --- !query 3 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(11,0)) + CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 3 output +-- !query output 2 --- !query 4 +-- !query SELECT cast(1 as tinyint) + cast(1 as decimal(20, 0)) FROM t --- !query 4 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(21,0)) + CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 4 output +-- !query output 2 --- !query 5 +-- !query SELECT cast(1 as smallint) + cast(1 as decimal(3, 0)) FROM t --- !query 5 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(6,0)) + CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(6,0))):decimal(6,0)> --- !query 5 output +-- !query output 2 --- !query 6 +-- !query SELECT cast(1 as smallint) + cast(1 as decimal(5, 0)) FROM t --- !query 6 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) + CAST(1 AS DECIMAL(5,0))):decimal(6,0)> --- !query 6 output +-- !query output 2 --- !query 7 +-- !query SELECT cast(1 as smallint) + cast(1 as decimal(10, 0)) FROM t --- !query 7 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(11,0)) + CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 7 output +-- !query output 2 --- !query 8 +-- !query SELECT cast(1 as smallint) + cast(1 as decimal(20, 0)) FROM t --- !query 8 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(21,0)) + CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 8 output +-- !query output 2 --- !query 9 +-- !query SELECT cast(1 as int) + cast(1 as decimal(3, 0)) FROM t --- !query 9 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(11,0)) + CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 9 output +-- !query output 2 --- !query 10 +-- !query SELECT cast(1 as int) + cast(1 as decimal(5, 0)) FROM t --- !query 10 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(11,0)) + CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 10 output +-- !query output 2 --- !query 11 +-- !query SELECT cast(1 as int) + cast(1 as decimal(10, 0)) FROM t --- !query 11 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) + CAST(1 AS DECIMAL(10,0))):decimal(11,0)> --- !query 11 output +-- !query output 2 --- !query 12 +-- !query SELECT cast(1 as int) + cast(1 as decimal(20, 0)) FROM t --- !query 12 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(21,0)) + CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 12 output +-- !query output 2 --- !query 13 +-- !query SELECT cast(1 as bigint) + cast(1 as decimal(3, 0)) FROM t --- !query 13 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0)) + CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 13 output +-- !query output 2 --- !query 14 +-- !query SELECT cast(1 as bigint) + cast(1 as decimal(5, 0)) FROM t --- !query 14 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0)) + CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 14 output +-- !query output 2 --- !query 15 +-- !query SELECT cast(1 as bigint) + cast(1 as decimal(10, 0)) FROM t --- !query 15 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0)) + CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 15 output +-- !query output 2 --- !query 16 +-- !query SELECT cast(1 as bigint) + cast(1 as decimal(20, 0)) FROM t --- !query 16 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) + CAST(1 AS DECIMAL(20,0))):decimal(21,0)> --- !query 16 output +-- !query output 2 --- !query 17 +-- !query SELECT cast(1 as float) + cast(1 as decimal(3, 0)) FROM t --- !query 17 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) + CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):double> --- !query 17 output +-- !query output 2.0 --- !query 18 +-- !query SELECT cast(1 as float) + cast(1 as decimal(5, 0)) FROM t --- !query 18 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) + CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):double> --- !query 18 output +-- !query output 2.0 --- !query 19 +-- !query SELECT cast(1 as float) + cast(1 as decimal(10, 0)) FROM t --- !query 19 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) + CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double> --- !query 19 output +-- !query output 2.0 --- !query 20 +-- !query SELECT cast(1 as float) + cast(1 as decimal(20, 0)) FROM t --- !query 20 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) + CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):double> --- !query 20 output +-- !query output 2.0 --- !query 21 +-- !query SELECT cast(1 as double) + cast(1 as decimal(3, 0)) FROM t --- !query 21 schema +-- !query schema struct<(CAST(1 AS DOUBLE) + CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):double> --- !query 21 output +-- !query output 2.0 --- !query 22 +-- !query SELECT cast(1 as double) + cast(1 as decimal(5, 0)) FROM t --- !query 22 schema +-- !query schema struct<(CAST(1 AS DOUBLE) + CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):double> --- !query 22 output +-- !query output 2.0 --- !query 23 +-- !query SELECT cast(1 as double) + cast(1 as decimal(10, 0)) FROM t --- !query 23 schema +-- !query schema struct<(CAST(1 AS DOUBLE) + CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double> --- !query 23 output +-- !query output 2.0 --- !query 24 +-- !query SELECT cast(1 as double) + cast(1 as decimal(20, 0)) FROM t --- !query 24 schema +-- !query schema struct<(CAST(1 AS DOUBLE) + CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):double> --- !query 24 output +-- !query output 2.0 --- !query 25 +-- !query SELECT cast(1 as decimal(10, 0)) + cast(1 as decimal(3, 0)) FROM t --- !query 25 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0)) + CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 25 output +-- !query output 2 --- !query 26 +-- !query SELECT cast(1 as decimal(10, 0)) + cast(1 as decimal(5, 0)) FROM t --- !query 26 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0)) + CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 26 output +-- !query output 2 --- !query 27 +-- !query SELECT cast(1 as decimal(10, 0)) + cast(1 as decimal(10, 0)) FROM t --- !query 27 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) + CAST(1 AS DECIMAL(10,0))):decimal(11,0)> --- !query 27 output +-- !query output 2 --- !query 28 +-- !query SELECT cast(1 as decimal(10, 0)) + cast(1 as decimal(20, 0)) FROM t --- !query 28 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(21,0)) + CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 28 output +-- !query output 2 --- !query 29 +-- !query SELECT cast('1' as binary) + cast(1 as decimal(3, 0)) FROM t --- !query 29 schema +-- !query schema struct<> --- !query 29 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) + CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) + CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7 --- !query 30 +-- !query SELECT cast('1' as binary) + cast(1 as decimal(5, 0)) FROM t --- !query 30 schema +-- !query schema struct<> --- !query 30 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) + CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) + CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7 --- !query 31 +-- !query SELECT cast('1' as binary) + cast(1 as decimal(10, 0)) FROM t --- !query 31 schema +-- !query schema struct<> --- !query 31 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) + CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) + CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7 --- !query 32 +-- !query SELECT cast('1' as binary) + cast(1 as decimal(20, 0)) FROM t --- !query 32 schema +-- !query schema struct<> --- !query 32 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) + CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) + CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7 --- !query 33 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) + cast(1 as decimal(3, 0)) FROM t --- !query 33 schema +-- !query schema struct<> --- !query 33 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) + CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) + CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7 --- !query 34 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) + cast(1 as decimal(5, 0)) FROM t --- !query 34 schema +-- !query schema struct<> --- !query 34 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) + CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) + CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7 --- !query 35 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) + cast(1 as decimal(10, 0)) FROM t --- !query 35 schema +-- !query schema struct<> --- !query 35 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) + CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) + CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7 --- !query 36 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) + cast(1 as decimal(20, 0)) FROM t --- !query 36 schema +-- !query schema struct<> --- !query 36 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) + CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) + CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7 --- !query 37 +-- !query SELECT cast('2017-12-11 09:30:00' as date) + cast(1 as decimal(3, 0)) FROM t --- !query 37 schema +-- !query schema struct<> --- !query 37 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) + CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) + CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7 +cannot resolve 'date_add(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'CAST(1 AS DECIMAL(3,0))' is of decimal(3,0) type.; line 1 pos 7 --- !query 38 +-- !query SELECT cast('2017-12-11 09:30:00' as date) + cast(1 as decimal(5, 0)) FROM t --- !query 38 schema +-- !query schema struct<> --- !query 38 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) + CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) + CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7 +cannot resolve 'date_add(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'CAST(1 AS DECIMAL(5,0))' is of decimal(5,0) type.; line 1 pos 7 --- !query 39 +-- !query SELECT cast('2017-12-11 09:30:00' as date) + cast(1 as decimal(10, 0)) FROM t --- !query 39 schema +-- !query schema struct<> --- !query 39 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) + CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) + CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7 +cannot resolve 'date_add(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'CAST(1 AS DECIMAL(10,0))' is of decimal(10,0) type.; line 1 pos 7 --- !query 40 +-- !query SELECT cast('2017-12-11 09:30:00' as date) + cast(1 as decimal(20, 0)) FROM t --- !query 40 schema +-- !query schema struct<> --- !query 40 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) + CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) + CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7 +cannot resolve 'date_add(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'CAST(1 AS DECIMAL(20,0))' is of decimal(20,0) type.; line 1 pos 7 --- !query 41 +-- !query SELECT cast(1 as decimal(3, 0)) + cast(1 as tinyint) FROM t --- !query 41 schema +-- !query schema struct<(CAST(1 AS DECIMAL(3,0)) + CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):decimal(4,0)> --- !query 41 output +-- !query output 2 --- !query 42 +-- !query SELECT cast(1 as decimal(5, 0)) + cast(1 as tinyint) FROM t --- !query 42 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(6,0)) + CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(6,0))):decimal(6,0)> --- !query 42 output +-- !query output 2 --- !query 43 +-- !query SELECT cast(1 as decimal(10, 0)) + cast(1 as tinyint) FROM t --- !query 43 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0)) + CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 43 output +-- !query output 2 --- !query 44 +-- !query SELECT cast(1 as decimal(20, 0)) + cast(1 as tinyint) FROM t --- !query 44 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0)) + CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 44 output +-- !query output 2 --- !query 45 +-- !query SELECT cast(1 as decimal(3, 0)) + cast(1 as smallint) FROM t --- !query 45 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(6,0)) + CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(6,0))):decimal(6,0)> --- !query 45 output +-- !query output 2 --- !query 46 +-- !query SELECT cast(1 as decimal(5, 0)) + cast(1 as smallint) FROM t --- !query 46 schema +-- !query schema struct<(CAST(1 AS DECIMAL(5,0)) + CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):decimal(6,0)> --- !query 46 output +-- !query output 2 --- !query 47 +-- !query SELECT cast(1 as decimal(10, 0)) + cast(1 as smallint) FROM t --- !query 47 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0)) + CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 47 output +-- !query output 2 --- !query 48 +-- !query SELECT cast(1 as decimal(20, 0)) + cast(1 as smallint) FROM t --- !query 48 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0)) + CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 48 output +-- !query output 2 --- !query 49 +-- !query SELECT cast(1 as decimal(3, 0)) + cast(1 as int) FROM t --- !query 49 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(11,0)) + CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 49 output +-- !query output 2 --- !query 50 +-- !query SELECT cast(1 as decimal(5, 0)) + cast(1 as int) FROM t --- !query 50 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(11,0)) + CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 50 output +-- !query output 2 --- !query 51 +-- !query SELECT cast(1 as decimal(10, 0)) + cast(1 as int) FROM t --- !query 51 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) + CAST(CAST(1 AS INT) AS DECIMAL(10,0))):decimal(11,0)> --- !query 51 output +-- !query output 2 --- !query 52 +-- !query SELECT cast(1 as decimal(20, 0)) + cast(1 as int) FROM t --- !query 52 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0)) + CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 52 output +-- !query output 2 --- !query 53 +-- !query SELECT cast(1 as decimal(3, 0)) + cast(1 as bigint) FROM t --- !query 53 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(21,0)) + CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 53 output +-- !query output 2 --- !query 54 +-- !query SELECT cast(1 as decimal(5, 0)) + cast(1 as bigint) FROM t --- !query 54 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(21,0)) + CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 54 output +-- !query output 2 --- !query 55 +-- !query SELECT cast(1 as decimal(10, 0)) + cast(1 as bigint) FROM t --- !query 55 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(21,0)) + CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 55 output +-- !query output 2 --- !query 56 +-- !query SELECT cast(1 as decimal(20, 0)) + cast(1 as bigint) FROM t --- !query 56 schema +-- !query schema struct<(CAST(1 AS DECIMAL(20,0)) + CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):decimal(21,0)> --- !query 56 output +-- !query output 2 --- !query 57 +-- !query SELECT cast(1 as decimal(3, 0)) + cast(1 as float) FROM t --- !query 57 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) + CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 57 output +-- !query output 2.0 --- !query 58 +-- !query SELECT cast(1 as decimal(5, 0)) + cast(1 as float) FROM t --- !query 58 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) + CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 58 output +-- !query output 2.0 --- !query 59 +-- !query SELECT cast(1 as decimal(10, 0)) + cast(1 as float) FROM t --- !query 59 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) + CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 59 output +-- !query output 2.0 --- !query 60 +-- !query SELECT cast(1 as decimal(20, 0)) + cast(1 as float) FROM t --- !query 60 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) + CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 60 output +-- !query output 2.0 --- !query 61 +-- !query SELECT cast(1 as decimal(3, 0)) + cast(1 as double) FROM t --- !query 61 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) + CAST(1 AS DOUBLE)):double> --- !query 61 output +-- !query output 2.0 --- !query 62 +-- !query SELECT cast(1 as decimal(5, 0)) + cast(1 as double) FROM t --- !query 62 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) + CAST(1 AS DOUBLE)):double> --- !query 62 output +-- !query output 2.0 --- !query 63 +-- !query SELECT cast(1 as decimal(10, 0)) + cast(1 as double) FROM t --- !query 63 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) + CAST(1 AS DOUBLE)):double> --- !query 63 output +-- !query output 2.0 --- !query 64 +-- !query SELECT cast(1 as decimal(20, 0)) + cast(1 as double) FROM t --- !query 64 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) + CAST(1 AS DOUBLE)):double> --- !query 64 output +-- !query output 2.0 --- !query 65 +-- !query SELECT cast(1 as decimal(3, 0)) + cast(1 as decimal(10, 0)) FROM t --- !query 65 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(11,0)) + CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 65 output +-- !query output 2 --- !query 66 +-- !query SELECT cast(1 as decimal(5, 0)) + cast(1 as decimal(10, 0)) FROM t --- !query 66 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(11,0)) + CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 66 output +-- !query output 2 --- !query 67 +-- !query SELECT cast(1 as decimal(10, 0)) + cast(1 as decimal(10, 0)) FROM t --- !query 67 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) + CAST(1 AS DECIMAL(10,0))):decimal(11,0)> --- !query 67 output +-- !query output 2 --- !query 68 +-- !query SELECT cast(1 as decimal(20, 0)) + cast(1 as decimal(10, 0)) FROM t --- !query 68 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0)) + CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 68 output +-- !query output 2 --- !query 69 +-- !query SELECT cast(1 as decimal(3, 0)) + cast(1 as string) FROM t --- !query 69 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) + CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 69 output +-- !query output 2.0 --- !query 70 +-- !query SELECT cast(1 as decimal(5, 0)) + cast(1 as string) FROM t --- !query 70 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) + CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 70 output +-- !query output 2.0 --- !query 71 +-- !query SELECT cast(1 as decimal(10, 0)) + cast(1 as string) FROM t --- !query 71 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) + CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 71 output +-- !query output 2.0 --- !query 72 +-- !query SELECT cast(1 as decimal(20, 0)) + cast(1 as string) FROM t --- !query 72 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) + CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 72 output +-- !query output 2.0 --- !query 73 +-- !query SELECT cast(1 as decimal(3, 0)) + cast('1' as binary) FROM t --- !query 73 schema +-- !query schema struct<> --- !query 73 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) + CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) + CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7 --- !query 74 +-- !query SELECT cast(1 as decimal(5, 0)) + cast('1' as binary) FROM t --- !query 74 schema +-- !query schema struct<> --- !query 74 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) + CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) + CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7 --- !query 75 +-- !query SELECT cast(1 as decimal(10, 0)) + cast('1' as binary) FROM t --- !query 75 schema +-- !query schema struct<> --- !query 75 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) + CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) + CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7 --- !query 76 +-- !query SELECT cast(1 as decimal(20, 0)) + cast('1' as binary) FROM t --- !query 76 schema +-- !query schema struct<> --- !query 76 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) + CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) + CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7 --- !query 77 +-- !query SELECT cast(1 as decimal(3, 0)) + cast(1 as boolean) FROM t --- !query 77 schema +-- !query schema struct<> --- !query 77 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) + CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) + CAST(1 AS BOOLEAN))' (decimal(3,0) and boolean).; line 1 pos 7 --- !query 78 +-- !query SELECT cast(1 as decimal(5, 0)) + cast(1 as boolean) FROM t --- !query 78 schema +-- !query schema struct<> --- !query 78 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) + CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) + CAST(1 AS BOOLEAN))' (decimal(5,0) and boolean).; line 1 pos 7 --- !query 79 +-- !query SELECT cast(1 as decimal(10, 0)) + cast(1 as boolean) FROM t --- !query 79 schema +-- !query schema struct<> --- !query 79 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) + CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) + CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7 --- !query 80 +-- !query SELECT cast(1 as decimal(20, 0)) + cast(1 as boolean) FROM t --- !query 80 schema +-- !query schema struct<> --- !query 80 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) + CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) + CAST(1 AS BOOLEAN))' (decimal(20,0) and boolean).; line 1 pos 7 --- !query 81 +-- !query SELECT cast(1 as decimal(3, 0)) + cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 81 schema +-- !query schema struct<> --- !query 81 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) + CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) + CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7 --- !query 82 +-- !query SELECT cast(1 as decimal(5, 0)) + cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 82 schema +-- !query schema struct<> --- !query 82 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) + CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) + CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7 --- !query 83 +-- !query SELECT cast(1 as decimal(10, 0)) + cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 83 schema +-- !query schema struct<> --- !query 83 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) + CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) + CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7 --- !query 84 +-- !query SELECT cast(1 as decimal(20, 0)) + cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 84 schema +-- !query schema struct<> --- !query 84 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) + CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) + CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7 --- !query 85 +-- !query SELECT cast(1 as decimal(3, 0)) + cast('2017-12-11 09:30:00' as date) FROM t --- !query 85 schema +-- !query schema struct<> --- !query 85 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS DECIMAL(3,0)) + CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) + CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7 +cannot resolve 'date_add(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'CAST(1 AS DECIMAL(3,0))' is of decimal(3,0) type.; line 1 pos 7 --- !query 86 +-- !query SELECT cast(1 as decimal(5, 0)) + cast('2017-12-11 09:30:00' as date) FROM t --- !query 86 schema +-- !query schema struct<> --- !query 86 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS DECIMAL(5,0)) + CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) + CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7 +cannot resolve 'date_add(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'CAST(1 AS DECIMAL(5,0))' is of decimal(5,0) type.; line 1 pos 7 --- !query 87 +-- !query SELECT cast(1 as decimal(10, 0)) + cast('2017-12-11 09:30:00' as date) FROM t --- !query 87 schema +-- !query schema struct<> --- !query 87 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS DECIMAL(10,0)) + CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) + CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7 +cannot resolve 'date_add(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'CAST(1 AS DECIMAL(10,0))' is of decimal(10,0) type.; line 1 pos 7 --- !query 88 +-- !query SELECT cast(1 as decimal(20, 0)) + cast('2017-12-11 09:30:00' as date) FROM t --- !query 88 schema +-- !query schema struct<> --- !query 88 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS DECIMAL(20,0)) + CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) + CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7 +cannot resolve 'date_add(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'CAST(1 AS DECIMAL(20,0))' is of decimal(20,0) type.; line 1 pos 7 --- !query 89 +-- !query SELECT cast(1 as tinyint) - cast(1 as decimal(3, 0)) FROM t --- !query 89 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) - CAST(1 AS DECIMAL(3,0))):decimal(4,0)> --- !query 89 output +-- !query output 0 --- !query 90 +-- !query SELECT cast(1 as tinyint) - cast(1 as decimal(5, 0)) FROM t --- !query 90 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(6,0)) - CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(6,0))):decimal(6,0)> --- !query 90 output +-- !query output 0 --- !query 91 +-- !query SELECT cast(1 as tinyint) - cast(1 as decimal(10, 0)) FROM t --- !query 91 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(11,0)) - CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 91 output +-- !query output 0 --- !query 92 +-- !query SELECT cast(1 as tinyint) - cast(1 as decimal(20, 0)) FROM t --- !query 92 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(21,0)) - CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 92 output +-- !query output 0 --- !query 93 +-- !query SELECT cast(1 as smallint) - cast(1 as decimal(3, 0)) FROM t --- !query 93 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(6,0)) - CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(6,0))):decimal(6,0)> --- !query 93 output +-- !query output 0 --- !query 94 +-- !query SELECT cast(1 as smallint) - cast(1 as decimal(5, 0)) FROM t --- !query 94 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) - CAST(1 AS DECIMAL(5,0))):decimal(6,0)> --- !query 94 output +-- !query output 0 --- !query 95 +-- !query SELECT cast(1 as smallint) - cast(1 as decimal(10, 0)) FROM t --- !query 95 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(11,0)) - CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 95 output +-- !query output 0 --- !query 96 +-- !query SELECT cast(1 as smallint) - cast(1 as decimal(20, 0)) FROM t --- !query 96 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(21,0)) - CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 96 output +-- !query output 0 --- !query 97 +-- !query SELECT cast(1 as int) - cast(1 as decimal(3, 0)) FROM t --- !query 97 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(11,0)) - CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 97 output +-- !query output 0 --- !query 98 +-- !query SELECT cast(1 as int) - cast(1 as decimal(5, 0)) FROM t --- !query 98 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(11,0)) - CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 98 output +-- !query output 0 --- !query 99 +-- !query SELECT cast(1 as int) - cast(1 as decimal(10, 0)) FROM t --- !query 99 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) - CAST(1 AS DECIMAL(10,0))):decimal(11,0)> --- !query 99 output +-- !query output 0 --- !query 100 +-- !query SELECT cast(1 as int) - cast(1 as decimal(20, 0)) FROM t --- !query 100 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(21,0)) - CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 100 output +-- !query output 0 --- !query 101 +-- !query SELECT cast(1 as bigint) - cast(1 as decimal(3, 0)) FROM t --- !query 101 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0)) - CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 101 output +-- !query output 0 --- !query 102 +-- !query SELECT cast(1 as bigint) - cast(1 as decimal(5, 0)) FROM t --- !query 102 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0)) - CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 102 output +-- !query output 0 --- !query 103 +-- !query SELECT cast(1 as bigint) - cast(1 as decimal(10, 0)) FROM t --- !query 103 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0)) - CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 103 output +-- !query output 0 --- !query 104 +-- !query SELECT cast(1 as bigint) - cast(1 as decimal(20, 0)) FROM t --- !query 104 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) - CAST(1 AS DECIMAL(20,0))):decimal(21,0)> --- !query 104 output +-- !query output 0 --- !query 105 +-- !query SELECT cast(1 as float) - cast(1 as decimal(3, 0)) FROM t --- !query 105 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) - CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):double> --- !query 105 output +-- !query output 0.0 --- !query 106 +-- !query SELECT cast(1 as float) - cast(1 as decimal(5, 0)) FROM t --- !query 106 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) - CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):double> --- !query 106 output +-- !query output 0.0 --- !query 107 +-- !query SELECT cast(1 as float) - cast(1 as decimal(10, 0)) FROM t --- !query 107 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) - CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double> --- !query 107 output +-- !query output 0.0 --- !query 108 +-- !query SELECT cast(1 as float) - cast(1 as decimal(20, 0)) FROM t --- !query 108 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) - CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):double> --- !query 108 output +-- !query output 0.0 --- !query 109 +-- !query SELECT cast(1 as double) - cast(1 as decimal(3, 0)) FROM t --- !query 109 schema +-- !query schema struct<(CAST(1 AS DOUBLE) - CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):double> --- !query 109 output +-- !query output 0.0 --- !query 110 +-- !query SELECT cast(1 as double) - cast(1 as decimal(5, 0)) FROM t --- !query 110 schema +-- !query schema struct<(CAST(1 AS DOUBLE) - CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):double> --- !query 110 output +-- !query output 0.0 --- !query 111 +-- !query SELECT cast(1 as double) - cast(1 as decimal(10, 0)) FROM t --- !query 111 schema +-- !query schema struct<(CAST(1 AS DOUBLE) - CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double> --- !query 111 output +-- !query output 0.0 --- !query 112 +-- !query SELECT cast(1 as double) - cast(1 as decimal(20, 0)) FROM t --- !query 112 schema +-- !query schema struct<(CAST(1 AS DOUBLE) - CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):double> --- !query 112 output +-- !query output 0.0 --- !query 113 +-- !query SELECT cast(1 as decimal(10, 0)) - cast(1 as decimal(3, 0)) FROM t --- !query 113 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0)) - CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 113 output +-- !query output 0 --- !query 114 +-- !query SELECT cast(1 as decimal(10, 0)) - cast(1 as decimal(5, 0)) FROM t --- !query 114 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0)) - CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 114 output +-- !query output 0 --- !query 115 +-- !query SELECT cast(1 as decimal(10, 0)) - cast(1 as decimal(10, 0)) FROM t --- !query 115 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) - CAST(1 AS DECIMAL(10,0))):decimal(11,0)> --- !query 115 output +-- !query output 0 --- !query 116 +-- !query SELECT cast(1 as decimal(10, 0)) - cast(1 as decimal(20, 0)) FROM t --- !query 116 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(21,0)) - CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 116 output +-- !query output 0 --- !query 117 +-- !query SELECT cast('1' as binary) - cast(1 as decimal(3, 0)) FROM t --- !query 117 schema +-- !query schema struct<> --- !query 117 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) - CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) - CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7 --- !query 118 +-- !query SELECT cast('1' as binary) - cast(1 as decimal(5, 0)) FROM t --- !query 118 schema +-- !query schema struct<> --- !query 118 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) - CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) - CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7 --- !query 119 +-- !query SELECT cast('1' as binary) - cast(1 as decimal(10, 0)) FROM t --- !query 119 schema +-- !query schema struct<> --- !query 119 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) - CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) - CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7 --- !query 120 +-- !query SELECT cast('1' as binary) - cast(1 as decimal(20, 0)) FROM t --- !query 120 schema +-- !query schema struct<> --- !query 120 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) - CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) - CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7 --- !query 121 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) - cast(1 as decimal(3, 0)) FROM t --- !query 121 schema +-- !query schema struct<> --- !query 121 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) - CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) - CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7 +cannot resolve 'subtracttimestamps(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: argument 2 requires timestamp type, however, 'CAST(1 AS DECIMAL(3,0))' is of decimal(3,0) type.; line 1 pos 7 --- !query 122 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) - cast(1 as decimal(5, 0)) FROM t --- !query 122 schema +-- !query schema struct<> --- !query 122 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) - CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) - CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7 +cannot resolve 'subtracttimestamps(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: argument 2 requires timestamp type, however, 'CAST(1 AS DECIMAL(5,0))' is of decimal(5,0) type.; line 1 pos 7 --- !query 123 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) - cast(1 as decimal(10, 0)) FROM t --- !query 123 schema +-- !query schema struct<> --- !query 123 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) - CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) - CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7 +cannot resolve 'subtracttimestamps(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: argument 2 requires timestamp type, however, 'CAST(1 AS DECIMAL(10,0))' is of decimal(10,0) type.; line 1 pos 7 --- !query 124 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) - cast(1 as decimal(20, 0)) FROM t --- !query 124 schema +-- !query schema struct<> --- !query 124 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) - CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) - CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7 +cannot resolve 'subtracttimestamps(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: argument 2 requires timestamp type, however, 'CAST(1 AS DECIMAL(20,0))' is of decimal(20,0) type.; line 1 pos 7 --- !query 125 +-- !query SELECT cast('2017-12-11 09:30:00' as date) - cast(1 as decimal(3, 0)) FROM t --- !query 125 schema +-- !query schema struct<> --- !query 125 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) - CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) - CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7 +cannot resolve 'date_sub(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'CAST(1 AS DECIMAL(3,0))' is of decimal(3,0) type.; line 1 pos 7 --- !query 126 +-- !query SELECT cast('2017-12-11 09:30:00' as date) - cast(1 as decimal(5, 0)) FROM t --- !query 126 schema +-- !query schema struct<> --- !query 126 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) - CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) - CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7 +cannot resolve 'date_sub(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'CAST(1 AS DECIMAL(5,0))' is of decimal(5,0) type.; line 1 pos 7 --- !query 127 +-- !query SELECT cast('2017-12-11 09:30:00' as date) - cast(1 as decimal(10, 0)) FROM t --- !query 127 schema +-- !query schema struct<> --- !query 127 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) - CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) - CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7 +cannot resolve 'date_sub(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'CAST(1 AS DECIMAL(10,0))' is of decimal(10,0) type.; line 1 pos 7 --- !query 128 +-- !query SELECT cast('2017-12-11 09:30:00' as date) - cast(1 as decimal(20, 0)) FROM t --- !query 128 schema +-- !query schema struct<> --- !query 128 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) - CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) - CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7 +cannot resolve 'date_sub(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, 'CAST(1 AS DECIMAL(20,0))' is of decimal(20,0) type.; line 1 pos 7 --- !query 129 +-- !query SELECT cast(1 as decimal(3, 0)) - cast(1 as tinyint) FROM t --- !query 129 schema +-- !query schema struct<(CAST(1 AS DECIMAL(3,0)) - CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):decimal(4,0)> --- !query 129 output +-- !query output 0 --- !query 130 +-- !query SELECT cast(1 as decimal(5, 0)) - cast(1 as tinyint) FROM t --- !query 130 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(6,0)) - CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(6,0))):decimal(6,0)> --- !query 130 output +-- !query output 0 --- !query 131 +-- !query SELECT cast(1 as decimal(10, 0)) - cast(1 as tinyint) FROM t --- !query 131 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0)) - CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 131 output +-- !query output 0 --- !query 132 +-- !query SELECT cast(1 as decimal(20, 0)) - cast(1 as tinyint) FROM t --- !query 132 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0)) - CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 132 output +-- !query output 0 --- !query 133 +-- !query SELECT cast(1 as decimal(3, 0)) - cast(1 as smallint) FROM t --- !query 133 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(6,0)) - CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(6,0))):decimal(6,0)> --- !query 133 output +-- !query output 0 --- !query 134 +-- !query SELECT cast(1 as decimal(5, 0)) - cast(1 as smallint) FROM t --- !query 134 schema +-- !query schema struct<(CAST(1 AS DECIMAL(5,0)) - CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):decimal(6,0)> --- !query 134 output +-- !query output 0 --- !query 135 +-- !query SELECT cast(1 as decimal(10, 0)) - cast(1 as smallint) FROM t --- !query 135 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0)) - CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 135 output +-- !query output 0 --- !query 136 +-- !query SELECT cast(1 as decimal(20, 0)) - cast(1 as smallint) FROM t --- !query 136 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0)) - CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 136 output +-- !query output 0 --- !query 137 +-- !query SELECT cast(1 as decimal(3, 0)) - cast(1 as int) FROM t --- !query 137 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(11,0)) - CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 137 output +-- !query output 0 --- !query 138 +-- !query SELECT cast(1 as decimal(5, 0)) - cast(1 as int) FROM t --- !query 138 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(11,0)) - CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 138 output +-- !query output 0 --- !query 139 +-- !query SELECT cast(1 as decimal(10, 0)) - cast(1 as int) FROM t --- !query 139 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) - CAST(CAST(1 AS INT) AS DECIMAL(10,0))):decimal(11,0)> --- !query 139 output +-- !query output 0 --- !query 140 +-- !query SELECT cast(1 as decimal(20, 0)) - cast(1 as int) FROM t --- !query 140 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0)) - CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 140 output +-- !query output 0 --- !query 141 +-- !query SELECT cast(1 as decimal(3, 0)) - cast(1 as bigint) FROM t --- !query 141 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(21,0)) - CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 141 output +-- !query output 0 --- !query 142 +-- !query SELECT cast(1 as decimal(5, 0)) - cast(1 as bigint) FROM t --- !query 142 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(21,0)) - CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 142 output +-- !query output 0 --- !query 143 +-- !query SELECT cast(1 as decimal(10, 0)) - cast(1 as bigint) FROM t --- !query 143 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(21,0)) - CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 143 output +-- !query output 0 --- !query 144 +-- !query SELECT cast(1 as decimal(20, 0)) - cast(1 as bigint) FROM t --- !query 144 schema +-- !query schema struct<(CAST(1 AS DECIMAL(20,0)) - CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):decimal(21,0)> --- !query 144 output +-- !query output 0 --- !query 145 +-- !query SELECT cast(1 as decimal(3, 0)) - cast(1 as float) FROM t --- !query 145 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) - CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 145 output +-- !query output 0.0 --- !query 146 +-- !query SELECT cast(1 as decimal(5, 0)) - cast(1 as float) FROM t --- !query 146 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) - CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 146 output +-- !query output 0.0 --- !query 147 +-- !query SELECT cast(1 as decimal(10, 0)) - cast(1 as float) FROM t --- !query 147 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) - CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 147 output +-- !query output 0.0 --- !query 148 +-- !query SELECT cast(1 as decimal(20, 0)) - cast(1 as float) FROM t --- !query 148 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) - CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 148 output +-- !query output 0.0 --- !query 149 +-- !query SELECT cast(1 as decimal(3, 0)) - cast(1 as double) FROM t --- !query 149 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) - CAST(1 AS DOUBLE)):double> --- !query 149 output +-- !query output 0.0 --- !query 150 +-- !query SELECT cast(1 as decimal(5, 0)) - cast(1 as double) FROM t --- !query 150 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) - CAST(1 AS DOUBLE)):double> --- !query 150 output +-- !query output 0.0 --- !query 151 +-- !query SELECT cast(1 as decimal(10, 0)) - cast(1 as double) FROM t --- !query 151 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) - CAST(1 AS DOUBLE)):double> --- !query 151 output +-- !query output 0.0 --- !query 152 +-- !query SELECT cast(1 as decimal(20, 0)) - cast(1 as double) FROM t --- !query 152 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) - CAST(1 AS DOUBLE)):double> --- !query 152 output +-- !query output 0.0 --- !query 153 +-- !query SELECT cast(1 as decimal(3, 0)) - cast(1 as decimal(10, 0)) FROM t --- !query 153 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(11,0)) - CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 153 output +-- !query output 0 --- !query 154 +-- !query SELECT cast(1 as decimal(5, 0)) - cast(1 as decimal(10, 0)) FROM t --- !query 154 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(11,0)) - CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)> --- !query 154 output +-- !query output 0 --- !query 155 +-- !query SELECT cast(1 as decimal(10, 0)) - cast(1 as decimal(10, 0)) FROM t --- !query 155 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) - CAST(1 AS DECIMAL(10,0))):decimal(11,0)> --- !query 155 output +-- !query output 0 --- !query 156 +-- !query SELECT cast(1 as decimal(20, 0)) - cast(1 as decimal(10, 0)) FROM t --- !query 156 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0)) - CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(21,0))):decimal(21,0)> --- !query 156 output +-- !query output 0 --- !query 157 +-- !query SELECT cast(1 as decimal(3, 0)) - cast(1 as string) FROM t --- !query 157 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) - CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 157 output +-- !query output 0.0 --- !query 158 +-- !query SELECT cast(1 as decimal(5, 0)) - cast(1 as string) FROM t --- !query 158 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) - CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 158 output +-- !query output 0.0 --- !query 159 +-- !query SELECT cast(1 as decimal(10, 0)) - cast(1 as string) FROM t --- !query 159 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) - CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 159 output +-- !query output 0.0 --- !query 160 +-- !query SELECT cast(1 as decimal(20, 0)) - cast(1 as string) FROM t --- !query 160 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) - CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 160 output +-- !query output 0.0 --- !query 161 +-- !query SELECT cast(1 as decimal(3, 0)) - cast('1' as binary) FROM t --- !query 161 schema +-- !query schema struct<> --- !query 161 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) - CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) - CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7 --- !query 162 +-- !query SELECT cast(1 as decimal(5, 0)) - cast('1' as binary) FROM t --- !query 162 schema +-- !query schema struct<> --- !query 162 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) - CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) - CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7 --- !query 163 +-- !query SELECT cast(1 as decimal(10, 0)) - cast('1' as binary) FROM t --- !query 163 schema +-- !query schema struct<> --- !query 163 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) - CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) - CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7 --- !query 164 +-- !query SELECT cast(1 as decimal(20, 0)) - cast('1' as binary) FROM t --- !query 164 schema +-- !query schema struct<> --- !query 164 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) - CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) - CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7 --- !query 165 +-- !query SELECT cast(1 as decimal(3, 0)) - cast(1 as boolean) FROM t --- !query 165 schema +-- !query schema struct<> --- !query 165 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) - CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) - CAST(1 AS BOOLEAN))' (decimal(3,0) and boolean).; line 1 pos 7 --- !query 166 +-- !query SELECT cast(1 as decimal(5, 0)) - cast(1 as boolean) FROM t --- !query 166 schema +-- !query schema struct<> --- !query 166 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) - CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) - CAST(1 AS BOOLEAN))' (decimal(5,0) and boolean).; line 1 pos 7 --- !query 167 +-- !query SELECT cast(1 as decimal(10, 0)) - cast(1 as boolean) FROM t --- !query 167 schema +-- !query schema struct<> --- !query 167 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) - CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) - CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7 --- !query 168 +-- !query SELECT cast(1 as decimal(20, 0)) - cast(1 as boolean) FROM t --- !query 168 schema +-- !query schema struct<> --- !query 168 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) - CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) - CAST(1 AS BOOLEAN))' (decimal(20,0) and boolean).; line 1 pos 7 --- !query 169 +-- !query SELECT cast(1 as decimal(3, 0)) - cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 169 schema +-- !query schema struct<> --- !query 169 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS DECIMAL(3,0)) - CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) - CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7 +cannot resolve 'subtracttimestamps(CAST(1 AS DECIMAL(3,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS DECIMAL(3,0))' is of decimal(3,0) type.; line 1 pos 7 --- !query 170 +-- !query SELECT cast(1 as decimal(5, 0)) - cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 170 schema +-- !query schema struct<> --- !query 170 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS DECIMAL(5,0)) - CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) - CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7 +cannot resolve 'subtracttimestamps(CAST(1 AS DECIMAL(5,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS DECIMAL(5,0))' is of decimal(5,0) type.; line 1 pos 7 --- !query 171 +-- !query SELECT cast(1 as decimal(10, 0)) - cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 171 schema +-- !query schema struct<> --- !query 171 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS DECIMAL(10,0)) - CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) - CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7 +cannot resolve 'subtracttimestamps(CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS DECIMAL(10,0))' is of decimal(10,0) type.; line 1 pos 7 --- !query 172 +-- !query SELECT cast(1 as decimal(20, 0)) - cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 172 schema +-- !query schema struct<> --- !query 172 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS DECIMAL(20,0)) - CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) - CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7 +cannot resolve 'subtracttimestamps(CAST(1 AS DECIMAL(20,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: argument 1 requires timestamp type, however, 'CAST(1 AS DECIMAL(20,0))' is of decimal(20,0) type.; line 1 pos 7 --- !query 173 +-- !query SELECT cast(1 as decimal(3, 0)) - cast('2017-12-11 09:30:00' as date) FROM t --- !query 173 schema +-- !query schema struct<> --- !query 173 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS DECIMAL(3,0)) - CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) - CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7 +cannot resolve 'subtractdates(CAST(1 AS DECIMAL(3,0)), CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: argument 1 requires date type, however, 'CAST(1 AS DECIMAL(3,0))' is of decimal(3,0) type.; line 1 pos 7 --- !query 174 +-- !query SELECT cast(1 as decimal(5, 0)) - cast('2017-12-11 09:30:00' as date) FROM t --- !query 174 schema +-- !query schema struct<> --- !query 174 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS DECIMAL(5,0)) - CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) - CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7 +cannot resolve 'subtractdates(CAST(1 AS DECIMAL(5,0)), CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: argument 1 requires date type, however, 'CAST(1 AS DECIMAL(5,0))' is of decimal(5,0) type.; line 1 pos 7 --- !query 175 +-- !query SELECT cast(1 as decimal(10, 0)) - cast('2017-12-11 09:30:00' as date) FROM t --- !query 175 schema +-- !query schema struct<> --- !query 175 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS DECIMAL(10,0)) - CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) - CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7 +cannot resolve 'subtractdates(CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: argument 1 requires date type, however, 'CAST(1 AS DECIMAL(10,0))' is of decimal(10,0) type.; line 1 pos 7 --- !query 176 +-- !query SELECT cast(1 as decimal(20, 0)) - cast('2017-12-11 09:30:00' as date) FROM t --- !query 176 schema +-- !query schema struct<> --- !query 176 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST(1 AS DECIMAL(20,0)) - CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) - CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7 +cannot resolve 'subtractdates(CAST(1 AS DECIMAL(20,0)), CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: argument 1 requires date type, however, 'CAST(1 AS DECIMAL(20,0))' is of decimal(20,0) type.; line 1 pos 7 --- !query 177 +-- !query SELECT cast(1 as tinyint) * cast(1 as decimal(3, 0)) FROM t --- !query 177 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) * CAST(1 AS DECIMAL(3,0))):decimal(7,0)> --- !query 177 output +-- !query output 1 --- !query 178 +-- !query SELECT cast(1 as tinyint) * cast(1 as decimal(5, 0)) FROM t --- !query 178 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)) * CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0))):decimal(9,0)> --- !query 178 output +-- !query output 1 --- !query 179 +-- !query SELECT cast(1 as tinyint) * cast(1 as decimal(10, 0)) FROM t --- !query 179 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) * CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(14,0)> --- !query 179 output +-- !query output 1 --- !query 180 +-- !query SELECT cast(1 as tinyint) * cast(1 as decimal(20, 0)) FROM t --- !query 180 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)) * CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(24,0)> --- !query 180 output +-- !query output 1 --- !query 181 +-- !query SELECT cast(1 as smallint) * cast(1 as decimal(3, 0)) FROM t --- !query 181 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)) * CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0))):decimal(9,0)> --- !query 181 output +-- !query output 1 --- !query 182 +-- !query SELECT cast(1 as smallint) * cast(1 as decimal(5, 0)) FROM t --- !query 182 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) * CAST(1 AS DECIMAL(5,0))):decimal(11,0)> --- !query 182 output +-- !query output 1 --- !query 183 +-- !query SELECT cast(1 as smallint) * cast(1 as decimal(10, 0)) FROM t --- !query 183 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) * CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(16,0)> --- !query 183 output +-- !query output 1 --- !query 184 +-- !query SELECT cast(1 as smallint) * cast(1 as decimal(20, 0)) FROM t --- !query 184 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)) * CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(26,0)> --- !query 184 output +-- !query output 1 --- !query 185 +-- !query SELECT cast(1 as int) * cast(1 as decimal(3, 0)) FROM t --- !query 185 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) * CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(14,0)> --- !query 185 output +-- !query output 1 --- !query 186 +-- !query SELECT cast(1 as int) * cast(1 as decimal(5, 0)) FROM t --- !query 186 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) * CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(16,0)> --- !query 186 output +-- !query output 1 --- !query 187 +-- !query SELECT cast(1 as int) * cast(1 as decimal(10, 0)) FROM t --- !query 187 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) * CAST(1 AS DECIMAL(10,0))):decimal(21,0)> --- !query 187 output +-- !query output 1 --- !query 188 +-- !query SELECT cast(1 as int) * cast(1 as decimal(20, 0)) FROM t --- !query 188 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)) * CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(31,0)> --- !query 188 output +-- !query output 1 --- !query 189 +-- !query SELECT cast(1 as bigint) * cast(1 as decimal(3, 0)) FROM t --- !query 189 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) * CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0))):decimal(24,0)> --- !query 189 output +-- !query output 1 --- !query 190 +-- !query SELECT cast(1 as bigint) * cast(1 as decimal(5, 0)) FROM t --- !query 190 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) * CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0))):decimal(26,0)> --- !query 190 output +-- !query output 1 --- !query 191 +-- !query SELECT cast(1 as bigint) * cast(1 as decimal(10, 0)) FROM t --- !query 191 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) * CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(31,0)> --- !query 191 output +-- !query output 1 --- !query 192 +-- !query SELECT cast(1 as bigint) * cast(1 as decimal(20, 0)) FROM t --- !query 192 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) * CAST(1 AS DECIMAL(20,0))):decimal(38,0)> --- !query 192 output +-- !query output 1 --- !query 193 +-- !query SELECT cast(1 as float) * cast(1 as decimal(3, 0)) FROM t --- !query 193 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) * CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):double> --- !query 193 output +-- !query output 1.0 --- !query 194 +-- !query SELECT cast(1 as float) * cast(1 as decimal(5, 0)) FROM t --- !query 194 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) * CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):double> --- !query 194 output +-- !query output 1.0 --- !query 195 +-- !query SELECT cast(1 as float) * cast(1 as decimal(10, 0)) FROM t --- !query 195 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) * CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double> --- !query 195 output +-- !query output 1.0 --- !query 196 +-- !query SELECT cast(1 as float) * cast(1 as decimal(20, 0)) FROM t --- !query 196 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) * CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):double> --- !query 196 output +-- !query output 1.0 --- !query 197 +-- !query SELECT cast(1 as double) * cast(1 as decimal(3, 0)) FROM t --- !query 197 schema +-- !query schema struct<(CAST(1 AS DOUBLE) * CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):double> --- !query 197 output +-- !query output 1.0 --- !query 198 +-- !query SELECT cast(1 as double) * cast(1 as decimal(5, 0)) FROM t --- !query 198 schema +-- !query schema struct<(CAST(1 AS DOUBLE) * CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):double> --- !query 198 output +-- !query output 1.0 --- !query 199 +-- !query SELECT cast(1 as double) * cast(1 as decimal(10, 0)) FROM t --- !query 199 schema +-- !query schema struct<(CAST(1 AS DOUBLE) * CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double> --- !query 199 output +-- !query output 1.0 --- !query 200 +-- !query SELECT cast(1 as double) * cast(1 as decimal(20, 0)) FROM t --- !query 200 schema +-- !query schema struct<(CAST(1 AS DOUBLE) * CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):double> --- !query 200 output +-- !query output 1.0 --- !query 201 +-- !query SELECT cast(1 as decimal(10, 0)) * cast(1 as decimal(3, 0)) FROM t --- !query 201 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) * CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(14,0)> --- !query 201 output +-- !query output 1 --- !query 202 +-- !query SELECT cast(1 as decimal(10, 0)) * cast(1 as decimal(5, 0)) FROM t --- !query 202 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) * CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(16,0)> --- !query 202 output +-- !query output 1 --- !query 203 +-- !query SELECT cast(1 as decimal(10, 0)) * cast(1 as decimal(10, 0)) FROM t --- !query 203 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) * CAST(1 AS DECIMAL(10,0))):decimal(21,0)> --- !query 203 output +-- !query output 1 --- !query 204 +-- !query SELECT cast(1 as decimal(10, 0)) * cast(1 as decimal(20, 0)) FROM t --- !query 204 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) * CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(31,0)> --- !query 204 output +-- !query output 1 --- !query 205 +-- !query SELECT cast('1' as binary) * cast(1 as decimal(3, 0)) FROM t --- !query 205 schema +-- !query schema struct<> --- !query 205 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) * CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) * CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7 --- !query 206 +-- !query SELECT cast('1' as binary) * cast(1 as decimal(5, 0)) FROM t --- !query 206 schema +-- !query schema struct<> --- !query 206 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) * CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) * CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7 --- !query 207 +-- !query SELECT cast('1' as binary) * cast(1 as decimal(10, 0)) FROM t --- !query 207 schema +-- !query schema struct<> --- !query 207 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) * CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) * CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7 --- !query 208 +-- !query SELECT cast('1' as binary) * cast(1 as decimal(20, 0)) FROM t --- !query 208 schema +-- !query schema struct<> --- !query 208 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) * CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) * CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7 --- !query 209 +-- !query SELECT cast('2017*12*11 09:30:00.0' as timestamp) * cast(1 as decimal(3, 0)) FROM t --- !query 209 schema +-- !query schema struct<> --- !query 209 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017*12*11 09:30:00.0' AS TIMESTAMP) * CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017*12*11 09:30:00.0' AS TIMESTAMP) * CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7 --- !query 210 +-- !query SELECT cast('2017*12*11 09:30:00.0' as timestamp) * cast(1 as decimal(5, 0)) FROM t --- !query 210 schema +-- !query schema struct<> --- !query 210 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017*12*11 09:30:00.0' AS TIMESTAMP) * CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017*12*11 09:30:00.0' AS TIMESTAMP) * CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7 --- !query 211 +-- !query SELECT cast('2017*12*11 09:30:00.0' as timestamp) * cast(1 as decimal(10, 0)) FROM t --- !query 211 schema +-- !query schema struct<> --- !query 211 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017*12*11 09:30:00.0' AS TIMESTAMP) * CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017*12*11 09:30:00.0' AS TIMESTAMP) * CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7 --- !query 212 +-- !query SELECT cast('2017*12*11 09:30:00.0' as timestamp) * cast(1 as decimal(20, 0)) FROM t --- !query 212 schema +-- !query schema struct<> --- !query 212 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017*12*11 09:30:00.0' AS TIMESTAMP) * CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017*12*11 09:30:00.0' AS TIMESTAMP) * CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7 --- !query 213 +-- !query SELECT cast('2017*12*11 09:30:00' as date) * cast(1 as decimal(3, 0)) FROM t --- !query 213 schema +-- !query schema struct<> --- !query 213 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017*12*11 09:30:00' AS DATE) * CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017*12*11 09:30:00' AS DATE) * CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7 --- !query 214 +-- !query SELECT cast('2017*12*11 09:30:00' as date) * cast(1 as decimal(5, 0)) FROM t --- !query 214 schema +-- !query schema struct<> --- !query 214 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017*12*11 09:30:00' AS DATE) * CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017*12*11 09:30:00' AS DATE) * CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7 --- !query 215 +-- !query SELECT cast('2017*12*11 09:30:00' as date) * cast(1 as decimal(10, 0)) FROM t --- !query 215 schema +-- !query schema struct<> --- !query 215 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017*12*11 09:30:00' AS DATE) * CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017*12*11 09:30:00' AS DATE) * CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7 --- !query 216 +-- !query SELECT cast('2017*12*11 09:30:00' as date) * cast(1 as decimal(20, 0)) FROM t --- !query 216 schema +-- !query schema struct<> --- !query 216 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017*12*11 09:30:00' AS DATE) * CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017*12*11 09:30:00' AS DATE) * CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7 --- !query 217 +-- !query SELECT cast(1 as decimal(3, 0)) * cast(1 as tinyint) FROM t --- !query 217 schema +-- !query schema struct<(CAST(1 AS DECIMAL(3,0)) * CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):decimal(7,0)> --- !query 217 output +-- !query output 1 --- !query 218 +-- !query SELECT cast(1 as decimal(5, 0)) * cast(1 as tinyint) FROM t --- !query 218 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)) * CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0))):decimal(9,0)> --- !query 218 output +-- !query output 1 --- !query 219 +-- !query SELECT cast(1 as decimal(10, 0)) * cast(1 as tinyint) FROM t --- !query 219 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) * CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(14,0)> --- !query 219 output +-- !query output 1 --- !query 220 +-- !query SELECT cast(1 as decimal(20, 0)) * cast(1 as tinyint) FROM t --- !query 220 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) * CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0))):decimal(24,0)> --- !query 220 output +-- !query output 1 --- !query 221 +-- !query SELECT cast(1 as decimal(3, 0)) * cast(1 as smallint) FROM t --- !query 221 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)) * CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0))):decimal(9,0)> --- !query 221 output +-- !query output 1 --- !query 222 +-- !query SELECT cast(1 as decimal(5, 0)) * cast(1 as smallint) FROM t --- !query 222 schema +-- !query schema struct<(CAST(1 AS DECIMAL(5,0)) * CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):decimal(11,0)> --- !query 222 output +-- !query output 1 --- !query 223 +-- !query SELECT cast(1 as decimal(10, 0)) * cast(1 as smallint) FROM t --- !query 223 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) * CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(16,0)> --- !query 223 output +-- !query output 1 --- !query 224 +-- !query SELECT cast(1 as decimal(20, 0)) * cast(1 as smallint) FROM t --- !query 224 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) * CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0))):decimal(26,0)> --- !query 224 output +-- !query output 1 --- !query 225 +-- !query SELECT cast(1 as decimal(3, 0)) * cast(1 as int) FROM t --- !query 225 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) * CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(14,0)> --- !query 225 output +-- !query output 1 --- !query 226 +-- !query SELECT cast(1 as decimal(5, 0)) * cast(1 as int) FROM t --- !query 226 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) * CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(16,0)> --- !query 226 output +-- !query output 1 --- !query 227 +-- !query SELECT cast(1 as decimal(10, 0)) * cast(1 as int) FROM t --- !query 227 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) * CAST(CAST(1 AS INT) AS DECIMAL(10,0))):decimal(21,0)> --- !query 227 output +-- !query output 1 --- !query 228 +-- !query SELECT cast(1 as decimal(20, 0)) * cast(1 as int) FROM t --- !query 228 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) * CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(31,0)> --- !query 228 output +-- !query output 1 --- !query 229 +-- !query SELECT cast(1 as decimal(3, 0)) * cast(1 as bigint) FROM t --- !query 229 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)) * CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(24,0)> --- !query 229 output +-- !query output 1 --- !query 230 +-- !query SELECT cast(1 as decimal(5, 0)) * cast(1 as bigint) FROM t --- !query 230 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)) * CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(26,0)> --- !query 230 output +-- !query output 1 --- !query 231 +-- !query SELECT cast(1 as decimal(10, 0)) * cast(1 as bigint) FROM t --- !query 231 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) * CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(31,0)> --- !query 231 output +-- !query output 1 --- !query 232 +-- !query SELECT cast(1 as decimal(20, 0)) * cast(1 as bigint) FROM t --- !query 232 schema +-- !query schema struct<(CAST(1 AS DECIMAL(20,0)) * CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):decimal(38,0)> --- !query 232 output +-- !query output 1 --- !query 233 +-- !query SELECT cast(1 as decimal(3, 0)) * cast(1 as float) FROM t --- !query 233 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) * CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 233 output +-- !query output 1.0 --- !query 234 +-- !query SELECT cast(1 as decimal(5, 0)) * cast(1 as float) FROM t --- !query 234 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) * CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 234 output +-- !query output 1.0 --- !query 235 +-- !query SELECT cast(1 as decimal(10, 0)) * cast(1 as float) FROM t --- !query 235 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) * CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 235 output +-- !query output 1.0 --- !query 236 +-- !query SELECT cast(1 as decimal(20, 0)) * cast(1 as float) FROM t --- !query 236 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) * CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 236 output +-- !query output 1.0 --- !query 237 +-- !query SELECT cast(1 as decimal(3, 0)) * cast(1 as double) FROM t --- !query 237 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) * CAST(1 AS DOUBLE)):double> --- !query 237 output +-- !query output 1.0 --- !query 238 +-- !query SELECT cast(1 as decimal(5, 0)) * cast(1 as double) FROM t --- !query 238 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) * CAST(1 AS DOUBLE)):double> --- !query 238 output +-- !query output 1.0 --- !query 239 +-- !query SELECT cast(1 as decimal(10, 0)) * cast(1 as double) FROM t --- !query 239 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) * CAST(1 AS DOUBLE)):double> --- !query 239 output +-- !query output 1.0 --- !query 240 +-- !query SELECT cast(1 as decimal(20, 0)) * cast(1 as double) FROM t --- !query 240 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) * CAST(1 AS DOUBLE)):double> --- !query 240 output +-- !query output 1.0 --- !query 241 +-- !query SELECT cast(1 as decimal(3, 0)) * cast(1 as decimal(10, 0)) FROM t --- !query 241 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) * CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(14,0)> --- !query 241 output +-- !query output 1 --- !query 242 +-- !query SELECT cast(1 as decimal(5, 0)) * cast(1 as decimal(10, 0)) FROM t --- !query 242 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) * CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(16,0)> --- !query 242 output +-- !query output 1 --- !query 243 +-- !query SELECT cast(1 as decimal(10, 0)) * cast(1 as decimal(10, 0)) FROM t --- !query 243 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) * CAST(1 AS DECIMAL(10,0))):decimal(21,0)> --- !query 243 output +-- !query output 1 --- !query 244 +-- !query SELECT cast(1 as decimal(20, 0)) * cast(1 as decimal(10, 0)) FROM t --- !query 244 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) * CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(31,0)> --- !query 244 output +-- !query output 1 --- !query 245 +-- !query SELECT cast(1 as decimal(3, 0)) * cast(1 as string) FROM t --- !query 245 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) * CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 245 output +-- !query output 1.0 --- !query 246 +-- !query SELECT cast(1 as decimal(5, 0)) * cast(1 as string) FROM t --- !query 246 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) * CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 246 output +-- !query output 1.0 --- !query 247 +-- !query SELECT cast(1 as decimal(10, 0)) * cast(1 as string) FROM t --- !query 247 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) * CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 247 output +-- !query output 1.0 --- !query 248 +-- !query SELECT cast(1 as decimal(20, 0)) * cast(1 as string) FROM t --- !query 248 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) * CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 248 output +-- !query output 1.0 --- !query 249 +-- !query SELECT cast(1 as decimal(3, 0)) * cast('1' as binary) FROM t --- !query 249 schema +-- !query schema struct<> --- !query 249 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) * CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) * CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7 --- !query 250 +-- !query SELECT cast(1 as decimal(5, 0)) * cast('1' as binary) FROM t --- !query 250 schema +-- !query schema struct<> --- !query 250 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) * CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) * CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7 --- !query 251 +-- !query SELECT cast(1 as decimal(10, 0)) * cast('1' as binary) FROM t --- !query 251 schema +-- !query schema struct<> --- !query 251 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) * CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) * CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7 --- !query 252 +-- !query SELECT cast(1 as decimal(20, 0)) * cast('1' as binary) FROM t --- !query 252 schema +-- !query schema struct<> --- !query 252 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) * CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) * CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7 --- !query 253 +-- !query SELECT cast(1 as decimal(3, 0)) * cast(1 as boolean) FROM t --- !query 253 schema +-- !query schema struct<> --- !query 253 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) * CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) * CAST(1 AS BOOLEAN))' (decimal(3,0) and boolean).; line 1 pos 7 --- !query 254 +-- !query SELECT cast(1 as decimal(5, 0)) * cast(1 as boolean) FROM t --- !query 254 schema +-- !query schema struct<> --- !query 254 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) * CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) * CAST(1 AS BOOLEAN))' (decimal(5,0) and boolean).; line 1 pos 7 --- !query 255 +-- !query SELECT cast(1 as decimal(10, 0)) * cast(1 as boolean) FROM t --- !query 255 schema +-- !query schema struct<> --- !query 255 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) * CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) * CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7 --- !query 256 +-- !query SELECT cast(1 as decimal(20, 0)) * cast(1 as boolean) FROM t --- !query 256 schema +-- !query schema struct<> --- !query 256 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) * CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) * CAST(1 AS BOOLEAN))' (decimal(20,0) and boolean).; line 1 pos 7 --- !query 257 +-- !query SELECT cast(1 as decimal(3, 0)) * cast('2017*12*11 09:30:00.0' as timestamp) FROM t --- !query 257 schema +-- !query schema struct<> --- !query 257 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) * CAST('2017*12*11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) * CAST('2017*12*11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7 --- !query 258 +-- !query SELECT cast(1 as decimal(5, 0)) * cast('2017*12*11 09:30:00.0' as timestamp) FROM t --- !query 258 schema +-- !query schema struct<> --- !query 258 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) * CAST('2017*12*11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) * CAST('2017*12*11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7 --- !query 259 +-- !query SELECT cast(1 as decimal(10, 0)) * cast('2017*12*11 09:30:00.0' as timestamp) FROM t --- !query 259 schema +-- !query schema struct<> --- !query 259 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) * CAST('2017*12*11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) * CAST('2017*12*11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7 --- !query 260 +-- !query SELECT cast(1 as decimal(20, 0)) * cast('2017*12*11 09:30:00.0' as timestamp) FROM t --- !query 260 schema +-- !query schema struct<> --- !query 260 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) * CAST('2017*12*11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) * CAST('2017*12*11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7 --- !query 261 +-- !query SELECT cast(1 as decimal(3, 0)) * cast('2017*12*11 09:30:00' as date) FROM t --- !query 261 schema +-- !query schema struct<> --- !query 261 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) * CAST('2017*12*11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) * CAST('2017*12*11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7 --- !query 262 +-- !query SELECT cast(1 as decimal(5, 0)) * cast('2017*12*11 09:30:00' as date) FROM t --- !query 262 schema +-- !query schema struct<> --- !query 262 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) * CAST('2017*12*11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) * CAST('2017*12*11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7 --- !query 263 +-- !query SELECT cast(1 as decimal(10, 0)) * cast('2017*12*11 09:30:00' as date) FROM t --- !query 263 schema +-- !query schema struct<> --- !query 263 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) * CAST('2017*12*11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) * CAST('2017*12*11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7 --- !query 264 +-- !query SELECT cast(1 as decimal(20, 0)) * cast('2017*12*11 09:30:00' as date) FROM t --- !query 264 schema +-- !query schema struct<> --- !query 264 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) * CAST('2017*12*11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) * CAST('2017*12*11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7 --- !query 265 +-- !query SELECT cast(1 as tinyint) / cast(1 as decimal(3, 0)) FROM t --- !query 265 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) / CAST(1 AS DECIMAL(3,0))):decimal(9,6)> --- !query 265 output -1 +-- !query output +1.000000 --- !query 266 +-- !query SELECT cast(1 as tinyint) / cast(1 as decimal(5, 0)) FROM t --- !query 266 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)) / CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0))):decimal(9,6)> --- !query 266 output -1 +-- !query output +1.000000 --- !query 267 +-- !query SELECT cast(1 as tinyint) / cast(1 as decimal(10, 0)) FROM t --- !query 267 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) / CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(14,11)> --- !query 267 output -1 +-- !query output +1.00000000000 --- !query 268 +-- !query SELECT cast(1 as tinyint) / cast(1 as decimal(20, 0)) FROM t --- !query 268 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)) / CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(24,21)> --- !query 268 output -1 +-- !query output +1.000000000000000000000 --- !query 269 +-- !query SELECT cast(1 as smallint) / cast(1 as decimal(3, 0)) FROM t --- !query 269 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)) / CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0))):decimal(11,6)> --- !query 269 output -1 +-- !query output +1.000000 --- !query 270 +-- !query SELECT cast(1 as smallint) / cast(1 as decimal(5, 0)) FROM t --- !query 270 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) / CAST(1 AS DECIMAL(5,0))):decimal(11,6)> --- !query 270 output -1 +-- !query output +1.000000 --- !query 271 +-- !query SELECT cast(1 as smallint) / cast(1 as decimal(10, 0)) FROM t --- !query 271 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) / CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(16,11)> --- !query 271 output -1 +-- !query output +1.00000000000 --- !query 272 +-- !query SELECT cast(1 as smallint) / cast(1 as decimal(20, 0)) FROM t --- !query 272 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)) / CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(26,21)> --- !query 272 output -1 +-- !query output +1.000000000000000000000 --- !query 273 +-- !query SELECT cast(1 as int) / cast(1 as decimal(3, 0)) FROM t --- !query 273 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) / CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(16,6)> --- !query 273 output -1 +-- !query output +1.000000 --- !query 274 +-- !query SELECT cast(1 as int) / cast(1 as decimal(5, 0)) FROM t --- !query 274 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) / CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(16,6)> --- !query 274 output -1 +-- !query output +1.000000 --- !query 275 +-- !query SELECT cast(1 as int) / cast(1 as decimal(10, 0)) FROM t --- !query 275 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) / CAST(1 AS DECIMAL(10,0))):decimal(21,11)> --- !query 275 output -1 +-- !query output +1.00000000000 --- !query 276 +-- !query SELECT cast(1 as int) / cast(1 as decimal(20, 0)) FROM t --- !query 276 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)) / CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(31,21)> --- !query 276 output -1 +-- !query output +1.000000000000000000000 --- !query 277 +-- !query SELECT cast(1 as bigint) / cast(1 as decimal(3, 0)) FROM t --- !query 277 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) / CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0))):decimal(26,6)> --- !query 277 output -1 +-- !query output +1.000000 --- !query 278 +-- !query SELECT cast(1 as bigint) / cast(1 as decimal(5, 0)) FROM t --- !query 278 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) / CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0))):decimal(26,6)> --- !query 278 output -1 +-- !query output +1.000000 --- !query 279 +-- !query SELECT cast(1 as bigint) / cast(1 as decimal(10, 0)) FROM t --- !query 279 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) / CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(31,11)> --- !query 279 output -1 +-- !query output +1.00000000000 --- !query 280 +-- !query SELECT cast(1 as bigint) / cast(1 as decimal(20, 0)) FROM t --- !query 280 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) / CAST(1 AS DECIMAL(20,0))):decimal(38,18)> --- !query 280 output -1 +-- !query output +1.000000000000000000 --- !query 281 +-- !query SELECT cast(1 as float) / cast(1 as decimal(3, 0)) FROM t --- !query 281 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) AS DOUBLE)):double> --- !query 281 output +-- !query output 1.0 --- !query 282 +-- !query SELECT cast(1 as float) / cast(1 as decimal(5, 0)) FROM t --- !query 282 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) AS DOUBLE)):double> --- !query 282 output +-- !query output 1.0 --- !query 283 +-- !query SELECT cast(1 as float) / cast(1 as decimal(10, 0)) FROM t --- !query 283 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) AS DOUBLE)):double> --- !query 283 output +-- !query output 1.0 --- !query 284 +-- !query SELECT cast(1 as float) / cast(1 as decimal(20, 0)) FROM t --- !query 284 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) AS DOUBLE)):double> --- !query 284 output +-- !query output 1.0 --- !query 285 +-- !query SELECT cast(1 as double) / cast(1 as decimal(3, 0)) FROM t --- !query 285 schema +-- !query schema struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):double> --- !query 285 output +-- !query output 1.0 --- !query 286 +-- !query SELECT cast(1 as double) / cast(1 as decimal(5, 0)) FROM t --- !query 286 schema +-- !query schema struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):double> --- !query 286 output +-- !query output 1.0 --- !query 287 +-- !query SELECT cast(1 as double) / cast(1 as decimal(10, 0)) FROM t --- !query 287 schema +-- !query schema struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double> --- !query 287 output +-- !query output 1.0 --- !query 288 +-- !query SELECT cast(1 as double) / cast(1 as decimal(20, 0)) FROM t --- !query 288 schema +-- !query schema struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):double> --- !query 288 output +-- !query output 1.0 --- !query 289 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as decimal(3, 0)) FROM t --- !query 289 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) / CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(16,6)> --- !query 289 output -1 +-- !query output +1.000000 --- !query 290 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as decimal(5, 0)) FROM t --- !query 290 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) / CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(16,6)> --- !query 290 output -1 +-- !query output +1.000000 --- !query 291 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as decimal(10, 0)) FROM t --- !query 291 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) / CAST(1 AS DECIMAL(10,0))):decimal(21,11)> --- !query 291 output -1 +-- !query output +1.00000000000 --- !query 292 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as decimal(20, 0)) FROM t --- !query 292 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) / CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(31,21)> --- !query 292 output -1 +-- !query output +1.000000000000000000000 --- !query 293 +-- !query SELECT cast('1' as binary) / cast(1 as decimal(3, 0)) FROM t --- !query 293 schema +-- !query schema struct<> --- !query 293 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7 --- !query 294 +-- !query SELECT cast('1' as binary) / cast(1 as decimal(5, 0)) FROM t --- !query 294 schema +-- !query schema struct<> --- !query 294 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7 --- !query 295 +-- !query SELECT cast('1' as binary) / cast(1 as decimal(10, 0)) FROM t --- !query 295 schema +-- !query schema struct<> --- !query 295 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7 --- !query 296 +-- !query SELECT cast('1' as binary) / cast(1 as decimal(20, 0)) FROM t --- !query 296 schema +-- !query schema struct<> --- !query 296 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7 --- !query 297 +-- !query SELECT cast('2017/12/11 09:30:00.0' as timestamp) / cast(1 as decimal(3, 0)) FROM t --- !query 297 schema +-- !query schema struct<> --- !query 297 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017/12/11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017/12/11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7 --- !query 298 +-- !query SELECT cast('2017/12/11 09:30:00.0' as timestamp) / cast(1 as decimal(5, 0)) FROM t --- !query 298 schema +-- !query schema struct<> --- !query 298 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017/12/11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017/12/11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7 --- !query 299 +-- !query SELECT cast('2017/12/11 09:30:00.0' as timestamp) / cast(1 as decimal(10, 0)) FROM t --- !query 299 schema +-- !query schema struct<> --- !query 299 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017/12/11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017/12/11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7 --- !query 300 +-- !query SELECT cast('2017/12/11 09:30:00.0' as timestamp) / cast(1 as decimal(20, 0)) FROM t --- !query 300 schema +-- !query schema struct<> --- !query 300 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017/12/11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017/12/11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7 --- !query 301 +-- !query SELECT cast('2017/12/11 09:30:00' as date) / cast(1 as decimal(3, 0)) FROM t --- !query 301 schema +-- !query schema struct<> --- !query 301 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017/12/11 09:30:00' AS DATE) / CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017/12/11 09:30:00' AS DATE) / CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7 --- !query 302 +-- !query SELECT cast('2017/12/11 09:30:00' as date) / cast(1 as decimal(5, 0)) FROM t --- !query 302 schema +-- !query schema struct<> --- !query 302 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017/12/11 09:30:00' AS DATE) / CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017/12/11 09:30:00' AS DATE) / CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7 --- !query 303 +-- !query SELECT cast('2017/12/11 09:30:00' as date) / cast(1 as decimal(10, 0)) FROM t --- !query 303 schema +-- !query schema struct<> --- !query 303 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017/12/11 09:30:00' AS DATE) / CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017/12/11 09:30:00' AS DATE) / CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7 --- !query 304 +-- !query SELECT cast('2017/12/11 09:30:00' as date) / cast(1 as decimal(20, 0)) FROM t --- !query 304 schema +-- !query schema struct<> --- !query 304 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017/12/11 09:30:00' AS DATE) / CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017/12/11 09:30:00' AS DATE) / CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7 --- !query 305 +-- !query SELECT cast(1 as decimal(3, 0)) / cast(1 as tinyint) FROM t --- !query 305 schema +-- !query schema struct<(CAST(1 AS DECIMAL(3,0)) / CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):decimal(9,6)> --- !query 305 output -1 +-- !query output +1.000000 --- !query 306 +-- !query SELECT cast(1 as decimal(5, 0)) / cast(1 as tinyint) FROM t --- !query 306 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)) / CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0))):decimal(11,6)> --- !query 306 output -1 +-- !query output +1.000000 --- !query 307 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as tinyint) FROM t --- !query 307 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) / CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(16,6)> --- !query 307 output -1 +-- !query output +1.000000 --- !query 308 +-- !query SELECT cast(1 as decimal(20, 0)) / cast(1 as tinyint) FROM t --- !query 308 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) / CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0))):decimal(26,6)> --- !query 308 output -1 +-- !query output +1.000000 --- !query 309 +-- !query SELECT cast(1 as decimal(3, 0)) / cast(1 as smallint) FROM t --- !query 309 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)) / CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0))):decimal(9,6)> --- !query 309 output -1 +-- !query output +1.000000 --- !query 310 +-- !query SELECT cast(1 as decimal(5, 0)) / cast(1 as smallint) FROM t --- !query 310 schema +-- !query schema struct<(CAST(1 AS DECIMAL(5,0)) / CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):decimal(11,6)> --- !query 310 output -1 +-- !query output +1.000000 --- !query 311 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as smallint) FROM t --- !query 311 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) / CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(16,6)> --- !query 311 output -1 +-- !query output +1.000000 --- !query 312 +-- !query SELECT cast(1 as decimal(20, 0)) / cast(1 as smallint) FROM t --- !query 312 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) / CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0))):decimal(26,6)> --- !query 312 output -1 +-- !query output +1.000000 --- !query 313 +-- !query SELECT cast(1 as decimal(3, 0)) / cast(1 as int) FROM t --- !query 313 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) / CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(14,11)> --- !query 313 output -1 +-- !query output +1.00000000000 --- !query 314 +-- !query SELECT cast(1 as decimal(5, 0)) / cast(1 as int) FROM t --- !query 314 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) / CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(16,11)> --- !query 314 output -1 +-- !query output +1.00000000000 --- !query 315 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as int) FROM t --- !query 315 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) / CAST(CAST(1 AS INT) AS DECIMAL(10,0))):decimal(21,11)> --- !query 315 output -1 +-- !query output +1.00000000000 --- !query 316 +-- !query SELECT cast(1 as decimal(20, 0)) / cast(1 as int) FROM t --- !query 316 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) / CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(31,11)> --- !query 316 output -1 +-- !query output +1.00000000000 --- !query 317 +-- !query SELECT cast(1 as decimal(3, 0)) / cast(1 as bigint) FROM t --- !query 317 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)) / CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(24,21)> --- !query 317 output -1 +-- !query output +1.000000000000000000000 --- !query 318 +-- !query SELECT cast(1 as decimal(5, 0)) / cast(1 as bigint) FROM t --- !query 318 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)) / CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(26,21)> --- !query 318 output -1 +-- !query output +1.000000000000000000000 --- !query 319 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as bigint) FROM t --- !query 319 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) / CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(31,21)> --- !query 319 output -1 +-- !query output +1.000000000000000000000 --- !query 320 +-- !query SELECT cast(1 as decimal(20, 0)) / cast(1 as bigint) FROM t --- !query 320 schema +-- !query schema struct<(CAST(1 AS DECIMAL(20,0)) / CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):decimal(38,18)> --- !query 320 output -1 +-- !query output +1.000000000000000000 --- !query 321 +-- !query SELECT cast(1 as decimal(3, 0)) / cast(1 as float) FROM t --- !query 321 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 321 output +-- !query output 1.0 --- !query 322 +-- !query SELECT cast(1 as decimal(5, 0)) / cast(1 as float) FROM t --- !query 322 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 322 output +-- !query output 1.0 --- !query 323 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as float) FROM t --- !query 323 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 323 output +-- !query output 1.0 --- !query 324 +-- !query SELECT cast(1 as decimal(20, 0)) / cast(1 as float) FROM t --- !query 324 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 324 output +-- !query output 1.0 --- !query 325 +-- !query SELECT cast(1 as decimal(3, 0)) / cast(1 as double) FROM t --- !query 325 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) / CAST(1 AS DOUBLE)):double> --- !query 325 output +-- !query output 1.0 --- !query 326 +-- !query SELECT cast(1 as decimal(5, 0)) / cast(1 as double) FROM t --- !query 326 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) / CAST(1 AS DOUBLE)):double> --- !query 326 output +-- !query output 1.0 --- !query 327 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as double) FROM t --- !query 327 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) / CAST(1 AS DOUBLE)):double> --- !query 327 output +-- !query output 1.0 --- !query 328 +-- !query SELECT cast(1 as decimal(20, 0)) / cast(1 as double) FROM t --- !query 328 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) / CAST(1 AS DOUBLE)):double> --- !query 328 output +-- !query output 1.0 --- !query 329 +-- !query SELECT cast(1 as decimal(3, 0)) / cast(1 as decimal(10, 0)) FROM t --- !query 329 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) / CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(14,11)> --- !query 329 output -1 +-- !query output +1.00000000000 --- !query 330 +-- !query SELECT cast(1 as decimal(5, 0)) / cast(1 as decimal(10, 0)) FROM t --- !query 330 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) / CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(16,11)> --- !query 330 output -1 +-- !query output +1.00000000000 --- !query 331 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as decimal(10, 0)) FROM t --- !query 331 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) / CAST(1 AS DECIMAL(10,0))):decimal(21,11)> --- !query 331 output -1 +-- !query output +1.00000000000 --- !query 332 +-- !query SELECT cast(1 as decimal(20, 0)) / cast(1 as decimal(10, 0)) FROM t --- !query 332 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) / CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(31,11)> --- !query 332 output -1 +-- !query output +1.00000000000 --- !query 333 +-- !query SELECT cast(1 as decimal(3, 0)) / cast(1 as string) FROM t --- !query 333 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) / CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 333 output +-- !query output 1.0 --- !query 334 +-- !query SELECT cast(1 as decimal(5, 0)) / cast(1 as string) FROM t --- !query 334 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) / CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 334 output +-- !query output 1.0 --- !query 335 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as string) FROM t --- !query 335 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) / CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 335 output +-- !query output 1.0 --- !query 336 +-- !query SELECT cast(1 as decimal(20, 0)) / cast(1 as string) FROM t --- !query 336 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) / CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 336 output +-- !query output 1.0 --- !query 337 +-- !query SELECT cast(1 as decimal(3, 0)) / cast('1' as binary) FROM t --- !query 337 schema +-- !query schema struct<> --- !query 337 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) / CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7 --- !query 338 +-- !query SELECT cast(1 as decimal(5, 0)) / cast('1' as binary) FROM t --- !query 338 schema +-- !query schema struct<> --- !query 338 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) / CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7 --- !query 339 +-- !query SELECT cast(1 as decimal(10, 0)) / cast('1' as binary) FROM t --- !query 339 schema +-- !query schema struct<> --- !query 339 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) / CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7 --- !query 340 +-- !query SELECT cast(1 as decimal(20, 0)) / cast('1' as binary) FROM t --- !query 340 schema +-- !query schema struct<> --- !query 340 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) / CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7 --- !query 341 +-- !query SELECT cast(1 as decimal(3, 0)) / cast(1 as boolean) FROM t --- !query 341 schema +-- !query schema struct<> --- !query 341 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) / CAST(1 AS BOOLEAN))' (decimal(3,0) and boolean).; line 1 pos 7 --- !query 342 +-- !query SELECT cast(1 as decimal(5, 0)) / cast(1 as boolean) FROM t --- !query 342 schema +-- !query schema struct<> --- !query 342 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) / CAST(1 AS BOOLEAN))' (decimal(5,0) and boolean).; line 1 pos 7 --- !query 343 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as boolean) FROM t --- !query 343 schema +-- !query schema struct<> --- !query 343 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) / CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7 --- !query 344 +-- !query SELECT cast(1 as decimal(20, 0)) / cast(1 as boolean) FROM t --- !query 344 schema +-- !query schema struct<> --- !query 344 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) / CAST(1 AS BOOLEAN))' (decimal(20,0) and boolean).; line 1 pos 7 --- !query 345 +-- !query SELECT cast(1 as decimal(3, 0)) / cast('2017/12/11 09:30:00.0' as timestamp) FROM t --- !query 345 schema +-- !query schema struct<> --- !query 345 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) / CAST('2017/12/11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) / CAST('2017/12/11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7 --- !query 346 +-- !query SELECT cast(1 as decimal(5, 0)) / cast('2017/12/11 09:30:00.0' as timestamp) FROM t --- !query 346 schema +-- !query schema struct<> --- !query 346 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) / CAST('2017/12/11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) / CAST('2017/12/11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7 --- !query 347 +-- !query SELECT cast(1 as decimal(10, 0)) / cast('2017/12/11 09:30:00.0' as timestamp) FROM t --- !query 347 schema +-- !query schema struct<> --- !query 347 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) / CAST('2017/12/11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) / CAST('2017/12/11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7 --- !query 348 +-- !query SELECT cast(1 as decimal(20, 0)) / cast('2017/12/11 09:30:00.0' as timestamp) FROM t --- !query 348 schema +-- !query schema struct<> --- !query 348 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) / CAST('2017/12/11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) / CAST('2017/12/11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7 --- !query 349 +-- !query SELECT cast(1 as decimal(3, 0)) / cast('2017/12/11 09:30:00' as date) FROM t --- !query 349 schema +-- !query schema struct<> --- !query 349 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) / CAST('2017/12/11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) / CAST('2017/12/11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7 --- !query 350 +-- !query SELECT cast(1 as decimal(5, 0)) / cast('2017/12/11 09:30:00' as date) FROM t --- !query 350 schema +-- !query schema struct<> --- !query 350 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) / CAST('2017/12/11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) / CAST('2017/12/11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7 --- !query 351 +-- !query SELECT cast(1 as decimal(10, 0)) / cast('2017/12/11 09:30:00' as date) FROM t --- !query 351 schema +-- !query schema struct<> --- !query 351 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) / CAST('2017/12/11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) / CAST('2017/12/11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7 --- !query 352 +-- !query SELECT cast(1 as decimal(20, 0)) / cast('2017/12/11 09:30:00' as date) FROM t --- !query 352 schema +-- !query schema struct<> --- !query 352 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) / CAST('2017/12/11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) / CAST('2017/12/11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7 --- !query 353 +-- !query SELECT cast(1 as tinyint) % cast(1 as decimal(3, 0)) FROM t --- !query 353 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) % CAST(1 AS DECIMAL(3,0))):decimal(3,0)> --- !query 353 output +-- !query output 0 --- !query 354 +-- !query SELECT cast(1 as tinyint) % cast(1 as decimal(5, 0)) FROM t --- !query 354 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)) % CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0))):decimal(3,0)> --- !query 354 output +-- !query output 0 --- !query 355 +-- !query SELECT cast(1 as tinyint) % cast(1 as decimal(10, 0)) FROM t --- !query 355 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) % CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(3,0)> --- !query 355 output +-- !query output 0 --- !query 356 +-- !query SELECT cast(1 as tinyint) % cast(1 as decimal(20, 0)) FROM t --- !query 356 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)) % CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(3,0)> --- !query 356 output +-- !query output 0 --- !query 357 +-- !query SELECT cast(1 as smallint) % cast(1 as decimal(3, 0)) FROM t --- !query 357 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)) % CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0))):decimal(3,0)> --- !query 357 output +-- !query output 0 --- !query 358 +-- !query SELECT cast(1 as smallint) % cast(1 as decimal(5, 0)) FROM t --- !query 358 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) % CAST(1 AS DECIMAL(5,0))):decimal(5,0)> --- !query 358 output +-- !query output 0 --- !query 359 +-- !query SELECT cast(1 as smallint) % cast(1 as decimal(10, 0)) FROM t --- !query 359 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) % CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(5,0)> --- !query 359 output +-- !query output 0 --- !query 360 +-- !query SELECT cast(1 as smallint) % cast(1 as decimal(20, 0)) FROM t --- !query 360 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)) % CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(5,0)> --- !query 360 output +-- !query output 0 --- !query 361 +-- !query SELECT cast(1 as int) % cast(1 as decimal(3, 0)) FROM t --- !query 361 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) % CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(3,0)> --- !query 361 output +-- !query output 0 --- !query 362 +-- !query SELECT cast(1 as int) % cast(1 as decimal(5, 0)) FROM t --- !query 362 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) % CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(5,0)> --- !query 362 output +-- !query output 0 --- !query 363 +-- !query SELECT cast(1 as int) % cast(1 as decimal(10, 0)) FROM t --- !query 363 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) % CAST(1 AS DECIMAL(10,0))):decimal(10,0)> --- !query 363 output +-- !query output 0 --- !query 364 +-- !query SELECT cast(1 as int) % cast(1 as decimal(20, 0)) FROM t --- !query 364 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)) % CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(10,0)> --- !query 364 output +-- !query output 0 --- !query 365 +-- !query SELECT cast(1 as bigint) % cast(1 as decimal(3, 0)) FROM t --- !query 365 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) % CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0))):decimal(3,0)> --- !query 365 output +-- !query output 0 --- !query 366 +-- !query SELECT cast(1 as bigint) % cast(1 as decimal(5, 0)) FROM t --- !query 366 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) % CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0))):decimal(5,0)> --- !query 366 output +-- !query output 0 --- !query 367 +-- !query SELECT cast(1 as bigint) % cast(1 as decimal(10, 0)) FROM t --- !query 367 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) % CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(10,0)> --- !query 367 output +-- !query output 0 --- !query 368 +-- !query SELECT cast(1 as bigint) % cast(1 as decimal(20, 0)) FROM t --- !query 368 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) % CAST(1 AS DECIMAL(20,0))):decimal(20,0)> --- !query 368 output +-- !query output 0 --- !query 369 +-- !query SELECT cast(1 as float) % cast(1 as decimal(3, 0)) FROM t --- !query 369 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) % CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):double> --- !query 369 output +-- !query output 0.0 --- !query 370 +-- !query SELECT cast(1 as float) % cast(1 as decimal(5, 0)) FROM t --- !query 370 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) % CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):double> --- !query 370 output +-- !query output 0.0 --- !query 371 +-- !query SELECT cast(1 as float) % cast(1 as decimal(10, 0)) FROM t --- !query 371 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) % CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double> --- !query 371 output +-- !query output 0.0 --- !query 372 +-- !query SELECT cast(1 as float) % cast(1 as decimal(20, 0)) FROM t --- !query 372 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) % CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):double> --- !query 372 output +-- !query output 0.0 --- !query 373 +-- !query SELECT cast(1 as double) % cast(1 as decimal(3, 0)) FROM t --- !query 373 schema +-- !query schema struct<(CAST(1 AS DOUBLE) % CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):double> --- !query 373 output +-- !query output 0.0 --- !query 374 +-- !query SELECT cast(1 as double) % cast(1 as decimal(5, 0)) FROM t --- !query 374 schema +-- !query schema struct<(CAST(1 AS DOUBLE) % CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):double> --- !query 374 output +-- !query output 0.0 --- !query 375 +-- !query SELECT cast(1 as double) % cast(1 as decimal(10, 0)) FROM t --- !query 375 schema +-- !query schema struct<(CAST(1 AS DOUBLE) % CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double> --- !query 375 output +-- !query output 0.0 --- !query 376 +-- !query SELECT cast(1 as double) % cast(1 as decimal(20, 0)) FROM t --- !query 376 schema +-- !query schema struct<(CAST(1 AS DOUBLE) % CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):double> --- !query 376 output +-- !query output 0.0 --- !query 377 +-- !query SELECT cast(1 as decimal(10, 0)) % cast(1 as decimal(3, 0)) FROM t --- !query 377 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) % CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(3,0)> --- !query 377 output +-- !query output 0 --- !query 378 +-- !query SELECT cast(1 as decimal(10, 0)) % cast(1 as decimal(5, 0)) FROM t --- !query 378 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) % CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(5,0)> --- !query 378 output +-- !query output 0 --- !query 379 +-- !query SELECT cast(1 as decimal(10, 0)) % cast(1 as decimal(10, 0)) FROM t --- !query 379 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) % CAST(1 AS DECIMAL(10,0))):decimal(10,0)> --- !query 379 output +-- !query output 0 --- !query 380 +-- !query SELECT cast(1 as decimal(10, 0)) % cast(1 as decimal(20, 0)) FROM t --- !query 380 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) % CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(10,0)> --- !query 380 output +-- !query output 0 --- !query 381 +-- !query SELECT cast('1' as binary) % cast(1 as decimal(3, 0)) FROM t --- !query 381 schema +-- !query schema struct<> --- !query 381 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) % CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) % CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7 --- !query 382 +-- !query SELECT cast('1' as binary) % cast(1 as decimal(5, 0)) FROM t --- !query 382 schema +-- !query schema struct<> --- !query 382 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) % CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) % CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7 --- !query 383 +-- !query SELECT cast('1' as binary) % cast(1 as decimal(10, 0)) FROM t --- !query 383 schema +-- !query schema struct<> --- !query 383 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) % CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) % CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7 --- !query 384 +-- !query SELECT cast('1' as binary) % cast(1 as decimal(20, 0)) FROM t --- !query 384 schema +-- !query schema struct<> --- !query 384 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) % CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) % CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7 --- !query 385 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) % cast(1 as decimal(3, 0)) FROM t --- !query 385 schema +-- !query schema struct<> --- !query 385 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) % CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) % CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7 --- !query 386 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) % cast(1 as decimal(5, 0)) FROM t --- !query 386 schema +-- !query schema struct<> --- !query 386 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) % CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) % CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7 --- !query 387 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) % cast(1 as decimal(10, 0)) FROM t --- !query 387 schema +-- !query schema struct<> --- !query 387 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) % CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) % CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7 --- !query 388 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) % cast(1 as decimal(20, 0)) FROM t --- !query 388 schema +-- !query schema struct<> --- !query 388 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) % CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) % CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7 --- !query 389 +-- !query SELECT cast('2017-12-11 09:30:00' as date) % cast(1 as decimal(3, 0)) FROM t --- !query 389 schema +-- !query schema struct<> --- !query 389 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) % CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) % CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7 --- !query 390 +-- !query SELECT cast('2017-12-11 09:30:00' as date) % cast(1 as decimal(5, 0)) FROM t --- !query 390 schema +-- !query schema struct<> --- !query 390 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) % CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) % CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7 --- !query 391 +-- !query SELECT cast('2017-12-11 09:30:00' as date) % cast(1 as decimal(10, 0)) FROM t --- !query 391 schema +-- !query schema struct<> --- !query 391 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) % CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) % CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7 --- !query 392 +-- !query SELECT cast('2017-12-11 09:30:00' as date) % cast(1 as decimal(20, 0)) FROM t --- !query 392 schema +-- !query schema struct<> --- !query 392 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) % CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) % CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7 --- !query 393 +-- !query SELECT cast(1 as decimal(3, 0)) % cast(1 as tinyint) FROM t --- !query 393 schema +-- !query schema struct<(CAST(1 AS DECIMAL(3,0)) % CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):decimal(3,0)> --- !query 393 output +-- !query output 0 --- !query 394 +-- !query SELECT cast(1 as decimal(5, 0)) % cast(1 as tinyint) FROM t --- !query 394 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)) % CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0))):decimal(3,0)> --- !query 394 output +-- !query output 0 --- !query 395 +-- !query SELECT cast(1 as decimal(10, 0)) % cast(1 as tinyint) FROM t --- !query 395 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) % CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(3,0)> --- !query 395 output +-- !query output 0 --- !query 396 +-- !query SELECT cast(1 as decimal(20, 0)) % cast(1 as tinyint) FROM t --- !query 396 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) % CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0))):decimal(3,0)> --- !query 396 output +-- !query output 0 --- !query 397 +-- !query SELECT cast(1 as decimal(3, 0)) % cast(1 as smallint) FROM t --- !query 397 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)) % CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0))):decimal(3,0)> --- !query 397 output +-- !query output 0 --- !query 398 +-- !query SELECT cast(1 as decimal(5, 0)) % cast(1 as smallint) FROM t --- !query 398 schema +-- !query schema struct<(CAST(1 AS DECIMAL(5,0)) % CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):decimal(5,0)> --- !query 398 output +-- !query output 0 --- !query 399 +-- !query SELECT cast(1 as decimal(10, 0)) % cast(1 as smallint) FROM t --- !query 399 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) % CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(5,0)> --- !query 399 output +-- !query output 0 --- !query 400 +-- !query SELECT cast(1 as decimal(20, 0)) % cast(1 as smallint) FROM t --- !query 400 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) % CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0))):decimal(5,0)> --- !query 400 output +-- !query output 0 --- !query 401 +-- !query SELECT cast(1 as decimal(3, 0)) % cast(1 as int) FROM t --- !query 401 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) % CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(3,0)> --- !query 401 output +-- !query output 0 --- !query 402 +-- !query SELECT cast(1 as decimal(5, 0)) % cast(1 as int) FROM t --- !query 402 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) % CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(5,0)> --- !query 402 output +-- !query output 0 --- !query 403 +-- !query SELECT cast(1 as decimal(10, 0)) % cast(1 as int) FROM t --- !query 403 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) % CAST(CAST(1 AS INT) AS DECIMAL(10,0))):decimal(10,0)> --- !query 403 output +-- !query output 0 --- !query 404 +-- !query SELECT cast(1 as decimal(20, 0)) % cast(1 as int) FROM t --- !query 404 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) % CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(10,0)> --- !query 404 output +-- !query output 0 --- !query 405 +-- !query SELECT cast(1 as decimal(3, 0)) % cast(1 as bigint) FROM t --- !query 405 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)) % CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(3,0)> --- !query 405 output +-- !query output 0 --- !query 406 +-- !query SELECT cast(1 as decimal(5, 0)) % cast(1 as bigint) FROM t --- !query 406 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)) % CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(5,0)> --- !query 406 output +-- !query output 0 --- !query 407 +-- !query SELECT cast(1 as decimal(10, 0)) % cast(1 as bigint) FROM t --- !query 407 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) % CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(10,0)> --- !query 407 output +-- !query output 0 --- !query 408 +-- !query SELECT cast(1 as decimal(20, 0)) % cast(1 as bigint) FROM t --- !query 408 schema +-- !query schema struct<(CAST(1 AS DECIMAL(20,0)) % CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):decimal(20,0)> --- !query 408 output +-- !query output 0 --- !query 409 +-- !query SELECT cast(1 as decimal(3, 0)) % cast(1 as float) FROM t --- !query 409 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) % CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 409 output +-- !query output 0.0 --- !query 410 +-- !query SELECT cast(1 as decimal(5, 0)) % cast(1 as float) FROM t --- !query 410 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) % CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 410 output +-- !query output 0.0 --- !query 411 +-- !query SELECT cast(1 as decimal(10, 0)) % cast(1 as float) FROM t --- !query 411 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) % CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 411 output +-- !query output 0.0 --- !query 412 +-- !query SELECT cast(1 as decimal(20, 0)) % cast(1 as float) FROM t --- !query 412 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) % CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 412 output +-- !query output 0.0 --- !query 413 +-- !query SELECT cast(1 as decimal(3, 0)) % cast(1 as double) FROM t --- !query 413 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) % CAST(1 AS DOUBLE)):double> --- !query 413 output +-- !query output 0.0 --- !query 414 +-- !query SELECT cast(1 as decimal(5, 0)) % cast(1 as double) FROM t --- !query 414 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) % CAST(1 AS DOUBLE)):double> --- !query 414 output +-- !query output 0.0 --- !query 415 +-- !query SELECT cast(1 as decimal(10, 0)) % cast(1 as double) FROM t --- !query 415 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) % CAST(1 AS DOUBLE)):double> --- !query 415 output +-- !query output 0.0 --- !query 416 +-- !query SELECT cast(1 as decimal(20, 0)) % cast(1 as double) FROM t --- !query 416 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) % CAST(1 AS DOUBLE)):double> --- !query 416 output +-- !query output 0.0 --- !query 417 +-- !query SELECT cast(1 as decimal(3, 0)) % cast(1 as decimal(10, 0)) FROM t --- !query 417 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) % CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(3,0)> --- !query 417 output +-- !query output 0 --- !query 418 +-- !query SELECT cast(1 as decimal(5, 0)) % cast(1 as decimal(10, 0)) FROM t --- !query 418 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) % CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(5,0)> --- !query 418 output +-- !query output 0 --- !query 419 +-- !query SELECT cast(1 as decimal(10, 0)) % cast(1 as decimal(10, 0)) FROM t --- !query 419 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) % CAST(1 AS DECIMAL(10,0))):decimal(10,0)> --- !query 419 output +-- !query output 0 --- !query 420 +-- !query SELECT cast(1 as decimal(20, 0)) % cast(1 as decimal(10, 0)) FROM t --- !query 420 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) % CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(10,0)> --- !query 420 output +-- !query output 0 --- !query 421 +-- !query SELECT cast(1 as decimal(3, 0)) % cast(1 as string) FROM t --- !query 421 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) % CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 421 output +-- !query output 0.0 --- !query 422 +-- !query SELECT cast(1 as decimal(5, 0)) % cast(1 as string) FROM t --- !query 422 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) % CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 422 output +-- !query output 0.0 --- !query 423 +-- !query SELECT cast(1 as decimal(10, 0)) % cast(1 as string) FROM t --- !query 423 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) % CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 423 output +-- !query output 0.0 --- !query 424 +-- !query SELECT cast(1 as decimal(20, 0)) % cast(1 as string) FROM t --- !query 424 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) % CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 424 output +-- !query output 0.0 --- !query 425 +-- !query SELECT cast(1 as decimal(3, 0)) % cast('1' as binary) FROM t --- !query 425 schema +-- !query schema struct<> --- !query 425 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) % CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) % CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7 --- !query 426 +-- !query SELECT cast(1 as decimal(5, 0)) % cast('1' as binary) FROM t --- !query 426 schema +-- !query schema struct<> --- !query 426 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) % CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) % CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7 --- !query 427 +-- !query SELECT cast(1 as decimal(10, 0)) % cast('1' as binary) FROM t --- !query 427 schema +-- !query schema struct<> --- !query 427 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) % CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) % CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7 --- !query 428 +-- !query SELECT cast(1 as decimal(20, 0)) % cast('1' as binary) FROM t --- !query 428 schema +-- !query schema struct<> --- !query 428 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) % CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) % CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7 --- !query 429 +-- !query SELECT cast(1 as decimal(3, 0)) % cast(1 as boolean) FROM t --- !query 429 schema +-- !query schema struct<> --- !query 429 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) % CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) % CAST(1 AS BOOLEAN))' (decimal(3,0) and boolean).; line 1 pos 7 --- !query 430 +-- !query SELECT cast(1 as decimal(5, 0)) % cast(1 as boolean) FROM t --- !query 430 schema +-- !query schema struct<> --- !query 430 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) % CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) % CAST(1 AS BOOLEAN))' (decimal(5,0) and boolean).; line 1 pos 7 --- !query 431 +-- !query SELECT cast(1 as decimal(10, 0)) % cast(1 as boolean) FROM t --- !query 431 schema +-- !query schema struct<> --- !query 431 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) % CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) % CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7 --- !query 432 +-- !query SELECT cast(1 as decimal(20, 0)) % cast(1 as boolean) FROM t --- !query 432 schema +-- !query schema struct<> --- !query 432 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) % CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) % CAST(1 AS BOOLEAN))' (decimal(20,0) and boolean).; line 1 pos 7 --- !query 433 +-- !query SELECT cast(1 as decimal(3, 0)) % cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 433 schema +-- !query schema struct<> --- !query 433 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) % CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) % CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7 --- !query 434 +-- !query SELECT cast(1 as decimal(5, 0)) % cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 434 schema +-- !query schema struct<> --- !query 434 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) % CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) % CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7 --- !query 435 +-- !query SELECT cast(1 as decimal(10, 0)) % cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 435 schema +-- !query schema struct<> --- !query 435 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) % CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) % CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7 --- !query 436 +-- !query SELECT cast(1 as decimal(20, 0)) % cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 436 schema +-- !query schema struct<> --- !query 436 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) % CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) % CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7 --- !query 437 +-- !query SELECT cast(1 as decimal(3, 0)) % cast('2017-12-11 09:30:00' as date) FROM t --- !query 437 schema +-- !query schema struct<> --- !query 437 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) % CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) % CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7 --- !query 438 +-- !query SELECT cast(1 as decimal(5, 0)) % cast('2017-12-11 09:30:00' as date) FROM t --- !query 438 schema +-- !query schema struct<> --- !query 438 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) % CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) % CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7 --- !query 439 +-- !query SELECT cast(1 as decimal(10, 0)) % cast('2017-12-11 09:30:00' as date) FROM t --- !query 439 schema +-- !query schema struct<> --- !query 439 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) % CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) % CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7 --- !query 440 +-- !query SELECT cast(1 as decimal(20, 0)) % cast('2017-12-11 09:30:00' as date) FROM t --- !query 440 schema +-- !query schema struct<> --- !query 440 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) % CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) % CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7 --- !query 441 +-- !query SELECT pmod(cast(1 as tinyint), cast(1 as decimal(3, 0))) FROM t --- !query 441 schema +-- !query schema struct --- !query 441 output +-- !query output 0 --- !query 442 +-- !query SELECT pmod(cast(1 as tinyint), cast(1 as decimal(5, 0))) FROM t --- !query 442 schema +-- !query schema struct --- !query 442 output +-- !query output 0 --- !query 443 +-- !query SELECT pmod(cast(1 as tinyint), cast(1 as decimal(10, 0))) FROM t --- !query 443 schema +-- !query schema struct --- !query 443 output +-- !query output 0 --- !query 444 +-- !query SELECT pmod(cast(1 as tinyint), cast(1 as decimal(20, 0))) FROM t --- !query 444 schema +-- !query schema struct --- !query 444 output +-- !query output 0 --- !query 445 +-- !query SELECT pmod(cast(1 as smallint), cast(1 as decimal(3, 0))) FROM t --- !query 445 schema +-- !query schema struct --- !query 445 output +-- !query output 0 --- !query 446 +-- !query SELECT pmod(cast(1 as smallint), cast(1 as decimal(5, 0))) FROM t --- !query 446 schema +-- !query schema struct --- !query 446 output +-- !query output 0 --- !query 447 +-- !query SELECT pmod(cast(1 as smallint), cast(1 as decimal(10, 0))) FROM t --- !query 447 schema +-- !query schema struct --- !query 447 output +-- !query output 0 --- !query 448 +-- !query SELECT pmod(cast(1 as smallint), cast(1 as decimal(20, 0))) FROM t --- !query 448 schema +-- !query schema struct --- !query 448 output +-- !query output 0 --- !query 449 +-- !query SELECT pmod(cast(1 as int), cast(1 as decimal(3, 0))) FROM t --- !query 449 schema +-- !query schema struct --- !query 449 output +-- !query output 0 --- !query 450 +-- !query SELECT pmod(cast(1 as int), cast(1 as decimal(5, 0))) FROM t --- !query 450 schema +-- !query schema struct --- !query 450 output +-- !query output 0 --- !query 451 +-- !query SELECT pmod(cast(1 as int), cast(1 as decimal(10, 0))) FROM t --- !query 451 schema +-- !query schema struct --- !query 451 output +-- !query output 0 --- !query 452 +-- !query SELECT pmod(cast(1 as int), cast(1 as decimal(20, 0))) FROM t --- !query 452 schema +-- !query schema struct --- !query 452 output +-- !query output 0 --- !query 453 +-- !query SELECT pmod(cast(1 as bigint), cast(1 as decimal(3, 0))) FROM t --- !query 453 schema +-- !query schema struct --- !query 453 output +-- !query output 0 --- !query 454 +-- !query SELECT pmod(cast(1 as bigint), cast(1 as decimal(5, 0))) FROM t --- !query 454 schema +-- !query schema struct --- !query 454 output +-- !query output 0 --- !query 455 +-- !query SELECT pmod(cast(1 as bigint), cast(1 as decimal(10, 0))) FROM t --- !query 455 schema +-- !query schema struct --- !query 455 output +-- !query output 0 --- !query 456 +-- !query SELECT pmod(cast(1 as bigint), cast(1 as decimal(20, 0))) FROM t --- !query 456 schema +-- !query schema struct --- !query 456 output +-- !query output 0 --- !query 457 +-- !query SELECT pmod(cast(1 as float), cast(1 as decimal(3, 0))) FROM t --- !query 457 schema +-- !query schema struct --- !query 457 output +-- !query output 0.0 --- !query 458 +-- !query SELECT pmod(cast(1 as float), cast(1 as decimal(5, 0))) FROM t --- !query 458 schema +-- !query schema struct --- !query 458 output +-- !query output 0.0 --- !query 459 +-- !query SELECT pmod(cast(1 as float), cast(1 as decimal(10, 0))) FROM t --- !query 459 schema +-- !query schema struct --- !query 459 output +-- !query output 0.0 --- !query 460 +-- !query SELECT pmod(cast(1 as float), cast(1 as decimal(20, 0))) FROM t --- !query 460 schema +-- !query schema struct --- !query 460 output +-- !query output 0.0 --- !query 461 +-- !query SELECT pmod(cast(1 as double), cast(1 as decimal(3, 0))) FROM t --- !query 461 schema +-- !query schema struct --- !query 461 output +-- !query output 0.0 --- !query 462 +-- !query SELECT pmod(cast(1 as double), cast(1 as decimal(5, 0))) FROM t --- !query 462 schema +-- !query schema struct --- !query 462 output +-- !query output 0.0 --- !query 463 +-- !query SELECT pmod(cast(1 as double), cast(1 as decimal(10, 0))) FROM t --- !query 463 schema +-- !query schema struct --- !query 463 output +-- !query output 0.0 --- !query 464 +-- !query SELECT pmod(cast(1 as double), cast(1 as decimal(20, 0))) FROM t --- !query 464 schema +-- !query schema struct --- !query 464 output +-- !query output 0.0 --- !query 465 +-- !query SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as decimal(3, 0))) FROM t --- !query 465 schema +-- !query schema struct --- !query 465 output +-- !query output 0 --- !query 466 +-- !query SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as decimal(5, 0))) FROM t --- !query 466 schema +-- !query schema struct --- !query 466 output +-- !query output 0 --- !query 467 +-- !query SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as decimal(10, 0))) FROM t --- !query 467 schema +-- !query schema struct --- !query 467 output +-- !query output 0 --- !query 468 +-- !query SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as decimal(20, 0))) FROM t --- !query 468 schema +-- !query schema struct --- !query 468 output +-- !query output 0 --- !query 469 +-- !query SELECT pmod(cast('1' as binary), cast(1 as decimal(3, 0))) FROM t --- !query 469 schema +-- !query schema struct<> --- !query 469 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST('1' AS BINARY), CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in 'pmod(CAST('1' AS BINARY), CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7 --- !query 470 +-- !query SELECT pmod(cast('1' as binary), cast(1 as decimal(5, 0))) FROM t --- !query 470 schema +-- !query schema struct<> --- !query 470 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST('1' AS BINARY), CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in 'pmod(CAST('1' AS BINARY), CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7 --- !query 471 +-- !query SELECT pmod(cast('1' as binary), cast(1 as decimal(10, 0))) FROM t --- !query 471 schema +-- !query schema struct<> --- !query 471 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST('1' AS BINARY), CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in 'pmod(CAST('1' AS BINARY), CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7 --- !query 472 +-- !query SELECT pmod(cast('1' as binary), cast(1 as decimal(20, 0))) FROM t --- !query 472 schema +-- !query schema struct<> --- !query 472 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST('1' AS BINARY), CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in 'pmod(CAST('1' AS BINARY), CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7 --- !query 473 +-- !query SELECT pmod(cast('2017-12-11 09:30:00.0' as timestamp), cast(1 as decimal(3, 0))) FROM t --- !query 473 schema +-- !query schema struct<> --- !query 473 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in 'pmod(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7 --- !query 474 +-- !query SELECT pmod(cast('2017-12-11 09:30:00.0' as timestamp), cast(1 as decimal(5, 0))) FROM t --- !query 474 schema +-- !query schema struct<> --- !query 474 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in 'pmod(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7 --- !query 475 +-- !query SELECT pmod(cast('2017-12-11 09:30:00.0' as timestamp), cast(1 as decimal(10, 0))) FROM t --- !query 475 schema +-- !query schema struct<> --- !query 475 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in 'pmod(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7 --- !query 476 +-- !query SELECT pmod(cast('2017-12-11 09:30:00.0' as timestamp), cast(1 as decimal(20, 0))) FROM t --- !query 476 schema +-- !query schema struct<> --- !query 476 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in 'pmod(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7 --- !query 477 +-- !query SELECT pmod(cast('2017-12-11 09:30:00' as date), cast(1 as decimal(3, 0))) FROM t --- !query 477 schema +-- !query schema struct<> --- !query 477 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in 'pmod(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7 --- !query 478 +-- !query SELECT pmod(cast('2017-12-11 09:30:00' as date), cast(1 as decimal(5, 0))) FROM t --- !query 478 schema +-- !query schema struct<> --- !query 478 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in 'pmod(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7 --- !query 479 +-- !query SELECT pmod(cast('2017-12-11 09:30:00' as date), cast(1 as decimal(10, 0))) FROM t --- !query 479 schema +-- !query schema struct<> --- !query 479 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in 'pmod(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7 --- !query 480 +-- !query SELECT pmod(cast('2017-12-11 09:30:00' as date), cast(1 as decimal(20, 0))) FROM t --- !query 480 schema +-- !query schema struct<> --- !query 480 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in 'pmod(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7 --- !query 481 +-- !query SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as tinyint)) FROM t --- !query 481 schema +-- !query schema struct --- !query 481 output +-- !query output 0 --- !query 482 +-- !query SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as tinyint)) FROM t --- !query 482 schema +-- !query schema struct --- !query 482 output +-- !query output 0 --- !query 483 +-- !query SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as tinyint)) FROM t --- !query 483 schema +-- !query schema struct --- !query 483 output +-- !query output 0 --- !query 484 +-- !query SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as tinyint)) FROM t --- !query 484 schema +-- !query schema struct --- !query 484 output +-- !query output 0 --- !query 485 +-- !query SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as smallint)) FROM t --- !query 485 schema +-- !query schema struct --- !query 485 output +-- !query output 0 --- !query 486 +-- !query SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as smallint)) FROM t --- !query 486 schema +-- !query schema struct --- !query 486 output +-- !query output 0 --- !query 487 +-- !query SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as smallint)) FROM t --- !query 487 schema +-- !query schema struct --- !query 487 output +-- !query output 0 --- !query 488 +-- !query SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as smallint)) FROM t --- !query 488 schema +-- !query schema struct --- !query 488 output +-- !query output 0 --- !query 489 +-- !query SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as int)) FROM t --- !query 489 schema +-- !query schema struct --- !query 489 output +-- !query output 0 --- !query 490 +-- !query SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as int)) FROM t --- !query 490 schema +-- !query schema struct --- !query 490 output +-- !query output 0 --- !query 491 +-- !query SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as int)) FROM t --- !query 491 schema +-- !query schema struct --- !query 491 output +-- !query output 0 --- !query 492 +-- !query SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as int)) FROM t --- !query 492 schema +-- !query schema struct --- !query 492 output +-- !query output 0 --- !query 493 +-- !query SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as bigint)) FROM t --- !query 493 schema +-- !query schema struct --- !query 493 output +-- !query output 0 --- !query 494 +-- !query SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as bigint)) FROM t --- !query 494 schema +-- !query schema struct --- !query 494 output +-- !query output 0 --- !query 495 +-- !query SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as bigint)) FROM t --- !query 495 schema +-- !query schema struct --- !query 495 output +-- !query output 0 --- !query 496 +-- !query SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as bigint)) FROM t --- !query 496 schema +-- !query schema struct --- !query 496 output +-- !query output 0 --- !query 497 +-- !query SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as float)) FROM t --- !query 497 schema +-- !query schema struct --- !query 497 output +-- !query output 0.0 --- !query 498 +-- !query SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as float)) FROM t --- !query 498 schema +-- !query schema struct --- !query 498 output +-- !query output 0.0 --- !query 499 +-- !query SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as float)) FROM t --- !query 499 schema +-- !query schema struct --- !query 499 output +-- !query output 0.0 --- !query 500 +-- !query SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as float)) FROM t --- !query 500 schema +-- !query schema struct --- !query 500 output +-- !query output 0.0 --- !query 501 +-- !query SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as double)) FROM t --- !query 501 schema +-- !query schema struct --- !query 501 output +-- !query output 0.0 --- !query 502 +-- !query SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as double)) FROM t --- !query 502 schema +-- !query schema struct --- !query 502 output +-- !query output 0.0 --- !query 503 +-- !query SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as double)) FROM t --- !query 503 schema +-- !query schema struct --- !query 503 output +-- !query output 0.0 --- !query 504 +-- !query SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as double)) FROM t --- !query 504 schema +-- !query schema struct --- !query 504 output +-- !query output 0.0 --- !query 505 +-- !query SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as decimal(10, 0))) FROM t --- !query 505 schema +-- !query schema struct --- !query 505 output +-- !query output 0 --- !query 506 +-- !query SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as decimal(10, 0))) FROM t --- !query 506 schema +-- !query schema struct --- !query 506 output +-- !query output 0 --- !query 507 +-- !query SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as decimal(10, 0))) FROM t --- !query 507 schema +-- !query schema struct --- !query 507 output +-- !query output 0 --- !query 508 +-- !query SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as decimal(10, 0))) FROM t --- !query 508 schema +-- !query schema struct --- !query 508 output +-- !query output 0 --- !query 509 +-- !query SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as string)) FROM t --- !query 509 schema +-- !query schema struct --- !query 509 output +-- !query output 0.0 --- !query 510 +-- !query SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as string)) FROM t --- !query 510 schema +-- !query schema struct --- !query 510 output +-- !query output 0.0 --- !query 511 +-- !query SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as string)) FROM t --- !query 511 schema +-- !query schema struct --- !query 511 output +-- !query output 0.0 --- !query 512 +-- !query SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as string)) FROM t --- !query 512 schema +-- !query schema struct --- !query 512 output +-- !query output 0.0 --- !query 513 +-- !query SELECT pmod(cast(1 as decimal(3, 0)) , cast('1' as binary)) FROM t --- !query 513 schema +-- !query schema struct<> --- !query 513 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST(1 AS DECIMAL(3,0)), CAST('1' AS BINARY))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(3,0)), CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7 --- !query 514 +-- !query SELECT pmod(cast(1 as decimal(5, 0)) , cast('1' as binary)) FROM t --- !query 514 schema +-- !query schema struct<> --- !query 514 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST(1 AS DECIMAL(5,0)), CAST('1' AS BINARY))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(5,0)), CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7 --- !query 515 +-- !query SELECT pmod(cast(1 as decimal(10, 0)), cast('1' as binary)) FROM t --- !query 515 schema +-- !query schema struct<> --- !query 515 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST(1 AS DECIMAL(10,0)), CAST('1' AS BINARY))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(10,0)), CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7 --- !query 516 +-- !query SELECT pmod(cast(1 as decimal(20, 0)), cast('1' as binary)) FROM t --- !query 516 schema +-- !query schema struct<> --- !query 516 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST(1 AS DECIMAL(20,0)), CAST('1' AS BINARY))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(20,0)), CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7 --- !query 517 +-- !query SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as boolean)) FROM t --- !query 517 schema +-- !query schema struct<> --- !query 517 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST(1 AS DECIMAL(3,0)), CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(3,0)), CAST(1 AS BOOLEAN))' (decimal(3,0) and boolean).; line 1 pos 7 --- !query 518 +-- !query SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as boolean)) FROM t --- !query 518 schema +-- !query schema struct<> --- !query 518 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST(1 AS DECIMAL(5,0)), CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(5,0)), CAST(1 AS BOOLEAN))' (decimal(5,0) and boolean).; line 1 pos 7 --- !query 519 +-- !query SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as boolean)) FROM t --- !query 519 schema +-- !query schema struct<> --- !query 519 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST(1 AS DECIMAL(10,0)), CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(10,0)), CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7 --- !query 520 +-- !query SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as boolean)) FROM t --- !query 520 schema +-- !query schema struct<> --- !query 520 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST(1 AS DECIMAL(20,0)), CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(20,0)), CAST(1 AS BOOLEAN))' (decimal(20,0) and boolean).; line 1 pos 7 --- !query 521 +-- !query SELECT pmod(cast(1 as decimal(3, 0)) , cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 521 schema +-- !query schema struct<> --- !query 521 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST(1 AS DECIMAL(3,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(3,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7 --- !query 522 +-- !query SELECT pmod(cast(1 as decimal(5, 0)) , cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 522 schema +-- !query schema struct<> --- !query 522 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST(1 AS DECIMAL(5,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(5,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7 --- !query 523 +-- !query SELECT pmod(cast(1 as decimal(10, 0)), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 523 schema +-- !query schema struct<> --- !query 523 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7 --- !query 524 +-- !query SELECT pmod(cast(1 as decimal(20, 0)), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 524 schema +-- !query schema struct<> --- !query 524 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST(1 AS DECIMAL(20,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(20,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7 --- !query 525 +-- !query SELECT pmod(cast(1 as decimal(3, 0)) , cast('2017-12-11 09:30:00' as date)) FROM t --- !query 525 schema +-- !query schema struct<> --- !query 525 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST(1 AS DECIMAL(3,0)), CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(3,0)), CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7 --- !query 526 +-- !query SELECT pmod(cast(1 as decimal(5, 0)) , cast('2017-12-11 09:30:00' as date)) FROM t --- !query 526 schema +-- !query schema struct<> --- !query 526 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST(1 AS DECIMAL(5,0)), CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(5,0)), CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7 --- !query 527 +-- !query SELECT pmod(cast(1 as decimal(10, 0)), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 527 schema +-- !query schema struct<> --- !query 527 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7 --- !query 528 +-- !query SELECT pmod(cast(1 as decimal(20, 0)), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 528 schema +-- !query schema struct<> --- !query 528 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST(1 AS DECIMAL(20,0)), CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(20,0)), CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7 --- !query 529 +-- !query SELECT cast(1 as tinyint) = cast(1 as decimal(3, 0)) FROM t --- !query 529 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) = CAST(1 AS DECIMAL(3,0))):boolean> --- !query 529 output +-- !query output true --- !query 530 +-- !query SELECT cast(1 as tinyint) = cast(1 as decimal(5, 0)) FROM t --- !query 530 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)) = CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean> --- !query 530 output +-- !query output true --- !query 531 +-- !query SELECT cast(1 as tinyint) = cast(1 as decimal(10, 0)) FROM t --- !query 531 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 531 output +-- !query output true --- !query 532 +-- !query SELECT cast(1 as tinyint) = cast(1 as decimal(20, 0)) FROM t --- !query 532 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 532 output +-- !query output true --- !query 533 +-- !query SELECT cast(1 as smallint) = cast(1 as decimal(3, 0)) FROM t --- !query 533 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)) = CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean> --- !query 533 output +-- !query output true --- !query 534 +-- !query SELECT cast(1 as smallint) = cast(1 as decimal(5, 0)) FROM t --- !query 534 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) = CAST(1 AS DECIMAL(5,0))):boolean> --- !query 534 output +-- !query output true --- !query 535 +-- !query SELECT cast(1 as smallint) = cast(1 as decimal(10, 0)) FROM t --- !query 535 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 535 output +-- !query output true --- !query 536 +-- !query SELECT cast(1 as smallint) = cast(1 as decimal(20, 0)) FROM t --- !query 536 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 536 output +-- !query output true --- !query 537 +-- !query SELECT cast(1 as int) = cast(1 as decimal(3, 0)) FROM t --- !query 537 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean> --- !query 537 output +-- !query output true --- !query 538 +-- !query SELECT cast(1 as int) = cast(1 as decimal(5, 0)) FROM t --- !query 538 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean> --- !query 538 output +-- !query output true --- !query 539 +-- !query SELECT cast(1 as int) = cast(1 as decimal(10, 0)) FROM t --- !query 539 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) = CAST(1 AS DECIMAL(10,0))):boolean> --- !query 539 output +-- !query output true --- !query 540 +-- !query SELECT cast(1 as int) = cast(1 as decimal(20, 0)) FROM t --- !query 540 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 540 output +-- !query output true --- !query 541 +-- !query SELECT cast(1 as bigint) = cast(1 as decimal(3, 0)) FROM t --- !query 541 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean> --- !query 541 output +-- !query output true --- !query 542 +-- !query SELECT cast(1 as bigint) = cast(1 as decimal(5, 0)) FROM t --- !query 542 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean> --- !query 542 output +-- !query output true --- !query 543 +-- !query SELECT cast(1 as bigint) = cast(1 as decimal(10, 0)) FROM t --- !query 543 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean> --- !query 543 output +-- !query output true --- !query 544 +-- !query SELECT cast(1 as bigint) = cast(1 as decimal(20, 0)) FROM t --- !query 544 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) = CAST(1 AS DECIMAL(20,0))):boolean> --- !query 544 output +-- !query output true --- !query 545 +-- !query SELECT cast(1 as float) = cast(1 as decimal(3, 0)) FROM t --- !query 545 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) = CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean> --- !query 545 output +-- !query output true --- !query 546 +-- !query SELECT cast(1 as float) = cast(1 as decimal(5, 0)) FROM t --- !query 546 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) = CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean> --- !query 546 output +-- !query output true --- !query 547 +-- !query SELECT cast(1 as float) = cast(1 as decimal(10, 0)) FROM t --- !query 547 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) = CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 547 output +-- !query output true --- !query 548 +-- !query SELECT cast(1 as float) = cast(1 as decimal(20, 0)) FROM t --- !query 548 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) = CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean> --- !query 548 output +-- !query output true --- !query 549 +-- !query SELECT cast(1 as double) = cast(1 as decimal(3, 0)) FROM t --- !query 549 schema +-- !query schema struct<(CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean> --- !query 549 output +-- !query output true --- !query 550 +-- !query SELECT cast(1 as double) = cast(1 as decimal(5, 0)) FROM t --- !query 550 schema +-- !query schema struct<(CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean> --- !query 550 output +-- !query output true --- !query 551 +-- !query SELECT cast(1 as double) = cast(1 as decimal(10, 0)) FROM t --- !query 551 schema +-- !query schema struct<(CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 551 output +-- !query output true --- !query 552 +-- !query SELECT cast(1 as double) = cast(1 as decimal(20, 0)) FROM t --- !query 552 schema +-- !query schema struct<(CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean> --- !query 552 output +-- !query output true --- !query 553 +-- !query SELECT cast(1 as decimal(10, 0)) = cast(1 as decimal(3, 0)) FROM t --- !query 553 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean> --- !query 553 output +-- !query output true --- !query 554 +-- !query SELECT cast(1 as decimal(10, 0)) = cast(1 as decimal(5, 0)) FROM t --- !query 554 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean> --- !query 554 output +-- !query output true --- !query 555 +-- !query SELECT cast(1 as decimal(10, 0)) = cast(1 as decimal(10, 0)) FROM t --- !query 555 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) = CAST(1 AS DECIMAL(10,0))):boolean> --- !query 555 output +-- !query output true --- !query 556 +-- !query SELECT cast(1 as decimal(10, 0)) = cast(1 as decimal(20, 0)) FROM t --- !query 556 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 556 output +-- !query output true --- !query 557 +-- !query SELECT cast('1' as binary) = cast(1 as decimal(3, 0)) FROM t --- !query 557 schema +-- !query schema struct<> --- !query 557 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7 --- !query 558 +-- !query SELECT cast('1' as binary) = cast(1 as decimal(5, 0)) FROM t --- !query 558 schema +-- !query schema struct<> --- !query 558 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7 --- !query 559 +-- !query SELECT cast('1' as binary) = cast(1 as decimal(10, 0)) FROM t --- !query 559 schema +-- !query schema struct<> --- !query 559 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7 --- !query 560 +-- !query SELECT cast('1' as binary) = cast(1 as decimal(20, 0)) FROM t --- !query 560 schema +-- !query schema struct<> --- !query 560 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7 --- !query 561 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) = cast(1 as decimal(3, 0)) FROM t --- !query 561 schema +-- !query schema struct<> --- !query 561 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7 --- !query 562 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) = cast(1 as decimal(5, 0)) FROM t --- !query 562 schema +-- !query schema struct<> --- !query 562 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7 --- !query 563 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) = cast(1 as decimal(10, 0)) FROM t --- !query 563 schema +-- !query schema struct<> --- !query 563 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7 --- !query 564 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) = cast(1 as decimal(20, 0)) FROM t --- !query 564 schema +-- !query schema struct<> --- !query 564 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7 --- !query 565 +-- !query SELECT cast('2017-12-11 09:30:00' as date) = cast(1 as decimal(3, 0)) FROM t --- !query 565 schema +-- !query schema struct<> --- !query 565 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7 --- !query 566 +-- !query SELECT cast('2017-12-11 09:30:00' as date) = cast(1 as decimal(5, 0)) FROM t --- !query 566 schema +-- !query schema struct<> --- !query 566 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7 --- !query 567 +-- !query SELECT cast('2017-12-11 09:30:00' as date) = cast(1 as decimal(10, 0)) FROM t --- !query 567 schema +-- !query schema struct<> --- !query 567 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7 --- !query 568 +-- !query SELECT cast('2017-12-11 09:30:00' as date) = cast(1 as decimal(20, 0)) FROM t --- !query 568 schema +-- !query schema struct<> --- !query 568 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7 --- !query 569 +-- !query SELECT cast(1 as decimal(3, 0)) = cast(1 as tinyint) FROM t --- !query 569 schema +-- !query schema struct<(CAST(1 AS DECIMAL(3,0)) = CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):boolean> --- !query 569 output +-- !query output true --- !query 570 +-- !query SELECT cast(1 as decimal(5, 0)) = cast(1 as tinyint) FROM t --- !query 570 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)) = CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean> --- !query 570 output +-- !query output true --- !query 571 +-- !query SELECT cast(1 as decimal(10, 0)) = cast(1 as tinyint) FROM t --- !query 571 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean> --- !query 571 output +-- !query output true --- !query 572 +-- !query SELECT cast(1 as decimal(20, 0)) = cast(1 as tinyint) FROM t --- !query 572 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean> --- !query 572 output +-- !query output true --- !query 573 +-- !query SELECT cast(1 as decimal(3, 0)) = cast(1 as smallint) FROM t --- !query 573 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)) = CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean> --- !query 573 output +-- !query output true --- !query 574 +-- !query SELECT cast(1 as decimal(5, 0)) = cast(1 as smallint) FROM t --- !query 574 schema +-- !query schema struct<(CAST(1 AS DECIMAL(5,0)) = CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):boolean> --- !query 574 output +-- !query output true --- !query 575 +-- !query SELECT cast(1 as decimal(10, 0)) = cast(1 as smallint) FROM t --- !query 575 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean> --- !query 575 output +-- !query output true --- !query 576 +-- !query SELECT cast(1 as decimal(20, 0)) = cast(1 as smallint) FROM t --- !query 576 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean> --- !query 576 output +-- !query output true --- !query 577 +-- !query SELECT cast(1 as decimal(3, 0)) = cast(1 as int) FROM t --- !query 577 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) = CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 577 output +-- !query output true --- !query 578 +-- !query SELECT cast(1 as decimal(5, 0)) = cast(1 as int) FROM t --- !query 578 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) = CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 578 output +-- !query output true --- !query 579 +-- !query SELECT cast(1 as decimal(10, 0)) = cast(1 as int) FROM t --- !query 579 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) = CAST(CAST(1 AS INT) AS DECIMAL(10,0))):boolean> --- !query 579 output +-- !query output true --- !query 580 +-- !query SELECT cast(1 as decimal(20, 0)) = cast(1 as int) FROM t --- !query 580 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean> --- !query 580 output +-- !query output true --- !query 581 +-- !query SELECT cast(1 as decimal(3, 0)) = cast(1 as bigint) FROM t --- !query 581 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 581 output +-- !query output true --- !query 582 +-- !query SELECT cast(1 as decimal(5, 0)) = cast(1 as bigint) FROM t --- !query 582 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 582 output +-- !query output true --- !query 583 +-- !query SELECT cast(1 as decimal(10, 0)) = cast(1 as bigint) FROM t --- !query 583 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 583 output +-- !query output true --- !query 584 +-- !query SELECT cast(1 as decimal(20, 0)) = cast(1 as bigint) FROM t --- !query 584 schema +-- !query schema struct<(CAST(1 AS DECIMAL(20,0)) = CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):boolean> --- !query 584 output +-- !query output true --- !query 585 +-- !query SELECT cast(1 as decimal(3, 0)) = cast(1 as float) FROM t --- !query 585 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) = CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 585 output +-- !query output true --- !query 586 +-- !query SELECT cast(1 as decimal(5, 0)) = cast(1 as float) FROM t --- !query 586 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) = CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 586 output +-- !query output true --- !query 587 +-- !query SELECT cast(1 as decimal(10, 0)) = cast(1 as float) FROM t --- !query 587 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 587 output +-- !query output true --- !query 588 +-- !query SELECT cast(1 as decimal(20, 0)) = cast(1 as float) FROM t --- !query 588 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) = CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 588 output +-- !query output true --- !query 589 +-- !query SELECT cast(1 as decimal(3, 0)) = cast(1 as double) FROM t --- !query 589 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) = CAST(1 AS DOUBLE)):boolean> --- !query 589 output +-- !query output true --- !query 590 +-- !query SELECT cast(1 as decimal(5, 0)) = cast(1 as double) FROM t --- !query 590 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) = CAST(1 AS DOUBLE)):boolean> --- !query 590 output +-- !query output true --- !query 591 +-- !query SELECT cast(1 as decimal(10, 0)) = cast(1 as double) FROM t --- !query 591 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(1 AS DOUBLE)):boolean> --- !query 591 output +-- !query output true --- !query 592 +-- !query SELECT cast(1 as decimal(20, 0)) = cast(1 as double) FROM t --- !query 592 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) = CAST(1 AS DOUBLE)):boolean> --- !query 592 output +-- !query output true --- !query 593 +-- !query SELECT cast(1 as decimal(3, 0)) = cast(1 as decimal(10, 0)) FROM t --- !query 593 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 593 output +-- !query output true --- !query 594 +-- !query SELECT cast(1 as decimal(5, 0)) = cast(1 as decimal(10, 0)) FROM t --- !query 594 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 594 output +-- !query output true --- !query 595 +-- !query SELECT cast(1 as decimal(10, 0)) = cast(1 as decimal(10, 0)) FROM t --- !query 595 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) = CAST(1 AS DECIMAL(10,0))):boolean> --- !query 595 output +-- !query output true --- !query 596 +-- !query SELECT cast(1 as decimal(20, 0)) = cast(1 as decimal(10, 0)) FROM t --- !query 596 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean> --- !query 596 output +-- !query output true --- !query 597 +-- !query SELECT cast(1 as decimal(3, 0)) = cast(1 as string) FROM t --- !query 597 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) = CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 597 output +-- !query output true --- !query 598 +-- !query SELECT cast(1 as decimal(5, 0)) = cast(1 as string) FROM t --- !query 598 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) = CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 598 output +-- !query output true --- !query 599 +-- !query SELECT cast(1 as decimal(10, 0)) = cast(1 as string) FROM t --- !query 599 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 599 output +-- !query output true --- !query 600 +-- !query SELECT cast(1 as decimal(20, 0)) = cast(1 as string) FROM t --- !query 600 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) = CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 600 output +-- !query output true --- !query 601 +-- !query SELECT cast(1 as decimal(3, 0)) = cast('1' as binary) FROM t --- !query 601 schema +-- !query schema struct<> --- !query 601 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) = CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) = CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7 --- !query 602 +-- !query SELECT cast(1 as decimal(5, 0)) = cast('1' as binary) FROM t --- !query 602 schema +-- !query schema struct<> --- !query 602 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) = CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) = CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7 --- !query 603 +-- !query SELECT cast(1 as decimal(10, 0)) = cast('1' as binary) FROM t --- !query 603 schema +-- !query schema struct<> --- !query 603 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) = CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) = CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7 --- !query 604 +-- !query SELECT cast(1 as decimal(20, 0)) = cast('1' as binary) FROM t --- !query 604 schema +-- !query schema struct<> --- !query 604 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) = CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) = CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7 --- !query 605 +-- !query SELECT cast(1 as decimal(3, 0)) = cast(1 as boolean) FROM t --- !query 605 schema +-- !query schema struct<(CAST(1 AS DECIMAL(3,0)) = CAST(CAST(1 AS BOOLEAN) AS DECIMAL(3,0))):boolean> --- !query 605 output +-- !query output true --- !query 606 +-- !query SELECT cast(1 as decimal(5, 0)) = cast(1 as boolean) FROM t --- !query 606 schema +-- !query schema struct<(CAST(1 AS DECIMAL(5,0)) = CAST(CAST(1 AS BOOLEAN) AS DECIMAL(5,0))):boolean> --- !query 606 output +-- !query output true --- !query 607 +-- !query SELECT cast(1 as decimal(10, 0)) = cast(1 as boolean) FROM t --- !query 607 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) = CAST(CAST(1 AS BOOLEAN) AS DECIMAL(10,0))):boolean> --- !query 607 output +-- !query output true --- !query 608 +-- !query SELECT cast(1 as decimal(20, 0)) = cast(1 as boolean) FROM t --- !query 608 schema +-- !query schema struct<(CAST(1 AS DECIMAL(20,0)) = CAST(CAST(1 AS BOOLEAN) AS DECIMAL(20,0))):boolean> --- !query 608 output +-- !query output true --- !query 609 +-- !query SELECT cast(1 as decimal(3, 0)) = cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 609 schema +-- !query schema struct<> --- !query 609 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7 --- !query 610 +-- !query SELECT cast(1 as decimal(5, 0)) = cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 610 schema +-- !query schema struct<> --- !query 610 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7 --- !query 611 +-- !query SELECT cast(1 as decimal(10, 0)) = cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 611 schema +-- !query schema struct<> --- !query 611 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7 --- !query 612 +-- !query SELECT cast(1 as decimal(20, 0)) = cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 612 schema +-- !query schema struct<> --- !query 612 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7 --- !query 613 +-- !query SELECT cast(1 as decimal(3, 0)) = cast('2017-12-11 09:30:00' as date) FROM t --- !query 613 schema +-- !query schema struct<> --- !query 613 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) = CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) = CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7 --- !query 614 +-- !query SELECT cast(1 as decimal(5, 0)) = cast('2017-12-11 09:30:00' as date) FROM t --- !query 614 schema +-- !query schema struct<> --- !query 614 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) = CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) = CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7 --- !query 615 +-- !query SELECT cast(1 as decimal(10, 0)) = cast('2017-12-11 09:30:00' as date) FROM t --- !query 615 schema +-- !query schema struct<> --- !query 615 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) = CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) = CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7 --- !query 616 +-- !query SELECT cast(1 as decimal(20, 0)) = cast('2017-12-11 09:30:00' as date) FROM t --- !query 616 schema +-- !query schema struct<> --- !query 616 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) = CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) = CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7 --- !query 617 +-- !query SELECT cast(1 as tinyint) <=> cast(1 as decimal(3, 0)) FROM t --- !query 617 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) <=> CAST(1 AS DECIMAL(3,0))):boolean> --- !query 617 output +-- !query output true --- !query 618 +-- !query SELECT cast(1 as tinyint) <=> cast(1 as decimal(5, 0)) FROM t --- !query 618 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)) <=> CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean> --- !query 618 output +-- !query output true --- !query 619 +-- !query SELECT cast(1 as tinyint) <=> cast(1 as decimal(10, 0)) FROM t --- !query 619 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) <=> CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 619 output +-- !query output true --- !query 620 +-- !query SELECT cast(1 as tinyint) <=> cast(1 as decimal(20, 0)) FROM t --- !query 620 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)) <=> CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 620 output +-- !query output true --- !query 621 +-- !query SELECT cast(1 as smallint) <=> cast(1 as decimal(3, 0)) FROM t --- !query 621 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)) <=> CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean> --- !query 621 output +-- !query output true --- !query 622 +-- !query SELECT cast(1 as smallint) <=> cast(1 as decimal(5, 0)) FROM t --- !query 622 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) <=> CAST(1 AS DECIMAL(5,0))):boolean> --- !query 622 output +-- !query output true --- !query 623 +-- !query SELECT cast(1 as smallint) <=> cast(1 as decimal(10, 0)) FROM t --- !query 623 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) <=> CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 623 output +-- !query output true --- !query 624 +-- !query SELECT cast(1 as smallint) <=> cast(1 as decimal(20, 0)) FROM t --- !query 624 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)) <=> CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 624 output +-- !query output true --- !query 625 +-- !query SELECT cast(1 as int) <=> cast(1 as decimal(3, 0)) FROM t --- !query 625 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) <=> CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean> --- !query 625 output +-- !query output true --- !query 626 +-- !query SELECT cast(1 as int) <=> cast(1 as decimal(5, 0)) FROM t --- !query 626 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) <=> CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean> --- !query 626 output +-- !query output true --- !query 627 +-- !query SELECT cast(1 as int) <=> cast(1 as decimal(10, 0)) FROM t --- !query 627 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) <=> CAST(1 AS DECIMAL(10,0))):boolean> --- !query 627 output +-- !query output true --- !query 628 +-- !query SELECT cast(1 as int) <=> cast(1 as decimal(20, 0)) FROM t --- !query 628 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)) <=> CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 628 output +-- !query output true --- !query 629 +-- !query SELECT cast(1 as bigint) <=> cast(1 as decimal(3, 0)) FROM t --- !query 629 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) <=> CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean> --- !query 629 output +-- !query output true --- !query 630 +-- !query SELECT cast(1 as bigint) <=> cast(1 as decimal(5, 0)) FROM t --- !query 630 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) <=> CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean> --- !query 630 output +-- !query output true --- !query 631 +-- !query SELECT cast(1 as bigint) <=> cast(1 as decimal(10, 0)) FROM t --- !query 631 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) <=> CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean> --- !query 631 output +-- !query output true --- !query 632 +-- !query SELECT cast(1 as bigint) <=> cast(1 as decimal(20, 0)) FROM t --- !query 632 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) <=> CAST(1 AS DECIMAL(20,0))):boolean> --- !query 632 output +-- !query output true --- !query 633 +-- !query SELECT cast(1 as float) <=> cast(1 as decimal(3, 0)) FROM t --- !query 633 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) <=> CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean> --- !query 633 output +-- !query output true --- !query 634 +-- !query SELECT cast(1 as float) <=> cast(1 as decimal(5, 0)) FROM t --- !query 634 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) <=> CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean> --- !query 634 output +-- !query output true --- !query 635 +-- !query SELECT cast(1 as float) <=> cast(1 as decimal(10, 0)) FROM t --- !query 635 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) <=> CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 635 output +-- !query output true --- !query 636 +-- !query SELECT cast(1 as float) <=> cast(1 as decimal(20, 0)) FROM t --- !query 636 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) <=> CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean> --- !query 636 output +-- !query output true --- !query 637 +-- !query SELECT cast(1 as double) <=> cast(1 as decimal(3, 0)) FROM t --- !query 637 schema +-- !query schema struct<(CAST(1 AS DOUBLE) <=> CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean> --- !query 637 output +-- !query output true --- !query 638 +-- !query SELECT cast(1 as double) <=> cast(1 as decimal(5, 0)) FROM t --- !query 638 schema +-- !query schema struct<(CAST(1 AS DOUBLE) <=> CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean> --- !query 638 output +-- !query output true --- !query 639 +-- !query SELECT cast(1 as double) <=> cast(1 as decimal(10, 0)) FROM t --- !query 639 schema +-- !query schema struct<(CAST(1 AS DOUBLE) <=> CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 639 output +-- !query output true --- !query 640 +-- !query SELECT cast(1 as double) <=> cast(1 as decimal(20, 0)) FROM t --- !query 640 schema +-- !query schema struct<(CAST(1 AS DOUBLE) <=> CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean> --- !query 640 output +-- !query output true --- !query 641 +-- !query SELECT cast(1 as decimal(10, 0)) <=> cast(1 as decimal(3, 0)) FROM t --- !query 641 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) <=> CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean> --- !query 641 output +-- !query output true --- !query 642 +-- !query SELECT cast(1 as decimal(10, 0)) <=> cast(1 as decimal(5, 0)) FROM t --- !query 642 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) <=> CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean> --- !query 642 output +-- !query output true --- !query 643 +-- !query SELECT cast(1 as decimal(10, 0)) <=> cast(1 as decimal(10, 0)) FROM t --- !query 643 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) <=> CAST(1 AS DECIMAL(10,0))):boolean> --- !query 643 output +-- !query output true --- !query 644 +-- !query SELECT cast(1 as decimal(10, 0)) <=> cast(1 as decimal(20, 0)) FROM t --- !query 644 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) <=> CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 644 output +-- !query output true --- !query 645 +-- !query SELECT cast('1' as binary) <=> cast(1 as decimal(3, 0)) FROM t --- !query 645 schema +-- !query schema struct<> --- !query 645 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) <=> CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) <=> CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7 --- !query 646 +-- !query SELECT cast('1' as binary) <=> cast(1 as decimal(5, 0)) FROM t --- !query 646 schema +-- !query schema struct<> --- !query 646 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) <=> CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) <=> CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7 --- !query 647 +-- !query SELECT cast('1' as binary) <=> cast(1 as decimal(10, 0)) FROM t --- !query 647 schema +-- !query schema struct<> --- !query 647 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) <=> CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) <=> CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7 --- !query 648 +-- !query SELECT cast('1' as binary) <=> cast(1 as decimal(20, 0)) FROM t --- !query 648 schema +-- !query schema struct<> --- !query 648 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) <=> CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) <=> CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7 --- !query 649 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) <=> cast(1 as decimal(3, 0)) FROM t --- !query 649 schema +-- !query schema struct<> --- !query 649 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7 --- !query 650 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) <=> cast(1 as decimal(5, 0)) FROM t --- !query 650 schema +-- !query schema struct<> --- !query 650 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7 --- !query 651 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) <=> cast(1 as decimal(10, 0)) FROM t --- !query 651 schema +-- !query schema struct<> --- !query 651 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7 --- !query 652 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) <=> cast(1 as decimal(20, 0)) FROM t --- !query 652 schema +-- !query schema struct<> --- !query 652 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7 --- !query 653 +-- !query SELECT cast('2017-12-11 09:30:00' as date) <=> cast(1 as decimal(3, 0)) FROM t --- !query 653 schema +-- !query schema struct<> --- !query 653 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) <=> CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) <=> CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7 --- !query 654 +-- !query SELECT cast('2017-12-11 09:30:00' as date) <=> cast(1 as decimal(5, 0)) FROM t --- !query 654 schema +-- !query schema struct<> --- !query 654 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) <=> CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) <=> CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7 --- !query 655 +-- !query SELECT cast('2017-12-11 09:30:00' as date) <=> cast(1 as decimal(10, 0)) FROM t --- !query 655 schema +-- !query schema struct<> --- !query 655 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) <=> CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) <=> CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7 --- !query 656 +-- !query SELECT cast('2017-12-11 09:30:00' as date) <=> cast(1 as decimal(20, 0)) FROM t --- !query 656 schema +-- !query schema struct<> --- !query 656 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) <=> CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) <=> CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7 --- !query 657 +-- !query SELECT cast(1 as decimal(3, 0)) <=> cast(1 as tinyint) FROM t --- !query 657 schema +-- !query schema struct<(CAST(1 AS DECIMAL(3,0)) <=> CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):boolean> --- !query 657 output +-- !query output true --- !query 658 +-- !query SELECT cast(1 as decimal(5, 0)) <=> cast(1 as tinyint) FROM t --- !query 658 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)) <=> CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean> --- !query 658 output +-- !query output true --- !query 659 +-- !query SELECT cast(1 as decimal(10, 0)) <=> cast(1 as tinyint) FROM t --- !query 659 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) <=> CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean> --- !query 659 output +-- !query output true --- !query 660 +-- !query SELECT cast(1 as decimal(20, 0)) <=> cast(1 as tinyint) FROM t --- !query 660 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) <=> CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean> --- !query 660 output +-- !query output true --- !query 661 +-- !query SELECT cast(1 as decimal(3, 0)) <=> cast(1 as smallint) FROM t --- !query 661 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)) <=> CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean> --- !query 661 output +-- !query output true --- !query 662 +-- !query SELECT cast(1 as decimal(5, 0)) <=> cast(1 as smallint) FROM t --- !query 662 schema +-- !query schema struct<(CAST(1 AS DECIMAL(5,0)) <=> CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):boolean> --- !query 662 output +-- !query output true --- !query 663 +-- !query SELECT cast(1 as decimal(10, 0)) <=> cast(1 as smallint) FROM t --- !query 663 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) <=> CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean> --- !query 663 output +-- !query output true --- !query 664 +-- !query SELECT cast(1 as decimal(20, 0)) <=> cast(1 as smallint) FROM t --- !query 664 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) <=> CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean> --- !query 664 output +-- !query output true --- !query 665 +-- !query SELECT cast(1 as decimal(3, 0)) <=> cast(1 as int) FROM t --- !query 665 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) <=> CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 665 output +-- !query output true --- !query 666 +-- !query SELECT cast(1 as decimal(5, 0)) <=> cast(1 as int) FROM t --- !query 666 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) <=> CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 666 output +-- !query output true --- !query 667 +-- !query SELECT cast(1 as decimal(10, 0)) <=> cast(1 as int) FROM t --- !query 667 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) <=> CAST(CAST(1 AS INT) AS DECIMAL(10,0))):boolean> --- !query 667 output +-- !query output true --- !query 668 +-- !query SELECT cast(1 as decimal(20, 0)) <=> cast(1 as int) FROM t --- !query 668 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) <=> CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean> --- !query 668 output +-- !query output true --- !query 669 +-- !query SELECT cast(1 as decimal(3, 0)) <=> cast(1 as bigint) FROM t --- !query 669 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)) <=> CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 669 output +-- !query output true --- !query 670 +-- !query SELECT cast(1 as decimal(5, 0)) <=> cast(1 as bigint) FROM t --- !query 670 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)) <=> CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 670 output +-- !query output true --- !query 671 +-- !query SELECT cast(1 as decimal(10, 0)) <=> cast(1 as bigint) FROM t --- !query 671 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) <=> CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 671 output +-- !query output true --- !query 672 +-- !query SELECT cast(1 as decimal(20, 0)) <=> cast(1 as bigint) FROM t --- !query 672 schema +-- !query schema struct<(CAST(1 AS DECIMAL(20,0)) <=> CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):boolean> --- !query 672 output +-- !query output true --- !query 673 +-- !query SELECT cast(1 as decimal(3, 0)) <=> cast(1 as float) FROM t --- !query 673 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) <=> CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 673 output +-- !query output true --- !query 674 +-- !query SELECT cast(1 as decimal(5, 0)) <=> cast(1 as float) FROM t --- !query 674 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) <=> CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 674 output +-- !query output true --- !query 675 +-- !query SELECT cast(1 as decimal(10, 0)) <=> cast(1 as float) FROM t --- !query 675 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) <=> CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 675 output +-- !query output true --- !query 676 +-- !query SELECT cast(1 as decimal(20, 0)) <=> cast(1 as float) FROM t --- !query 676 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) <=> CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 676 output +-- !query output true --- !query 677 +-- !query SELECT cast(1 as decimal(3, 0)) <=> cast(1 as double) FROM t --- !query 677 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) <=> CAST(1 AS DOUBLE)):boolean> --- !query 677 output +-- !query output true --- !query 678 +-- !query SELECT cast(1 as decimal(5, 0)) <=> cast(1 as double) FROM t --- !query 678 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) <=> CAST(1 AS DOUBLE)):boolean> --- !query 678 output +-- !query output true --- !query 679 +-- !query SELECT cast(1 as decimal(10, 0)) <=> cast(1 as double) FROM t --- !query 679 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) <=> CAST(1 AS DOUBLE)):boolean> --- !query 679 output +-- !query output true --- !query 680 +-- !query SELECT cast(1 as decimal(20, 0)) <=> cast(1 as double) FROM t --- !query 680 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) <=> CAST(1 AS DOUBLE)):boolean> --- !query 680 output +-- !query output true --- !query 681 +-- !query SELECT cast(1 as decimal(3, 0)) <=> cast(1 as decimal(10, 0)) FROM t --- !query 681 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) <=> CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 681 output +-- !query output true --- !query 682 +-- !query SELECT cast(1 as decimal(5, 0)) <=> cast(1 as decimal(10, 0)) FROM t --- !query 682 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) <=> CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 682 output +-- !query output true --- !query 683 +-- !query SELECT cast(1 as decimal(10, 0)) <=> cast(1 as decimal(10, 0)) FROM t --- !query 683 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) <=> CAST(1 AS DECIMAL(10,0))):boolean> --- !query 683 output +-- !query output true --- !query 684 +-- !query SELECT cast(1 as decimal(20, 0)) <=> cast(1 as decimal(10, 0)) FROM t --- !query 684 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) <=> CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean> --- !query 684 output +-- !query output true --- !query 685 +-- !query SELECT cast(1 as decimal(3, 0)) <=> cast(1 as string) FROM t --- !query 685 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) <=> CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 685 output +-- !query output true --- !query 686 +-- !query SELECT cast(1 as decimal(5, 0)) <=> cast(1 as string) FROM t --- !query 686 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) <=> CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 686 output +-- !query output true --- !query 687 +-- !query SELECT cast(1 as decimal(10, 0)) <=> cast(1 as string) FROM t --- !query 687 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) <=> CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 687 output +-- !query output true --- !query 688 +-- !query SELECT cast(1 as decimal(20, 0)) <=> cast(1 as string) FROM t --- !query 688 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) <=> CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 688 output +-- !query output true --- !query 689 +-- !query SELECT cast(1 as decimal(3, 0)) <=> cast('1' as binary) FROM t --- !query 689 schema +-- !query schema struct<> --- !query 689 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) <=> CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) <=> CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7 --- !query 690 +-- !query SELECT cast(1 as decimal(5, 0)) <=> cast('1' as binary) FROM t --- !query 690 schema +-- !query schema struct<> --- !query 690 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) <=> CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) <=> CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7 --- !query 691 +-- !query SELECT cast(1 as decimal(10, 0)) <=> cast('1' as binary) FROM t --- !query 691 schema +-- !query schema struct<> --- !query 691 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) <=> CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) <=> CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7 --- !query 692 +-- !query SELECT cast(1 as decimal(20, 0)) <=> cast('1' as binary) FROM t --- !query 692 schema +-- !query schema struct<> --- !query 692 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) <=> CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) <=> CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7 --- !query 693 +-- !query SELECT cast(1 as decimal(3, 0)) <=> cast(1 as boolean) FROM t --- !query 693 schema +-- !query schema struct<(CAST(1 AS DECIMAL(3,0)) <=> CAST(CAST(1 AS BOOLEAN) AS DECIMAL(3,0))):boolean> --- !query 693 output +-- !query output true --- !query 694 +-- !query SELECT cast(1 as decimal(5, 0)) <=> cast(1 as boolean) FROM t --- !query 694 schema +-- !query schema struct<(CAST(1 AS DECIMAL(5,0)) <=> CAST(CAST(1 AS BOOLEAN) AS DECIMAL(5,0))):boolean> --- !query 694 output +-- !query output true --- !query 695 +-- !query SELECT cast(1 as decimal(10, 0)) <=> cast(1 as boolean) FROM t --- !query 695 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) <=> CAST(CAST(1 AS BOOLEAN) AS DECIMAL(10,0))):boolean> --- !query 695 output +-- !query output true --- !query 696 +-- !query SELECT cast(1 as decimal(20, 0)) <=> cast(1 as boolean) FROM t --- !query 696 schema +-- !query schema struct<(CAST(1 AS DECIMAL(20,0)) <=> CAST(CAST(1 AS BOOLEAN) AS DECIMAL(20,0))):boolean> --- !query 696 output +-- !query output true --- !query 697 +-- !query SELECT cast(1 as decimal(3, 0)) <=> cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 697 schema +-- !query schema struct<> --- !query 697 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7 --- !query 698 +-- !query SELECT cast(1 as decimal(5, 0)) <=> cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 698 schema +-- !query schema struct<> --- !query 698 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7 --- !query 699 +-- !query SELECT cast(1 as decimal(10, 0)) <=> cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 699 schema +-- !query schema struct<> --- !query 699 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7 --- !query 700 +-- !query SELECT cast(1 as decimal(20, 0)) <=> cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 700 schema +-- !query schema struct<> --- !query 700 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7 --- !query 701 +-- !query SELECT cast(1 as decimal(3, 0)) <=> cast('2017-12-11 09:30:00' as date) FROM t --- !query 701 schema +-- !query schema struct<> --- !query 701 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) <=> CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) <=> CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7 --- !query 702 +-- !query SELECT cast(1 as decimal(5, 0)) <=> cast('2017-12-11 09:30:00' as date) FROM t --- !query 702 schema +-- !query schema struct<> --- !query 702 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) <=> CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) <=> CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7 --- !query 703 +-- !query SELECT cast(1 as decimal(10, 0)) <=> cast('2017-12-11 09:30:00' as date) FROM t --- !query 703 schema +-- !query schema struct<> --- !query 703 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) <=> CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) <=> CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7 --- !query 704 +-- !query SELECT cast(1 as decimal(20, 0)) <=> cast('2017-12-11 09:30:00' as date) FROM t --- !query 704 schema +-- !query schema struct<> --- !query 704 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) <=> CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) <=> CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7 --- !query 705 +-- !query SELECT cast(1 as tinyint) < cast(1 as decimal(3, 0)) FROM t --- !query 705 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) < CAST(1 AS DECIMAL(3,0))):boolean> --- !query 705 output +-- !query output false --- !query 706 +-- !query SELECT cast(1 as tinyint) < cast(1 as decimal(5, 0)) FROM t --- !query 706 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)) < CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean> --- !query 706 output +-- !query output false --- !query 707 +-- !query SELECT cast(1 as tinyint) < cast(1 as decimal(10, 0)) FROM t --- !query 707 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) < CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 707 output +-- !query output false --- !query 708 +-- !query SELECT cast(1 as tinyint) < cast(1 as decimal(20, 0)) FROM t --- !query 708 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)) < CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 708 output +-- !query output false --- !query 709 +-- !query SELECT cast(1 as smallint) < cast(1 as decimal(3, 0)) FROM t --- !query 709 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)) < CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean> --- !query 709 output +-- !query output false --- !query 710 +-- !query SELECT cast(1 as smallint) < cast(1 as decimal(5, 0)) FROM t --- !query 710 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) < CAST(1 AS DECIMAL(5,0))):boolean> --- !query 710 output +-- !query output false --- !query 711 +-- !query SELECT cast(1 as smallint) < cast(1 as decimal(10, 0)) FROM t --- !query 711 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) < CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 711 output +-- !query output false --- !query 712 +-- !query SELECT cast(1 as smallint) < cast(1 as decimal(20, 0)) FROM t --- !query 712 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)) < CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 712 output +-- !query output false --- !query 713 +-- !query SELECT cast(1 as int) < cast(1 as decimal(3, 0)) FROM t --- !query 713 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) < CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean> --- !query 713 output +-- !query output false --- !query 714 +-- !query SELECT cast(1 as int) < cast(1 as decimal(5, 0)) FROM t --- !query 714 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) < CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean> --- !query 714 output +-- !query output false --- !query 715 +-- !query SELECT cast(1 as int) < cast(1 as decimal(10, 0)) FROM t --- !query 715 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) < CAST(1 AS DECIMAL(10,0))):boolean> --- !query 715 output +-- !query output false --- !query 716 +-- !query SELECT cast(1 as int) < cast(1 as decimal(20, 0)) FROM t --- !query 716 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)) < CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 716 output +-- !query output false --- !query 717 +-- !query SELECT cast(1 as bigint) < cast(1 as decimal(3, 0)) FROM t --- !query 717 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) < CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean> --- !query 717 output +-- !query output false --- !query 718 +-- !query SELECT cast(1 as bigint) < cast(1 as decimal(5, 0)) FROM t --- !query 718 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) < CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean> --- !query 718 output +-- !query output false --- !query 719 +-- !query SELECT cast(1 as bigint) < cast(1 as decimal(10, 0)) FROM t --- !query 719 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) < CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean> --- !query 719 output +-- !query output false --- !query 720 +-- !query SELECT cast(1 as bigint) < cast(1 as decimal(20, 0)) FROM t --- !query 720 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) < CAST(1 AS DECIMAL(20,0))):boolean> --- !query 720 output +-- !query output false --- !query 721 +-- !query SELECT cast(1 as float) < cast(1 as decimal(3, 0)) FROM t --- !query 721 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) < CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean> --- !query 721 output +-- !query output false --- !query 722 +-- !query SELECT cast(1 as float) < cast(1 as decimal(5, 0)) FROM t --- !query 722 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) < CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean> --- !query 722 output +-- !query output false --- !query 723 +-- !query SELECT cast(1 as float) < cast(1 as decimal(10, 0)) FROM t --- !query 723 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) < CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 723 output +-- !query output false --- !query 724 +-- !query SELECT cast(1 as float) < cast(1 as decimal(20, 0)) FROM t --- !query 724 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) < CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean> --- !query 724 output +-- !query output false --- !query 725 +-- !query SELECT cast(1 as double) < cast(1 as decimal(3, 0)) FROM t --- !query 725 schema +-- !query schema struct<(CAST(1 AS DOUBLE) < CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean> --- !query 725 output +-- !query output false --- !query 726 +-- !query SELECT cast(1 as double) < cast(1 as decimal(5, 0)) FROM t --- !query 726 schema +-- !query schema struct<(CAST(1 AS DOUBLE) < CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean> --- !query 726 output +-- !query output false --- !query 727 +-- !query SELECT cast(1 as double) < cast(1 as decimal(10, 0)) FROM t --- !query 727 schema +-- !query schema struct<(CAST(1 AS DOUBLE) < CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 727 output +-- !query output false --- !query 728 +-- !query SELECT cast(1 as double) < cast(1 as decimal(20, 0)) FROM t --- !query 728 schema +-- !query schema struct<(CAST(1 AS DOUBLE) < CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean> --- !query 728 output +-- !query output false --- !query 729 +-- !query SELECT cast(1 as decimal(10, 0)) < cast(1 as decimal(3, 0)) FROM t --- !query 729 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) < CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean> --- !query 729 output +-- !query output false --- !query 730 +-- !query SELECT cast(1 as decimal(10, 0)) < cast(1 as decimal(5, 0)) FROM t --- !query 730 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) < CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean> --- !query 730 output +-- !query output false --- !query 731 +-- !query SELECT cast(1 as decimal(10, 0)) < cast(1 as decimal(10, 0)) FROM t --- !query 731 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) < CAST(1 AS DECIMAL(10,0))):boolean> --- !query 731 output +-- !query output false --- !query 732 +-- !query SELECT cast(1 as decimal(10, 0)) < cast(1 as decimal(20, 0)) FROM t --- !query 732 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) < CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 732 output +-- !query output false --- !query 733 +-- !query SELECT cast('1' as binary) < cast(1 as decimal(3, 0)) FROM t --- !query 733 schema +-- !query schema struct<> --- !query 733 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) < CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) < CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7 --- !query 734 +-- !query SELECT cast('1' as binary) < cast(1 as decimal(5, 0)) FROM t --- !query 734 schema +-- !query schema struct<> --- !query 734 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) < CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) < CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7 --- !query 735 +-- !query SELECT cast('1' as binary) < cast(1 as decimal(10, 0)) FROM t --- !query 735 schema +-- !query schema struct<> --- !query 735 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) < CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) < CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7 --- !query 736 +-- !query SELECT cast('1' as binary) < cast(1 as decimal(20, 0)) FROM t --- !query 736 schema +-- !query schema struct<> --- !query 736 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) < CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) < CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7 --- !query 737 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) < cast(1 as decimal(3, 0)) FROM t --- !query 737 schema +-- !query schema struct<> --- !query 737 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) < CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) < CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7 --- !query 738 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) < cast(1 as decimal(5, 0)) FROM t --- !query 738 schema +-- !query schema struct<> --- !query 738 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) < CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) < CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7 --- !query 739 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) < cast(1 as decimal(10, 0)) FROM t --- !query 739 schema +-- !query schema struct<> --- !query 739 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) < CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) < CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7 --- !query 740 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) < cast(1 as decimal(20, 0)) FROM t --- !query 740 schema +-- !query schema struct<> --- !query 740 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) < CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) < CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7 --- !query 741 +-- !query SELECT cast('2017-12-11 09:30:00' as date) < cast(1 as decimal(3, 0)) FROM t --- !query 741 schema +-- !query schema struct<> --- !query 741 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) < CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) < CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7 --- !query 742 +-- !query SELECT cast('2017-12-11 09:30:00' as date) < cast(1 as decimal(5, 0)) FROM t --- !query 742 schema +-- !query schema struct<> --- !query 742 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) < CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) < CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7 --- !query 743 +-- !query SELECT cast('2017-12-11 09:30:00' as date) < cast(1 as decimal(10, 0)) FROM t --- !query 743 schema +-- !query schema struct<> --- !query 743 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) < CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) < CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7 --- !query 744 +-- !query SELECT cast('2017-12-11 09:30:00' as date) < cast(1 as decimal(20, 0)) FROM t --- !query 744 schema +-- !query schema struct<> --- !query 744 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) < CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) < CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7 --- !query 745 +-- !query SELECT cast(1 as decimal(3, 0)) < cast(1 as tinyint) FROM t --- !query 745 schema +-- !query schema struct<(CAST(1 AS DECIMAL(3,0)) < CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):boolean> --- !query 745 output +-- !query output false --- !query 746 +-- !query SELECT cast(1 as decimal(5, 0)) < cast(1 as tinyint) FROM t --- !query 746 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)) < CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean> --- !query 746 output +-- !query output false --- !query 747 +-- !query SELECT cast(1 as decimal(10, 0)) < cast(1 as tinyint) FROM t --- !query 747 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) < CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean> --- !query 747 output +-- !query output false --- !query 748 +-- !query SELECT cast(1 as decimal(20, 0)) < cast(1 as tinyint) FROM t --- !query 748 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) < CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean> --- !query 748 output +-- !query output false --- !query 749 +-- !query SELECT cast(1 as decimal(3, 0)) < cast(1 as smallint) FROM t --- !query 749 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)) < CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean> --- !query 749 output +-- !query output false --- !query 750 +-- !query SELECT cast(1 as decimal(5, 0)) < cast(1 as smallint) FROM t --- !query 750 schema +-- !query schema struct<(CAST(1 AS DECIMAL(5,0)) < CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):boolean> --- !query 750 output +-- !query output false --- !query 751 +-- !query SELECT cast(1 as decimal(10, 0)) < cast(1 as smallint) FROM t --- !query 751 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) < CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean> --- !query 751 output +-- !query output false --- !query 752 +-- !query SELECT cast(1 as decimal(20, 0)) < cast(1 as smallint) FROM t --- !query 752 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) < CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean> --- !query 752 output +-- !query output false --- !query 753 +-- !query SELECT cast(1 as decimal(3, 0)) < cast(1 as int) FROM t --- !query 753 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) < CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 753 output +-- !query output false --- !query 754 +-- !query SELECT cast(1 as decimal(5, 0)) < cast(1 as int) FROM t --- !query 754 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) < CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 754 output +-- !query output false --- !query 755 +-- !query SELECT cast(1 as decimal(10, 0)) < cast(1 as int) FROM t --- !query 755 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) < CAST(CAST(1 AS INT) AS DECIMAL(10,0))):boolean> --- !query 755 output +-- !query output false --- !query 756 +-- !query SELECT cast(1 as decimal(20, 0)) < cast(1 as int) FROM t --- !query 756 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) < CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean> --- !query 756 output +-- !query output false --- !query 757 +-- !query SELECT cast(1 as decimal(3, 0)) < cast(1 as bigint) FROM t --- !query 757 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)) < CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 757 output +-- !query output false --- !query 758 +-- !query SELECT cast(1 as decimal(5, 0)) < cast(1 as bigint) FROM t --- !query 758 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)) < CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 758 output +-- !query output false --- !query 759 +-- !query SELECT cast(1 as decimal(10, 0)) < cast(1 as bigint) FROM t --- !query 759 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) < CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 759 output +-- !query output false --- !query 760 +-- !query SELECT cast(1 as decimal(20, 0)) < cast(1 as bigint) FROM t --- !query 760 schema +-- !query schema struct<(CAST(1 AS DECIMAL(20,0)) < CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):boolean> --- !query 760 output +-- !query output false --- !query 761 +-- !query SELECT cast(1 as decimal(3, 0)) < cast(1 as float) FROM t --- !query 761 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) < CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 761 output +-- !query output false --- !query 762 +-- !query SELECT cast(1 as decimal(5, 0)) < cast(1 as float) FROM t --- !query 762 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) < CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 762 output +-- !query output false --- !query 763 +-- !query SELECT cast(1 as decimal(10, 0)) < cast(1 as float) FROM t --- !query 763 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) < CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 763 output +-- !query output false --- !query 764 +-- !query SELECT cast(1 as decimal(20, 0)) < cast(1 as float) FROM t --- !query 764 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) < CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 764 output +-- !query output false --- !query 765 +-- !query SELECT cast(1 as decimal(3, 0)) < cast(1 as double) FROM t --- !query 765 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) < CAST(1 AS DOUBLE)):boolean> --- !query 765 output +-- !query output false --- !query 766 +-- !query SELECT cast(1 as decimal(5, 0)) < cast(1 as double) FROM t --- !query 766 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) < CAST(1 AS DOUBLE)):boolean> --- !query 766 output +-- !query output false --- !query 767 +-- !query SELECT cast(1 as decimal(10, 0)) < cast(1 as double) FROM t --- !query 767 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) < CAST(1 AS DOUBLE)):boolean> --- !query 767 output +-- !query output false --- !query 768 +-- !query SELECT cast(1 as decimal(20, 0)) < cast(1 as double) FROM t --- !query 768 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) < CAST(1 AS DOUBLE)):boolean> --- !query 768 output +-- !query output false --- !query 769 +-- !query SELECT cast(1 as decimal(3, 0)) < cast(1 as decimal(10, 0)) FROM t --- !query 769 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) < CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 769 output +-- !query output false --- !query 770 +-- !query SELECT cast(1 as decimal(5, 0)) < cast(1 as decimal(10, 0)) FROM t --- !query 770 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) < CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 770 output +-- !query output false --- !query 771 +-- !query SELECT cast(1 as decimal(10, 0)) < cast(1 as decimal(10, 0)) FROM t --- !query 771 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) < CAST(1 AS DECIMAL(10,0))):boolean> --- !query 771 output +-- !query output false --- !query 772 +-- !query SELECT cast(1 as decimal(20, 0)) < cast(1 as decimal(10, 0)) FROM t --- !query 772 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) < CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean> --- !query 772 output +-- !query output false --- !query 773 +-- !query SELECT cast(1 as decimal(3, 0)) < cast(1 as string) FROM t --- !query 773 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) < CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 773 output +-- !query output false --- !query 774 +-- !query SELECT cast(1 as decimal(5, 0)) < cast(1 as string) FROM t --- !query 774 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) < CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 774 output +-- !query output false --- !query 775 +-- !query SELECT cast(1 as decimal(10, 0)) < cast(1 as string) FROM t --- !query 775 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) < CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 775 output +-- !query output false --- !query 776 +-- !query SELECT cast(1 as decimal(20, 0)) < cast(1 as string) FROM t --- !query 776 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) < CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 776 output +-- !query output false --- !query 777 +-- !query SELECT cast(1 as decimal(3, 0)) < cast('1' as binary) FROM t --- !query 777 schema +-- !query schema struct<> --- !query 777 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) < CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) < CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7 --- !query 778 +-- !query SELECT cast(1 as decimal(5, 0)) < cast('1' as binary) FROM t --- !query 778 schema +-- !query schema struct<> --- !query 778 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) < CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) < CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7 --- !query 779 +-- !query SELECT cast(1 as decimal(10, 0)) < cast('1' as binary) FROM t --- !query 779 schema +-- !query schema struct<> --- !query 779 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) < CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) < CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7 --- !query 780 +-- !query SELECT cast(1 as decimal(20, 0)) < cast('1' as binary) FROM t --- !query 780 schema +-- !query schema struct<> --- !query 780 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) < CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) < CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7 --- !query 781 +-- !query SELECT cast(1 as decimal(3, 0)) < cast(1 as boolean) FROM t --- !query 781 schema +-- !query schema struct<> --- !query 781 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) < CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) < CAST(1 AS BOOLEAN))' (decimal(3,0) and boolean).; line 1 pos 7 --- !query 782 +-- !query SELECT cast(1 as decimal(5, 0)) < cast(1 as boolean) FROM t --- !query 782 schema +-- !query schema struct<> --- !query 782 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) < CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) < CAST(1 AS BOOLEAN))' (decimal(5,0) and boolean).; line 1 pos 7 --- !query 783 +-- !query SELECT cast(1 as decimal(10, 0)) < cast(1 as boolean) FROM t --- !query 783 schema +-- !query schema struct<> --- !query 783 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) < CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) < CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7 --- !query 784 +-- !query SELECT cast(1 as decimal(20, 0)) < cast(1 as boolean) FROM t --- !query 784 schema +-- !query schema struct<> --- !query 784 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) < CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) < CAST(1 AS BOOLEAN))' (decimal(20,0) and boolean).; line 1 pos 7 --- !query 785 +-- !query SELECT cast(1 as decimal(3, 0)) < cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 785 schema +-- !query schema struct<> --- !query 785 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) < CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) < CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7 --- !query 786 +-- !query SELECT cast(1 as decimal(5, 0)) < cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 786 schema +-- !query schema struct<> --- !query 786 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) < CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) < CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7 --- !query 787 +-- !query SELECT cast(1 as decimal(10, 0)) < cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 787 schema +-- !query schema struct<> --- !query 787 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) < CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) < CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7 --- !query 788 +-- !query SELECT cast(1 as decimal(20, 0)) < cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 788 schema +-- !query schema struct<> --- !query 788 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) < CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) < CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7 --- !query 789 +-- !query SELECT cast(1 as decimal(3, 0)) < cast('2017-12-11 09:30:00' as date) FROM t --- !query 789 schema +-- !query schema struct<> --- !query 789 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) < CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) < CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7 --- !query 790 +-- !query SELECT cast(1 as decimal(5, 0)) < cast('2017-12-11 09:30:00' as date) FROM t --- !query 790 schema +-- !query schema struct<> --- !query 790 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) < CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) < CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7 --- !query 791 +-- !query SELECT cast(1 as decimal(10, 0)) < cast('2017-12-11 09:30:00' as date) FROM t --- !query 791 schema +-- !query schema struct<> --- !query 791 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) < CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) < CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7 --- !query 792 +-- !query SELECT cast(1 as decimal(20, 0)) < cast('2017-12-11 09:30:00' as date) FROM t --- !query 792 schema +-- !query schema struct<> --- !query 792 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) < CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) < CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7 --- !query 793 +-- !query SELECT cast(1 as tinyint) <= cast(1 as decimal(3, 0)) FROM t --- !query 793 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) <= CAST(1 AS DECIMAL(3,0))):boolean> --- !query 793 output +-- !query output true --- !query 794 +-- !query SELECT cast(1 as tinyint) <= cast(1 as decimal(5, 0)) FROM t --- !query 794 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)) <= CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean> --- !query 794 output +-- !query output true --- !query 795 +-- !query SELECT cast(1 as tinyint) <= cast(1 as decimal(10, 0)) FROM t --- !query 795 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 795 output +-- !query output true --- !query 796 +-- !query SELECT cast(1 as tinyint) <= cast(1 as decimal(20, 0)) FROM t --- !query 796 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)) <= CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 796 output +-- !query output true --- !query 797 +-- !query SELECT cast(1 as smallint) <= cast(1 as decimal(3, 0)) FROM t --- !query 797 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)) <= CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean> --- !query 797 output +-- !query output true --- !query 798 +-- !query SELECT cast(1 as smallint) <= cast(1 as decimal(5, 0)) FROM t --- !query 798 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) <= CAST(1 AS DECIMAL(5,0))):boolean> --- !query 798 output +-- !query output true --- !query 799 +-- !query SELECT cast(1 as smallint) <= cast(1 as decimal(10, 0)) FROM t --- !query 799 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 799 output +-- !query output true --- !query 800 +-- !query SELECT cast(1 as smallint) <= cast(1 as decimal(20, 0)) FROM t --- !query 800 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)) <= CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 800 output +-- !query output true --- !query 801 +-- !query SELECT cast(1 as int) <= cast(1 as decimal(3, 0)) FROM t --- !query 801 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) <= CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean> --- !query 801 output +-- !query output true --- !query 802 +-- !query SELECT cast(1 as int) <= cast(1 as decimal(5, 0)) FROM t --- !query 802 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) <= CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean> --- !query 802 output +-- !query output true --- !query 803 +-- !query SELECT cast(1 as int) <= cast(1 as decimal(10, 0)) FROM t --- !query 803 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) <= CAST(1 AS DECIMAL(10,0))):boolean> --- !query 803 output +-- !query output true --- !query 804 +-- !query SELECT cast(1 as int) <= cast(1 as decimal(20, 0)) FROM t --- !query 804 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)) <= CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 804 output +-- !query output true --- !query 805 +-- !query SELECT cast(1 as bigint) <= cast(1 as decimal(3, 0)) FROM t --- !query 805 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) <= CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean> --- !query 805 output +-- !query output true --- !query 806 +-- !query SELECT cast(1 as bigint) <= cast(1 as decimal(5, 0)) FROM t --- !query 806 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) <= CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean> --- !query 806 output +-- !query output true --- !query 807 +-- !query SELECT cast(1 as bigint) <= cast(1 as decimal(10, 0)) FROM t --- !query 807 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean> --- !query 807 output +-- !query output true --- !query 808 +-- !query SELECT cast(1 as bigint) <= cast(1 as decimal(20, 0)) FROM t --- !query 808 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) <= CAST(1 AS DECIMAL(20,0))):boolean> --- !query 808 output +-- !query output true --- !query 809 +-- !query SELECT cast(1 as float) <= cast(1 as decimal(3, 0)) FROM t --- !query 809 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean> --- !query 809 output +-- !query output true --- !query 810 +-- !query SELECT cast(1 as float) <= cast(1 as decimal(5, 0)) FROM t --- !query 810 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean> --- !query 810 output +-- !query output true --- !query 811 +-- !query SELECT cast(1 as float) <= cast(1 as decimal(10, 0)) FROM t --- !query 811 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 811 output +-- !query output true --- !query 812 +-- !query SELECT cast(1 as float) <= cast(1 as decimal(20, 0)) FROM t --- !query 812 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean> --- !query 812 output +-- !query output true --- !query 813 +-- !query SELECT cast(1 as double) <= cast(1 as decimal(3, 0)) FROM t --- !query 813 schema +-- !query schema struct<(CAST(1 AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean> --- !query 813 output +-- !query output true --- !query 814 +-- !query SELECT cast(1 as double) <= cast(1 as decimal(5, 0)) FROM t --- !query 814 schema +-- !query schema struct<(CAST(1 AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean> --- !query 814 output +-- !query output true --- !query 815 +-- !query SELECT cast(1 as double) <= cast(1 as decimal(10, 0)) FROM t --- !query 815 schema +-- !query schema struct<(CAST(1 AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 815 output +-- !query output true --- !query 816 +-- !query SELECT cast(1 as double) <= cast(1 as decimal(20, 0)) FROM t --- !query 816 schema +-- !query schema struct<(CAST(1 AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean> --- !query 816 output +-- !query output true --- !query 817 +-- !query SELECT cast(1 as decimal(10, 0)) <= cast(1 as decimal(3, 0)) FROM t --- !query 817 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) <= CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean> --- !query 817 output +-- !query output true --- !query 818 +-- !query SELECT cast(1 as decimal(10, 0)) <= cast(1 as decimal(5, 0)) FROM t --- !query 818 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) <= CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean> --- !query 818 output +-- !query output true --- !query 819 +-- !query SELECT cast(1 as decimal(10, 0)) <= cast(1 as decimal(10, 0)) FROM t --- !query 819 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) <= CAST(1 AS DECIMAL(10,0))):boolean> --- !query 819 output +-- !query output true --- !query 820 +-- !query SELECT cast(1 as decimal(10, 0)) <= cast(1 as decimal(20, 0)) FROM t --- !query 820 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) <= CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 820 output +-- !query output true --- !query 821 +-- !query SELECT cast('1' as binary) <= cast(1 as decimal(3, 0)) FROM t --- !query 821 schema +-- !query schema struct<> --- !query 821 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) <= CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) <= CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7 --- !query 822 +-- !query SELECT cast('1' as binary) <= cast(1 as decimal(5, 0)) FROM t --- !query 822 schema +-- !query schema struct<> --- !query 822 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) <= CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) <= CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7 --- !query 823 +-- !query SELECT cast('1' as binary) <= cast(1 as decimal(10, 0)) FROM t --- !query 823 schema +-- !query schema struct<> --- !query 823 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) <= CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) <= CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7 --- !query 824 +-- !query SELECT cast('1' as binary) <= cast(1 as decimal(20, 0)) FROM t --- !query 824 schema +-- !query schema struct<> --- !query 824 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) <= CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) <= CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7 --- !query 825 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) <= cast(1 as decimal(3, 0)) FROM t --- !query 825 schema +-- !query schema struct<> --- !query 825 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <= CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <= CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7 --- !query 826 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) <= cast(1 as decimal(5, 0)) FROM t --- !query 826 schema +-- !query schema struct<> --- !query 826 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <= CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <= CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7 --- !query 827 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) <= cast(1 as decimal(10, 0)) FROM t --- !query 827 schema +-- !query schema struct<> --- !query 827 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <= CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <= CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7 --- !query 828 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) <= cast(1 as decimal(20, 0)) FROM t --- !query 828 schema +-- !query schema struct<> --- !query 828 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <= CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <= CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7 --- !query 829 +-- !query SELECT cast('2017-12-11 09:30:00' as date) <= cast(1 as decimal(3, 0)) FROM t --- !query 829 schema +-- !query schema struct<> --- !query 829 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) <= CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) <= CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7 --- !query 830 +-- !query SELECT cast('2017-12-11 09:30:00' as date) <= cast(1 as decimal(5, 0)) FROM t --- !query 830 schema +-- !query schema struct<> --- !query 830 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) <= CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) <= CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7 --- !query 831 +-- !query SELECT cast('2017-12-11 09:30:00' as date) <= cast(1 as decimal(10, 0)) FROM t --- !query 831 schema +-- !query schema struct<> --- !query 831 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) <= CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) <= CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7 --- !query 832 +-- !query SELECT cast('2017-12-11 09:30:00' as date) <= cast(1 as decimal(20, 0)) FROM t --- !query 832 schema +-- !query schema struct<> --- !query 832 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) <= CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) <= CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7 --- !query 833 +-- !query SELECT cast(1 as decimal(3, 0)) <= cast(1 as tinyint) FROM t --- !query 833 schema +-- !query schema struct<(CAST(1 AS DECIMAL(3,0)) <= CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):boolean> --- !query 833 output +-- !query output true --- !query 834 +-- !query SELECT cast(1 as decimal(5, 0)) <= cast(1 as tinyint) FROM t --- !query 834 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)) <= CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean> --- !query 834 output +-- !query output true --- !query 835 +-- !query SELECT cast(1 as decimal(10, 0)) <= cast(1 as tinyint) FROM t --- !query 835 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) <= CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean> --- !query 835 output +-- !query output true --- !query 836 +-- !query SELECT cast(1 as decimal(20, 0)) <= cast(1 as tinyint) FROM t --- !query 836 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) <= CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean> --- !query 836 output +-- !query output true --- !query 837 +-- !query SELECT cast(1 as decimal(3, 0)) <= cast(1 as smallint) FROM t --- !query 837 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)) <= CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean> --- !query 837 output +-- !query output true --- !query 838 +-- !query SELECT cast(1 as decimal(5, 0)) <= cast(1 as smallint) FROM t --- !query 838 schema +-- !query schema struct<(CAST(1 AS DECIMAL(5,0)) <= CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):boolean> --- !query 838 output +-- !query output true --- !query 839 +-- !query SELECT cast(1 as decimal(10, 0)) <= cast(1 as smallint) FROM t --- !query 839 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) <= CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean> --- !query 839 output +-- !query output true --- !query 840 +-- !query SELECT cast(1 as decimal(20, 0)) <= cast(1 as smallint) FROM t --- !query 840 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) <= CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean> --- !query 840 output +-- !query output true --- !query 841 +-- !query SELECT cast(1 as decimal(3, 0)) <= cast(1 as int) FROM t --- !query 841 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) <= CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 841 output +-- !query output true --- !query 842 +-- !query SELECT cast(1 as decimal(5, 0)) <= cast(1 as int) FROM t --- !query 842 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) <= CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 842 output +-- !query output true --- !query 843 +-- !query SELECT cast(1 as decimal(10, 0)) <= cast(1 as int) FROM t --- !query 843 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) <= CAST(CAST(1 AS INT) AS DECIMAL(10,0))):boolean> --- !query 843 output +-- !query output true --- !query 844 +-- !query SELECT cast(1 as decimal(20, 0)) <= cast(1 as int) FROM t --- !query 844 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) <= CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean> --- !query 844 output +-- !query output true --- !query 845 +-- !query SELECT cast(1 as decimal(3, 0)) <= cast(1 as bigint) FROM t --- !query 845 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)) <= CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 845 output +-- !query output true --- !query 846 +-- !query SELECT cast(1 as decimal(5, 0)) <= cast(1 as bigint) FROM t --- !query 846 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)) <= CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 846 output +-- !query output true --- !query 847 +-- !query SELECT cast(1 as decimal(10, 0)) <= cast(1 as bigint) FROM t --- !query 847 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) <= CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 847 output +-- !query output true --- !query 848 +-- !query SELECT cast(1 as decimal(20, 0)) <= cast(1 as bigint) FROM t --- !query 848 schema +-- !query schema struct<(CAST(1 AS DECIMAL(20,0)) <= CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):boolean> --- !query 848 output +-- !query output true --- !query 849 +-- !query SELECT cast(1 as decimal(3, 0)) <= cast(1 as float) FROM t --- !query 849 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) <= CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 849 output +-- !query output true --- !query 850 +-- !query SELECT cast(1 as decimal(5, 0)) <= cast(1 as float) FROM t --- !query 850 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) <= CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 850 output +-- !query output true --- !query 851 +-- !query SELECT cast(1 as decimal(10, 0)) <= cast(1 as float) FROM t --- !query 851 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) <= CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 851 output +-- !query output true --- !query 852 +-- !query SELECT cast(1 as decimal(20, 0)) <= cast(1 as float) FROM t --- !query 852 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) <= CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 852 output +-- !query output true --- !query 853 +-- !query SELECT cast(1 as decimal(3, 0)) <= cast(1 as double) FROM t --- !query 853 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) <= CAST(1 AS DOUBLE)):boolean> --- !query 853 output +-- !query output true --- !query 854 +-- !query SELECT cast(1 as decimal(5, 0)) <= cast(1 as double) FROM t --- !query 854 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) <= CAST(1 AS DOUBLE)):boolean> --- !query 854 output +-- !query output true --- !query 855 +-- !query SELECT cast(1 as decimal(10, 0)) <= cast(1 as double) FROM t --- !query 855 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) <= CAST(1 AS DOUBLE)):boolean> --- !query 855 output +-- !query output true --- !query 856 +-- !query SELECT cast(1 as decimal(20, 0)) <= cast(1 as double) FROM t --- !query 856 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) <= CAST(1 AS DOUBLE)):boolean> --- !query 856 output +-- !query output true --- !query 857 +-- !query SELECT cast(1 as decimal(3, 0)) <= cast(1 as decimal(10, 0)) FROM t --- !query 857 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 857 output +-- !query output true --- !query 858 +-- !query SELECT cast(1 as decimal(5, 0)) <= cast(1 as decimal(10, 0)) FROM t --- !query 858 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 858 output +-- !query output true --- !query 859 +-- !query SELECT cast(1 as decimal(10, 0)) <= cast(1 as decimal(10, 0)) FROM t --- !query 859 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) <= CAST(1 AS DECIMAL(10,0))):boolean> --- !query 859 output +-- !query output true --- !query 860 +-- !query SELECT cast(1 as decimal(20, 0)) <= cast(1 as decimal(10, 0)) FROM t --- !query 860 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean> --- !query 860 output +-- !query output true --- !query 861 +-- !query SELECT cast(1 as decimal(3, 0)) <= cast(1 as string) FROM t --- !query 861 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) <= CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 861 output +-- !query output true --- !query 862 +-- !query SELECT cast(1 as decimal(5, 0)) <= cast(1 as string) FROM t --- !query 862 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) <= CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 862 output +-- !query output true --- !query 863 +-- !query SELECT cast(1 as decimal(10, 0)) <= cast(1 as string) FROM t --- !query 863 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) <= CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 863 output +-- !query output true --- !query 864 +-- !query SELECT cast(1 as decimal(20, 0)) <= cast(1 as string) FROM t --- !query 864 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) <= CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 864 output +-- !query output true --- !query 865 +-- !query SELECT cast(1 as decimal(3, 0)) <= cast('1' as binary) FROM t --- !query 865 schema +-- !query schema struct<> --- !query 865 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) <= CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) <= CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7 --- !query 866 +-- !query SELECT cast(1 as decimal(5, 0)) <= cast('1' as binary) FROM t --- !query 866 schema +-- !query schema struct<> --- !query 866 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) <= CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) <= CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7 --- !query 867 +-- !query SELECT cast(1 as decimal(10, 0)) <= cast('1' as binary) FROM t --- !query 867 schema +-- !query schema struct<> --- !query 867 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) <= CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) <= CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7 --- !query 868 +-- !query SELECT cast(1 as decimal(20, 0)) <= cast('1' as binary) FROM t --- !query 868 schema +-- !query schema struct<> --- !query 868 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) <= CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) <= CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7 --- !query 869 +-- !query SELECT cast(1 as decimal(3, 0)) <= cast(1 as boolean) FROM t --- !query 869 schema +-- !query schema struct<> --- !query 869 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) <= CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) <= CAST(1 AS BOOLEAN))' (decimal(3,0) and boolean).; line 1 pos 7 --- !query 870 +-- !query SELECT cast(1 as decimal(5, 0)) <= cast(1 as boolean) FROM t --- !query 870 schema +-- !query schema struct<> --- !query 870 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) <= CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) <= CAST(1 AS BOOLEAN))' (decimal(5,0) and boolean).; line 1 pos 7 --- !query 871 +-- !query SELECT cast(1 as decimal(10, 0)) <= cast(1 as boolean) FROM t --- !query 871 schema +-- !query schema struct<> --- !query 871 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) <= CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) <= CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7 --- !query 872 +-- !query SELECT cast(1 as decimal(20, 0)) <= cast(1 as boolean) FROM t --- !query 872 schema +-- !query schema struct<> --- !query 872 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) <= CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) <= CAST(1 AS BOOLEAN))' (decimal(20,0) and boolean).; line 1 pos 7 --- !query 873 +-- !query SELECT cast(1 as decimal(3, 0)) <= cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 873 schema +-- !query schema struct<> --- !query 873 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) <= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) <= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7 --- !query 874 +-- !query SELECT cast(1 as decimal(5, 0)) <= cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 874 schema +-- !query schema struct<> --- !query 874 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) <= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) <= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7 --- !query 875 +-- !query SELECT cast(1 as decimal(10, 0)) <= cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 875 schema +-- !query schema struct<> --- !query 875 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) <= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) <= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7 --- !query 876 +-- !query SELECT cast(1 as decimal(20, 0)) <= cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 876 schema +-- !query schema struct<> --- !query 876 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) <= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) <= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7 --- !query 877 +-- !query SELECT cast(1 as decimal(3, 0)) <= cast('2017-12-11 09:30:00' as date) FROM t --- !query 877 schema +-- !query schema struct<> --- !query 877 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) <= CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) <= CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7 --- !query 878 +-- !query SELECT cast(1 as decimal(5, 0)) <= cast('2017-12-11 09:30:00' as date) FROM t --- !query 878 schema +-- !query schema struct<> --- !query 878 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) <= CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) <= CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7 --- !query 879 +-- !query SELECT cast(1 as decimal(10, 0)) <= cast('2017-12-11 09:30:00' as date) FROM t --- !query 879 schema +-- !query schema struct<> --- !query 879 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) <= CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) <= CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7 --- !query 880 +-- !query SELECT cast(1 as decimal(20, 0)) <= cast('2017-12-11 09:30:00' as date) FROM t --- !query 880 schema +-- !query schema struct<> --- !query 880 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) <= CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) <= CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7 --- !query 881 +-- !query SELECT cast(1 as tinyint) > cast(1 as decimal(3, 0)) FROM t --- !query 881 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) > CAST(1 AS DECIMAL(3,0))):boolean> --- !query 881 output +-- !query output false --- !query 882 +-- !query SELECT cast(1 as tinyint) > cast(1 as decimal(5, 0)) FROM t --- !query 882 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)) > CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean> --- !query 882 output +-- !query output false --- !query 883 +-- !query SELECT cast(1 as tinyint) > cast(1 as decimal(10, 0)) FROM t --- !query 883 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) > CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 883 output +-- !query output false --- !query 884 +-- !query SELECT cast(1 as tinyint) > cast(1 as decimal(20, 0)) FROM t --- !query 884 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)) > CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 884 output +-- !query output false --- !query 885 +-- !query SELECT cast(1 as smallint) > cast(1 as decimal(3, 0)) FROM t --- !query 885 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)) > CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean> --- !query 885 output +-- !query output false --- !query 886 +-- !query SELECT cast(1 as smallint) > cast(1 as decimal(5, 0)) FROM t --- !query 886 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) > CAST(1 AS DECIMAL(5,0))):boolean> --- !query 886 output +-- !query output false --- !query 887 +-- !query SELECT cast(1 as smallint) > cast(1 as decimal(10, 0)) FROM t --- !query 887 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) > CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 887 output +-- !query output false --- !query 888 +-- !query SELECT cast(1 as smallint) > cast(1 as decimal(20, 0)) FROM t --- !query 888 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)) > CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 888 output +-- !query output false --- !query 889 +-- !query SELECT cast(1 as int) > cast(1 as decimal(3, 0)) FROM t --- !query 889 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) > CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean> --- !query 889 output +-- !query output false --- !query 890 +-- !query SELECT cast(1 as int) > cast(1 as decimal(5, 0)) FROM t --- !query 890 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) > CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean> --- !query 890 output +-- !query output false --- !query 891 +-- !query SELECT cast(1 as int) > cast(1 as decimal(10, 0)) FROM t --- !query 891 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) > CAST(1 AS DECIMAL(10,0))):boolean> --- !query 891 output +-- !query output false --- !query 892 +-- !query SELECT cast(1 as int) > cast(1 as decimal(20, 0)) FROM t --- !query 892 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)) > CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 892 output +-- !query output false --- !query 893 +-- !query SELECT cast(1 as bigint) > cast(1 as decimal(3, 0)) FROM t --- !query 893 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) > CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean> --- !query 893 output +-- !query output false --- !query 894 +-- !query SELECT cast(1 as bigint) > cast(1 as decimal(5, 0)) FROM t --- !query 894 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) > CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean> --- !query 894 output +-- !query output false --- !query 895 +-- !query SELECT cast(1 as bigint) > cast(1 as decimal(10, 0)) FROM t --- !query 895 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) > CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean> --- !query 895 output +-- !query output false --- !query 896 +-- !query SELECT cast(1 as bigint) > cast(1 as decimal(20, 0)) FROM t --- !query 896 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) > CAST(1 AS DECIMAL(20,0))):boolean> --- !query 896 output +-- !query output false --- !query 897 +-- !query SELECT cast(1 as float) > cast(1 as decimal(3, 0)) FROM t --- !query 897 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) > CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean> --- !query 897 output +-- !query output false --- !query 898 +-- !query SELECT cast(1 as float) > cast(1 as decimal(5, 0)) FROM t --- !query 898 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) > CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean> --- !query 898 output +-- !query output false --- !query 899 +-- !query SELECT cast(1 as float) > cast(1 as decimal(10, 0)) FROM t --- !query 899 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) > CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 899 output +-- !query output false --- !query 900 +-- !query SELECT cast(1 as float) > cast(1 as decimal(20, 0)) FROM t --- !query 900 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) > CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean> --- !query 900 output +-- !query output false --- !query 901 +-- !query SELECT cast(1 as double) > cast(1 as decimal(3, 0)) FROM t --- !query 901 schema +-- !query schema struct<(CAST(1 AS DOUBLE) > CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean> --- !query 901 output +-- !query output false --- !query 902 +-- !query SELECT cast(1 as double) > cast(1 as decimal(5, 0)) FROM t --- !query 902 schema +-- !query schema struct<(CAST(1 AS DOUBLE) > CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean> --- !query 902 output +-- !query output false --- !query 903 +-- !query SELECT cast(1 as double) > cast(1 as decimal(10, 0)) FROM t --- !query 903 schema +-- !query schema struct<(CAST(1 AS DOUBLE) > CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 903 output +-- !query output false --- !query 904 +-- !query SELECT cast(1 as double) > cast(1 as decimal(20, 0)) FROM t --- !query 904 schema +-- !query schema struct<(CAST(1 AS DOUBLE) > CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean> --- !query 904 output +-- !query output false --- !query 905 +-- !query SELECT cast(1 as decimal(10, 0)) > cast(1 as decimal(3, 0)) FROM t --- !query 905 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) > CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean> --- !query 905 output +-- !query output false --- !query 906 +-- !query SELECT cast(1 as decimal(10, 0)) > cast(1 as decimal(5, 0)) FROM t --- !query 906 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) > CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean> --- !query 906 output +-- !query output false --- !query 907 +-- !query SELECT cast(1 as decimal(10, 0)) > cast(1 as decimal(10, 0)) FROM t --- !query 907 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) > CAST(1 AS DECIMAL(10,0))):boolean> --- !query 907 output +-- !query output false --- !query 908 +-- !query SELECT cast(1 as decimal(10, 0)) > cast(1 as decimal(20, 0)) FROM t --- !query 908 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) > CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 908 output +-- !query output false --- !query 909 +-- !query SELECT cast('1' as binary) > cast(1 as decimal(3, 0)) FROM t --- !query 909 schema +-- !query schema struct<> --- !query 909 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) > CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) > CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7 --- !query 910 +-- !query SELECT cast('1' as binary) > cast(1 as decimal(5, 0)) FROM t --- !query 910 schema +-- !query schema struct<> --- !query 910 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) > CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) > CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7 --- !query 911 +-- !query SELECT cast('1' as binary) > cast(1 as decimal(10, 0)) FROM t --- !query 911 schema +-- !query schema struct<> --- !query 911 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) > CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) > CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7 --- !query 912 +-- !query SELECT cast('1' as binary) > cast(1 as decimal(20, 0)) FROM t --- !query 912 schema +-- !query schema struct<> --- !query 912 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) > CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) > CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7 --- !query 913 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) > cast(1 as decimal(3, 0)) FROM t --- !query 913 schema +-- !query schema struct<> --- !query 913 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) > CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) > CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7 --- !query 914 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) > cast(1 as decimal(5, 0)) FROM t --- !query 914 schema +-- !query schema struct<> --- !query 914 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) > CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) > CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7 --- !query 915 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) > cast(1 as decimal(10, 0)) FROM t --- !query 915 schema +-- !query schema struct<> --- !query 915 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) > CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) > CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7 --- !query 916 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) > cast(1 as decimal(20, 0)) FROM t --- !query 916 schema +-- !query schema struct<> --- !query 916 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) > CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) > CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7 --- !query 917 +-- !query SELECT cast('2017-12-11 09:30:00' as date) > cast(1 as decimal(3, 0)) FROM t --- !query 917 schema +-- !query schema struct<> --- !query 917 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) > CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) > CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7 --- !query 918 +-- !query SELECT cast('2017-12-11 09:30:00' as date) > cast(1 as decimal(5, 0)) FROM t --- !query 918 schema +-- !query schema struct<> --- !query 918 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) > CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) > CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7 --- !query 919 +-- !query SELECT cast('2017-12-11 09:30:00' as date) > cast(1 as decimal(10, 0)) FROM t --- !query 919 schema +-- !query schema struct<> --- !query 919 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) > CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) > CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7 --- !query 920 +-- !query SELECT cast('2017-12-11 09:30:00' as date) > cast(1 as decimal(20, 0)) FROM t --- !query 920 schema +-- !query schema struct<> --- !query 920 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) > CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) > CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7 --- !query 921 +-- !query SELECT cast(1 as decimal(3, 0)) > cast(1 as tinyint) FROM t --- !query 921 schema +-- !query schema struct<(CAST(1 AS DECIMAL(3,0)) > CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):boolean> --- !query 921 output +-- !query output false --- !query 922 +-- !query SELECT cast(1 as decimal(5, 0)) > cast(1 as tinyint) FROM t --- !query 922 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)) > CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean> --- !query 922 output +-- !query output false --- !query 923 +-- !query SELECT cast(1 as decimal(10, 0)) > cast(1 as tinyint) FROM t --- !query 923 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) > CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean> --- !query 923 output +-- !query output false --- !query 924 +-- !query SELECT cast(1 as decimal(20, 0)) > cast(1 as tinyint) FROM t --- !query 924 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) > CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean> --- !query 924 output +-- !query output false --- !query 925 +-- !query SELECT cast(1 as decimal(3, 0)) > cast(1 as smallint) FROM t --- !query 925 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)) > CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean> --- !query 925 output +-- !query output false --- !query 926 +-- !query SELECT cast(1 as decimal(5, 0)) > cast(1 as smallint) FROM t --- !query 926 schema +-- !query schema struct<(CAST(1 AS DECIMAL(5,0)) > CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):boolean> --- !query 926 output +-- !query output false --- !query 927 +-- !query SELECT cast(1 as decimal(10, 0)) > cast(1 as smallint) FROM t --- !query 927 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) > CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean> --- !query 927 output +-- !query output false --- !query 928 +-- !query SELECT cast(1 as decimal(20, 0)) > cast(1 as smallint) FROM t --- !query 928 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) > CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean> --- !query 928 output +-- !query output false --- !query 929 +-- !query SELECT cast(1 as decimal(3, 0)) > cast(1 as int) FROM t --- !query 929 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) > CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 929 output +-- !query output false --- !query 930 +-- !query SELECT cast(1 as decimal(5, 0)) > cast(1 as int) FROM t --- !query 930 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) > CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 930 output +-- !query output false --- !query 931 +-- !query SELECT cast(1 as decimal(10, 0)) > cast(1 as int) FROM t --- !query 931 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) > CAST(CAST(1 AS INT) AS DECIMAL(10,0))):boolean> --- !query 931 output +-- !query output false --- !query 932 +-- !query SELECT cast(1 as decimal(20, 0)) > cast(1 as int) FROM t --- !query 932 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) > CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean> --- !query 932 output +-- !query output false --- !query 933 +-- !query SELECT cast(1 as decimal(3, 0)) > cast(1 as bigint) FROM t --- !query 933 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)) > CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 933 output +-- !query output false --- !query 934 +-- !query SELECT cast(1 as decimal(5, 0)) > cast(1 as bigint) FROM t --- !query 934 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)) > CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 934 output +-- !query output false --- !query 935 +-- !query SELECT cast(1 as decimal(10, 0)) > cast(1 as bigint) FROM t --- !query 935 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) > CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 935 output +-- !query output false --- !query 936 +-- !query SELECT cast(1 as decimal(20, 0)) > cast(1 as bigint) FROM t --- !query 936 schema +-- !query schema struct<(CAST(1 AS DECIMAL(20,0)) > CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):boolean> --- !query 936 output +-- !query output false --- !query 937 +-- !query SELECT cast(1 as decimal(3, 0)) > cast(1 as float) FROM t --- !query 937 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) > CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 937 output +-- !query output false --- !query 938 +-- !query SELECT cast(1 as decimal(5, 0)) > cast(1 as float) FROM t --- !query 938 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) > CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 938 output +-- !query output false --- !query 939 +-- !query SELECT cast(1 as decimal(10, 0)) > cast(1 as float) FROM t --- !query 939 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) > CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 939 output +-- !query output false --- !query 940 +-- !query SELECT cast(1 as decimal(20, 0)) > cast(1 as float) FROM t --- !query 940 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) > CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 940 output +-- !query output false --- !query 941 +-- !query SELECT cast(1 as decimal(3, 0)) > cast(1 as double) FROM t --- !query 941 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) > CAST(1 AS DOUBLE)):boolean> --- !query 941 output +-- !query output false --- !query 942 +-- !query SELECT cast(1 as decimal(5, 0)) > cast(1 as double) FROM t --- !query 942 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) > CAST(1 AS DOUBLE)):boolean> --- !query 942 output +-- !query output false --- !query 943 +-- !query SELECT cast(1 as decimal(10, 0)) > cast(1 as double) FROM t --- !query 943 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) > CAST(1 AS DOUBLE)):boolean> --- !query 943 output +-- !query output false --- !query 944 +-- !query SELECT cast(1 as decimal(20, 0)) > cast(1 as double) FROM t --- !query 944 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) > CAST(1 AS DOUBLE)):boolean> --- !query 944 output +-- !query output false --- !query 945 +-- !query SELECT cast(1 as decimal(3, 0)) > cast(1 as decimal(10, 0)) FROM t --- !query 945 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) > CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 945 output +-- !query output false --- !query 946 +-- !query SELECT cast(1 as decimal(5, 0)) > cast(1 as decimal(10, 0)) FROM t --- !query 946 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) > CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 946 output +-- !query output false --- !query 947 +-- !query SELECT cast(1 as decimal(10, 0)) > cast(1 as decimal(10, 0)) FROM t --- !query 947 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) > CAST(1 AS DECIMAL(10,0))):boolean> --- !query 947 output +-- !query output false --- !query 948 +-- !query SELECT cast(1 as decimal(20, 0)) > cast(1 as decimal(10, 0)) FROM t --- !query 948 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) > CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean> --- !query 948 output +-- !query output false --- !query 949 +-- !query SELECT cast(1 as decimal(3, 0)) > cast(1 as string) FROM t --- !query 949 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) > CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 949 output +-- !query output false --- !query 950 +-- !query SELECT cast(1 as decimal(5, 0)) > cast(1 as string) FROM t --- !query 950 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) > CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 950 output +-- !query output false --- !query 951 +-- !query SELECT cast(1 as decimal(10, 0)) > cast(1 as string) FROM t --- !query 951 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) > CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 951 output +-- !query output false --- !query 952 +-- !query SELECT cast(1 as decimal(20, 0)) > cast(1 as string) FROM t --- !query 952 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) > CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 952 output +-- !query output false --- !query 953 +-- !query SELECT cast(1 as decimal(3, 0)) > cast('1' as binary) FROM t --- !query 953 schema +-- !query schema struct<> --- !query 953 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) > CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) > CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7 --- !query 954 +-- !query SELECT cast(1 as decimal(5, 0)) > cast('1' as binary) FROM t --- !query 954 schema +-- !query schema struct<> --- !query 954 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) > CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) > CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7 --- !query 955 +-- !query SELECT cast(1 as decimal(10, 0)) > cast('1' as binary) FROM t --- !query 955 schema +-- !query schema struct<> --- !query 955 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) > CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) > CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7 --- !query 956 +-- !query SELECT cast(1 as decimal(20, 0)) > cast('1' as binary) FROM t --- !query 956 schema +-- !query schema struct<> --- !query 956 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) > CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) > CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7 --- !query 957 +-- !query SELECT cast(1 as decimal(3, 0)) > cast(1 as boolean) FROM t --- !query 957 schema +-- !query schema struct<> --- !query 957 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) > CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) > CAST(1 AS BOOLEAN))' (decimal(3,0) and boolean).; line 1 pos 7 --- !query 958 +-- !query SELECT cast(1 as decimal(5, 0)) > cast(1 as boolean) FROM t --- !query 958 schema +-- !query schema struct<> --- !query 958 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) > CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) > CAST(1 AS BOOLEAN))' (decimal(5,0) and boolean).; line 1 pos 7 --- !query 959 +-- !query SELECT cast(1 as decimal(10, 0)) > cast(1 as boolean) FROM t --- !query 959 schema +-- !query schema struct<> --- !query 959 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) > CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) > CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7 --- !query 960 +-- !query SELECT cast(1 as decimal(20, 0)) > cast(1 as boolean) FROM t --- !query 960 schema +-- !query schema struct<> --- !query 960 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) > CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) > CAST(1 AS BOOLEAN))' (decimal(20,0) and boolean).; line 1 pos 7 --- !query 961 +-- !query SELECT cast(1 as decimal(3, 0)) > cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 961 schema +-- !query schema struct<> --- !query 961 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) > CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) > CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7 --- !query 962 +-- !query SELECT cast(1 as decimal(5, 0)) > cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 962 schema +-- !query schema struct<> --- !query 962 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) > CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) > CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7 --- !query 963 +-- !query SELECT cast(1 as decimal(10, 0)) > cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 963 schema +-- !query schema struct<> --- !query 963 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) > CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) > CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7 --- !query 964 +-- !query SELECT cast(1 as decimal(20, 0)) > cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 964 schema +-- !query schema struct<> --- !query 964 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) > CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) > CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7 --- !query 965 +-- !query SELECT cast(1 as decimal(3, 0)) > cast('2017-12-11 09:30:00' as date) FROM t --- !query 965 schema +-- !query schema struct<> --- !query 965 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) > CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) > CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7 --- !query 966 +-- !query SELECT cast(1 as decimal(5, 0)) > cast('2017-12-11 09:30:00' as date) FROM t --- !query 966 schema +-- !query schema struct<> --- !query 966 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) > CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) > CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7 --- !query 967 +-- !query SELECT cast(1 as decimal(10, 0)) > cast('2017-12-11 09:30:00' as date) FROM t --- !query 967 schema +-- !query schema struct<> --- !query 967 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) > CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) > CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7 --- !query 968 +-- !query SELECT cast(1 as decimal(20, 0)) > cast('2017-12-11 09:30:00' as date) FROM t --- !query 968 schema +-- !query schema struct<> --- !query 968 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) > CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) > CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7 --- !query 969 +-- !query SELECT cast(1 as tinyint) >= cast(1 as decimal(3, 0)) FROM t --- !query 969 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) >= CAST(1 AS DECIMAL(3,0))):boolean> --- !query 969 output +-- !query output true --- !query 970 +-- !query SELECT cast(1 as tinyint) >= cast(1 as decimal(5, 0)) FROM t --- !query 970 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)) >= CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean> --- !query 970 output +-- !query output true --- !query 971 +-- !query SELECT cast(1 as tinyint) >= cast(1 as decimal(10, 0)) FROM t --- !query 971 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 971 output +-- !query output true --- !query 972 +-- !query SELECT cast(1 as tinyint) >= cast(1 as decimal(20, 0)) FROM t --- !query 972 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)) >= CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 972 output +-- !query output true --- !query 973 +-- !query SELECT cast(1 as smallint) >= cast(1 as decimal(3, 0)) FROM t --- !query 973 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)) >= CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean> --- !query 973 output +-- !query output true --- !query 974 +-- !query SELECT cast(1 as smallint) >= cast(1 as decimal(5, 0)) FROM t --- !query 974 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) >= CAST(1 AS DECIMAL(5,0))):boolean> --- !query 974 output +-- !query output true --- !query 975 +-- !query SELECT cast(1 as smallint) >= cast(1 as decimal(10, 0)) FROM t --- !query 975 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 975 output +-- !query output true --- !query 976 +-- !query SELECT cast(1 as smallint) >= cast(1 as decimal(20, 0)) FROM t --- !query 976 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)) >= CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 976 output +-- !query output true --- !query 977 +-- !query SELECT cast(1 as int) >= cast(1 as decimal(3, 0)) FROM t --- !query 977 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) >= CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean> --- !query 977 output +-- !query output true --- !query 978 +-- !query SELECT cast(1 as int) >= cast(1 as decimal(5, 0)) FROM t --- !query 978 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) >= CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean> --- !query 978 output +-- !query output true --- !query 979 +-- !query SELECT cast(1 as int) >= cast(1 as decimal(10, 0)) FROM t --- !query 979 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) >= CAST(1 AS DECIMAL(10,0))):boolean> --- !query 979 output +-- !query output true --- !query 980 +-- !query SELECT cast(1 as int) >= cast(1 as decimal(20, 0)) FROM t --- !query 980 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)) >= CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 980 output +-- !query output true --- !query 981 +-- !query SELECT cast(1 as bigint) >= cast(1 as decimal(3, 0)) FROM t --- !query 981 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) >= CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean> --- !query 981 output +-- !query output true --- !query 982 +-- !query SELECT cast(1 as bigint) >= cast(1 as decimal(5, 0)) FROM t --- !query 982 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) >= CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean> --- !query 982 output +-- !query output true --- !query 983 +-- !query SELECT cast(1 as bigint) >= cast(1 as decimal(10, 0)) FROM t --- !query 983 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean> --- !query 983 output +-- !query output true --- !query 984 +-- !query SELECT cast(1 as bigint) >= cast(1 as decimal(20, 0)) FROM t --- !query 984 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) >= CAST(1 AS DECIMAL(20,0))):boolean> --- !query 984 output +-- !query output true --- !query 985 +-- !query SELECT cast(1 as float) >= cast(1 as decimal(3, 0)) FROM t --- !query 985 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean> --- !query 985 output +-- !query output true --- !query 986 +-- !query SELECT cast(1 as float) >= cast(1 as decimal(5, 0)) FROM t --- !query 986 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean> --- !query 986 output +-- !query output true --- !query 987 +-- !query SELECT cast(1 as float) >= cast(1 as decimal(10, 0)) FROM t --- !query 987 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 987 output +-- !query output true --- !query 988 +-- !query SELECT cast(1 as float) >= cast(1 as decimal(20, 0)) FROM t --- !query 988 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean> --- !query 988 output +-- !query output true --- !query 989 +-- !query SELECT cast(1 as double) >= cast(1 as decimal(3, 0)) FROM t --- !query 989 schema +-- !query schema struct<(CAST(1 AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean> --- !query 989 output +-- !query output true --- !query 990 +-- !query SELECT cast(1 as double) >= cast(1 as decimal(5, 0)) FROM t --- !query 990 schema +-- !query schema struct<(CAST(1 AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean> --- !query 990 output +-- !query output true --- !query 991 +-- !query SELECT cast(1 as double) >= cast(1 as decimal(10, 0)) FROM t --- !query 991 schema +-- !query schema struct<(CAST(1 AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 991 output +-- !query output true --- !query 992 +-- !query SELECT cast(1 as double) >= cast(1 as decimal(20, 0)) FROM t --- !query 992 schema +-- !query schema struct<(CAST(1 AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean> --- !query 992 output +-- !query output true --- !query 993 +-- !query SELECT cast(1 as decimal(10, 0)) >= cast(1 as decimal(3, 0)) FROM t --- !query 993 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) >= CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean> --- !query 993 output +-- !query output true --- !query 994 +-- !query SELECT cast(1 as decimal(10, 0)) >= cast(1 as decimal(5, 0)) FROM t --- !query 994 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) >= CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean> --- !query 994 output +-- !query output true --- !query 995 +-- !query SELECT cast(1 as decimal(10, 0)) >= cast(1 as decimal(10, 0)) FROM t --- !query 995 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) >= CAST(1 AS DECIMAL(10,0))):boolean> --- !query 995 output +-- !query output true --- !query 996 +-- !query SELECT cast(1 as decimal(10, 0)) >= cast(1 as decimal(20, 0)) FROM t --- !query 996 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) >= CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 996 output +-- !query output true --- !query 997 +-- !query SELECT cast('1' as binary) >= cast(1 as decimal(3, 0)) FROM t --- !query 997 schema +-- !query schema struct<> --- !query 997 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) >= CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) >= CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7 --- !query 998 +-- !query SELECT cast('1' as binary) >= cast(1 as decimal(5, 0)) FROM t --- !query 998 schema +-- !query schema struct<> --- !query 998 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) >= CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) >= CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7 --- !query 999 +-- !query SELECT cast('1' as binary) >= cast(1 as decimal(10, 0)) FROM t --- !query 999 schema +-- !query schema struct<> --- !query 999 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) >= CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) >= CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7 --- !query 1000 +-- !query SELECT cast('1' as binary) >= cast(1 as decimal(20, 0)) FROM t --- !query 1000 schema +-- !query schema struct<> --- !query 1000 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) >= CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) >= CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7 --- !query 1001 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) >= cast(1 as decimal(3, 0)) FROM t --- !query 1001 schema +-- !query schema struct<> --- !query 1001 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) >= CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) >= CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7 --- !query 1002 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) >= cast(1 as decimal(5, 0)) FROM t --- !query 1002 schema +-- !query schema struct<> --- !query 1002 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) >= CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) >= CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7 --- !query 1003 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) >= cast(1 as decimal(10, 0)) FROM t --- !query 1003 schema +-- !query schema struct<> --- !query 1003 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) >= CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) >= CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7 --- !query 1004 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) >= cast(1 as decimal(20, 0)) FROM t --- !query 1004 schema +-- !query schema struct<> --- !query 1004 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) >= CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) >= CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7 --- !query 1005 +-- !query SELECT cast('2017-12-11 09:30:00' as date) >= cast(1 as decimal(3, 0)) FROM t --- !query 1005 schema +-- !query schema struct<> --- !query 1005 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) >= CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) >= CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7 --- !query 1006 +-- !query SELECT cast('2017-12-11 09:30:00' as date) >= cast(1 as decimal(5, 0)) FROM t --- !query 1006 schema +-- !query schema struct<> --- !query 1006 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) >= CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) >= CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7 --- !query 1007 +-- !query SELECT cast('2017-12-11 09:30:00' as date) >= cast(1 as decimal(10, 0)) FROM t --- !query 1007 schema +-- !query schema struct<> --- !query 1007 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) >= CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) >= CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7 --- !query 1008 +-- !query SELECT cast('2017-12-11 09:30:00' as date) >= cast(1 as decimal(20, 0)) FROM t --- !query 1008 schema +-- !query schema struct<> --- !query 1008 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) >= CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) >= CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7 --- !query 1009 +-- !query SELECT cast(1 as decimal(3, 0)) >= cast(1 as tinyint) FROM t --- !query 1009 schema +-- !query schema struct<(CAST(1 AS DECIMAL(3,0)) >= CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):boolean> --- !query 1009 output +-- !query output true --- !query 1010 +-- !query SELECT cast(1 as decimal(5, 0)) >= cast(1 as tinyint) FROM t --- !query 1010 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)) >= CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean> --- !query 1010 output +-- !query output true --- !query 1011 +-- !query SELECT cast(1 as decimal(10, 0)) >= cast(1 as tinyint) FROM t --- !query 1011 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) >= CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean> --- !query 1011 output +-- !query output true --- !query 1012 +-- !query SELECT cast(1 as decimal(20, 0)) >= cast(1 as tinyint) FROM t --- !query 1012 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) >= CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean> --- !query 1012 output +-- !query output true --- !query 1013 +-- !query SELECT cast(1 as decimal(3, 0)) >= cast(1 as smallint) FROM t --- !query 1013 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)) >= CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean> --- !query 1013 output +-- !query output true --- !query 1014 +-- !query SELECT cast(1 as decimal(5, 0)) >= cast(1 as smallint) FROM t --- !query 1014 schema +-- !query schema struct<(CAST(1 AS DECIMAL(5,0)) >= CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):boolean> --- !query 1014 output +-- !query output true --- !query 1015 +-- !query SELECT cast(1 as decimal(10, 0)) >= cast(1 as smallint) FROM t --- !query 1015 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) >= CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean> --- !query 1015 output +-- !query output true --- !query 1016 +-- !query SELECT cast(1 as decimal(20, 0)) >= cast(1 as smallint) FROM t --- !query 1016 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) >= CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean> --- !query 1016 output +-- !query output true --- !query 1017 +-- !query SELECT cast(1 as decimal(3, 0)) >= cast(1 as int) FROM t --- !query 1017 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) >= CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 1017 output +-- !query output true --- !query 1018 +-- !query SELECT cast(1 as decimal(5, 0)) >= cast(1 as int) FROM t --- !query 1018 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) >= CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 1018 output +-- !query output true --- !query 1019 +-- !query SELECT cast(1 as decimal(10, 0)) >= cast(1 as int) FROM t --- !query 1019 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) >= CAST(CAST(1 AS INT) AS DECIMAL(10,0))):boolean> --- !query 1019 output +-- !query output true --- !query 1020 +-- !query SELECT cast(1 as decimal(20, 0)) >= cast(1 as int) FROM t --- !query 1020 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) >= CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean> --- !query 1020 output +-- !query output true --- !query 1021 +-- !query SELECT cast(1 as decimal(3, 0)) >= cast(1 as bigint) FROM t --- !query 1021 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)) >= CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 1021 output +-- !query output true --- !query 1022 +-- !query SELECT cast(1 as decimal(5, 0)) >= cast(1 as bigint) FROM t --- !query 1022 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)) >= CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 1022 output +-- !query output true --- !query 1023 +-- !query SELECT cast(1 as decimal(10, 0)) >= cast(1 as bigint) FROM t --- !query 1023 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) >= CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean> --- !query 1023 output +-- !query output true --- !query 1024 +-- !query SELECT cast(1 as decimal(20, 0)) >= cast(1 as bigint) FROM t --- !query 1024 schema +-- !query schema struct<(CAST(1 AS DECIMAL(20,0)) >= CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):boolean> --- !query 1024 output +-- !query output true --- !query 1025 +-- !query SELECT cast(1 as decimal(3, 0)) >= cast(1 as float) FROM t --- !query 1025 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) >= CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 1025 output +-- !query output true --- !query 1026 +-- !query SELECT cast(1 as decimal(5, 0)) >= cast(1 as float) FROM t --- !query 1026 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) >= CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 1026 output +-- !query output true --- !query 1027 +-- !query SELECT cast(1 as decimal(10, 0)) >= cast(1 as float) FROM t --- !query 1027 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) >= CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 1027 output +-- !query output true --- !query 1028 +-- !query SELECT cast(1 as decimal(20, 0)) >= cast(1 as float) FROM t --- !query 1028 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) >= CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean> --- !query 1028 output +-- !query output true --- !query 1029 +-- !query SELECT cast(1 as decimal(3, 0)) >= cast(1 as double) FROM t --- !query 1029 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) >= CAST(1 AS DOUBLE)):boolean> --- !query 1029 output +-- !query output true --- !query 1030 +-- !query SELECT cast(1 as decimal(5, 0)) >= cast(1 as double) FROM t --- !query 1030 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) >= CAST(1 AS DOUBLE)):boolean> --- !query 1030 output +-- !query output true --- !query 1031 +-- !query SELECT cast(1 as decimal(10, 0)) >= cast(1 as double) FROM t --- !query 1031 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) >= CAST(1 AS DOUBLE)):boolean> --- !query 1031 output +-- !query output true --- !query 1032 +-- !query SELECT cast(1 as decimal(20, 0)) >= cast(1 as double) FROM t --- !query 1032 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) >= CAST(1 AS DOUBLE)):boolean> --- !query 1032 output +-- !query output true --- !query 1033 +-- !query SELECT cast(1 as decimal(3, 0)) >= cast(1 as decimal(10, 0)) FROM t --- !query 1033 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 1033 output +-- !query output true --- !query 1034 +-- !query SELECT cast(1 as decimal(5, 0)) >= cast(1 as decimal(10, 0)) FROM t --- !query 1034 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean> --- !query 1034 output +-- !query output true --- !query 1035 +-- !query SELECT cast(1 as decimal(10, 0)) >= cast(1 as decimal(10, 0)) FROM t --- !query 1035 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) >= CAST(1 AS DECIMAL(10,0))):boolean> --- !query 1035 output +-- !query output true --- !query 1036 +-- !query SELECT cast(1 as decimal(20, 0)) >= cast(1 as decimal(10, 0)) FROM t --- !query 1036 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean> --- !query 1036 output +-- !query output true --- !query 1037 +-- !query SELECT cast(1 as decimal(3, 0)) >= cast(1 as string) FROM t --- !query 1037 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) >= CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 1037 output +-- !query output true --- !query 1038 +-- !query SELECT cast(1 as decimal(5, 0)) >= cast(1 as string) FROM t --- !query 1038 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) >= CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 1038 output +-- !query output true --- !query 1039 +-- !query SELECT cast(1 as decimal(10, 0)) >= cast(1 as string) FROM t --- !query 1039 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) >= CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 1039 output +-- !query output true --- !query 1040 +-- !query SELECT cast(1 as decimal(20, 0)) >= cast(1 as string) FROM t --- !query 1040 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) >= CAST(CAST(1 AS STRING) AS DOUBLE)):boolean> --- !query 1040 output +-- !query output true --- !query 1041 +-- !query SELECT cast(1 as decimal(3, 0)) >= cast('1' as binary) FROM t --- !query 1041 schema +-- !query schema struct<> --- !query 1041 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) >= CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) >= CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7 --- !query 1042 +-- !query SELECT cast(1 as decimal(5, 0)) >= cast('1' as binary) FROM t --- !query 1042 schema +-- !query schema struct<> --- !query 1042 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) >= CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) >= CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7 --- !query 1043 +-- !query SELECT cast(1 as decimal(10, 0)) >= cast('1' as binary) FROM t --- !query 1043 schema +-- !query schema struct<> --- !query 1043 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) >= CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) >= CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7 --- !query 1044 +-- !query SELECT cast(1 as decimal(20, 0)) >= cast('1' as binary) FROM t --- !query 1044 schema +-- !query schema struct<> --- !query 1044 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) >= CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) >= CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7 --- !query 1045 +-- !query SELECT cast(1 as decimal(3, 0)) >= cast(1 as boolean) FROM t --- !query 1045 schema +-- !query schema struct<> --- !query 1045 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) >= CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) >= CAST(1 AS BOOLEAN))' (decimal(3,0) and boolean).; line 1 pos 7 --- !query 1046 +-- !query SELECT cast(1 as decimal(5, 0)) >= cast(1 as boolean) FROM t --- !query 1046 schema +-- !query schema struct<> --- !query 1046 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) >= CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) >= CAST(1 AS BOOLEAN))' (decimal(5,0) and boolean).; line 1 pos 7 --- !query 1047 +-- !query SELECT cast(1 as decimal(10, 0)) >= cast(1 as boolean) FROM t --- !query 1047 schema +-- !query schema struct<> --- !query 1047 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) >= CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) >= CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7 --- !query 1048 +-- !query SELECT cast(1 as decimal(20, 0)) >= cast(1 as boolean) FROM t --- !query 1048 schema +-- !query schema struct<> --- !query 1048 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) >= CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) >= CAST(1 AS BOOLEAN))' (decimal(20,0) and boolean).; line 1 pos 7 --- !query 1049 +-- !query SELECT cast(1 as decimal(3, 0)) >= cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 1049 schema +-- !query schema struct<> --- !query 1049 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) >= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) >= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7 --- !query 1050 +-- !query SELECT cast(1 as decimal(5, 0)) >= cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 1050 schema +-- !query schema struct<> --- !query 1050 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) >= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) >= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7 --- !query 1051 +-- !query SELECT cast(1 as decimal(10, 0)) >= cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 1051 schema +-- !query schema struct<> --- !query 1051 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) >= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) >= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7 --- !query 1052 +-- !query SELECT cast(1 as decimal(20, 0)) >= cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 1052 schema +-- !query schema struct<> --- !query 1052 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) >= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) >= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7 --- !query 1053 +-- !query SELECT cast(1 as decimal(3, 0)) >= cast('2017-12-11 09:30:00' as date) FROM t --- !query 1053 schema +-- !query schema struct<> --- !query 1053 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) >= CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) >= CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7 --- !query 1054 +-- !query SELECT cast(1 as decimal(5, 0)) >= cast('2017-12-11 09:30:00' as date) FROM t --- !query 1054 schema +-- !query schema struct<> --- !query 1054 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) >= CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) >= CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7 --- !query 1055 +-- !query SELECT cast(1 as decimal(10, 0)) >= cast('2017-12-11 09:30:00' as date) FROM t --- !query 1055 schema +-- !query schema struct<> --- !query 1055 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) >= CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) >= CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7 --- !query 1056 +-- !query SELECT cast(1 as decimal(20, 0)) >= cast('2017-12-11 09:30:00' as date) FROM t --- !query 1056 schema +-- !query schema struct<> --- !query 1056 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) >= CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) >= CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7 --- !query 1057 +-- !query SELECT cast(1 as tinyint) <> cast(1 as decimal(3, 0)) FROM t --- !query 1057 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) = CAST(1 AS DECIMAL(3,0)))):boolean> --- !query 1057 output +-- !query output false --- !query 1058 +-- !query SELECT cast(1 as tinyint) <> cast(1 as decimal(5, 0)) FROM t --- !query 1058 schema +-- !query schema struct<(NOT (CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)) = CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)))):boolean> --- !query 1058 output +-- !query output false --- !query 1059 +-- !query SELECT cast(1 as tinyint) <> cast(1 as decimal(10, 0)) FROM t --- !query 1059 schema +-- !query schema struct<(NOT (CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean> --- !query 1059 output +-- !query output false --- !query 1060 +-- !query SELECT cast(1 as tinyint) <> cast(1 as decimal(20, 0)) FROM t --- !query 1060 schema +-- !query schema struct<(NOT (CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)))):boolean> --- !query 1060 output +-- !query output false --- !query 1061 +-- !query SELECT cast(1 as smallint) <> cast(1 as decimal(3, 0)) FROM t --- !query 1061 schema +-- !query schema struct<(NOT (CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)) = CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)))):boolean> --- !query 1061 output +-- !query output false --- !query 1062 +-- !query SELECT cast(1 as smallint) <> cast(1 as decimal(5, 0)) FROM t --- !query 1062 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) = CAST(1 AS DECIMAL(5,0)))):boolean> --- !query 1062 output +-- !query output false --- !query 1063 +-- !query SELECT cast(1 as smallint) <> cast(1 as decimal(10, 0)) FROM t --- !query 1063 schema +-- !query schema struct<(NOT (CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean> --- !query 1063 output +-- !query output false --- !query 1064 +-- !query SELECT cast(1 as smallint) <> cast(1 as decimal(20, 0)) FROM t --- !query 1064 schema +-- !query schema struct<(NOT (CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)))):boolean> --- !query 1064 output +-- !query output false --- !query 1065 +-- !query SELECT cast(1 as int) <> cast(1 as decimal(3, 0)) FROM t --- !query 1065 schema +-- !query schema struct<(NOT (CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)))):boolean> --- !query 1065 output +-- !query output false --- !query 1066 +-- !query SELECT cast(1 as int) <> cast(1 as decimal(5, 0)) FROM t --- !query 1066 schema +-- !query schema struct<(NOT (CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)))):boolean> --- !query 1066 output +-- !query output false --- !query 1067 +-- !query SELECT cast(1 as int) <> cast(1 as decimal(10, 0)) FROM t --- !query 1067 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS INT) AS DECIMAL(10,0)) = CAST(1 AS DECIMAL(10,0)))):boolean> --- !query 1067 output +-- !query output false --- !query 1068 +-- !query SELECT cast(1 as int) <> cast(1 as decimal(20, 0)) FROM t --- !query 1068 schema +-- !query schema struct<(NOT (CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)))):boolean> --- !query 1068 output +-- !query output false --- !query 1069 +-- !query SELECT cast(1 as bigint) <> cast(1 as decimal(3, 0)) FROM t --- !query 1069 schema +-- !query schema struct<(NOT (CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)))):boolean> --- !query 1069 output +-- !query output false --- !query 1070 +-- !query SELECT cast(1 as bigint) <> cast(1 as decimal(5, 0)) FROM t --- !query 1070 schema +-- !query schema struct<(NOT (CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)))):boolean> --- !query 1070 output +-- !query output false --- !query 1071 +-- !query SELECT cast(1 as bigint) <> cast(1 as decimal(10, 0)) FROM t --- !query 1071 schema +-- !query schema struct<(NOT (CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)))):boolean> --- !query 1071 output +-- !query output false --- !query 1072 +-- !query SELECT cast(1 as bigint) <> cast(1 as decimal(20, 0)) FROM t --- !query 1072 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) = CAST(1 AS DECIMAL(20,0)))):boolean> --- !query 1072 output +-- !query output false --- !query 1073 +-- !query SELECT cast(1 as float) <> cast(1 as decimal(3, 0)) FROM t --- !query 1073 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS FLOAT) AS DOUBLE) = CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE))):boolean> --- !query 1073 output +-- !query output false --- !query 1074 +-- !query SELECT cast(1 as float) <> cast(1 as decimal(5, 0)) FROM t --- !query 1074 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS FLOAT) AS DOUBLE) = CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE))):boolean> --- !query 1074 output +-- !query output false --- !query 1075 +-- !query SELECT cast(1 as float) <> cast(1 as decimal(10, 0)) FROM t --- !query 1075 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS FLOAT) AS DOUBLE) = CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE))):boolean> --- !query 1075 output +-- !query output false --- !query 1076 +-- !query SELECT cast(1 as float) <> cast(1 as decimal(20, 0)) FROM t --- !query 1076 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS FLOAT) AS DOUBLE) = CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE))):boolean> --- !query 1076 output +-- !query output false --- !query 1077 +-- !query SELECT cast(1 as double) <> cast(1 as decimal(3, 0)) FROM t --- !query 1077 schema +-- !query schema struct<(NOT (CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE))):boolean> --- !query 1077 output +-- !query output false --- !query 1078 +-- !query SELECT cast(1 as double) <> cast(1 as decimal(5, 0)) FROM t --- !query 1078 schema +-- !query schema struct<(NOT (CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE))):boolean> --- !query 1078 output +-- !query output false --- !query 1079 +-- !query SELECT cast(1 as double) <> cast(1 as decimal(10, 0)) FROM t --- !query 1079 schema +-- !query schema struct<(NOT (CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE))):boolean> --- !query 1079 output +-- !query output false --- !query 1080 +-- !query SELECT cast(1 as double) <> cast(1 as decimal(20, 0)) FROM t --- !query 1080 schema +-- !query schema struct<(NOT (CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE))):boolean> --- !query 1080 output +-- !query output false --- !query 1081 +-- !query SELECT cast(1 as decimal(10, 0)) <> cast(1 as decimal(3, 0)) FROM t --- !query 1081 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)))):boolean> --- !query 1081 output +-- !query output false --- !query 1082 +-- !query SELECT cast(1 as decimal(10, 0)) <> cast(1 as decimal(5, 0)) FROM t --- !query 1082 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)))):boolean> --- !query 1082 output +-- !query output false --- !query 1083 +-- !query SELECT cast(1 as decimal(10, 0)) <> cast(1 as decimal(10, 0)) FROM t --- !query 1083 schema +-- !query schema struct<(NOT (CAST(1 AS DECIMAL(10,0)) = CAST(1 AS DECIMAL(10,0)))):boolean> --- !query 1083 output +-- !query output false --- !query 1084 +-- !query SELECT cast(1 as decimal(10, 0)) <> cast(1 as decimal(20, 0)) FROM t --- !query 1084 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)))):boolean> --- !query 1084 output +-- !query output false --- !query 1085 +-- !query SELECT cast('1' as binary) <> cast(1 as decimal(3, 0)) FROM t --- !query 1085 schema +-- !query schema struct<> --- !query 1085 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7 --- !query 1086 +-- !query SELECT cast('1' as binary) <> cast(1 as decimal(5, 0)) FROM t --- !query 1086 schema +-- !query schema struct<> --- !query 1086 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7 --- !query 1087 +-- !query SELECT cast('1' as binary) <> cast(1 as decimal(10, 0)) FROM t --- !query 1087 schema +-- !query schema struct<> --- !query 1087 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7 --- !query 1088 +-- !query SELECT cast('1' as binary) <> cast(1 as decimal(20, 0)) FROM t --- !query 1088 schema +-- !query schema struct<> --- !query 1088 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7 --- !query 1089 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) <> cast(1 as decimal(3, 0)) FROM t --- !query 1089 schema +-- !query schema struct<> --- !query 1089 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7 --- !query 1090 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) <> cast(1 as decimal(5, 0)) FROM t --- !query 1090 schema +-- !query schema struct<> --- !query 1090 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7 --- !query 1091 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) <> cast(1 as decimal(10, 0)) FROM t --- !query 1091 schema +-- !query schema struct<> --- !query 1091 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7 --- !query 1092 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) <> cast(1 as decimal(20, 0)) FROM t --- !query 1092 schema +-- !query schema struct<> --- !query 1092 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7 --- !query 1093 +-- !query SELECT cast('2017-12-11 09:30:00' as date) <> cast(1 as decimal(3, 0)) FROM t --- !query 1093 schema +-- !query schema struct<> --- !query 1093 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7 --- !query 1094 +-- !query SELECT cast('2017-12-11 09:30:00' as date) <> cast(1 as decimal(5, 0)) FROM t --- !query 1094 schema +-- !query schema struct<> --- !query 1094 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7 --- !query 1095 +-- !query SELECT cast('2017-12-11 09:30:00' as date) <> cast(1 as decimal(10, 0)) FROM t --- !query 1095 schema +-- !query schema struct<> --- !query 1095 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7 --- !query 1096 +-- !query SELECT cast('2017-12-11 09:30:00' as date) <> cast(1 as decimal(20, 0)) FROM t --- !query 1096 schema +-- !query schema struct<> --- !query 1096 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7 --- !query 1097 +-- !query SELECT cast(1 as decimal(3, 0)) <> cast(1 as tinyint) FROM t --- !query 1097 schema +-- !query schema struct<(NOT (CAST(1 AS DECIMAL(3,0)) = CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)))):boolean> --- !query 1097 output +-- !query output false --- !query 1098 +-- !query SELECT cast(1 as decimal(5, 0)) <> cast(1 as tinyint) FROM t --- !query 1098 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)) = CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)))):boolean> --- !query 1098 output +-- !query output false --- !query 1099 +-- !query SELECT cast(1 as decimal(10, 0)) <> cast(1 as tinyint) FROM t --- !query 1099 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)))):boolean> --- !query 1099 output +-- !query output false --- !query 1100 +-- !query SELECT cast(1 as decimal(20, 0)) <> cast(1 as tinyint) FROM t --- !query 1100 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)))):boolean> --- !query 1100 output +-- !query output false --- !query 1101 +-- !query SELECT cast(1 as decimal(3, 0)) <> cast(1 as smallint) FROM t --- !query 1101 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)) = CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)))):boolean> --- !query 1101 output +-- !query output false --- !query 1102 +-- !query SELECT cast(1 as decimal(5, 0)) <> cast(1 as smallint) FROM t --- !query 1102 schema +-- !query schema struct<(NOT (CAST(1 AS DECIMAL(5,0)) = CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)))):boolean> --- !query 1102 output +-- !query output false --- !query 1103 +-- !query SELECT cast(1 as decimal(10, 0)) <> cast(1 as smallint) FROM t --- !query 1103 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)))):boolean> --- !query 1103 output +-- !query output false --- !query 1104 +-- !query SELECT cast(1 as decimal(20, 0)) <> cast(1 as smallint) FROM t --- !query 1104 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)))):boolean> --- !query 1104 output +-- !query output false --- !query 1105 +-- !query SELECT cast(1 as decimal(3, 0)) <> cast(1 as int) FROM t --- !query 1105 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) = CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean> --- !query 1105 output +-- !query output false --- !query 1106 +-- !query SELECT cast(1 as decimal(5, 0)) <> cast(1 as int) FROM t --- !query 1106 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) = CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean> --- !query 1106 output +-- !query output false --- !query 1107 +-- !query SELECT cast(1 as decimal(10, 0)) <> cast(1 as int) FROM t --- !query 1107 schema +-- !query schema struct<(NOT (CAST(1 AS DECIMAL(10,0)) = CAST(CAST(1 AS INT) AS DECIMAL(10,0)))):boolean> --- !query 1107 output +-- !query output false --- !query 1108 +-- !query SELECT cast(1 as decimal(20, 0)) <> cast(1 as int) FROM t --- !query 1108 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)))):boolean> --- !query 1108 output +-- !query output false --- !query 1109 +-- !query SELECT cast(1 as decimal(3, 0)) <> cast(1 as bigint) FROM t --- !query 1109 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)))):boolean> --- !query 1109 output +-- !query output false --- !query 1110 +-- !query SELECT cast(1 as decimal(5, 0)) <> cast(1 as bigint) FROM t --- !query 1110 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)))):boolean> --- !query 1110 output +-- !query output false --- !query 1111 +-- !query SELECT cast(1 as decimal(10, 0)) <> cast(1 as bigint) FROM t --- !query 1111 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)))):boolean> --- !query 1111 output +-- !query output false --- !query 1112 +-- !query SELECT cast(1 as decimal(20, 0)) <> cast(1 as bigint) FROM t --- !query 1112 schema +-- !query schema struct<(NOT (CAST(1 AS DECIMAL(20,0)) = CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)))):boolean> --- !query 1112 output +-- !query output false --- !query 1113 +-- !query SELECT cast(1 as decimal(3, 0)) <> cast(1 as float) FROM t --- !query 1113 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) = CAST(CAST(1 AS FLOAT) AS DOUBLE))):boolean> --- !query 1113 output +-- !query output false --- !query 1114 +-- !query SELECT cast(1 as decimal(5, 0)) <> cast(1 as float) FROM t --- !query 1114 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) = CAST(CAST(1 AS FLOAT) AS DOUBLE))):boolean> --- !query 1114 output +-- !query output false --- !query 1115 +-- !query SELECT cast(1 as decimal(10, 0)) <> cast(1 as float) FROM t --- !query 1115 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(CAST(1 AS FLOAT) AS DOUBLE))):boolean> --- !query 1115 output +-- !query output false --- !query 1116 +-- !query SELECT cast(1 as decimal(20, 0)) <> cast(1 as float) FROM t --- !query 1116 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) = CAST(CAST(1 AS FLOAT) AS DOUBLE))):boolean> --- !query 1116 output +-- !query output false --- !query 1117 +-- !query SELECT cast(1 as decimal(3, 0)) <> cast(1 as double) FROM t --- !query 1117 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) = CAST(1 AS DOUBLE))):boolean> --- !query 1117 output +-- !query output false --- !query 1118 +-- !query SELECT cast(1 as decimal(5, 0)) <> cast(1 as double) FROM t --- !query 1118 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) = CAST(1 AS DOUBLE))):boolean> --- !query 1118 output +-- !query output false --- !query 1119 +-- !query SELECT cast(1 as decimal(10, 0)) <> cast(1 as double) FROM t --- !query 1119 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(1 AS DOUBLE))):boolean> --- !query 1119 output +-- !query output false --- !query 1120 +-- !query SELECT cast(1 as decimal(20, 0)) <> cast(1 as double) FROM t --- !query 1120 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) = CAST(1 AS DOUBLE))):boolean> --- !query 1120 output +-- !query output false --- !query 1121 +-- !query SELECT cast(1 as decimal(3, 0)) <> cast(1 as decimal(10, 0)) FROM t --- !query 1121 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean> --- !query 1121 output +-- !query output false --- !query 1122 +-- !query SELECT cast(1 as decimal(5, 0)) <> cast(1 as decimal(10, 0)) FROM t --- !query 1122 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean> --- !query 1122 output +-- !query output false --- !query 1123 +-- !query SELECT cast(1 as decimal(10, 0)) <> cast(1 as decimal(10, 0)) FROM t --- !query 1123 schema +-- !query schema struct<(NOT (CAST(1 AS DECIMAL(10,0)) = CAST(1 AS DECIMAL(10,0)))):boolean> --- !query 1123 output +-- !query output false --- !query 1124 +-- !query SELECT cast(1 as decimal(20, 0)) <> cast(1 as decimal(10, 0)) FROM t --- !query 1124 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)))):boolean> --- !query 1124 output +-- !query output false --- !query 1125 +-- !query SELECT cast(1 as decimal(3, 0)) <> cast(1 as string) FROM t --- !query 1125 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) = CAST(CAST(1 AS STRING) AS DOUBLE))):boolean> --- !query 1125 output +-- !query output false --- !query 1126 +-- !query SELECT cast(1 as decimal(5, 0)) <> cast(1 as string) FROM t --- !query 1126 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) = CAST(CAST(1 AS STRING) AS DOUBLE))):boolean> --- !query 1126 output +-- !query output false --- !query 1127 +-- !query SELECT cast(1 as decimal(10, 0)) <> cast(1 as string) FROM t --- !query 1127 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(CAST(1 AS STRING) AS DOUBLE))):boolean> --- !query 1127 output +-- !query output false --- !query 1128 +-- !query SELECT cast(1 as decimal(20, 0)) <> cast(1 as string) FROM t --- !query 1128 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) = CAST(CAST(1 AS STRING) AS DOUBLE))):boolean> --- !query 1128 output +-- !query output false --- !query 1129 +-- !query SELECT cast(1 as decimal(3, 0)) <> cast('1' as binary) FROM t --- !query 1129 schema +-- !query schema struct<> --- !query 1129 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) = CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) = CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7 --- !query 1130 +-- !query SELECT cast(1 as decimal(5, 0)) <> cast('1' as binary) FROM t --- !query 1130 schema +-- !query schema struct<> --- !query 1130 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) = CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) = CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7 --- !query 1131 +-- !query SELECT cast(1 as decimal(10, 0)) <> cast('1' as binary) FROM t --- !query 1131 schema +-- !query schema struct<> --- !query 1131 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) = CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) = CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7 --- !query 1132 +-- !query SELECT cast(1 as decimal(20, 0)) <> cast('1' as binary) FROM t --- !query 1132 schema +-- !query schema struct<> --- !query 1132 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) = CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) = CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7 --- !query 1133 +-- !query SELECT cast(1 as decimal(3, 0)) <> cast(1 as boolean) FROM t --- !query 1133 schema +-- !query schema struct<(NOT (CAST(1 AS DECIMAL(3,0)) = CAST(CAST(1 AS BOOLEAN) AS DECIMAL(3,0)))):boolean> --- !query 1133 output +-- !query output false --- !query 1134 +-- !query SELECT cast(1 as decimal(5, 0)) <> cast(1 as boolean) FROM t --- !query 1134 schema +-- !query schema struct<(NOT (CAST(1 AS DECIMAL(5,0)) = CAST(CAST(1 AS BOOLEAN) AS DECIMAL(5,0)))):boolean> --- !query 1134 output +-- !query output false --- !query 1135 +-- !query SELECT cast(1 as decimal(10, 0)) <> cast(1 as boolean) FROM t --- !query 1135 schema +-- !query schema struct<(NOT (CAST(1 AS DECIMAL(10,0)) = CAST(CAST(1 AS BOOLEAN) AS DECIMAL(10,0)))):boolean> --- !query 1135 output +-- !query output false --- !query 1136 +-- !query SELECT cast(1 as decimal(20, 0)) <> cast(1 as boolean) FROM t --- !query 1136 schema +-- !query schema struct<(NOT (CAST(1 AS DECIMAL(20,0)) = CAST(CAST(1 AS BOOLEAN) AS DECIMAL(20,0)))):boolean> --- !query 1136 output +-- !query output false --- !query 1137 +-- !query SELECT cast(1 as decimal(3, 0)) <> cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 1137 schema +-- !query schema struct<> --- !query 1137 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7 --- !query 1138 +-- !query SELECT cast(1 as decimal(5, 0)) <> cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 1138 schema +-- !query schema struct<> --- !query 1138 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7 --- !query 1139 +-- !query SELECT cast(1 as decimal(10, 0)) <> cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 1139 schema +-- !query schema struct<> --- !query 1139 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7 --- !query 1140 +-- !query SELECT cast(1 as decimal(20, 0)) <> cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 1140 schema +-- !query schema struct<> --- !query 1140 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7 --- !query 1141 +-- !query SELECT cast(1 as decimal(3, 0)) <> cast('2017-12-11 09:30:00' as date) FROM t --- !query 1141 schema +-- !query schema struct<> --- !query 1141 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(3,0)) = CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) = CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7 --- !query 1142 +-- !query SELECT cast(1 as decimal(5, 0)) <> cast('2017-12-11 09:30:00' as date) FROM t --- !query 1142 schema +-- !query schema struct<> --- !query 1142 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(5,0)) = CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) = CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7 --- !query 1143 +-- !query SELECT cast(1 as decimal(10, 0)) <> cast('2017-12-11 09:30:00' as date) FROM t --- !query 1143 schema +-- !query schema struct<> --- !query 1143 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) = CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) = CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7 --- !query 1144 +-- !query SELECT cast(1 as decimal(20, 0)) <> cast('2017-12-11 09:30:00' as date) FROM t --- !query 1144 schema +-- !query schema struct<> --- !query 1144 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(20,0)) = CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) = CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7 diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/division.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/division.sql.out index 017e0fea30e90..ae933da59f63f 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/division.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/division.sql.out @@ -2,1241 +2,1241 @@ -- Number of queries: 145 --- !query 0 +-- !query CREATE TEMPORARY VIEW t AS SELECT 1 --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT cast(1 as tinyint) / cast(1 as tinyint) FROM t --- !query 1 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) / CAST(CAST(1 AS TINYINT) AS DOUBLE)):double> --- !query 1 output +-- !query output 1.0 --- !query 2 +-- !query SELECT cast(1 as tinyint) / cast(1 as smallint) FROM t --- !query 2 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) / CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double> --- !query 2 output +-- !query output 1.0 --- !query 3 +-- !query SELECT cast(1 as tinyint) / cast(1 as int) FROM t --- !query 3 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) / CAST(CAST(1 AS INT) AS DOUBLE)):double> --- !query 3 output +-- !query output 1.0 --- !query 4 +-- !query SELECT cast(1 as tinyint) / cast(1 as bigint) FROM t --- !query 4 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) / CAST(CAST(1 AS BIGINT) AS DOUBLE)):double> --- !query 4 output +-- !query output 1.0 --- !query 5 +-- !query SELECT cast(1 as tinyint) / cast(1 as float) FROM t --- !query 5 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 5 output +-- !query output 1.0 --- !query 6 +-- !query SELECT cast(1 as tinyint) / cast(1 as double) FROM t --- !query 6 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) / CAST(CAST(1 AS DOUBLE) AS DOUBLE)):double> --- !query 6 output +-- !query output 1.0 --- !query 7 +-- !query SELECT cast(1 as tinyint) / cast(1 as decimal(10, 0)) FROM t --- !query 7 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) / CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(14,11)> --- !query 7 output -1 +-- !query output +1.00000000000 --- !query 8 +-- !query SELECT cast(1 as tinyint) / cast(1 as string) FROM t --- !query 8 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) / CAST(CAST(CAST(1 AS STRING) AS DOUBLE) AS DOUBLE)):double> --- !query 8 output +-- !query output 1.0 --- !query 9 +-- !query SELECT cast(1 as tinyint) / cast('1' as binary) FROM t --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS TINYINT) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS TINYINT) / CAST('1' AS BINARY))' (tinyint and binary).; line 1 pos 7 --- !query 10 +-- !query SELECT cast(1 as tinyint) / cast(1 as boolean) FROM t --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS TINYINT) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS TINYINT) / CAST(1 AS BOOLEAN))' (tinyint and boolean).; line 1 pos 7 --- !query 11 +-- !query SELECT cast(1 as tinyint) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS TINYINT) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS TINYINT) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (tinyint and timestamp).; line 1 pos 7 --- !query 12 +-- !query SELECT cast(1 as tinyint) / cast('2017-12-11 09:30:00' as date) FROM t --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS TINYINT) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS TINYINT) / CAST('2017-12-11 09:30:00' AS DATE))' (tinyint and date).; line 1 pos 7 --- !query 13 +-- !query SELECT cast(1 as smallint) / cast(1 as tinyint) FROM t --- !query 13 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) / CAST(CAST(1 AS TINYINT) AS DOUBLE)):double> --- !query 13 output +-- !query output 1.0 --- !query 14 +-- !query SELECT cast(1 as smallint) / cast(1 as smallint) FROM t --- !query 14 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) / CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double> --- !query 14 output +-- !query output 1.0 --- !query 15 +-- !query SELECT cast(1 as smallint) / cast(1 as int) FROM t --- !query 15 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) / CAST(CAST(1 AS INT) AS DOUBLE)):double> --- !query 15 output +-- !query output 1.0 --- !query 16 +-- !query SELECT cast(1 as smallint) / cast(1 as bigint) FROM t --- !query 16 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) / CAST(CAST(1 AS BIGINT) AS DOUBLE)):double> --- !query 16 output +-- !query output 1.0 --- !query 17 +-- !query SELECT cast(1 as smallint) / cast(1 as float) FROM t --- !query 17 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 17 output +-- !query output 1.0 --- !query 18 +-- !query SELECT cast(1 as smallint) / cast(1 as double) FROM t --- !query 18 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) / CAST(CAST(1 AS DOUBLE) AS DOUBLE)):double> --- !query 18 output +-- !query output 1.0 --- !query 19 +-- !query SELECT cast(1 as smallint) / cast(1 as decimal(10, 0)) FROM t --- !query 19 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) / CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(16,11)> --- !query 19 output -1 +-- !query output +1.00000000000 --- !query 20 +-- !query SELECT cast(1 as smallint) / cast(1 as string) FROM t --- !query 20 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) / CAST(CAST(CAST(1 AS STRING) AS DOUBLE) AS DOUBLE)):double> --- !query 20 output +-- !query output 1.0 --- !query 21 +-- !query SELECT cast(1 as smallint) / cast('1' as binary) FROM t --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS SMALLINT) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS SMALLINT) / CAST('1' AS BINARY))' (smallint and binary).; line 1 pos 7 --- !query 22 +-- !query SELECT cast(1 as smallint) / cast(1 as boolean) FROM t --- !query 22 schema +-- !query schema struct<> --- !query 22 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS SMALLINT) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS SMALLINT) / CAST(1 AS BOOLEAN))' (smallint and boolean).; line 1 pos 7 --- !query 23 +-- !query SELECT cast(1 as smallint) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS SMALLINT) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS SMALLINT) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (smallint and timestamp).; line 1 pos 7 --- !query 24 +-- !query SELECT cast(1 as smallint) / cast('2017-12-11 09:30:00' as date) FROM t --- !query 24 schema +-- !query schema struct<> --- !query 24 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS SMALLINT) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS SMALLINT) / CAST('2017-12-11 09:30:00' AS DATE))' (smallint and date).; line 1 pos 7 --- !query 25 +-- !query SELECT cast(1 as int) / cast(1 as tinyint) FROM t --- !query 25 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DOUBLE) / CAST(CAST(1 AS TINYINT) AS DOUBLE)):double> --- !query 25 output +-- !query output 1.0 --- !query 26 +-- !query SELECT cast(1 as int) / cast(1 as smallint) FROM t --- !query 26 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DOUBLE) / CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double> --- !query 26 output +-- !query output 1.0 --- !query 27 +-- !query SELECT cast(1 as int) / cast(1 as int) FROM t --- !query 27 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DOUBLE) / CAST(CAST(1 AS INT) AS DOUBLE)):double> --- !query 27 output +-- !query output 1.0 --- !query 28 +-- !query SELECT cast(1 as int) / cast(1 as bigint) FROM t --- !query 28 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DOUBLE) / CAST(CAST(1 AS BIGINT) AS DOUBLE)):double> --- !query 28 output +-- !query output 1.0 --- !query 29 +-- !query SELECT cast(1 as int) / cast(1 as float) FROM t --- !query 29 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 29 output +-- !query output 1.0 --- !query 30 +-- !query SELECT cast(1 as int) / cast(1 as double) FROM t --- !query 30 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DOUBLE) / CAST(CAST(1 AS DOUBLE) AS DOUBLE)):double> --- !query 30 output +-- !query output 1.0 --- !query 31 +-- !query SELECT cast(1 as int) / cast(1 as decimal(10, 0)) FROM t --- !query 31 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) / CAST(1 AS DECIMAL(10,0))):decimal(21,11)> --- !query 31 output -1 +-- !query output +1.00000000000 --- !query 32 +-- !query SELECT cast(1 as int) / cast(1 as string) FROM t --- !query 32 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DOUBLE) / CAST(CAST(CAST(1 AS STRING) AS DOUBLE) AS DOUBLE)):double> --- !query 32 output +-- !query output 1.0 --- !query 33 +-- !query SELECT cast(1 as int) / cast('1' as binary) FROM t --- !query 33 schema +-- !query schema struct<> --- !query 33 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS INT) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS INT) / CAST('1' AS BINARY))' (int and binary).; line 1 pos 7 --- !query 34 +-- !query SELECT cast(1 as int) / cast(1 as boolean) FROM t --- !query 34 schema +-- !query schema struct<> --- !query 34 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS INT) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS INT) / CAST(1 AS BOOLEAN))' (int and boolean).; line 1 pos 7 --- !query 35 +-- !query SELECT cast(1 as int) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 35 schema +-- !query schema struct<> --- !query 35 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS INT) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS INT) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (int and timestamp).; line 1 pos 7 --- !query 36 +-- !query SELECT cast(1 as int) / cast('2017-12-11 09:30:00' as date) FROM t --- !query 36 schema +-- !query schema struct<> --- !query 36 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS INT) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS INT) / CAST('2017-12-11 09:30:00' AS DATE))' (int and date).; line 1 pos 7 --- !query 37 +-- !query SELECT cast(1 as bigint) / cast(1 as tinyint) FROM t --- !query 37 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) / CAST(CAST(1 AS TINYINT) AS DOUBLE)):double> --- !query 37 output +-- !query output 1.0 --- !query 38 +-- !query SELECT cast(1 as bigint) / cast(1 as smallint) FROM t --- !query 38 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) / CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double> --- !query 38 output +-- !query output 1.0 --- !query 39 +-- !query SELECT cast(1 as bigint) / cast(1 as int) FROM t --- !query 39 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) / CAST(CAST(1 AS INT) AS DOUBLE)):double> --- !query 39 output +-- !query output 1.0 --- !query 40 +-- !query SELECT cast(1 as bigint) / cast(1 as bigint) FROM t --- !query 40 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) / CAST(CAST(1 AS BIGINT) AS DOUBLE)):double> --- !query 40 output +-- !query output 1.0 --- !query 41 +-- !query SELECT cast(1 as bigint) / cast(1 as float) FROM t --- !query 41 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 41 output +-- !query output 1.0 --- !query 42 +-- !query SELECT cast(1 as bigint) / cast(1 as double) FROM t --- !query 42 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) / CAST(CAST(1 AS DOUBLE) AS DOUBLE)):double> --- !query 42 output +-- !query output 1.0 --- !query 43 +-- !query SELECT cast(1 as bigint) / cast(1 as decimal(10, 0)) FROM t --- !query 43 schema +-- !query schema struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) / CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(31,11)> --- !query 43 output -1 +-- !query output +1.00000000000 --- !query 44 +-- !query SELECT cast(1 as bigint) / cast(1 as string) FROM t --- !query 44 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) / CAST(CAST(CAST(1 AS STRING) AS DOUBLE) AS DOUBLE)):double> --- !query 44 output +-- !query output 1.0 --- !query 45 +-- !query SELECT cast(1 as bigint) / cast('1' as binary) FROM t --- !query 45 schema +-- !query schema struct<> --- !query 45 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BIGINT) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS BIGINT) / CAST('1' AS BINARY))' (bigint and binary).; line 1 pos 7 --- !query 46 +-- !query SELECT cast(1 as bigint) / cast(1 as boolean) FROM t --- !query 46 schema +-- !query schema struct<> --- !query 46 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BIGINT) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS BIGINT) / CAST(1 AS BOOLEAN))' (bigint and boolean).; line 1 pos 7 --- !query 47 +-- !query SELECT cast(1 as bigint) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 47 schema +-- !query schema struct<> --- !query 47 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BIGINT) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS BIGINT) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (bigint and timestamp).; line 1 pos 7 --- !query 48 +-- !query SELECT cast(1 as bigint) / cast('2017-12-11 09:30:00' as date) FROM t --- !query 48 schema +-- !query schema struct<> --- !query 48 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BIGINT) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS BIGINT) / CAST('2017-12-11 09:30:00' AS DATE))' (bigint and date).; line 1 pos 7 --- !query 49 +-- !query SELECT cast(1 as float) / cast(1 as tinyint) FROM t --- !query 49 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(1 AS TINYINT) AS DOUBLE)):double> --- !query 49 output +-- !query output 1.0 --- !query 50 +-- !query SELECT cast(1 as float) / cast(1 as smallint) FROM t --- !query 50 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double> --- !query 50 output +-- !query output 1.0 --- !query 51 +-- !query SELECT cast(1 as float) / cast(1 as int) FROM t --- !query 51 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(1 AS INT) AS DOUBLE)):double> --- !query 51 output +-- !query output 1.0 --- !query 52 +-- !query SELECT cast(1 as float) / cast(1 as bigint) FROM t --- !query 52 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(1 AS BIGINT) AS DOUBLE)):double> --- !query 52 output +-- !query output 1.0 --- !query 53 +-- !query SELECT cast(1 as float) / cast(1 as float) FROM t --- !query 53 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 53 output +-- !query output 1.0 --- !query 54 +-- !query SELECT cast(1 as float) / cast(1 as double) FROM t --- !query 54 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(1 AS DOUBLE) AS DOUBLE)):double> --- !query 54 output +-- !query output 1.0 --- !query 55 +-- !query SELECT cast(1 as float) / cast(1 as decimal(10, 0)) FROM t --- !query 55 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) AS DOUBLE)):double> --- !query 55 output +-- !query output 1.0 --- !query 56 +-- !query SELECT cast(1 as float) / cast(1 as string) FROM t --- !query 56 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(CAST(1 AS STRING) AS DOUBLE) AS DOUBLE)):double> --- !query 56 output +-- !query output 1.0 --- !query 57 +-- !query SELECT cast(1 as float) / cast('1' as binary) FROM t --- !query 57 schema +-- !query schema struct<> --- !query 57 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS FLOAT) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS FLOAT) / CAST('1' AS BINARY))' (float and binary).; line 1 pos 7 --- !query 58 +-- !query SELECT cast(1 as float) / cast(1 as boolean) FROM t --- !query 58 schema +-- !query schema struct<> --- !query 58 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS FLOAT) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS FLOAT) / CAST(1 AS BOOLEAN))' (float and boolean).; line 1 pos 7 --- !query 59 +-- !query SELECT cast(1 as float) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 59 schema +-- !query schema struct<> --- !query 59 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS FLOAT) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS FLOAT) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (float and timestamp).; line 1 pos 7 --- !query 60 +-- !query SELECT cast(1 as float) / cast('2017-12-11 09:30:00' as date) FROM t --- !query 60 schema +-- !query schema struct<> --- !query 60 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS FLOAT) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS FLOAT) / CAST('2017-12-11 09:30:00' AS DATE))' (float and date).; line 1 pos 7 --- !query 61 +-- !query SELECT cast(1 as double) / cast(1 as tinyint) FROM t --- !query 61 schema +-- !query schema struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS TINYINT) AS DOUBLE)):double> --- !query 61 output +-- !query output 1.0 --- !query 62 +-- !query SELECT cast(1 as double) / cast(1 as smallint) FROM t --- !query 62 schema +-- !query schema struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double> --- !query 62 output +-- !query output 1.0 --- !query 63 +-- !query SELECT cast(1 as double) / cast(1 as int) FROM t --- !query 63 schema +-- !query schema struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS INT) AS DOUBLE)):double> --- !query 63 output +-- !query output 1.0 --- !query 64 +-- !query SELECT cast(1 as double) / cast(1 as bigint) FROM t --- !query 64 schema +-- !query schema struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS BIGINT) AS DOUBLE)):double> --- !query 64 output +-- !query output 1.0 --- !query 65 +-- !query SELECT cast(1 as double) / cast(1 as float) FROM t --- !query 65 schema +-- !query schema struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 65 output +-- !query output 1.0 --- !query 66 +-- !query SELECT cast(1 as double) / cast(1 as double) FROM t --- !query 66 schema +-- !query schema struct<(CAST(1 AS DOUBLE) / CAST(1 AS DOUBLE)):double> --- !query 66 output +-- !query output 1.0 --- !query 67 +-- !query SELECT cast(1 as double) / cast(1 as decimal(10, 0)) FROM t --- !query 67 schema +-- !query schema struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double> --- !query 67 output +-- !query output 1.0 --- !query 68 +-- !query SELECT cast(1 as double) / cast(1 as string) FROM t --- !query 68 schema +-- !query schema struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 68 output +-- !query output 1.0 --- !query 69 +-- !query SELECT cast(1 as double) / cast('1' as binary) FROM t --- !query 69 schema +-- !query schema struct<> --- !query 69 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DOUBLE) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DOUBLE) / CAST('1' AS BINARY))' (double and binary).; line 1 pos 7 --- !query 70 +-- !query SELECT cast(1 as double) / cast(1 as boolean) FROM t --- !query 70 schema +-- !query schema struct<> --- !query 70 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DOUBLE) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DOUBLE) / CAST(1 AS BOOLEAN))' (double and boolean).; line 1 pos 7 --- !query 71 +-- !query SELECT cast(1 as double) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 71 schema +-- !query schema struct<> --- !query 71 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DOUBLE) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DOUBLE) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (double and timestamp).; line 1 pos 7 --- !query 72 +-- !query SELECT cast(1 as double) / cast('2017-12-11 09:30:00' as date) FROM t --- !query 72 schema +-- !query schema struct<> --- !query 72 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DOUBLE) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DOUBLE) / CAST('2017-12-11 09:30:00' AS DATE))' (double and date).; line 1 pos 7 --- !query 73 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as tinyint) FROM t --- !query 73 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) / CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(16,6)> --- !query 73 output -1 +-- !query output +1.000000 --- !query 74 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as smallint) FROM t --- !query 74 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) / CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(16,6)> --- !query 74 output -1 +-- !query output +1.000000 --- !query 75 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as int) FROM t --- !query 75 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) / CAST(CAST(1 AS INT) AS DECIMAL(10,0))):decimal(21,11)> --- !query 75 output -1 +-- !query output +1.00000000000 --- !query 76 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as bigint) FROM t --- !query 76 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) / CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(31,21)> --- !query 76 output -1 +-- !query output +1.000000000000000000000 --- !query 77 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as float) FROM t --- !query 77 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 77 output +-- !query output 1.0 --- !query 78 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as double) FROM t --- !query 78 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) / CAST(1 AS DOUBLE)):double> --- !query 78 output +-- !query output 1.0 --- !query 79 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as decimal(10, 0)) FROM t --- !query 79 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) / CAST(1 AS DECIMAL(10,0))):decimal(21,11)> --- !query 79 output -1 +-- !query output +1.00000000000 --- !query 80 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as string) FROM t --- !query 80 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) / CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 80 output +-- !query output 1.0 --- !query 81 +-- !query SELECT cast(1 as decimal(10, 0)) / cast('1' as binary) FROM t --- !query 81 schema +-- !query schema struct<> --- !query 81 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) / CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7 --- !query 82 +-- !query SELECT cast(1 as decimal(10, 0)) / cast(1 as boolean) FROM t --- !query 82 schema +-- !query schema struct<> --- !query 82 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) / CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7 --- !query 83 +-- !query SELECT cast(1 as decimal(10, 0)) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 83 schema +-- !query schema struct<> --- !query 83 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7 --- !query 84 +-- !query SELECT cast(1 as decimal(10, 0)) / cast('2017-12-11 09:30:00' as date) FROM t --- !query 84 schema +-- !query schema struct<> --- !query 84 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) / CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7 --- !query 85 +-- !query SELECT cast(1 as string) / cast(1 as tinyint) FROM t --- !query 85 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST(CAST(1 AS TINYINT) AS DOUBLE)):double> --- !query 85 output +-- !query output 1.0 --- !query 86 +-- !query SELECT cast(1 as string) / cast(1 as smallint) FROM t --- !query 86 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double> --- !query 86 output +-- !query output 1.0 --- !query 87 +-- !query SELECT cast(1 as string) / cast(1 as int) FROM t --- !query 87 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST(CAST(1 AS INT) AS DOUBLE)):double> --- !query 87 output +-- !query output 1.0 --- !query 88 +-- !query SELECT cast(1 as string) / cast(1 as bigint) FROM t --- !query 88 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST(CAST(1 AS BIGINT) AS DOUBLE)):double> --- !query 88 output +-- !query output 1.0 --- !query 89 +-- !query SELECT cast(1 as string) / cast(1 as float) FROM t --- !query 89 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 89 output +-- !query output 1.0 --- !query 90 +-- !query SELECT cast(1 as string) / cast(1 as double) FROM t --- !query 90 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST(1 AS DOUBLE)):double> --- !query 90 output +-- !query output 1.0 --- !query 91 +-- !query SELECT cast(1 as string) / cast(1 as decimal(10, 0)) FROM t --- !query 91 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double> --- !query 91 output +-- !query output 1.0 --- !query 92 +-- !query SELECT cast(1 as string) / cast(1 as string) FROM t --- !query 92 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST(CAST(1 AS STRING) AS DOUBLE)):double> --- !query 92 output +-- !query output 1.0 --- !query 93 +-- !query SELECT cast(1 as string) / cast('1' as binary) FROM t --- !query 93 schema +-- !query schema struct<> --- !query 93 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST('1' AS BINARY))' (double and binary).; line 1 pos 7 --- !query 94 +-- !query SELECT cast(1 as string) / cast(1 as boolean) FROM t --- !query 94 schema +-- !query schema struct<> --- !query 94 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST(1 AS BOOLEAN))' (double and boolean).; line 1 pos 7 --- !query 95 +-- !query SELECT cast(1 as string) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 95 schema +-- !query schema struct<> --- !query 95 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (double and timestamp).; line 1 pos 7 --- !query 96 +-- !query SELECT cast(1 as string) / cast('2017-12-11 09:30:00' as date) FROM t --- !query 96 schema +-- !query schema struct<> --- !query 96 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST('2017-12-11 09:30:00' AS DATE))' (double and date).; line 1 pos 7 --- !query 97 +-- !query SELECT cast('1' as binary) / cast(1 as tinyint) FROM t --- !query 97 schema +-- !query schema struct<> --- !query 97 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS TINYINT))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS TINYINT))' (binary and tinyint).; line 1 pos 7 --- !query 98 +-- !query SELECT cast('1' as binary) / cast(1 as smallint) FROM t --- !query 98 schema +-- !query schema struct<> --- !query 98 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS SMALLINT))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS SMALLINT))' (binary and smallint).; line 1 pos 7 --- !query 99 +-- !query SELECT cast('1' as binary) / cast(1 as int) FROM t --- !query 99 schema +-- !query schema struct<> --- !query 99 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS INT))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS INT))' (binary and int).; line 1 pos 7 --- !query 100 +-- !query SELECT cast('1' as binary) / cast(1 as bigint) FROM t --- !query 100 schema +-- !query schema struct<> --- !query 100 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS BIGINT))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS BIGINT))' (binary and bigint).; line 1 pos 7 --- !query 101 +-- !query SELECT cast('1' as binary) / cast(1 as float) FROM t --- !query 101 schema +-- !query schema struct<> --- !query 101 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS FLOAT))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS FLOAT))' (binary and float).; line 1 pos 7 --- !query 102 +-- !query SELECT cast('1' as binary) / cast(1 as double) FROM t --- !query 102 schema +-- !query schema struct<> --- !query 102 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS DOUBLE))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS DOUBLE))' (binary and double).; line 1 pos 7 --- !query 103 +-- !query SELECT cast('1' as binary) / cast(1 as decimal(10, 0)) FROM t --- !query 103 schema +-- !query schema struct<> --- !query 103 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7 --- !query 104 +-- !query SELECT cast('1' as binary) / cast(1 as string) FROM t --- !query 104 schema +-- !query schema struct<> --- !query 104 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) / CAST(CAST(1 AS STRING) AS DOUBLE))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(CAST(1 AS STRING) AS DOUBLE))' (binary and double).; line 1 pos 7 --- !query 105 +-- !query SELECT cast('1' as binary) / cast('1' as binary) FROM t --- !query 105 schema +-- !query schema struct<> --- !query 105 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) / CAST('1' AS BINARY))' due to data type mismatch: '(CAST('1' AS BINARY) / CAST('1' AS BINARY))' requires (double or decimal) type, not binary; line 1 pos 7 --- !query 106 +-- !query SELECT cast('1' as binary) / cast(1 as boolean) FROM t --- !query 106 schema +-- !query schema struct<> --- !query 106 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS BOOLEAN))' (binary and boolean).; line 1 pos 7 --- !query 107 +-- !query SELECT cast('1' as binary) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 107 schema +-- !query schema struct<> --- !query 107 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (binary and timestamp).; line 1 pos 7 --- !query 108 +-- !query SELECT cast('1' as binary) / cast('2017-12-11 09:30:00' as date) FROM t --- !query 108 schema +-- !query schema struct<> --- !query 108 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST('2017-12-11 09:30:00' AS DATE))' (binary and date).; line 1 pos 7 --- !query 109 +-- !query SELECT cast(1 as boolean) / cast(1 as tinyint) FROM t --- !query 109 schema +-- !query schema struct<> --- !query 109 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BOOLEAN) / CAST(1 AS TINYINT))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST(1 AS TINYINT))' (boolean and tinyint).; line 1 pos 7 --- !query 110 +-- !query SELECT cast(1 as boolean) / cast(1 as smallint) FROM t --- !query 110 schema +-- !query schema struct<> --- !query 110 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BOOLEAN) / CAST(1 AS SMALLINT))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST(1 AS SMALLINT))' (boolean and smallint).; line 1 pos 7 --- !query 111 +-- !query SELECT cast(1 as boolean) / cast(1 as int) FROM t --- !query 111 schema +-- !query schema struct<> --- !query 111 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BOOLEAN) / CAST(1 AS INT))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST(1 AS INT))' (boolean and int).; line 1 pos 7 --- !query 112 +-- !query SELECT cast(1 as boolean) / cast(1 as bigint) FROM t --- !query 112 schema +-- !query schema struct<> --- !query 112 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BOOLEAN) / CAST(1 AS BIGINT))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST(1 AS BIGINT))' (boolean and bigint).; line 1 pos 7 --- !query 113 +-- !query SELECT cast(1 as boolean) / cast(1 as float) FROM t --- !query 113 schema +-- !query schema struct<> --- !query 113 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BOOLEAN) / CAST(1 AS FLOAT))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST(1 AS FLOAT))' (boolean and float).; line 1 pos 7 --- !query 114 +-- !query SELECT cast(1 as boolean) / cast(1 as double) FROM t --- !query 114 schema +-- !query schema struct<> --- !query 114 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BOOLEAN) / CAST(1 AS DOUBLE))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST(1 AS DOUBLE))' (boolean and double).; line 1 pos 7 --- !query 115 +-- !query SELECT cast(1 as boolean) / cast(1 as decimal(10, 0)) FROM t --- !query 115 schema +-- !query schema struct<> --- !query 115 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BOOLEAN) / CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST(1 AS DECIMAL(10,0)))' (boolean and decimal(10,0)).; line 1 pos 7 --- !query 116 +-- !query SELECT cast(1 as boolean) / cast(1 as string) FROM t --- !query 116 schema +-- !query schema struct<> --- !query 116 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BOOLEAN) / CAST(CAST(1 AS STRING) AS DOUBLE))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST(CAST(1 AS STRING) AS DOUBLE))' (boolean and double).; line 1 pos 7 --- !query 117 +-- !query SELECT cast(1 as boolean) / cast('1' as binary) FROM t --- !query 117 schema +-- !query schema struct<> --- !query 117 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BOOLEAN) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST('1' AS BINARY))' (boolean and binary).; line 1 pos 7 --- !query 118 +-- !query SELECT cast(1 as boolean) / cast(1 as boolean) FROM t --- !query 118 schema +-- !query schema struct<> --- !query 118 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BOOLEAN) / CAST(1 AS BOOLEAN))' due to data type mismatch: '(CAST(1 AS BOOLEAN) / CAST(1 AS BOOLEAN))' requires (double or decimal) type, not boolean; line 1 pos 7 --- !query 119 +-- !query SELECT cast(1 as boolean) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 119 schema +-- !query schema struct<> --- !query 119 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BOOLEAN) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (boolean and timestamp).; line 1 pos 7 --- !query 120 +-- !query SELECT cast(1 as boolean) / cast('2017-12-11 09:30:00' as date) FROM t --- !query 120 schema +-- !query schema struct<> --- !query 120 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BOOLEAN) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST('2017-12-11 09:30:00' AS DATE))' (boolean and date).; line 1 pos 7 --- !query 121 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as tinyint) FROM t --- !query 121 schema +-- !query schema struct<> --- !query 121 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS TINYINT))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS TINYINT))' (timestamp and tinyint).; line 1 pos 7 --- !query 122 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as smallint) FROM t --- !query 122 schema +-- !query schema struct<> --- !query 122 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS SMALLINT))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS SMALLINT))' (timestamp and smallint).; line 1 pos 7 --- !query 123 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as int) FROM t --- !query 123 schema +-- !query schema struct<> --- !query 123 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS INT))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS INT))' (timestamp and int).; line 1 pos 7 --- !query 124 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as bigint) FROM t --- !query 124 schema +-- !query schema struct<> --- !query 124 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS BIGINT))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS BIGINT))' (timestamp and bigint).; line 1 pos 7 --- !query 125 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as float) FROM t --- !query 125 schema +-- !query schema struct<> --- !query 125 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS FLOAT))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS FLOAT))' (timestamp and float).; line 1 pos 7 --- !query 126 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as double) FROM t --- !query 126 schema +-- !query schema struct<> --- !query 126 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DOUBLE))' (timestamp and double).; line 1 pos 7 --- !query 127 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as decimal(10, 0)) FROM t --- !query 127 schema +-- !query schema struct<> --- !query 127 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7 --- !query 128 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as string) FROM t --- !query 128 schema +-- !query schema struct<> --- !query 128 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(CAST(1 AS STRING) AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(CAST(1 AS STRING) AS DOUBLE))' (timestamp and double).; line 1 pos 7 --- !query 129 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast('1' as binary) FROM t --- !query 129 schema +-- !query schema struct<> --- !query 129 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST('1' AS BINARY))' (timestamp and binary).; line 1 pos 7 --- !query 130 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as boolean) FROM t --- !query 130 schema +-- !query schema struct<> --- !query 130 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS BOOLEAN))' (timestamp and boolean).; line 1 pos 7 --- !query 131 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 131 schema +-- !query schema struct<> --- !query 131 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' requires (double or decimal) type, not timestamp; line 1 pos 7 --- !query 132 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast('2017-12-11 09:30:00' as date) FROM t --- !query 132 schema +-- !query schema struct<> --- !query 132 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST('2017-12-11 09:30:00' AS DATE))' (timestamp and date).; line 1 pos 7 --- !query 133 +-- !query SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as tinyint) FROM t --- !query 133 schema +-- !query schema struct<> --- !query 133 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS TINYINT))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS TINYINT))' (date and tinyint).; line 1 pos 7 --- !query 134 +-- !query SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as smallint) FROM t --- !query 134 schema +-- !query schema struct<> --- !query 134 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS SMALLINT))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS SMALLINT))' (date and smallint).; line 1 pos 7 --- !query 135 +-- !query SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as int) FROM t --- !query 135 schema +-- !query schema struct<> --- !query 135 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS INT))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS INT))' (date and int).; line 1 pos 7 --- !query 136 +-- !query SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as bigint) FROM t --- !query 136 schema +-- !query schema struct<> --- !query 136 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS BIGINT))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS BIGINT))' (date and bigint).; line 1 pos 7 --- !query 137 +-- !query SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as float) FROM t --- !query 137 schema +-- !query schema struct<> --- !query 137 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS FLOAT))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS FLOAT))' (date and float).; line 1 pos 7 --- !query 138 +-- !query SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as double) FROM t --- !query 138 schema +-- !query schema struct<> --- !query 138 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS DOUBLE))' (date and double).; line 1 pos 7 --- !query 139 +-- !query SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as decimal(10, 0)) FROM t --- !query 139 schema +-- !query schema struct<> --- !query 139 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7 --- !query 140 +-- !query SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as string) FROM t --- !query 140 schema +-- !query schema struct<> --- !query 140 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(CAST(1 AS STRING) AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(CAST(1 AS STRING) AS DOUBLE))' (date and double).; line 1 pos 7 --- !query 141 +-- !query SELECT cast('2017-12-11 09:30:00' as date) / cast('1' as binary) FROM t --- !query 141 schema +-- !query schema struct<> --- !query 141 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST('1' AS BINARY))' (date and binary).; line 1 pos 7 --- !query 142 +-- !query SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as boolean) FROM t --- !query 142 schema +-- !query schema struct<> --- !query 142 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS BOOLEAN))' (date and boolean).; line 1 pos 7 --- !query 143 +-- !query SELECT cast('2017-12-11 09:30:00' as date) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 143 schema +-- !query schema struct<> --- !query 143 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (date and timestamp).; line 1 pos 7 --- !query 144 +-- !query SELECT cast('2017-12-11 09:30:00' as date) / cast('2017-12-11 09:30:00' as date) FROM t --- !query 144 schema +-- !query schema struct<> --- !query 144 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: '(CAST('2017-12-11 09:30:00' AS DATE) / CAST('2017-12-11 09:30:00' AS DATE))' requires (double or decimal) type, not date; line 1 pos 7 diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/elt.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/elt.sql.out index b62e1b6826045..5e335df904a3d 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/elt.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/elt.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 6 --- !query 0 +-- !query SELECT elt(2, col1, col2, col3, col4, col5) col FROM ( SELECT @@ -13,9 +13,9 @@ FROM ( CAST(id AS DOUBLE) col5 FROM range(10) ) --- !query 0 schema +-- !query schema struct --- !query 0 output +-- !query output 0 1 2 @@ -28,7 +28,7 @@ struct 9 --- !query 1 +-- !query SELECT elt(3, col1, col2, col3, col4) col FROM ( SELECT @@ -38,9 +38,9 @@ FROM ( encode(string(id + 3), 'utf-8') col4 FROM range(10) ) --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 10 11 2 @@ -53,15 +53,15 @@ struct 9 --- !query 2 +-- !query set spark.sql.function.eltOutputAsString=true --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output spark.sql.function.eltOutputAsString true --- !query 3 +-- !query SELECT elt(1, col1, col2) col FROM ( SELECT @@ -69,9 +69,9 @@ FROM ( encode(string(id + 1), 'utf-8') col2 FROM range(10) ) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 0 1 2 @@ -84,15 +84,15 @@ struct 9 --- !query 4 +-- !query set spark.sql.function.eltOutputAsString=false --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output spark.sql.function.eltOutputAsString false --- !query 5 +-- !query SELECT elt(2, col1, col2) col FROM ( SELECT @@ -100,9 +100,9 @@ FROM ( encode(string(id + 1), 'utf-8') col2 FROM range(10) ) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 1 10 2 diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/ifCoercion.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/ifCoercion.sql.out index 7097027872707..bb49d296eaada 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/ifCoercion.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/ifCoercion.sql.out @@ -2,1231 +2,1231 @@ -- Number of queries: 145 --- !query 0 +-- !query CREATE TEMPORARY VIEW t AS SELECT 1 --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT IF(true, cast(1 as tinyint), cast(2 as tinyint)) FROM t --- !query 1 schema +-- !query schema struct<(IF(true, CAST(1 AS TINYINT), CAST(2 AS TINYINT))):tinyint> --- !query 1 output +-- !query output 1 --- !query 2 +-- !query SELECT IF(true, cast(1 as tinyint), cast(2 as smallint)) FROM t --- !query 2 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS TINYINT) AS SMALLINT), CAST(2 AS SMALLINT))):smallint> --- !query 2 output +-- !query output 1 --- !query 3 +-- !query SELECT IF(true, cast(1 as tinyint), cast(2 as int)) FROM t --- !query 3 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS TINYINT) AS INT), CAST(2 AS INT))):int> --- !query 3 output +-- !query output 1 --- !query 4 +-- !query SELECT IF(true, cast(1 as tinyint), cast(2 as bigint)) FROM t --- !query 4 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS TINYINT) AS BIGINT), CAST(2 AS BIGINT))):bigint> --- !query 4 output +-- !query output 1 --- !query 5 +-- !query SELECT IF(true, cast(1 as tinyint), cast(2 as float)) FROM t --- !query 5 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS TINYINT) AS FLOAT), CAST(2 AS FLOAT))):float> --- !query 5 output +-- !query output 1.0 --- !query 6 +-- !query SELECT IF(true, cast(1 as tinyint), cast(2 as double)) FROM t --- !query 6 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS TINYINT) AS DOUBLE), CAST(2 AS DOUBLE))):double> --- !query 6 output +-- !query output 1.0 --- !query 7 +-- !query SELECT IF(true, cast(1 as tinyint), cast(2 as decimal(10, 0))) FROM t --- !query 7 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS TINYINT) AS DECIMAL(10,0)), CAST(2 AS DECIMAL(10,0)))):decimal(10,0)> --- !query 7 output +-- !query output 1 --- !query 8 +-- !query SELECT IF(true, cast(1 as tinyint), cast(2 as string)) FROM t --- !query 8 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS TINYINT) AS STRING), CAST(2 AS STRING))):string> --- !query 8 output +-- !query output 1 --- !query 9 +-- !query SELECT IF(true, cast(1 as tinyint), cast('2' as binary)) FROM t --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS TINYINT), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS TINYINT), CAST('2' AS BINARY)))' (tinyint and binary).; line 1 pos 7 --- !query 10 +-- !query SELECT IF(true, cast(1 as tinyint), cast(2 as boolean)) FROM t --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS TINYINT), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS TINYINT), CAST(2 AS BOOLEAN)))' (tinyint and boolean).; line 1 pos 7 --- !query 11 +-- !query SELECT IF(true, cast(1 as tinyint), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS TINYINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS TINYINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' (tinyint and timestamp).; line 1 pos 7 --- !query 12 +-- !query SELECT IF(true, cast(1 as tinyint), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS TINYINT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS TINYINT), CAST('2017-12-11 09:30:00' AS DATE)))' (tinyint and date).; line 1 pos 7 --- !query 13 +-- !query SELECT IF(true, cast(1 as smallint), cast(2 as tinyint)) FROM t --- !query 13 schema +-- !query schema struct<(IF(true, CAST(1 AS SMALLINT), CAST(CAST(2 AS TINYINT) AS SMALLINT))):smallint> --- !query 13 output +-- !query output 1 --- !query 14 +-- !query SELECT IF(true, cast(1 as smallint), cast(2 as smallint)) FROM t --- !query 14 schema +-- !query schema struct<(IF(true, CAST(1 AS SMALLINT), CAST(2 AS SMALLINT))):smallint> --- !query 14 output +-- !query output 1 --- !query 15 +-- !query SELECT IF(true, cast(1 as smallint), cast(2 as int)) FROM t --- !query 15 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS SMALLINT) AS INT), CAST(2 AS INT))):int> --- !query 15 output +-- !query output 1 --- !query 16 +-- !query SELECT IF(true, cast(1 as smallint), cast(2 as bigint)) FROM t --- !query 16 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS SMALLINT) AS BIGINT), CAST(2 AS BIGINT))):bigint> --- !query 16 output +-- !query output 1 --- !query 17 +-- !query SELECT IF(true, cast(1 as smallint), cast(2 as float)) FROM t --- !query 17 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS SMALLINT) AS FLOAT), CAST(2 AS FLOAT))):float> --- !query 17 output +-- !query output 1.0 --- !query 18 +-- !query SELECT IF(true, cast(1 as smallint), cast(2 as double)) FROM t --- !query 18 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS SMALLINT) AS DOUBLE), CAST(2 AS DOUBLE))):double> --- !query 18 output +-- !query output 1.0 --- !query 19 +-- !query SELECT IF(true, cast(1 as smallint), cast(2 as decimal(10, 0))) FROM t --- !query 19 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS SMALLINT) AS DECIMAL(10,0)), CAST(2 AS DECIMAL(10,0)))):decimal(10,0)> --- !query 19 output +-- !query output 1 --- !query 20 +-- !query SELECT IF(true, cast(1 as smallint), cast(2 as string)) FROM t --- !query 20 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS SMALLINT) AS STRING), CAST(2 AS STRING))):string> --- !query 20 output +-- !query output 1 --- !query 21 +-- !query SELECT IF(true, cast(1 as smallint), cast('2' as binary)) FROM t --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS SMALLINT), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS SMALLINT), CAST('2' AS BINARY)))' (smallint and binary).; line 1 pos 7 --- !query 22 +-- !query SELECT IF(true, cast(1 as smallint), cast(2 as boolean)) FROM t --- !query 22 schema +-- !query schema struct<> --- !query 22 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS SMALLINT), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS SMALLINT), CAST(2 AS BOOLEAN)))' (smallint and boolean).; line 1 pos 7 --- !query 23 +-- !query SELECT IF(true, cast(1 as smallint), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS SMALLINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS SMALLINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' (smallint and timestamp).; line 1 pos 7 --- !query 24 +-- !query SELECT IF(true, cast(1 as smallint), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 24 schema +-- !query schema struct<> --- !query 24 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS SMALLINT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS SMALLINT), CAST('2017-12-11 09:30:00' AS DATE)))' (smallint and date).; line 1 pos 7 --- !query 25 +-- !query SELECT IF(true, cast(1 as int), cast(2 as tinyint)) FROM t --- !query 25 schema +-- !query schema struct<(IF(true, CAST(1 AS INT), CAST(CAST(2 AS TINYINT) AS INT))):int> --- !query 25 output +-- !query output 1 --- !query 26 +-- !query SELECT IF(true, cast(1 as int), cast(2 as smallint)) FROM t --- !query 26 schema +-- !query schema struct<(IF(true, CAST(1 AS INT), CAST(CAST(2 AS SMALLINT) AS INT))):int> --- !query 26 output +-- !query output 1 --- !query 27 +-- !query SELECT IF(true, cast(1 as int), cast(2 as int)) FROM t --- !query 27 schema +-- !query schema struct<(IF(true, CAST(1 AS INT), CAST(2 AS INT))):int> --- !query 27 output +-- !query output 1 --- !query 28 +-- !query SELECT IF(true, cast(1 as int), cast(2 as bigint)) FROM t --- !query 28 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS INT) AS BIGINT), CAST(2 AS BIGINT))):bigint> --- !query 28 output +-- !query output 1 --- !query 29 +-- !query SELECT IF(true, cast(1 as int), cast(2 as float)) FROM t --- !query 29 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS INT) AS FLOAT), CAST(2 AS FLOAT))):float> --- !query 29 output +-- !query output 1.0 --- !query 30 +-- !query SELECT IF(true, cast(1 as int), cast(2 as double)) FROM t --- !query 30 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS INT) AS DOUBLE), CAST(2 AS DOUBLE))):double> --- !query 30 output +-- !query output 1.0 --- !query 31 +-- !query SELECT IF(true, cast(1 as int), cast(2 as decimal(10, 0))) FROM t --- !query 31 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS INT) AS DECIMAL(10,0)), CAST(2 AS DECIMAL(10,0)))):decimal(10,0)> --- !query 31 output +-- !query output 1 --- !query 32 +-- !query SELECT IF(true, cast(1 as int), cast(2 as string)) FROM t --- !query 32 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS INT) AS STRING), CAST(2 AS STRING))):string> --- !query 32 output +-- !query output 1 --- !query 33 +-- !query SELECT IF(true, cast(1 as int), cast('2' as binary)) FROM t --- !query 33 schema +-- !query schema struct<> --- !query 33 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS INT), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS INT), CAST('2' AS BINARY)))' (int and binary).; line 1 pos 7 --- !query 34 +-- !query SELECT IF(true, cast(1 as int), cast(2 as boolean)) FROM t --- !query 34 schema +-- !query schema struct<> --- !query 34 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS INT), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS INT), CAST(2 AS BOOLEAN)))' (int and boolean).; line 1 pos 7 --- !query 35 +-- !query SELECT IF(true, cast(1 as int), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 35 schema +-- !query schema struct<> --- !query 35 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS INT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS INT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' (int and timestamp).; line 1 pos 7 --- !query 36 +-- !query SELECT IF(true, cast(1 as int), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 36 schema +-- !query schema struct<> --- !query 36 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS INT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS INT), CAST('2017-12-11 09:30:00' AS DATE)))' (int and date).; line 1 pos 7 --- !query 37 +-- !query SELECT IF(true, cast(1 as bigint), cast(2 as tinyint)) FROM t --- !query 37 schema +-- !query schema struct<(IF(true, CAST(1 AS BIGINT), CAST(CAST(2 AS TINYINT) AS BIGINT))):bigint> --- !query 37 output +-- !query output 1 --- !query 38 +-- !query SELECT IF(true, cast(1 as bigint), cast(2 as smallint)) FROM t --- !query 38 schema +-- !query schema struct<(IF(true, CAST(1 AS BIGINT), CAST(CAST(2 AS SMALLINT) AS BIGINT))):bigint> --- !query 38 output +-- !query output 1 --- !query 39 +-- !query SELECT IF(true, cast(1 as bigint), cast(2 as int)) FROM t --- !query 39 schema +-- !query schema struct<(IF(true, CAST(1 AS BIGINT), CAST(CAST(2 AS INT) AS BIGINT))):bigint> --- !query 39 output +-- !query output 1 --- !query 40 +-- !query SELECT IF(true, cast(1 as bigint), cast(2 as bigint)) FROM t --- !query 40 schema +-- !query schema struct<(IF(true, CAST(1 AS BIGINT), CAST(2 AS BIGINT))):bigint> --- !query 40 output +-- !query output 1 --- !query 41 +-- !query SELECT IF(true, cast(1 as bigint), cast(2 as float)) FROM t --- !query 41 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS BIGINT) AS FLOAT), CAST(2 AS FLOAT))):float> --- !query 41 output +-- !query output 1.0 --- !query 42 +-- !query SELECT IF(true, cast(1 as bigint), cast(2 as double)) FROM t --- !query 42 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS BIGINT) AS DOUBLE), CAST(2 AS DOUBLE))):double> --- !query 42 output +-- !query output 1.0 --- !query 43 +-- !query SELECT IF(true, cast(1 as bigint), cast(2 as decimal(10, 0))) FROM t --- !query 43 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)), CAST(CAST(2 AS DECIMAL(10,0)) AS DECIMAL(20,0)))):decimal(20,0)> --- !query 43 output +-- !query output 1 --- !query 44 +-- !query SELECT IF(true, cast(1 as bigint), cast(2 as string)) FROM t --- !query 44 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS BIGINT) AS STRING), CAST(2 AS STRING))):string> --- !query 44 output +-- !query output 1 --- !query 45 +-- !query SELECT IF(true, cast(1 as bigint), cast('2' as binary)) FROM t --- !query 45 schema +-- !query schema struct<> --- !query 45 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS BIGINT), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BIGINT), CAST('2' AS BINARY)))' (bigint and binary).; line 1 pos 7 --- !query 46 +-- !query SELECT IF(true, cast(1 as bigint), cast(2 as boolean)) FROM t --- !query 46 schema +-- !query schema struct<> --- !query 46 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS BIGINT), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BIGINT), CAST(2 AS BOOLEAN)))' (bigint and boolean).; line 1 pos 7 --- !query 47 +-- !query SELECT IF(true, cast(1 as bigint), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 47 schema +-- !query schema struct<> --- !query 47 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS BIGINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BIGINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' (bigint and timestamp).; line 1 pos 7 --- !query 48 +-- !query SELECT IF(true, cast(1 as bigint), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 48 schema +-- !query schema struct<> --- !query 48 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS BIGINT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BIGINT), CAST('2017-12-11 09:30:00' AS DATE)))' (bigint and date).; line 1 pos 7 --- !query 49 +-- !query SELECT IF(true, cast(1 as float), cast(2 as tinyint)) FROM t --- !query 49 schema +-- !query schema struct<(IF(true, CAST(1 AS FLOAT), CAST(CAST(2 AS TINYINT) AS FLOAT))):float> --- !query 49 output +-- !query output 1.0 --- !query 50 +-- !query SELECT IF(true, cast(1 as float), cast(2 as smallint)) FROM t --- !query 50 schema +-- !query schema struct<(IF(true, CAST(1 AS FLOAT), CAST(CAST(2 AS SMALLINT) AS FLOAT))):float> --- !query 50 output +-- !query output 1.0 --- !query 51 +-- !query SELECT IF(true, cast(1 as float), cast(2 as int)) FROM t --- !query 51 schema +-- !query schema struct<(IF(true, CAST(1 AS FLOAT), CAST(CAST(2 AS INT) AS FLOAT))):float> --- !query 51 output +-- !query output 1.0 --- !query 52 +-- !query SELECT IF(true, cast(1 as float), cast(2 as bigint)) FROM t --- !query 52 schema +-- !query schema struct<(IF(true, CAST(1 AS FLOAT), CAST(CAST(2 AS BIGINT) AS FLOAT))):float> --- !query 52 output +-- !query output 1.0 --- !query 53 +-- !query SELECT IF(true, cast(1 as float), cast(2 as float)) FROM t --- !query 53 schema +-- !query schema struct<(IF(true, CAST(1 AS FLOAT), CAST(2 AS FLOAT))):float> --- !query 53 output +-- !query output 1.0 --- !query 54 +-- !query SELECT IF(true, cast(1 as float), cast(2 as double)) FROM t --- !query 54 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS FLOAT) AS DOUBLE), CAST(2 AS DOUBLE))):double> --- !query 54 output +-- !query output 1.0 --- !query 55 +-- !query SELECT IF(true, cast(1 as float), cast(2 as decimal(10, 0))) FROM t --- !query 55 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS FLOAT) AS DOUBLE), CAST(CAST(2 AS DECIMAL(10,0)) AS DOUBLE))):double> --- !query 55 output +-- !query output 1.0 --- !query 56 +-- !query SELECT IF(true, cast(1 as float), cast(2 as string)) FROM t --- !query 56 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS FLOAT) AS STRING), CAST(2 AS STRING))):string> --- !query 56 output +-- !query output 1.0 --- !query 57 +-- !query SELECT IF(true, cast(1 as float), cast('2' as binary)) FROM t --- !query 57 schema +-- !query schema struct<> --- !query 57 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS FLOAT), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS FLOAT), CAST('2' AS BINARY)))' (float and binary).; line 1 pos 7 --- !query 58 +-- !query SELECT IF(true, cast(1 as float), cast(2 as boolean)) FROM t --- !query 58 schema +-- !query schema struct<> --- !query 58 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS FLOAT), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS FLOAT), CAST(2 AS BOOLEAN)))' (float and boolean).; line 1 pos 7 --- !query 59 +-- !query SELECT IF(true, cast(1 as float), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 59 schema +-- !query schema struct<> --- !query 59 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS FLOAT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS FLOAT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' (float and timestamp).; line 1 pos 7 --- !query 60 +-- !query SELECT IF(true, cast(1 as float), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 60 schema +-- !query schema struct<> --- !query 60 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS FLOAT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS FLOAT), CAST('2017-12-11 09:30:00' AS DATE)))' (float and date).; line 1 pos 7 --- !query 61 +-- !query SELECT IF(true, cast(1 as double), cast(2 as tinyint)) FROM t --- !query 61 schema +-- !query schema struct<(IF(true, CAST(1 AS DOUBLE), CAST(CAST(2 AS TINYINT) AS DOUBLE))):double> --- !query 61 output +-- !query output 1.0 --- !query 62 +-- !query SELECT IF(true, cast(1 as double), cast(2 as smallint)) FROM t --- !query 62 schema +-- !query schema struct<(IF(true, CAST(1 AS DOUBLE), CAST(CAST(2 AS SMALLINT) AS DOUBLE))):double> --- !query 62 output +-- !query output 1.0 --- !query 63 +-- !query SELECT IF(true, cast(1 as double), cast(2 as int)) FROM t --- !query 63 schema +-- !query schema struct<(IF(true, CAST(1 AS DOUBLE), CAST(CAST(2 AS INT) AS DOUBLE))):double> --- !query 63 output +-- !query output 1.0 --- !query 64 +-- !query SELECT IF(true, cast(1 as double), cast(2 as bigint)) FROM t --- !query 64 schema +-- !query schema struct<(IF(true, CAST(1 AS DOUBLE), CAST(CAST(2 AS BIGINT) AS DOUBLE))):double> --- !query 64 output +-- !query output 1.0 --- !query 65 +-- !query SELECT IF(true, cast(1 as double), cast(2 as float)) FROM t --- !query 65 schema +-- !query schema struct<(IF(true, CAST(1 AS DOUBLE), CAST(CAST(2 AS FLOAT) AS DOUBLE))):double> --- !query 65 output +-- !query output 1.0 --- !query 66 +-- !query SELECT IF(true, cast(1 as double), cast(2 as double)) FROM t --- !query 66 schema +-- !query schema struct<(IF(true, CAST(1 AS DOUBLE), CAST(2 AS DOUBLE))):double> --- !query 66 output +-- !query output 1.0 --- !query 67 +-- !query SELECT IF(true, cast(1 as double), cast(2 as decimal(10, 0))) FROM t --- !query 67 schema +-- !query schema struct<(IF(true, CAST(1 AS DOUBLE), CAST(CAST(2 AS DECIMAL(10,0)) AS DOUBLE))):double> --- !query 67 output +-- !query output 1.0 --- !query 68 +-- !query SELECT IF(true, cast(1 as double), cast(2 as string)) FROM t --- !query 68 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS DOUBLE) AS STRING), CAST(2 AS STRING))):string> --- !query 68 output +-- !query output 1.0 --- !query 69 +-- !query SELECT IF(true, cast(1 as double), cast('2' as binary)) FROM t --- !query 69 schema +-- !query schema struct<> --- !query 69 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS DOUBLE), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS DOUBLE), CAST('2' AS BINARY)))' (double and binary).; line 1 pos 7 --- !query 70 +-- !query SELECT IF(true, cast(1 as double), cast(2 as boolean)) FROM t --- !query 70 schema +-- !query schema struct<> --- !query 70 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS DOUBLE), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS DOUBLE), CAST(2 AS BOOLEAN)))' (double and boolean).; line 1 pos 7 --- !query 71 +-- !query SELECT IF(true, cast(1 as double), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 71 schema +-- !query schema struct<> --- !query 71 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS DOUBLE), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS DOUBLE), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' (double and timestamp).; line 1 pos 7 --- !query 72 +-- !query SELECT IF(true, cast(1 as double), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 72 schema +-- !query schema struct<> --- !query 72 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS DOUBLE), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS DOUBLE), CAST('2017-12-11 09:30:00' AS DATE)))' (double and date).; line 1 pos 7 --- !query 73 +-- !query SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as tinyint)) FROM t --- !query 73 schema +-- !query schema struct<(IF(true, CAST(1 AS DECIMAL(10,0)), CAST(CAST(2 AS TINYINT) AS DECIMAL(10,0)))):decimal(10,0)> --- !query 73 output +-- !query output 1 --- !query 74 +-- !query SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as smallint)) FROM t --- !query 74 schema +-- !query schema struct<(IF(true, CAST(1 AS DECIMAL(10,0)), CAST(CAST(2 AS SMALLINT) AS DECIMAL(10,0)))):decimal(10,0)> --- !query 74 output +-- !query output 1 --- !query 75 +-- !query SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as int)) FROM t --- !query 75 schema +-- !query schema struct<(IF(true, CAST(1 AS DECIMAL(10,0)), CAST(CAST(2 AS INT) AS DECIMAL(10,0)))):decimal(10,0)> --- !query 75 output +-- !query output 1 --- !query 76 +-- !query SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as bigint)) FROM t --- !query 76 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)), CAST(CAST(2 AS BIGINT) AS DECIMAL(20,0)))):decimal(20,0)> --- !query 76 output +-- !query output 1 --- !query 77 +-- !query SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as float)) FROM t --- !query 77 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE), CAST(CAST(2 AS FLOAT) AS DOUBLE))):double> --- !query 77 output +-- !query output 1.0 --- !query 78 +-- !query SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as double)) FROM t --- !query 78 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE), CAST(2 AS DOUBLE))):double> --- !query 78 output +-- !query output 1.0 --- !query 79 +-- !query SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as decimal(10, 0))) FROM t --- !query 79 schema +-- !query schema struct<(IF(true, CAST(1 AS DECIMAL(10,0)), CAST(2 AS DECIMAL(10,0)))):decimal(10,0)> --- !query 79 output +-- !query output 1 --- !query 80 +-- !query SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as string)) FROM t --- !query 80 schema +-- !query schema struct<(IF(true, CAST(CAST(1 AS DECIMAL(10,0)) AS STRING), CAST(2 AS STRING))):string> --- !query 80 output +-- !query output 1 --- !query 81 +-- !query SELECT IF(true, cast(1 as decimal(10, 0)), cast('2' as binary)) FROM t --- !query 81 schema +-- !query schema struct<> --- !query 81 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS DECIMAL(10,0)), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS DECIMAL(10,0)), CAST('2' AS BINARY)))' (decimal(10,0) and binary).; line 1 pos 7 --- !query 82 +-- !query SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as boolean)) FROM t --- !query 82 schema +-- !query schema struct<> --- !query 82 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS DECIMAL(10,0)), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS DECIMAL(10,0)), CAST(2 AS BOOLEAN)))' (decimal(10,0) and boolean).; line 1 pos 7 --- !query 83 +-- !query SELECT IF(true, cast(1 as decimal(10, 0)), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 83 schema +-- !query schema struct<> --- !query 83 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' (decimal(10,0) and timestamp).; line 1 pos 7 --- !query 84 +-- !query SELECT IF(true, cast(1 as decimal(10, 0)), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 84 schema +-- !query schema struct<> --- !query 84 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00' AS DATE)))' (decimal(10,0) and date).; line 1 pos 7 --- !query 85 +-- !query SELECT IF(true, cast(1 as string), cast(2 as tinyint)) FROM t --- !query 85 schema +-- !query schema struct<(IF(true, CAST(1 AS STRING), CAST(CAST(2 AS TINYINT) AS STRING))):string> --- !query 85 output +-- !query output 1 --- !query 86 +-- !query SELECT IF(true, cast(1 as string), cast(2 as smallint)) FROM t --- !query 86 schema +-- !query schema struct<(IF(true, CAST(1 AS STRING), CAST(CAST(2 AS SMALLINT) AS STRING))):string> --- !query 86 output +-- !query output 1 --- !query 87 +-- !query SELECT IF(true, cast(1 as string), cast(2 as int)) FROM t --- !query 87 schema +-- !query schema struct<(IF(true, CAST(1 AS STRING), CAST(CAST(2 AS INT) AS STRING))):string> --- !query 87 output +-- !query output 1 --- !query 88 +-- !query SELECT IF(true, cast(1 as string), cast(2 as bigint)) FROM t --- !query 88 schema +-- !query schema struct<(IF(true, CAST(1 AS STRING), CAST(CAST(2 AS BIGINT) AS STRING))):string> --- !query 88 output +-- !query output 1 --- !query 89 +-- !query SELECT IF(true, cast(1 as string), cast(2 as float)) FROM t --- !query 89 schema +-- !query schema struct<(IF(true, CAST(1 AS STRING), CAST(CAST(2 AS FLOAT) AS STRING))):string> --- !query 89 output +-- !query output 1 --- !query 90 +-- !query SELECT IF(true, cast(1 as string), cast(2 as double)) FROM t --- !query 90 schema +-- !query schema struct<(IF(true, CAST(1 AS STRING), CAST(CAST(2 AS DOUBLE) AS STRING))):string> --- !query 90 output +-- !query output 1 --- !query 91 +-- !query SELECT IF(true, cast(1 as string), cast(2 as decimal(10, 0))) FROM t --- !query 91 schema +-- !query schema struct<(IF(true, CAST(1 AS STRING), CAST(CAST(2 AS DECIMAL(10,0)) AS STRING))):string> --- !query 91 output +-- !query output 1 --- !query 92 +-- !query SELECT IF(true, cast(1 as string), cast(2 as string)) FROM t --- !query 92 schema +-- !query schema struct<(IF(true, CAST(1 AS STRING), CAST(2 AS STRING))):string> --- !query 92 output +-- !query output 1 --- !query 93 +-- !query SELECT IF(true, cast(1 as string), cast('2' as binary)) FROM t --- !query 93 schema +-- !query schema struct<> --- !query 93 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS STRING), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS STRING), CAST('2' AS BINARY)))' (string and binary).; line 1 pos 7 --- !query 94 +-- !query SELECT IF(true, cast(1 as string), cast(2 as boolean)) FROM t --- !query 94 schema +-- !query schema struct<> --- !query 94 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS STRING), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS STRING), CAST(2 AS BOOLEAN)))' (string and boolean).; line 1 pos 7 --- !query 95 +-- !query SELECT IF(true, cast(1 as string), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 95 schema +-- !query schema struct<(IF(true, CAST(1 AS STRING), CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) AS STRING))):string> --- !query 95 output +-- !query output 1 --- !query 96 +-- !query SELECT IF(true, cast(1 as string), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 96 schema +-- !query schema struct<(IF(true, CAST(1 AS STRING), CAST(CAST(2017-12-11 09:30:00 AS DATE) AS STRING))):string> --- !query 96 output +-- !query output 1 --- !query 97 +-- !query SELECT IF(true, cast('1' as binary), cast(2 as tinyint)) FROM t --- !query 97 schema +-- !query schema struct<> --- !query 97 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('1' AS BINARY), CAST(2 AS TINYINT)))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST(2 AS TINYINT)))' (binary and tinyint).; line 1 pos 7 --- !query 98 +-- !query SELECT IF(true, cast('1' as binary), cast(2 as smallint)) FROM t --- !query 98 schema +-- !query schema struct<> --- !query 98 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('1' AS BINARY), CAST(2 AS SMALLINT)))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST(2 AS SMALLINT)))' (binary and smallint).; line 1 pos 7 --- !query 99 +-- !query SELECT IF(true, cast('1' as binary), cast(2 as int)) FROM t --- !query 99 schema +-- !query schema struct<> --- !query 99 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('1' AS BINARY), CAST(2 AS INT)))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST(2 AS INT)))' (binary and int).; line 1 pos 7 --- !query 100 +-- !query SELECT IF(true, cast('1' as binary), cast(2 as bigint)) FROM t --- !query 100 schema +-- !query schema struct<> --- !query 100 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('1' AS BINARY), CAST(2 AS BIGINT)))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST(2 AS BIGINT)))' (binary and bigint).; line 1 pos 7 --- !query 101 +-- !query SELECT IF(true, cast('1' as binary), cast(2 as float)) FROM t --- !query 101 schema +-- !query schema struct<> --- !query 101 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('1' AS BINARY), CAST(2 AS FLOAT)))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST(2 AS FLOAT)))' (binary and float).; line 1 pos 7 --- !query 102 +-- !query SELECT IF(true, cast('1' as binary), cast(2 as double)) FROM t --- !query 102 schema +-- !query schema struct<> --- !query 102 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('1' AS BINARY), CAST(2 AS DOUBLE)))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST(2 AS DOUBLE)))' (binary and double).; line 1 pos 7 --- !query 103 +-- !query SELECT IF(true, cast('1' as binary), cast(2 as decimal(10, 0))) FROM t --- !query 103 schema +-- !query schema struct<> --- !query 103 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('1' AS BINARY), CAST(2 AS DECIMAL(10,0))))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST(2 AS DECIMAL(10,0))))' (binary and decimal(10,0)).; line 1 pos 7 --- !query 104 +-- !query SELECT IF(true, cast('1' as binary), cast(2 as string)) FROM t --- !query 104 schema +-- !query schema struct<> --- !query 104 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('1' AS BINARY), CAST(2 AS STRING)))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST(2 AS STRING)))' (binary and string).; line 1 pos 7 --- !query 105 +-- !query SELECT IF(true, cast('1' as binary), cast('2' as binary)) FROM t --- !query 105 schema +-- !query schema struct<(IF(true, CAST(1 AS BINARY), CAST(2 AS BINARY))):binary> --- !query 105 output +-- !query output 1 --- !query 106 +-- !query SELECT IF(true, cast('1' as binary), cast(2 as boolean)) FROM t --- !query 106 schema +-- !query schema struct<> --- !query 106 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('1' AS BINARY), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST(2 AS BOOLEAN)))' (binary and boolean).; line 1 pos 7 --- !query 107 +-- !query SELECT IF(true, cast('1' as binary), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 107 schema +-- !query schema struct<> --- !query 107 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('1' AS BINARY), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' (binary and timestamp).; line 1 pos 7 --- !query 108 +-- !query SELECT IF(true, cast('1' as binary), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 108 schema +-- !query schema struct<> --- !query 108 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('1' AS BINARY), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST('2017-12-11 09:30:00' AS DATE)))' (binary and date).; line 1 pos 7 --- !query 109 +-- !query SELECT IF(true, cast(1 as boolean), cast(2 as tinyint)) FROM t --- !query 109 schema +-- !query schema struct<> --- !query 109 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS TINYINT)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS TINYINT)))' (boolean and tinyint).; line 1 pos 7 --- !query 110 +-- !query SELECT IF(true, cast(1 as boolean), cast(2 as smallint)) FROM t --- !query 110 schema +-- !query schema struct<> --- !query 110 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS SMALLINT)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS SMALLINT)))' (boolean and smallint).; line 1 pos 7 --- !query 111 +-- !query SELECT IF(true, cast(1 as boolean), cast(2 as int)) FROM t --- !query 111 schema +-- !query schema struct<> --- !query 111 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS INT)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS INT)))' (boolean and int).; line 1 pos 7 --- !query 112 +-- !query SELECT IF(true, cast(1 as boolean), cast(2 as bigint)) FROM t --- !query 112 schema +-- !query schema struct<> --- !query 112 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS BIGINT)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS BIGINT)))' (boolean and bigint).; line 1 pos 7 --- !query 113 +-- !query SELECT IF(true, cast(1 as boolean), cast(2 as float)) FROM t --- !query 113 schema +-- !query schema struct<> --- !query 113 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS FLOAT)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS FLOAT)))' (boolean and float).; line 1 pos 7 --- !query 114 +-- !query SELECT IF(true, cast(1 as boolean), cast(2 as double)) FROM t --- !query 114 schema +-- !query schema struct<> --- !query 114 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS DOUBLE)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS DOUBLE)))' (boolean and double).; line 1 pos 7 --- !query 115 +-- !query SELECT IF(true, cast(1 as boolean), cast(2 as decimal(10, 0))) FROM t --- !query 115 schema +-- !query schema struct<> --- !query 115 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS DECIMAL(10,0))))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS DECIMAL(10,0))))' (boolean and decimal(10,0)).; line 1 pos 7 --- !query 116 +-- !query SELECT IF(true, cast(1 as boolean), cast(2 as string)) FROM t --- !query 116 schema +-- !query schema struct<> --- !query 116 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS STRING)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS STRING)))' (boolean and string).; line 1 pos 7 --- !query 117 +-- !query SELECT IF(true, cast(1 as boolean), cast('2' as binary)) FROM t --- !query 117 schema +-- !query schema struct<> --- !query 117 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST('2' AS BINARY)))' (boolean and binary).; line 1 pos 7 --- !query 118 +-- !query SELECT IF(true, cast(1 as boolean), cast(2 as boolean)) FROM t --- !query 118 schema +-- !query schema struct<(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS BOOLEAN))):boolean> --- !query 118 output +-- !query output true --- !query 119 +-- !query SELECT IF(true, cast(1 as boolean), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 119 schema +-- !query schema struct<> --- !query 119 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' (boolean and timestamp).; line 1 pos 7 --- !query 120 +-- !query SELECT IF(true, cast(1 as boolean), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 120 schema +-- !query schema struct<> --- !query 120 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST('2017-12-11 09:30:00' AS DATE)))' (boolean and date).; line 1 pos 7 --- !query 121 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as tinyint)) FROM t --- !query 121 schema +-- !query schema struct<> --- !query 121 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS TINYINT)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS TINYINT)))' (timestamp and tinyint).; line 1 pos 7 --- !query 122 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as smallint)) FROM t --- !query 122 schema +-- !query schema struct<> --- !query 122 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS SMALLINT)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS SMALLINT)))' (timestamp and smallint).; line 1 pos 7 --- !query 123 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as int)) FROM t --- !query 123 schema +-- !query schema struct<> --- !query 123 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS INT)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS INT)))' (timestamp and int).; line 1 pos 7 --- !query 124 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as bigint)) FROM t --- !query 124 schema +-- !query schema struct<> --- !query 124 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS BIGINT)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS BIGINT)))' (timestamp and bigint).; line 1 pos 7 --- !query 125 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as float)) FROM t --- !query 125 schema +-- !query schema struct<> --- !query 125 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS FLOAT)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS FLOAT)))' (timestamp and float).; line 1 pos 7 --- !query 126 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as double)) FROM t --- !query 126 schema +-- !query schema struct<> --- !query 126 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS DOUBLE)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS DOUBLE)))' (timestamp and double).; line 1 pos 7 --- !query 127 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as decimal(10, 0))) FROM t --- !query 127 schema +-- !query schema struct<> --- !query 127 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS DECIMAL(10,0))))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS DECIMAL(10,0))))' (timestamp and decimal(10,0)).; line 1 pos 7 --- !query 128 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as string)) FROM t --- !query 128 schema +-- !query schema struct<(IF(true, CAST(CAST(2017-12-12 09:30:00.0 AS TIMESTAMP) AS STRING), CAST(2 AS STRING))):string> --- !query 128 output +-- !query output 2017-12-12 09:30:00 --- !query 129 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast('2' as binary)) FROM t --- !query 129 schema +-- !query schema struct<> --- !query 129 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST('2' AS BINARY)))' (timestamp and binary).; line 1 pos 7 --- !query 130 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as boolean)) FROM t --- !query 130 schema +-- !query schema struct<> --- !query 130 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS BOOLEAN)))' (timestamp and boolean).; line 1 pos 7 --- !query 131 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 131 schema +-- !query schema struct<(IF(true, CAST(2017-12-12 09:30:00.0 AS TIMESTAMP), CAST(2017-12-11 09:30:00.0 AS TIMESTAMP))):timestamp> --- !query 131 output +-- !query output 2017-12-12 09:30:00 --- !query 132 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 132 schema +-- !query schema struct<(IF(true, CAST(2017-12-12 09:30:00.0 AS TIMESTAMP), CAST(CAST(2017-12-11 09:30:00 AS DATE) AS TIMESTAMP))):timestamp> --- !query 132 output +-- !query output 2017-12-12 09:30:00 --- !query 133 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as tinyint)) FROM t --- !query 133 schema +-- !query schema struct<> --- !query 133 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS TINYINT)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS TINYINT)))' (date and tinyint).; line 1 pos 7 --- !query 134 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as smallint)) FROM t --- !query 134 schema +-- !query schema struct<> --- !query 134 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS SMALLINT)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS SMALLINT)))' (date and smallint).; line 1 pos 7 --- !query 135 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as int)) FROM t --- !query 135 schema +-- !query schema struct<> --- !query 135 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS INT)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS INT)))' (date and int).; line 1 pos 7 --- !query 136 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as bigint)) FROM t --- !query 136 schema +-- !query schema struct<> --- !query 136 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS BIGINT)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS BIGINT)))' (date and bigint).; line 1 pos 7 --- !query 137 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as float)) FROM t --- !query 137 schema +-- !query schema struct<> --- !query 137 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS FLOAT)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS FLOAT)))' (date and float).; line 1 pos 7 --- !query 138 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as double)) FROM t --- !query 138 schema +-- !query schema struct<> --- !query 138 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS DOUBLE)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS DOUBLE)))' (date and double).; line 1 pos 7 --- !query 139 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as decimal(10, 0))) FROM t --- !query 139 schema +-- !query schema struct<> --- !query 139 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS DECIMAL(10,0))))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS DECIMAL(10,0))))' (date and decimal(10,0)).; line 1 pos 7 --- !query 140 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as string)) FROM t --- !query 140 schema +-- !query schema struct<(IF(true, CAST(CAST(2017-12-12 09:30:00 AS DATE) AS STRING), CAST(2 AS STRING))):string> --- !query 140 output +-- !query output 2017-12-12 --- !query 141 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast('2' as binary)) FROM t --- !query 141 schema +-- !query schema struct<> --- !query 141 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST('2' AS BINARY)))' (date and binary).; line 1 pos 7 --- !query 142 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as boolean)) FROM t --- !query 142 schema +-- !query schema struct<> --- !query 142 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS BOOLEAN)))' (date and boolean).; line 1 pos 7 --- !query 143 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 143 schema +-- !query schema struct<(IF(true, CAST(CAST(2017-12-12 09:30:00 AS DATE) AS TIMESTAMP), CAST(2017-12-11 09:30:00.0 AS TIMESTAMP))):timestamp> --- !query 143 output +-- !query output 2017-12-12 00:00:00 --- !query 144 +-- !query SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 144 schema +-- !query schema struct<(IF(true, CAST(2017-12-12 09:30:00 AS DATE), CAST(2017-12-11 09:30:00 AS DATE))):date> --- !query 144 output +-- !query output 2017-12-12 diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/implicitTypeCasts.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/implicitTypeCasts.sql.out index 44fa48e2697b3..f841adf89612e 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/implicitTypeCasts.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/implicitTypeCasts.sql.out @@ -2,353 +2,353 @@ -- Number of queries: 44 --- !query 0 +-- !query CREATE TEMPORARY VIEW t AS SELECT 1 --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT 1 + '2' FROM t --- !query 1 schema +-- !query schema struct<(CAST(1 AS DOUBLE) + CAST(2 AS DOUBLE)):double> --- !query 1 output +-- !query output 3.0 --- !query 2 +-- !query SELECT 1 - '2' FROM t --- !query 2 schema +-- !query schema struct<(CAST(1 AS DOUBLE) - CAST(2 AS DOUBLE)):double> --- !query 2 output +-- !query output -1.0 --- !query 3 +-- !query SELECT 1 * '2' FROM t --- !query 3 schema +-- !query schema struct<(CAST(1 AS DOUBLE) * CAST(2 AS DOUBLE)):double> --- !query 3 output +-- !query output 2.0 --- !query 4 +-- !query SELECT 4 / '2' FROM t --- !query 4 schema +-- !query schema struct<(CAST(4 AS DOUBLE) / CAST(CAST(2 AS DOUBLE) AS DOUBLE)):double> --- !query 4 output +-- !query output 2.0 --- !query 5 +-- !query SELECT 1.1 + '2' FROM t --- !query 5 schema +-- !query schema struct<(CAST(1.1 AS DOUBLE) + CAST(2 AS DOUBLE)):double> --- !query 5 output +-- !query output 3.1 --- !query 6 +-- !query SELECT 1.1 - '2' FROM t --- !query 6 schema +-- !query schema struct<(CAST(1.1 AS DOUBLE) - CAST(2 AS DOUBLE)):double> --- !query 6 output +-- !query output -0.8999999999999999 --- !query 7 +-- !query SELECT 1.1 * '2' FROM t --- !query 7 schema +-- !query schema struct<(CAST(1.1 AS DOUBLE) * CAST(2 AS DOUBLE)):double> --- !query 7 output +-- !query output 2.2 --- !query 8 +-- !query SELECT 4.4 / '2' FROM t --- !query 8 schema +-- !query schema struct<(CAST(4.4 AS DOUBLE) / CAST(2 AS DOUBLE)):double> --- !query 8 output +-- !query output 2.2 --- !query 9 +-- !query SELECT 1.1 + '2.2' FROM t --- !query 9 schema +-- !query schema struct<(CAST(1.1 AS DOUBLE) + CAST(2.2 AS DOUBLE)):double> --- !query 9 output +-- !query output 3.3000000000000003 --- !query 10 +-- !query SELECT 1.1 - '2.2' FROM t --- !query 10 schema +-- !query schema struct<(CAST(1.1 AS DOUBLE) - CAST(2.2 AS DOUBLE)):double> --- !query 10 output +-- !query output -1.1 --- !query 11 +-- !query SELECT 1.1 * '2.2' FROM t --- !query 11 schema +-- !query schema struct<(CAST(1.1 AS DOUBLE) * CAST(2.2 AS DOUBLE)):double> --- !query 11 output +-- !query output 2.4200000000000004 --- !query 12 +-- !query SELECT 4.4 / '2.2' FROM t --- !query 12 schema +-- !query schema struct<(CAST(4.4 AS DOUBLE) / CAST(2.2 AS DOUBLE)):double> --- !query 12 output +-- !query output 2.0 --- !query 13 +-- !query SELECT '$' || cast(1 as smallint) || '$' FROM t --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output $1$ --- !query 14 +-- !query SELECT '$' || 1 || '$' FROM t --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output $1$ --- !query 15 +-- !query SELECT '$' || cast(1 as bigint) || '$' FROM t --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output $1$ --- !query 16 +-- !query SELECT '$' || cast(1.1 as float) || '$' FROM t --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output $1.1$ --- !query 17 +-- !query SELECT '$' || cast(1.1 as double) || '$' FROM t --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output $1.1$ --- !query 18 +-- !query SELECT '$' || 1.1 || '$' FROM t --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output $1.1$ --- !query 19 +-- !query SELECT '$' || cast(1.1 as decimal(8,3)) || '$' FROM t --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output $1.100$ --- !query 20 +-- !query SELECT '$' || 'abcd' || '$' FROM t --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output $abcd$ --- !query 21 +-- !query SELECT '$' || date('1996-09-09') || '$' FROM t --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output $1996-09-09$ --- !query 22 +-- !query SELECT '$' || timestamp('1996-09-09 10:11:12.4' )|| '$' FROM t --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output $1996-09-09 10:11:12.4$ --- !query 23 +-- !query SELECT length(cast(1 as smallint)) FROM t --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output 1 --- !query 24 +-- !query SELECT length(cast(1 as int)) FROM t --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output 1 --- !query 25 +-- !query SELECT length(cast(1 as bigint)) FROM t --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output 1 --- !query 26 +-- !query SELECT length(cast(1.1 as float)) FROM t --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output 3 --- !query 27 +-- !query SELECT length(cast(1.1 as double)) FROM t --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output 3 --- !query 28 +-- !query SELECT length(1.1) FROM t --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output 3 --- !query 29 +-- !query SELECT length(cast(1.1 as decimal(8,3))) FROM t --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output 5 --- !query 30 +-- !query SELECT length('four') FROM t --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output 4 --- !query 31 +-- !query SELECT length(date('1996-09-10')) FROM t --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output 10 --- !query 32 +-- !query SELECT length(timestamp('1996-09-10 10:11:12.4')) FROM t --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output 21 --- !query 33 +-- !query SELECT year( '1996-01-10') FROM t --- !query 33 schema +-- !query schema struct --- !query 33 output +-- !query output 1996 --- !query 34 +-- !query SELECT month( '1996-01-10') FROM t --- !query 34 schema +-- !query schema struct --- !query 34 output +-- !query output 1 --- !query 35 +-- !query SELECT day( '1996-01-10') FROM t --- !query 35 schema +-- !query schema struct --- !query 35 output +-- !query output 10 --- !query 36 +-- !query SELECT hour( '10:11:12') FROM t --- !query 36 schema +-- !query schema struct --- !query 36 output +-- !query output 10 --- !query 37 +-- !query SELECT minute( '10:11:12') FROM t --- !query 37 schema +-- !query schema struct --- !query 37 output +-- !query output 11 --- !query 38 +-- !query SELECT second( '10:11:12') FROM t --- !query 38 schema +-- !query schema struct --- !query 38 output +-- !query output 12 --- !query 39 +-- !query select 1 like '%' FROM t --- !query 39 schema +-- !query schema struct --- !query 39 output +-- !query output true --- !query 40 +-- !query select date('1996-09-10') like '19%' FROM t --- !query 40 schema +-- !query schema struct --- !query 40 output +-- !query output true --- !query 41 +-- !query select '1' like 1 FROM t --- !query 41 schema +-- !query schema struct<1 LIKE CAST(1 AS STRING):boolean> --- !query 41 output +-- !query output true --- !query 42 +-- !query select '1 ' like 1 FROM t --- !query 42 schema +-- !query schema struct<1 LIKE CAST(1 AS STRING):boolean> --- !query 42 output +-- !query output false --- !query 43 +-- !query select '1996-09-10' like date('1996-09-10') FROM t --- !query 43 schema +-- !query schema struct<1996-09-10 LIKE CAST(CAST(1996-09-10 AS DATE) AS STRING):boolean> --- !query 43 output +-- !query output true diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/inConversion.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/inConversion.sql.out index 875ccc1341ec4..21d0a0e0fef4e 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/inConversion.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/inConversion.sql.out @@ -2,2453 +2,2453 @@ -- Number of queries: 289 --- !query 0 +-- !query CREATE TEMPORARY VIEW t AS SELECT 1 --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT cast(1 as tinyint) in (cast(1 as tinyint)) FROM t --- !query 1 schema +-- !query schema struct<(CAST(1 AS TINYINT) IN (CAST(1 AS TINYINT))):boolean> --- !query 1 output +-- !query output true --- !query 2 +-- !query SELECT cast(1 as tinyint) in (cast(1 as smallint)) FROM t --- !query 2 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS SMALLINT) IN (CAST(CAST(1 AS SMALLINT) AS SMALLINT))):boolean> --- !query 2 output +-- !query output true --- !query 3 +-- !query SELECT cast(1 as tinyint) in (cast(1 as int)) FROM t --- !query 3 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS INT) IN (CAST(CAST(1 AS INT) AS INT))):boolean> --- !query 3 output +-- !query output true --- !query 4 +-- !query SELECT cast(1 as tinyint) in (cast(1 as bigint)) FROM t --- !query 4 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS BIGINT) IN (CAST(CAST(1 AS BIGINT) AS BIGINT))):boolean> --- !query 4 output +-- !query output true --- !query 5 +-- !query SELECT cast(1 as tinyint) in (cast(1 as float)) FROM t --- !query 5 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS FLOAT) IN (CAST(CAST(1 AS FLOAT) AS FLOAT))):boolean> --- !query 5 output +-- !query output true --- !query 6 +-- !query SELECT cast(1 as tinyint) in (cast(1 as double)) FROM t --- !query 6 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean> --- !query 6 output +-- !query output true --- !query 7 +-- !query SELECT cast(1 as tinyint) in (cast(1 as decimal(10, 0))) FROM t --- !query 7 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean> --- !query 7 output +-- !query output true --- !query 8 +-- !query SELECT cast(1 as tinyint) in (cast(1 as string)) FROM t --- !query 8 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING))):boolean> --- !query 8 output +-- !query output true --- !query 9 +-- !query SELECT cast(1 as tinyint) in (cast('1' as binary)) FROM t --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS TINYINT) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: tinyint != binary; line 1 pos 26 --- !query 10 +-- !query SELECT cast(1 as tinyint) in (cast(1 as boolean)) FROM t --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS TINYINT) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: tinyint != boolean; line 1 pos 26 --- !query 11 +-- !query SELECT cast(1 as tinyint) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS TINYINT) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: tinyint != timestamp; line 1 pos 26 --- !query 12 +-- !query SELECT cast(1 as tinyint) in (cast('2017-12-11 09:30:00' as date)) FROM t --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS TINYINT) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: tinyint != date; line 1 pos 26 --- !query 13 +-- !query SELECT cast(1 as smallint) in (cast(1 as tinyint)) FROM t --- !query 13 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS SMALLINT) IN (CAST(CAST(1 AS TINYINT) AS SMALLINT))):boolean> --- !query 13 output +-- !query output true --- !query 14 +-- !query SELECT cast(1 as smallint) in (cast(1 as smallint)) FROM t --- !query 14 schema +-- !query schema struct<(CAST(1 AS SMALLINT) IN (CAST(1 AS SMALLINT))):boolean> --- !query 14 output +-- !query output true --- !query 15 +-- !query SELECT cast(1 as smallint) in (cast(1 as int)) FROM t --- !query 15 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS INT) IN (CAST(CAST(1 AS INT) AS INT))):boolean> --- !query 15 output +-- !query output true --- !query 16 +-- !query SELECT cast(1 as smallint) in (cast(1 as bigint)) FROM t --- !query 16 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS BIGINT) IN (CAST(CAST(1 AS BIGINT) AS BIGINT))):boolean> --- !query 16 output +-- !query output true --- !query 17 +-- !query SELECT cast(1 as smallint) in (cast(1 as float)) FROM t --- !query 17 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS FLOAT) IN (CAST(CAST(1 AS FLOAT) AS FLOAT))):boolean> --- !query 17 output +-- !query output true --- !query 18 +-- !query SELECT cast(1 as smallint) in (cast(1 as double)) FROM t --- !query 18 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean> --- !query 18 output +-- !query output true --- !query 19 +-- !query SELECT cast(1 as smallint) in (cast(1 as decimal(10, 0))) FROM t --- !query 19 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean> --- !query 19 output +-- !query output true --- !query 20 +-- !query SELECT cast(1 as smallint) in (cast(1 as string)) FROM t --- !query 20 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING))):boolean> --- !query 20 output +-- !query output true --- !query 21 +-- !query SELECT cast(1 as smallint) in (cast('1' as binary)) FROM t --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS SMALLINT) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: smallint != binary; line 1 pos 27 --- !query 22 +-- !query SELECT cast(1 as smallint) in (cast(1 as boolean)) FROM t --- !query 22 schema +-- !query schema struct<> --- !query 22 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS SMALLINT) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: smallint != boolean; line 1 pos 27 --- !query 23 +-- !query SELECT cast(1 as smallint) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS SMALLINT) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: smallint != timestamp; line 1 pos 27 --- !query 24 +-- !query SELECT cast(1 as smallint) in (cast('2017-12-11 09:30:00' as date)) FROM t --- !query 24 schema +-- !query schema struct<> --- !query 24 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS SMALLINT) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: smallint != date; line 1 pos 27 --- !query 25 +-- !query SELECT cast(1 as int) in (cast(1 as tinyint)) FROM t --- !query 25 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS INT) IN (CAST(CAST(1 AS TINYINT) AS INT))):boolean> --- !query 25 output +-- !query output true --- !query 26 +-- !query SELECT cast(1 as int) in (cast(1 as smallint)) FROM t --- !query 26 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS INT) IN (CAST(CAST(1 AS SMALLINT) AS INT))):boolean> --- !query 26 output +-- !query output true --- !query 27 +-- !query SELECT cast(1 as int) in (cast(1 as int)) FROM t --- !query 27 schema +-- !query schema struct<(CAST(1 AS INT) IN (CAST(1 AS INT))):boolean> --- !query 27 output +-- !query output true --- !query 28 +-- !query SELECT cast(1 as int) in (cast(1 as bigint)) FROM t --- !query 28 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS BIGINT) IN (CAST(CAST(1 AS BIGINT) AS BIGINT))):boolean> --- !query 28 output +-- !query output true --- !query 29 +-- !query SELECT cast(1 as int) in (cast(1 as float)) FROM t --- !query 29 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS FLOAT) IN (CAST(CAST(1 AS FLOAT) AS FLOAT))):boolean> --- !query 29 output +-- !query output true --- !query 30 +-- !query SELECT cast(1 as int) in (cast(1 as double)) FROM t --- !query 30 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean> --- !query 30 output +-- !query output true --- !query 31 +-- !query SELECT cast(1 as int) in (cast(1 as decimal(10, 0))) FROM t --- !query 31 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean> --- !query 31 output +-- !query output true --- !query 32 +-- !query SELECT cast(1 as int) in (cast(1 as string)) FROM t --- !query 32 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING))):boolean> --- !query 32 output +-- !query output true --- !query 33 +-- !query SELECT cast(1 as int) in (cast('1' as binary)) FROM t --- !query 33 schema +-- !query schema struct<> --- !query 33 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS INT) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: int != binary; line 1 pos 22 --- !query 34 +-- !query SELECT cast(1 as int) in (cast(1 as boolean)) FROM t --- !query 34 schema +-- !query schema struct<> --- !query 34 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS INT) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: int != boolean; line 1 pos 22 --- !query 35 +-- !query SELECT cast(1 as int) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 35 schema +-- !query schema struct<> --- !query 35 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS INT) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: int != timestamp; line 1 pos 22 --- !query 36 +-- !query SELECT cast(1 as int) in (cast('2017-12-11 09:30:00' as date)) FROM t --- !query 36 schema +-- !query schema struct<> --- !query 36 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS INT) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: int != date; line 1 pos 22 --- !query 37 +-- !query SELECT cast(1 as bigint) in (cast(1 as tinyint)) FROM t --- !query 37 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS BIGINT) IN (CAST(CAST(1 AS TINYINT) AS BIGINT))):boolean> --- !query 37 output +-- !query output true --- !query 38 +-- !query SELECT cast(1 as bigint) in (cast(1 as smallint)) FROM t --- !query 38 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS BIGINT) IN (CAST(CAST(1 AS SMALLINT) AS BIGINT))):boolean> --- !query 38 output +-- !query output true --- !query 39 +-- !query SELECT cast(1 as bigint) in (cast(1 as int)) FROM t --- !query 39 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS BIGINT) IN (CAST(CAST(1 AS INT) AS BIGINT))):boolean> --- !query 39 output +-- !query output true --- !query 40 +-- !query SELECT cast(1 as bigint) in (cast(1 as bigint)) FROM t --- !query 40 schema +-- !query schema struct<(CAST(1 AS BIGINT) IN (CAST(1 AS BIGINT))):boolean> --- !query 40 output +-- !query output true --- !query 41 +-- !query SELECT cast(1 as bigint) in (cast(1 as float)) FROM t --- !query 41 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS FLOAT) IN (CAST(CAST(1 AS FLOAT) AS FLOAT))):boolean> --- !query 41 output +-- !query output true --- !query 42 +-- !query SELECT cast(1 as bigint) in (cast(1 as double)) FROM t --- !query 42 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean> --- !query 42 output +-- !query output true --- !query 43 +-- !query SELECT cast(1 as bigint) in (cast(1 as decimal(10, 0))) FROM t --- !query 43 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)))):boolean> --- !query 43 output +-- !query output true --- !query 44 +-- !query SELECT cast(1 as bigint) in (cast(1 as string)) FROM t --- !query 44 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING))):boolean> --- !query 44 output +-- !query output true --- !query 45 +-- !query SELECT cast(1 as bigint) in (cast('1' as binary)) FROM t --- !query 45 schema +-- !query schema struct<> --- !query 45 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BIGINT) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: bigint != binary; line 1 pos 25 --- !query 46 +-- !query SELECT cast(1 as bigint) in (cast(1 as boolean)) FROM t --- !query 46 schema +-- !query schema struct<> --- !query 46 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BIGINT) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: bigint != boolean; line 1 pos 25 --- !query 47 +-- !query SELECT cast(1 as bigint) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 47 schema +-- !query schema struct<> --- !query 47 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BIGINT) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: bigint != timestamp; line 1 pos 25 --- !query 48 +-- !query SELECT cast(1 as bigint) in (cast('2017-12-11 09:30:00' as date)) FROM t --- !query 48 schema +-- !query schema struct<> --- !query 48 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BIGINT) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: bigint != date; line 1 pos 25 --- !query 49 +-- !query SELECT cast(1 as float) in (cast(1 as tinyint)) FROM t --- !query 49 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS FLOAT) IN (CAST(CAST(1 AS TINYINT) AS FLOAT))):boolean> --- !query 49 output +-- !query output true --- !query 50 +-- !query SELECT cast(1 as float) in (cast(1 as smallint)) FROM t --- !query 50 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS FLOAT) IN (CAST(CAST(1 AS SMALLINT) AS FLOAT))):boolean> --- !query 50 output +-- !query output true --- !query 51 +-- !query SELECT cast(1 as float) in (cast(1 as int)) FROM t --- !query 51 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS FLOAT) IN (CAST(CAST(1 AS INT) AS FLOAT))):boolean> --- !query 51 output +-- !query output true --- !query 52 +-- !query SELECT cast(1 as float) in (cast(1 as bigint)) FROM t --- !query 52 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS FLOAT) IN (CAST(CAST(1 AS BIGINT) AS FLOAT))):boolean> --- !query 52 output +-- !query output true --- !query 53 +-- !query SELECT cast(1 as float) in (cast(1 as float)) FROM t --- !query 53 schema +-- !query schema struct<(CAST(1 AS FLOAT) IN (CAST(1 AS FLOAT))):boolean> --- !query 53 output +-- !query output true --- !query 54 +-- !query SELECT cast(1 as float) in (cast(1 as double)) FROM t --- !query 54 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean> --- !query 54 output +-- !query output true --- !query 55 +-- !query SELECT cast(1 as float) in (cast(1 as decimal(10, 0))) FROM t --- !query 55 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE))):boolean> --- !query 55 output +-- !query output true --- !query 56 +-- !query SELECT cast(1 as float) in (cast(1 as string)) FROM t --- !query 56 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING))):boolean> --- !query 56 output +-- !query output false --- !query 57 +-- !query SELECT cast(1 as float) in (cast('1' as binary)) FROM t --- !query 57 schema +-- !query schema struct<> --- !query 57 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS FLOAT) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: float != binary; line 1 pos 24 --- !query 58 +-- !query SELECT cast(1 as float) in (cast(1 as boolean)) FROM t --- !query 58 schema +-- !query schema struct<> --- !query 58 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS FLOAT) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: float != boolean; line 1 pos 24 --- !query 59 +-- !query SELECT cast(1 as float) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 59 schema +-- !query schema struct<> --- !query 59 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS FLOAT) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: float != timestamp; line 1 pos 24 --- !query 60 +-- !query SELECT cast(1 as float) in (cast('2017-12-11 09:30:00' as date)) FROM t --- !query 60 schema +-- !query schema struct<> --- !query 60 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS FLOAT) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: float != date; line 1 pos 24 --- !query 61 +-- !query SELECT cast(1 as double) in (cast(1 as tinyint)) FROM t --- !query 61 schema +-- !query schema struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS TINYINT) AS DOUBLE))):boolean> --- !query 61 output +-- !query output true --- !query 62 +-- !query SELECT cast(1 as double) in (cast(1 as smallint)) FROM t --- !query 62 schema +-- !query schema struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS SMALLINT) AS DOUBLE))):boolean> --- !query 62 output +-- !query output true --- !query 63 +-- !query SELECT cast(1 as double) in (cast(1 as int)) FROM t --- !query 63 schema +-- !query schema struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS INT) AS DOUBLE))):boolean> --- !query 63 output +-- !query output true --- !query 64 +-- !query SELECT cast(1 as double) in (cast(1 as bigint)) FROM t --- !query 64 schema +-- !query schema struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS BIGINT) AS DOUBLE))):boolean> --- !query 64 output +-- !query output true --- !query 65 +-- !query SELECT cast(1 as double) in (cast(1 as float)) FROM t --- !query 65 schema +-- !query schema struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS FLOAT) AS DOUBLE))):boolean> --- !query 65 output +-- !query output true --- !query 66 +-- !query SELECT cast(1 as double) in (cast(1 as double)) FROM t --- !query 66 schema +-- !query schema struct<(CAST(1 AS DOUBLE) IN (CAST(1 AS DOUBLE))):boolean> --- !query 66 output +-- !query output true --- !query 67 +-- !query SELECT cast(1 as double) in (cast(1 as decimal(10, 0))) FROM t --- !query 67 schema +-- !query schema struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE))):boolean> --- !query 67 output +-- !query output true --- !query 68 +-- !query SELECT cast(1 as double) in (cast(1 as string)) FROM t --- !query 68 schema +-- !query schema struct<(CAST(CAST(1 AS DOUBLE) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING))):boolean> --- !query 68 output +-- !query output false --- !query 69 +-- !query SELECT cast(1 as double) in (cast('1' as binary)) FROM t --- !query 69 schema +-- !query schema struct<> --- !query 69 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DOUBLE) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: double != binary; line 1 pos 25 --- !query 70 +-- !query SELECT cast(1 as double) in (cast(1 as boolean)) FROM t --- !query 70 schema +-- !query schema struct<> --- !query 70 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DOUBLE) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: double != boolean; line 1 pos 25 --- !query 71 +-- !query SELECT cast(1 as double) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 71 schema +-- !query schema struct<> --- !query 71 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DOUBLE) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: double != timestamp; line 1 pos 25 --- !query 72 +-- !query SELECT cast(1 as double) in (cast('2017-12-11 09:30:00' as date)) FROM t --- !query 72 schema +-- !query schema struct<> --- !query 72 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DOUBLE) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: double != date; line 1 pos 25 --- !query 73 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast(1 as tinyint)) FROM t --- !query 73 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS TINYINT) AS DECIMAL(10,0)))):boolean> --- !query 73 output +-- !query output true --- !query 74 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast(1 as smallint)) FROM t --- !query 74 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS SMALLINT) AS DECIMAL(10,0)))):boolean> --- !query 74 output +-- !query output true --- !query 75 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast(1 as int)) FROM t --- !query 75 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS INT) AS DECIMAL(10,0)))):boolean> --- !query 75 output +-- !query output true --- !query 76 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast(1 as bigint)) FROM t --- !query 76 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) IN (CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)))):boolean> --- !query 76 output +-- !query output true --- !query 77 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast(1 as float)) FROM t --- !query 77 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) IN (CAST(CAST(1 AS FLOAT) AS DOUBLE))):boolean> --- !query 77 output +-- !query output true --- !query 78 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast(1 as double)) FROM t --- !query 78 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean> --- !query 78 output +-- !query output true --- !query 79 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0))) FROM t --- !query 79 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS DECIMAL(10,0)))):boolean> --- !query 79 output +-- !query output true --- !query 80 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast(1 as string)) FROM t --- !query 80 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING))):boolean> --- !query 80 output +-- !query output true --- !query 81 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast('1' as binary)) FROM t --- !query 81 schema +-- !query schema struct<> --- !query 81 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: decimal(10,0) != binary; line 1 pos 33 --- !query 82 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast(1 as boolean)) FROM t --- !query 82 schema +-- !query schema struct<> --- !query 82 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: decimal(10,0) != boolean; line 1 pos 33 --- !query 83 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 83 schema +-- !query schema struct<> --- !query 83 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: decimal(10,0) != timestamp; line 1 pos 33 --- !query 84 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast('2017-12-11 09:30:00' as date)) FROM t --- !query 84 schema +-- !query schema struct<> --- !query 84 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: decimal(10,0) != date; line 1 pos 33 --- !query 85 +-- !query SELECT cast(1 as string) in (cast(1 as tinyint)) FROM t --- !query 85 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS TINYINT) AS STRING))):boolean> --- !query 85 output +-- !query output true --- !query 86 +-- !query SELECT cast(1 as string) in (cast(1 as smallint)) FROM t --- !query 86 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS SMALLINT) AS STRING))):boolean> --- !query 86 output +-- !query output true --- !query 87 +-- !query SELECT cast(1 as string) in (cast(1 as int)) FROM t --- !query 87 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS INT) AS STRING))):boolean> --- !query 87 output +-- !query output true --- !query 88 +-- !query SELECT cast(1 as string) in (cast(1 as bigint)) FROM t --- !query 88 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS BIGINT) AS STRING))):boolean> --- !query 88 output +-- !query output true --- !query 89 +-- !query SELECT cast(1 as string) in (cast(1 as float)) FROM t --- !query 89 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS FLOAT) AS STRING))):boolean> --- !query 89 output +-- !query output false --- !query 90 +-- !query SELECT cast(1 as string) in (cast(1 as double)) FROM t --- !query 90 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS DOUBLE) AS STRING))):boolean> --- !query 90 output +-- !query output false --- !query 91 +-- !query SELECT cast(1 as string) in (cast(1 as decimal(10, 0))) FROM t --- !query 91 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS STRING))):boolean> --- !query 91 output +-- !query output true --- !query 92 +-- !query SELECT cast(1 as string) in (cast(1 as string)) FROM t --- !query 92 schema +-- !query schema struct<(CAST(1 AS STRING) IN (CAST(1 AS STRING))):boolean> --- !query 92 output +-- !query output true --- !query 93 +-- !query SELECT cast(1 as string) in (cast('1' as binary)) FROM t --- !query 93 schema +-- !query schema struct<> --- !query 93 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS STRING) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: string != binary; line 1 pos 25 --- !query 94 +-- !query SELECT cast(1 as string) in (cast(1 as boolean)) FROM t --- !query 94 schema +-- !query schema struct<> --- !query 94 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS STRING) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: string != boolean; line 1 pos 25 --- !query 95 +-- !query SELECT cast(1 as string) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 95 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) AS STRING))):boolean> --- !query 95 output +-- !query output false --- !query 96 +-- !query SELECT cast(1 as string) in (cast('2017-12-11 09:30:00' as date)) FROM t --- !query 96 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(2017-12-11 09:30:00 AS DATE) AS STRING))):boolean> --- !query 96 output +-- !query output false --- !query 97 +-- !query SELECT cast('1' as binary) in (cast(1 as tinyint)) FROM t --- !query 97 schema +-- !query schema struct<> --- !query 97 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: binary != tinyint; line 1 pos 27 --- !query 98 +-- !query SELECT cast('1' as binary) in (cast(1 as smallint)) FROM t --- !query 98 schema +-- !query schema struct<> --- !query 98 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: binary != smallint; line 1 pos 27 --- !query 99 +-- !query SELECT cast('1' as binary) in (cast(1 as int)) FROM t --- !query 99 schema +-- !query schema struct<> --- !query 99 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: binary != int; line 1 pos 27 --- !query 100 +-- !query SELECT cast('1' as binary) in (cast(1 as bigint)) FROM t --- !query 100 schema +-- !query schema struct<> --- !query 100 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: binary != bigint; line 1 pos 27 --- !query 101 +-- !query SELECT cast('1' as binary) in (cast(1 as float)) FROM t --- !query 101 schema +-- !query schema struct<> --- !query 101 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: binary != float; line 1 pos 27 --- !query 102 +-- !query SELECT cast('1' as binary) in (cast(1 as double)) FROM t --- !query 102 schema +-- !query schema struct<> --- !query 102 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: binary != double; line 1 pos 27 --- !query 103 +-- !query SELECT cast('1' as binary) in (cast(1 as decimal(10, 0))) FROM t --- !query 103 schema +-- !query schema struct<> --- !query 103 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: binary != decimal(10,0); line 1 pos 27 --- !query 104 +-- !query SELECT cast('1' as binary) in (cast(1 as string)) FROM t --- !query 104 schema +-- !query schema struct<> --- !query 104 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS STRING)))' due to data type mismatch: Arguments must be same type but were: binary != string; line 1 pos 27 --- !query 105 +-- !query SELECT cast('1' as binary) in (cast('1' as binary)) FROM t --- !query 105 schema +-- !query schema struct<(CAST(1 AS BINARY) IN (CAST(1 AS BINARY))):boolean> --- !query 105 output +-- !query output true --- !query 106 +-- !query SELECT cast('1' as binary) in (cast(1 as boolean)) FROM t --- !query 106 schema +-- !query schema struct<> --- !query 106 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: binary != boolean; line 1 pos 27 --- !query 107 +-- !query SELECT cast('1' as binary) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 107 schema +-- !query schema struct<> --- !query 107 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: binary != timestamp; line 1 pos 27 --- !query 108 +-- !query SELECT cast('1' as binary) in (cast('2017-12-11 09:30:00' as date)) FROM t --- !query 108 schema +-- !query schema struct<> --- !query 108 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: binary != date; line 1 pos 27 --- !query 109 +-- !query SELECT true in (cast(1 as tinyint)) FROM t --- !query 109 schema +-- !query schema struct<> --- !query 109 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(true IN (CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: boolean != tinyint; line 1 pos 12 --- !query 110 +-- !query SELECT true in (cast(1 as smallint)) FROM t --- !query 110 schema +-- !query schema struct<> --- !query 110 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(true IN (CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: boolean != smallint; line 1 pos 12 --- !query 111 +-- !query SELECT true in (cast(1 as int)) FROM t --- !query 111 schema +-- !query schema struct<> --- !query 111 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(true IN (CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: boolean != int; line 1 pos 12 --- !query 112 +-- !query SELECT true in (cast(1 as bigint)) FROM t --- !query 112 schema +-- !query schema struct<> --- !query 112 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(true IN (CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: boolean != bigint; line 1 pos 12 --- !query 113 +-- !query SELECT true in (cast(1 as float)) FROM t --- !query 113 schema +-- !query schema struct<> --- !query 113 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(true IN (CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: boolean != float; line 1 pos 12 --- !query 114 +-- !query SELECT true in (cast(1 as double)) FROM t --- !query 114 schema +-- !query schema struct<> --- !query 114 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(true IN (CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: boolean != double; line 1 pos 12 --- !query 115 +-- !query SELECT true in (cast(1 as decimal(10, 0))) FROM t --- !query 115 schema +-- !query schema struct<> --- !query 115 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(true IN (CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: boolean != decimal(10,0); line 1 pos 12 --- !query 116 +-- !query SELECT true in (cast(1 as string)) FROM t --- !query 116 schema +-- !query schema struct<> --- !query 116 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(true IN (CAST(1 AS STRING)))' due to data type mismatch: Arguments must be same type but were: boolean != string; line 1 pos 12 --- !query 117 +-- !query SELECT true in (cast('1' as binary)) FROM t --- !query 117 schema +-- !query schema struct<> --- !query 117 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(true IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: boolean != binary; line 1 pos 12 --- !query 118 +-- !query SELECT true in (cast(1 as boolean)) FROM t --- !query 118 schema +-- !query schema struct<(true IN (CAST(1 AS BOOLEAN))):boolean> --- !query 118 output +-- !query output true --- !query 119 +-- !query SELECT true in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 119 schema +-- !query schema struct<> --- !query 119 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(true IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: boolean != timestamp; line 1 pos 12 --- !query 120 +-- !query SELECT true in (cast('2017-12-11 09:30:00' as date)) FROM t --- !query 120 schema +-- !query schema struct<> --- !query 120 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(true IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: boolean != date; line 1 pos 12 --- !query 121 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as tinyint)) FROM t --- !query 121 schema +-- !query schema struct<> --- !query 121 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: timestamp != tinyint; line 1 pos 50 --- !query 122 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as smallint)) FROM t --- !query 122 schema +-- !query schema struct<> --- !query 122 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: timestamp != smallint; line 1 pos 50 --- !query 123 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as int)) FROM t --- !query 123 schema +-- !query schema struct<> --- !query 123 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS INT)))' due to data type mismatch: Arguments must be same type but were: timestamp != int; line 1 pos 50 --- !query 124 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as bigint)) FROM t --- !query 124 schema +-- !query schema struct<> --- !query 124 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: timestamp != bigint; line 1 pos 50 --- !query 125 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as float)) FROM t --- !query 125 schema +-- !query schema struct<> --- !query 125 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: timestamp != float; line 1 pos 50 --- !query 126 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as double)) FROM t --- !query 126 schema +-- !query schema struct<> --- !query 126 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: timestamp != double; line 1 pos 50 --- !query 127 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as decimal(10, 0))) FROM t --- !query 127 schema +-- !query schema struct<> --- !query 127 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: timestamp != decimal(10,0); line 1 pos 50 --- !query 128 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as string)) FROM t --- !query 128 schema +-- !query schema struct<(CAST(CAST(2017-12-12 09:30:00.0 AS TIMESTAMP) AS STRING) IN (CAST(CAST(2 AS STRING) AS STRING))):boolean> --- !query 128 output +-- !query output false --- !query 129 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2' as binary)) FROM t --- !query 129 schema +-- !query schema struct<> --- !query 129 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: timestamp != binary; line 1 pos 50 --- !query 130 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as boolean)) FROM t --- !query 130 schema +-- !query schema struct<> --- !query 130 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: timestamp != boolean; line 1 pos 50 --- !query 131 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 131 schema +-- !query schema struct<(CAST(2017-12-12 09:30:00.0 AS TIMESTAMP) IN (CAST(2017-12-11 09:30:00.0 AS TIMESTAMP))):boolean> --- !query 131 output +-- !query output false --- !query 132 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-11 09:30:00' as date)) FROM t --- !query 132 schema +-- !query schema struct<(CAST(CAST(2017-12-12 09:30:00.0 AS TIMESTAMP) AS TIMESTAMP) IN (CAST(CAST(2017-12-11 09:30:00 AS DATE) AS TIMESTAMP))):boolean> --- !query 132 output +-- !query output false --- !query 133 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as tinyint)) FROM t --- !query 133 schema +-- !query schema struct<> --- !query 133 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: date != tinyint; line 1 pos 43 --- !query 134 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as smallint)) FROM t --- !query 134 schema +-- !query schema struct<> --- !query 134 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: date != smallint; line 1 pos 43 --- !query 135 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as int)) FROM t --- !query 135 schema +-- !query schema struct<> --- !query 135 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS INT)))' due to data type mismatch: Arguments must be same type but were: date != int; line 1 pos 43 --- !query 136 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as bigint)) FROM t --- !query 136 schema +-- !query schema struct<> --- !query 136 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: date != bigint; line 1 pos 43 --- !query 137 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as float)) FROM t --- !query 137 schema +-- !query schema struct<> --- !query 137 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: date != float; line 1 pos 43 --- !query 138 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as double)) FROM t --- !query 138 schema +-- !query schema struct<> --- !query 138 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: date != double; line 1 pos 43 --- !query 139 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as decimal(10, 0))) FROM t --- !query 139 schema +-- !query schema struct<> --- !query 139 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: date != decimal(10,0); line 1 pos 43 --- !query 140 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as string)) FROM t --- !query 140 schema +-- !query schema struct<(CAST(CAST(2017-12-12 09:30:00 AS DATE) AS STRING) IN (CAST(CAST(2 AS STRING) AS STRING))):boolean> --- !query 140 output +-- !query output false --- !query 141 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast('2' as binary)) FROM t --- !query 141 schema +-- !query schema struct<> --- !query 141 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: date != binary; line 1 pos 43 --- !query 142 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as boolean)) FROM t --- !query 142 schema +-- !query schema struct<> --- !query 142 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: date != boolean; line 1 pos 43 --- !query 143 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 143 schema +-- !query schema struct<(CAST(CAST(2017-12-12 09:30:00 AS DATE) AS TIMESTAMP) IN (CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) AS TIMESTAMP))):boolean> --- !query 143 output +-- !query output false --- !query 144 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-11 09:30:00' as date)) FROM t --- !query 144 schema +-- !query schema struct<(CAST(2017-12-12 09:30:00 AS DATE) IN (CAST(2017-12-11 09:30:00 AS DATE))):boolean> --- !query 144 output +-- !query output false --- !query 145 +-- !query SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as tinyint)) FROM t --- !query 145 schema +-- !query schema struct<(CAST(1 AS TINYINT) IN (CAST(1 AS TINYINT), CAST(1 AS TINYINT))):boolean> --- !query 145 output +-- !query output true --- !query 146 +-- !query SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as smallint)) FROM t --- !query 146 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS SMALLINT) IN (CAST(CAST(1 AS TINYINT) AS SMALLINT), CAST(CAST(1 AS SMALLINT) AS SMALLINT))):boolean> --- !query 146 output +-- !query output true --- !query 147 +-- !query SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as int)) FROM t --- !query 147 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS INT) IN (CAST(CAST(1 AS TINYINT) AS INT), CAST(CAST(1 AS INT) AS INT))):boolean> --- !query 147 output +-- !query output true --- !query 148 +-- !query SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as bigint)) FROM t --- !query 148 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS BIGINT) IN (CAST(CAST(1 AS TINYINT) AS BIGINT), CAST(CAST(1 AS BIGINT) AS BIGINT))):boolean> --- !query 148 output +-- !query output true --- !query 149 +-- !query SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as float)) FROM t --- !query 149 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS FLOAT) IN (CAST(CAST(1 AS TINYINT) AS FLOAT), CAST(CAST(1 AS FLOAT) AS FLOAT))):boolean> --- !query 149 output +-- !query output true --- !query 150 +-- !query SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as double)) FROM t --- !query 150 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) IN (CAST(CAST(1 AS TINYINT) AS DOUBLE), CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean> --- !query 150 output +-- !query output true --- !query 151 +-- !query SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as decimal(10, 0))) FROM t --- !query 151 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS TINYINT) AS DECIMAL(10,0)), CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean> --- !query 151 output +-- !query output true --- !query 152 +-- !query SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as string)) FROM t --- !query 152 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS STRING) IN (CAST(CAST(1 AS TINYINT) AS STRING), CAST(CAST(1 AS STRING) AS STRING))):boolean> --- !query 152 output +-- !query output true --- !query 153 +-- !query SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast('1' as binary)) FROM t --- !query 153 schema +-- !query schema struct<> --- !query 153 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS TINYINT) IN (CAST(1 AS TINYINT), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: tinyint != binary; line 1 pos 26 --- !query 154 +-- !query SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as boolean)) FROM t --- !query 154 schema +-- !query schema struct<> --- !query 154 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS TINYINT) IN (CAST(1 AS TINYINT), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: tinyint != boolean; line 1 pos 26 --- !query 155 +-- !query SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 155 schema +-- !query schema struct<> --- !query 155 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS TINYINT) IN (CAST(1 AS TINYINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: tinyint != timestamp; line 1 pos 26 --- !query 156 +-- !query SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 156 schema +-- !query schema struct<> --- !query 156 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS TINYINT) IN (CAST(1 AS TINYINT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: tinyint != date; line 1 pos 26 --- !query 157 +-- !query SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as tinyint)) FROM t --- !query 157 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS SMALLINT) IN (CAST(CAST(1 AS SMALLINT) AS SMALLINT), CAST(CAST(1 AS TINYINT) AS SMALLINT))):boolean> --- !query 157 output +-- !query output true --- !query 158 +-- !query SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as smallint)) FROM t --- !query 158 schema +-- !query schema struct<(CAST(1 AS SMALLINT) IN (CAST(1 AS SMALLINT), CAST(1 AS SMALLINT))):boolean> --- !query 158 output +-- !query output true --- !query 159 +-- !query SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as int)) FROM t --- !query 159 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS INT) IN (CAST(CAST(1 AS SMALLINT) AS INT), CAST(CAST(1 AS INT) AS INT))):boolean> --- !query 159 output +-- !query output true --- !query 160 +-- !query SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as bigint)) FROM t --- !query 160 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS BIGINT) IN (CAST(CAST(1 AS SMALLINT) AS BIGINT), CAST(CAST(1 AS BIGINT) AS BIGINT))):boolean> --- !query 160 output +-- !query output true --- !query 161 +-- !query SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as float)) FROM t --- !query 161 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS FLOAT) IN (CAST(CAST(1 AS SMALLINT) AS FLOAT), CAST(CAST(1 AS FLOAT) AS FLOAT))):boolean> --- !query 161 output +-- !query output true --- !query 162 +-- !query SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as double)) FROM t --- !query 162 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) IN (CAST(CAST(1 AS SMALLINT) AS DOUBLE), CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean> --- !query 162 output +-- !query output true --- !query 163 +-- !query SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as decimal(10, 0))) FROM t --- !query 163 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS SMALLINT) AS DECIMAL(10,0)), CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean> --- !query 163 output +-- !query output true --- !query 164 +-- !query SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as string)) FROM t --- !query 164 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS STRING) IN (CAST(CAST(1 AS SMALLINT) AS STRING), CAST(CAST(1 AS STRING) AS STRING))):boolean> --- !query 164 output +-- !query output true --- !query 165 +-- !query SELECT cast(1 as smallint) in (cast(1 as smallint), cast('1' as binary)) FROM t --- !query 165 schema +-- !query schema struct<> --- !query 165 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS SMALLINT) IN (CAST(1 AS SMALLINT), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: smallint != binary; line 1 pos 27 --- !query 166 +-- !query SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as boolean)) FROM t --- !query 166 schema +-- !query schema struct<> --- !query 166 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS SMALLINT) IN (CAST(1 AS SMALLINT), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: smallint != boolean; line 1 pos 27 --- !query 167 +-- !query SELECT cast(1 as smallint) in (cast(1 as smallint), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 167 schema +-- !query schema struct<> --- !query 167 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS SMALLINT) IN (CAST(1 AS SMALLINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: smallint != timestamp; line 1 pos 27 --- !query 168 +-- !query SELECT cast(1 as smallint) in (cast(1 as smallint), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 168 schema +-- !query schema struct<> --- !query 168 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS SMALLINT) IN (CAST(1 AS SMALLINT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: smallint != date; line 1 pos 27 --- !query 169 +-- !query SELECT cast(1 as int) in (cast(1 as int), cast(1 as tinyint)) FROM t --- !query 169 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS INT) IN (CAST(CAST(1 AS INT) AS INT), CAST(CAST(1 AS TINYINT) AS INT))):boolean> --- !query 169 output +-- !query output true --- !query 170 +-- !query SELECT cast(1 as int) in (cast(1 as int), cast(1 as smallint)) FROM t --- !query 170 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS INT) IN (CAST(CAST(1 AS INT) AS INT), CAST(CAST(1 AS SMALLINT) AS INT))):boolean> --- !query 170 output +-- !query output true --- !query 171 +-- !query SELECT cast(1 as int) in (cast(1 as int), cast(1 as int)) FROM t --- !query 171 schema +-- !query schema struct<(CAST(1 AS INT) IN (CAST(1 AS INT), CAST(1 AS INT))):boolean> --- !query 171 output +-- !query output true --- !query 172 +-- !query SELECT cast(1 as int) in (cast(1 as int), cast(1 as bigint)) FROM t --- !query 172 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS BIGINT) IN (CAST(CAST(1 AS INT) AS BIGINT), CAST(CAST(1 AS BIGINT) AS BIGINT))):boolean> --- !query 172 output +-- !query output true --- !query 173 +-- !query SELECT cast(1 as int) in (cast(1 as int), cast(1 as float)) FROM t --- !query 173 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS FLOAT) IN (CAST(CAST(1 AS INT) AS FLOAT), CAST(CAST(1 AS FLOAT) AS FLOAT))):boolean> --- !query 173 output +-- !query output true --- !query 174 +-- !query SELECT cast(1 as int) in (cast(1 as int), cast(1 as double)) FROM t --- !query 174 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DOUBLE) IN (CAST(CAST(1 AS INT) AS DOUBLE), CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean> --- !query 174 output +-- !query output true --- !query 175 +-- !query SELECT cast(1 as int) in (cast(1 as int), cast(1 as decimal(10, 0))) FROM t --- !query 175 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS INT) AS DECIMAL(10,0)), CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean> --- !query 175 output +-- !query output true --- !query 176 +-- !query SELECT cast(1 as int) in (cast(1 as int), cast(1 as string)) FROM t --- !query 176 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS STRING) IN (CAST(CAST(1 AS INT) AS STRING), CAST(CAST(1 AS STRING) AS STRING))):boolean> --- !query 176 output +-- !query output true --- !query 177 +-- !query SELECT cast(1 as int) in (cast(1 as int), cast('1' as binary)) FROM t --- !query 177 schema +-- !query schema struct<> --- !query 177 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS INT) IN (CAST(1 AS INT), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: int != binary; line 1 pos 22 --- !query 178 +-- !query SELECT cast(1 as int) in (cast(1 as int), cast(1 as boolean)) FROM t --- !query 178 schema +-- !query schema struct<> --- !query 178 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS INT) IN (CAST(1 AS INT), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: int != boolean; line 1 pos 22 --- !query 179 +-- !query SELECT cast(1 as int) in (cast(1 as int), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 179 schema +-- !query schema struct<> --- !query 179 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS INT) IN (CAST(1 AS INT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: int != timestamp; line 1 pos 22 --- !query 180 +-- !query SELECT cast(1 as int) in (cast(1 as int), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 180 schema +-- !query schema struct<> --- !query 180 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS INT) IN (CAST(1 AS INT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: int != date; line 1 pos 22 --- !query 181 +-- !query SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as tinyint)) FROM t --- !query 181 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS BIGINT) IN (CAST(CAST(1 AS BIGINT) AS BIGINT), CAST(CAST(1 AS TINYINT) AS BIGINT))):boolean> --- !query 181 output +-- !query output true --- !query 182 +-- !query SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as smallint)) FROM t --- !query 182 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS BIGINT) IN (CAST(CAST(1 AS BIGINT) AS BIGINT), CAST(CAST(1 AS SMALLINT) AS BIGINT))):boolean> --- !query 182 output +-- !query output true --- !query 183 +-- !query SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as int)) FROM t --- !query 183 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS BIGINT) IN (CAST(CAST(1 AS BIGINT) AS BIGINT), CAST(CAST(1 AS INT) AS BIGINT))):boolean> --- !query 183 output +-- !query output true --- !query 184 +-- !query SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as bigint)) FROM t --- !query 184 schema +-- !query schema struct<(CAST(1 AS BIGINT) IN (CAST(1 AS BIGINT), CAST(1 AS BIGINT))):boolean> --- !query 184 output +-- !query output true --- !query 185 +-- !query SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as float)) FROM t --- !query 185 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS FLOAT) IN (CAST(CAST(1 AS BIGINT) AS FLOAT), CAST(CAST(1 AS FLOAT) AS FLOAT))):boolean> --- !query 185 output +-- !query output true --- !query 186 +-- !query SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as double)) FROM t --- !query 186 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) IN (CAST(CAST(1 AS BIGINT) AS DOUBLE), CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean> --- !query 186 output +-- !query output true --- !query 187 +-- !query SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as decimal(10, 0))) FROM t --- !query 187 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) IN (CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)), CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)))):boolean> --- !query 187 output +-- !query output true --- !query 188 +-- !query SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as string)) FROM t --- !query 188 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS STRING) IN (CAST(CAST(1 AS BIGINT) AS STRING), CAST(CAST(1 AS STRING) AS STRING))):boolean> --- !query 188 output +-- !query output true --- !query 189 +-- !query SELECT cast(1 as bigint) in (cast(1 as bigint), cast('1' as binary)) FROM t --- !query 189 schema +-- !query schema struct<> --- !query 189 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BIGINT) IN (CAST(1 AS BIGINT), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: bigint != binary; line 1 pos 25 --- !query 190 +-- !query SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as boolean)) FROM t --- !query 190 schema +-- !query schema struct<> --- !query 190 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BIGINT) IN (CAST(1 AS BIGINT), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: bigint != boolean; line 1 pos 25 --- !query 191 +-- !query SELECT cast(1 as bigint) in (cast(1 as bigint), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 191 schema +-- !query schema struct<> --- !query 191 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BIGINT) IN (CAST(1 AS BIGINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: bigint != timestamp; line 1 pos 25 --- !query 192 +-- !query SELECT cast(1 as bigint) in (cast(1 as bigint), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 192 schema +-- !query schema struct<> --- !query 192 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BIGINT) IN (CAST(1 AS BIGINT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: bigint != date; line 1 pos 25 --- !query 193 +-- !query SELECT cast(1 as float) in (cast(1 as float), cast(1 as tinyint)) FROM t --- !query 193 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS FLOAT) IN (CAST(CAST(1 AS FLOAT) AS FLOAT), CAST(CAST(1 AS TINYINT) AS FLOAT))):boolean> --- !query 193 output +-- !query output true --- !query 194 +-- !query SELECT cast(1 as float) in (cast(1 as float), cast(1 as smallint)) FROM t --- !query 194 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS FLOAT) IN (CAST(CAST(1 AS FLOAT) AS FLOAT), CAST(CAST(1 AS SMALLINT) AS FLOAT))):boolean> --- !query 194 output +-- !query output true --- !query 195 +-- !query SELECT cast(1 as float) in (cast(1 as float), cast(1 as int)) FROM t --- !query 195 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS FLOAT) IN (CAST(CAST(1 AS FLOAT) AS FLOAT), CAST(CAST(1 AS INT) AS FLOAT))):boolean> --- !query 195 output +-- !query output true --- !query 196 +-- !query SELECT cast(1 as float) in (cast(1 as float), cast(1 as bigint)) FROM t --- !query 196 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS FLOAT) IN (CAST(CAST(1 AS FLOAT) AS FLOAT), CAST(CAST(1 AS BIGINT) AS FLOAT))):boolean> --- !query 196 output +-- !query output true --- !query 197 +-- !query SELECT cast(1 as float) in (cast(1 as float), cast(1 as float)) FROM t --- !query 197 schema +-- !query schema struct<(CAST(1 AS FLOAT) IN (CAST(1 AS FLOAT), CAST(1 AS FLOAT))):boolean> --- !query 197 output +-- !query output true --- !query 198 +-- !query SELECT cast(1 as float) in (cast(1 as float), cast(1 as double)) FROM t --- !query 198 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) IN (CAST(CAST(1 AS FLOAT) AS DOUBLE), CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean> --- !query 198 output +-- !query output true --- !query 199 +-- !query SELECT cast(1 as float) in (cast(1 as float), cast(1 as decimal(10, 0))) FROM t --- !query 199 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) IN (CAST(CAST(1 AS FLOAT) AS DOUBLE), CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE))):boolean> --- !query 199 output +-- !query output true --- !query 200 +-- !query SELECT cast(1 as float) in (cast(1 as float), cast(1 as string)) FROM t --- !query 200 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS STRING) IN (CAST(CAST(1 AS FLOAT) AS STRING), CAST(CAST(1 AS STRING) AS STRING))):boolean> --- !query 200 output +-- !query output true --- !query 201 +-- !query SELECT cast(1 as float) in (cast(1 as float), cast('1' as binary)) FROM t --- !query 201 schema +-- !query schema struct<> --- !query 201 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS FLOAT) IN (CAST(1 AS FLOAT), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: float != binary; line 1 pos 24 --- !query 202 +-- !query SELECT cast(1 as float) in (cast(1 as float), cast(1 as boolean)) FROM t --- !query 202 schema +-- !query schema struct<> --- !query 202 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS FLOAT) IN (CAST(1 AS FLOAT), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: float != boolean; line 1 pos 24 --- !query 203 +-- !query SELECT cast(1 as float) in (cast(1 as float), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 203 schema +-- !query schema struct<> --- !query 203 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS FLOAT) IN (CAST(1 AS FLOAT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: float != timestamp; line 1 pos 24 --- !query 204 +-- !query SELECT cast(1 as float) in (cast(1 as float), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 204 schema +-- !query schema struct<> --- !query 204 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS FLOAT) IN (CAST(1 AS FLOAT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: float != date; line 1 pos 24 --- !query 205 +-- !query SELECT cast(1 as double) in (cast(1 as double), cast(1 as tinyint)) FROM t --- !query 205 schema +-- !query schema struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE), CAST(CAST(1 AS TINYINT) AS DOUBLE))):boolean> --- !query 205 output +-- !query output true --- !query 206 +-- !query SELECT cast(1 as double) in (cast(1 as double), cast(1 as smallint)) FROM t --- !query 206 schema +-- !query schema struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE), CAST(CAST(1 AS SMALLINT) AS DOUBLE))):boolean> --- !query 206 output +-- !query output true --- !query 207 +-- !query SELECT cast(1 as double) in (cast(1 as double), cast(1 as int)) FROM t --- !query 207 schema +-- !query schema struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE), CAST(CAST(1 AS INT) AS DOUBLE))):boolean> --- !query 207 output +-- !query output true --- !query 208 +-- !query SELECT cast(1 as double) in (cast(1 as double), cast(1 as bigint)) FROM t --- !query 208 schema +-- !query schema struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE), CAST(CAST(1 AS BIGINT) AS DOUBLE))):boolean> --- !query 208 output +-- !query output true --- !query 209 +-- !query SELECT cast(1 as double) in (cast(1 as double), cast(1 as float)) FROM t --- !query 209 schema +-- !query schema struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE), CAST(CAST(1 AS FLOAT) AS DOUBLE))):boolean> --- !query 209 output +-- !query output true --- !query 210 +-- !query SELECT cast(1 as double) in (cast(1 as double), cast(1 as double)) FROM t --- !query 210 schema +-- !query schema struct<(CAST(1 AS DOUBLE) IN (CAST(1 AS DOUBLE), CAST(1 AS DOUBLE))):boolean> --- !query 210 output +-- !query output true --- !query 211 +-- !query SELECT cast(1 as double) in (cast(1 as double), cast(1 as decimal(10, 0))) FROM t --- !query 211 schema +-- !query schema struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE), CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE))):boolean> --- !query 211 output +-- !query output true --- !query 212 +-- !query SELECT cast(1 as double) in (cast(1 as double), cast(1 as string)) FROM t --- !query 212 schema +-- !query schema struct<(CAST(CAST(1 AS DOUBLE) AS STRING) IN (CAST(CAST(1 AS DOUBLE) AS STRING), CAST(CAST(1 AS STRING) AS STRING))):boolean> --- !query 212 output +-- !query output true --- !query 213 +-- !query SELECT cast(1 as double) in (cast(1 as double), cast('1' as binary)) FROM t --- !query 213 schema +-- !query schema struct<> --- !query 213 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DOUBLE) IN (CAST(1 AS DOUBLE), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: double != binary; line 1 pos 25 --- !query 214 +-- !query SELECT cast(1 as double) in (cast(1 as double), cast(1 as boolean)) FROM t --- !query 214 schema +-- !query schema struct<> --- !query 214 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DOUBLE) IN (CAST(1 AS DOUBLE), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: double != boolean; line 1 pos 25 --- !query 215 +-- !query SELECT cast(1 as double) in (cast(1 as double), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 215 schema +-- !query schema struct<> --- !query 215 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DOUBLE) IN (CAST(1 AS DOUBLE), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: double != timestamp; line 1 pos 25 --- !query 216 +-- !query SELECT cast(1 as double) in (cast(1 as double), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 216 schema +-- !query schema struct<> --- !query 216 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DOUBLE) IN (CAST(1 AS DOUBLE), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: double != date; line 1 pos 25 --- !query 217 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as tinyint)) FROM t --- !query 217 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)), CAST(CAST(1 AS TINYINT) AS DECIMAL(10,0)))):boolean> --- !query 217 output +-- !query output true --- !query 218 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as smallint)) FROM t --- !query 218 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)), CAST(CAST(1 AS SMALLINT) AS DECIMAL(10,0)))):boolean> --- !query 218 output +-- !query output true --- !query 219 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as int)) FROM t --- !query 219 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)), CAST(CAST(1 AS INT) AS DECIMAL(10,0)))):boolean> --- !query 219 output +-- !query output true --- !query 220 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as bigint)) FROM t --- !query 220 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)), CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)))):boolean> --- !query 220 output +-- !query output true --- !query 221 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as float)) FROM t --- !query 221 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE), CAST(CAST(1 AS FLOAT) AS DOUBLE))):boolean> --- !query 221 output +-- !query output true --- !query 222 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as double)) FROM t --- !query 222 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE), CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean> --- !query 222 output +-- !query output true --- !query 223 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as decimal(10, 0))) FROM t --- !query 223 schema +-- !query schema struct<(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS DECIMAL(10,0)), CAST(1 AS DECIMAL(10,0)))):boolean> --- !query 223 output +-- !query output true --- !query 224 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as string)) FROM t --- !query 224 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS STRING) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS STRING), CAST(CAST(1 AS STRING) AS STRING))):boolean> --- !query 224 output +-- !query output true --- !query 225 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast('1' as binary)) FROM t --- !query 225 schema +-- !query schema struct<> --- !query 225 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS DECIMAL(10,0)), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: decimal(10,0) != binary; line 1 pos 33 --- !query 226 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as boolean)) FROM t --- !query 226 schema +-- !query schema struct<> --- !query 226 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS DECIMAL(10,0)), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: decimal(10,0) != boolean; line 1 pos 33 --- !query 227 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 227 schema +-- !query schema struct<> --- !query 227 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: decimal(10,0) != timestamp; line 1 pos 33 --- !query 228 +-- !query SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 228 schema +-- !query schema struct<> --- !query 228 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: decimal(10,0) != date; line 1 pos 33 --- !query 229 +-- !query SELECT cast(1 as string) in (cast(1 as string), cast(1 as tinyint)) FROM t --- !query 229 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING), CAST(CAST(1 AS TINYINT) AS STRING))):boolean> --- !query 229 output +-- !query output true --- !query 230 +-- !query SELECT cast(1 as string) in (cast(1 as string), cast(1 as smallint)) FROM t --- !query 230 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING), CAST(CAST(1 AS SMALLINT) AS STRING))):boolean> --- !query 230 output +-- !query output true --- !query 231 +-- !query SELECT cast(1 as string) in (cast(1 as string), cast(1 as int)) FROM t --- !query 231 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING), CAST(CAST(1 AS INT) AS STRING))):boolean> --- !query 231 output +-- !query output true --- !query 232 +-- !query SELECT cast(1 as string) in (cast(1 as string), cast(1 as bigint)) FROM t --- !query 232 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING), CAST(CAST(1 AS BIGINT) AS STRING))):boolean> --- !query 232 output +-- !query output true --- !query 233 +-- !query SELECT cast(1 as string) in (cast(1 as string), cast(1 as float)) FROM t --- !query 233 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING), CAST(CAST(1 AS FLOAT) AS STRING))):boolean> --- !query 233 output +-- !query output true --- !query 234 +-- !query SELECT cast(1 as string) in (cast(1 as string), cast(1 as double)) FROM t --- !query 234 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING), CAST(CAST(1 AS DOUBLE) AS STRING))):boolean> --- !query 234 output +-- !query output true --- !query 235 +-- !query SELECT cast(1 as string) in (cast(1 as string), cast(1 as decimal(10, 0))) FROM t --- !query 235 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING), CAST(CAST(1 AS DECIMAL(10,0)) AS STRING))):boolean> --- !query 235 output +-- !query output true --- !query 236 +-- !query SELECT cast(1 as string) in (cast(1 as string), cast(1 as string)) FROM t --- !query 236 schema +-- !query schema struct<(CAST(1 AS STRING) IN (CAST(1 AS STRING), CAST(1 AS STRING))):boolean> --- !query 236 output +-- !query output true --- !query 237 +-- !query SELECT cast(1 as string) in (cast(1 as string), cast('1' as binary)) FROM t --- !query 237 schema +-- !query schema struct<> --- !query 237 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS STRING) IN (CAST(1 AS STRING), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: string != binary; line 1 pos 25 --- !query 238 +-- !query SELECT cast(1 as string) in (cast(1 as string), cast(1 as boolean)) FROM t --- !query 238 schema +-- !query schema struct<> --- !query 238 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS STRING) IN (CAST(1 AS STRING), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: string != boolean; line 1 pos 25 --- !query 239 +-- !query SELECT cast(1 as string) in (cast(1 as string), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 239 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING), CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) AS STRING))):boolean> --- !query 239 output +-- !query output true --- !query 240 +-- !query SELECT cast(1 as string) in (cast(1 as string), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 240 schema +-- !query schema struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING), CAST(CAST(2017-12-11 09:30:00 AS DATE) AS STRING))):boolean> --- !query 240 output +-- !query output true --- !query 241 +-- !query SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as tinyint)) FROM t --- !query 241 schema +-- !query schema struct<> --- !query 241 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: binary != tinyint; line 1 pos 27 --- !query 242 +-- !query SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as smallint)) FROM t --- !query 242 schema +-- !query schema struct<> --- !query 242 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: binary != smallint; line 1 pos 27 --- !query 243 +-- !query SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as int)) FROM t --- !query 243 schema +-- !query schema struct<> --- !query 243 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: binary != int; line 1 pos 27 --- !query 244 +-- !query SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as bigint)) FROM t --- !query 244 schema +-- !query schema struct<> --- !query 244 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: binary != bigint; line 1 pos 27 --- !query 245 +-- !query SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as float)) FROM t --- !query 245 schema +-- !query schema struct<> --- !query 245 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: binary != float; line 1 pos 27 --- !query 246 +-- !query SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as double)) FROM t --- !query 246 schema +-- !query schema struct<> --- !query 246 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: binary != double; line 1 pos 27 --- !query 247 +-- !query SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as decimal(10, 0))) FROM t --- !query 247 schema +-- !query schema struct<> --- !query 247 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: binary != decimal(10,0); line 1 pos 27 --- !query 248 +-- !query SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as string)) FROM t --- !query 248 schema +-- !query schema struct<> --- !query 248 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS STRING)))' due to data type mismatch: Arguments must be same type but were: binary != string; line 1 pos 27 --- !query 249 +-- !query SELECT cast('1' as binary) in (cast('1' as binary), cast('1' as binary)) FROM t --- !query 249 schema +-- !query schema struct<(CAST(1 AS BINARY) IN (CAST(1 AS BINARY), CAST(1 AS BINARY))):boolean> --- !query 249 output +-- !query output true --- !query 250 +-- !query SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as boolean)) FROM t --- !query 250 schema +-- !query schema struct<> --- !query 250 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: binary != boolean; line 1 pos 27 --- !query 251 +-- !query SELECT cast('1' as binary) in (cast('1' as binary), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 251 schema +-- !query schema struct<> --- !query 251 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: binary != timestamp; line 1 pos 27 --- !query 252 +-- !query SELECT cast('1' as binary) in (cast('1' as binary), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 252 schema +-- !query schema struct<> --- !query 252 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: binary != date; line 1 pos 27 --- !query 253 +-- !query SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as tinyint)) FROM t --- !query 253 schema +-- !query schema struct<> --- !query 253 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: boolean != tinyint; line 1 pos 28 --- !query 254 +-- !query SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as smallint)) FROM t --- !query 254 schema +-- !query schema struct<> --- !query 254 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: boolean != smallint; line 1 pos 28 --- !query 255 +-- !query SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as int)) FROM t --- !query 255 schema +-- !query schema struct<> --- !query 255 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: boolean != int; line 1 pos 28 --- !query 256 +-- !query SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as bigint)) FROM t --- !query 256 schema +-- !query schema struct<> --- !query 256 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: boolean != bigint; line 1 pos 28 --- !query 257 +-- !query SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as float)) FROM t --- !query 257 schema +-- !query schema struct<> --- !query 257 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: boolean != float; line 1 pos 28 --- !query 258 +-- !query SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as double)) FROM t --- !query 258 schema +-- !query schema struct<> --- !query 258 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: boolean != double; line 1 pos 28 --- !query 259 +-- !query SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as decimal(10, 0))) FROM t --- !query 259 schema +-- !query schema struct<> --- !query 259 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: boolean != decimal(10,0); line 1 pos 28 --- !query 260 +-- !query SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as string)) FROM t --- !query 260 schema +-- !query schema struct<> --- !query 260 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS STRING)))' due to data type mismatch: Arguments must be same type but were: boolean != string; line 1 pos 28 --- !query 261 +-- !query SELECT cast('1' as boolean) in (cast('1' as boolean), cast('1' as binary)) FROM t --- !query 261 schema +-- !query schema struct<> --- !query 261 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: boolean != binary; line 1 pos 28 --- !query 262 +-- !query SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as boolean)) FROM t --- !query 262 schema +-- !query schema struct<(CAST(1 AS BOOLEAN) IN (CAST(1 AS BOOLEAN), CAST(1 AS BOOLEAN))):boolean> --- !query 262 output +-- !query output true --- !query 263 +-- !query SELECT cast('1' as boolean) in (cast('1' as boolean), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 263 schema +-- !query schema struct<> --- !query 263 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: boolean != timestamp; line 1 pos 28 --- !query 264 +-- !query SELECT cast('1' as boolean) in (cast('1' as boolean), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 264 schema +-- !query schema struct<> --- !query 264 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: boolean != date; line 1 pos 28 --- !query 265 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as tinyint)) FROM t --- !query 265 schema +-- !query schema struct<> --- !query 265 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: timestamp != tinyint; line 1 pos 50 --- !query 266 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as smallint)) FROM t --- !query 266 schema +-- !query schema struct<> --- !query 266 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: timestamp != smallint; line 1 pos 50 --- !query 267 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as int)) FROM t --- !query 267 schema +-- !query schema struct<> --- !query 267 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: timestamp != int; line 1 pos 50 --- !query 268 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as bigint)) FROM t --- !query 268 schema +-- !query schema struct<> --- !query 268 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: timestamp != bigint; line 1 pos 50 --- !query 269 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as float)) FROM t --- !query 269 schema +-- !query schema struct<> --- !query 269 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: timestamp != float; line 1 pos 50 --- !query 270 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as double)) FROM t --- !query 270 schema +-- !query schema struct<> --- !query 270 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: timestamp != double; line 1 pos 50 --- !query 271 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as decimal(10, 0))) FROM t --- !query 271 schema +-- !query schema struct<> --- !query 271 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: timestamp != decimal(10,0); line 1 pos 50 --- !query 272 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as string)) FROM t --- !query 272 schema +-- !query schema struct<(CAST(CAST(2017-12-12 09:30:00.0 AS TIMESTAMP) AS STRING) IN (CAST(CAST(2017-12-12 09:30:00.0 AS TIMESTAMP) AS STRING), CAST(CAST(1 AS STRING) AS STRING))):boolean> --- !query 272 output +-- !query output true --- !query 273 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast('1' as binary)) FROM t --- !query 273 schema +-- !query schema struct<> --- !query 273 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: timestamp != binary; line 1 pos 50 --- !query 274 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as boolean)) FROM t --- !query 274 schema +-- !query schema struct<> --- !query 274 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: timestamp != boolean; line 1 pos 50 --- !query 275 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 275 schema +-- !query schema struct<(CAST(2017-12-12 09:30:00.0 AS TIMESTAMP) IN (CAST(2017-12-12 09:30:00.0 AS TIMESTAMP), CAST(2017-12-11 09:30:00.0 AS TIMESTAMP))):boolean> --- !query 275 output +-- !query output true --- !query 276 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 276 schema +-- !query schema struct<(CAST(CAST(2017-12-12 09:30:00.0 AS TIMESTAMP) AS TIMESTAMP) IN (CAST(CAST(2017-12-12 09:30:00.0 AS TIMESTAMP) AS TIMESTAMP), CAST(CAST(2017-12-11 09:30:00 AS DATE) AS TIMESTAMP))):boolean> --- !query 276 output +-- !query output true --- !query 277 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as tinyint)) FROM t --- !query 277 schema +-- !query schema struct<> --- !query 277 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: date != tinyint; line 1 pos 43 --- !query 278 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as smallint)) FROM t --- !query 278 schema +-- !query schema struct<> --- !query 278 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: date != smallint; line 1 pos 43 --- !query 279 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as int)) FROM t --- !query 279 schema +-- !query schema struct<> --- !query 279 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: date != int; line 1 pos 43 --- !query 280 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as bigint)) FROM t --- !query 280 schema +-- !query schema struct<> --- !query 280 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: date != bigint; line 1 pos 43 --- !query 281 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as float)) FROM t --- !query 281 schema +-- !query schema struct<> --- !query 281 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: date != float; line 1 pos 43 --- !query 282 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as double)) FROM t --- !query 282 schema +-- !query schema struct<> --- !query 282 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: date != double; line 1 pos 43 --- !query 283 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as decimal(10, 0))) FROM t --- !query 283 schema +-- !query schema struct<> --- !query 283 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: date != decimal(10,0); line 1 pos 43 --- !query 284 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as string)) FROM t --- !query 284 schema +-- !query schema struct<(CAST(CAST(2017-12-12 09:30:00 AS DATE) AS STRING) IN (CAST(CAST(2017-12-12 09:30:00 AS DATE) AS STRING), CAST(CAST(1 AS STRING) AS STRING))):boolean> --- !query 284 output +-- !query output true --- !query 285 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast('1' as binary)) FROM t --- !query 285 schema +-- !query schema struct<> --- !query 285 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: date != binary; line 1 pos 43 --- !query 286 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as boolean)) FROM t --- !query 286 schema +-- !query schema struct<> --- !query 286 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: date != boolean; line 1 pos 43 --- !query 287 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 287 schema +-- !query schema struct<(CAST(CAST(2017-12-12 09:30:00 AS DATE) AS TIMESTAMP) IN (CAST(CAST(2017-12-12 09:30:00 AS DATE) AS TIMESTAMP), CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) AS TIMESTAMP))):boolean> --- !query 287 output +-- !query output true --- !query 288 +-- !query SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast('2017-12-11 09:30:00' as date)) FROM t --- !query 288 schema +-- !query schema struct<(CAST(2017-12-12 09:30:00 AS DATE) IN (CAST(2017-12-12 09:30:00 AS DATE), CAST(2017-12-11 09:30:00 AS DATE))):boolean> --- !query 288 output +-- !query output true diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/mapZipWith.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/mapZipWith.sql.out index 86a578ca013df..ed7ab5a342c12 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/mapZipWith.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/mapZipWith.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 16 --- !query 0 +-- !query CREATE TEMPORARY VIEW various_maps AS SELECT * FROM VALUES ( map(true, false), map(2Y, 1Y), @@ -36,144 +36,144 @@ CREATE TEMPORARY VIEW various_maps AS SELECT * FROM VALUES ( array_map1, array_map2, struct_map1, struct_map2 ) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT map_zip_with(tinyint_map, smallint_map, (k, v1, v2) -> struct(k, v1, v2)) m FROM various_maps --- !query 1 schema +-- !query schema struct>> --- !query 1 output +-- !query output {2:{"k":2,"v1":1,"v2":1}} --- !query 2 +-- !query SELECT map_zip_with(smallint_map, int_map, (k, v1, v2) -> struct(k, v1, v2)) m FROM various_maps --- !query 2 schema +-- !query schema struct>> --- !query 2 output +-- !query output {2:{"k":2,"v1":1,"v2":1}} --- !query 3 +-- !query SELECT map_zip_with(int_map, bigint_map, (k, v1, v2) -> struct(k, v1, v2)) m FROM various_maps --- !query 3 schema +-- !query schema struct>> --- !query 3 output +-- !query output {2:{"k":2,"v1":1,"v2":1}} --- !query 4 +-- !query SELECT map_zip_with(double_map, float_map, (k, v1, v2) -> struct(k, v1, v2)) m FROM various_maps --- !query 4 schema +-- !query schema struct>> --- !query 4 output +-- !query output {2.0:{"k":2.0,"v1":1.0,"v2":1.0}} --- !query 5 +-- !query SELECT map_zip_with(decimal_map1, decimal_map2, (k, v1, v2) -> struct(k, v1, v2)) m FROM various_maps --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'map_zip_with(various_maps.`decimal_map1`, various_maps.`decimal_map2`, lambdafunction(named_struct(NamePlaceholder(), k, NamePlaceholder(), v1, NamePlaceholder(), v2), k, v1, v2))' due to argument data type mismatch: The input to function map_zip_with should have been two maps with compatible key types, but the key types are [decimal(36,0), decimal(36,35)].; line 1 pos 7 --- !query 6 +-- !query SELECT map_zip_with(decimal_map1, int_map, (k, v1, v2) -> struct(k, v1, v2)) m FROM various_maps --- !query 6 schema +-- !query schema struct>> --- !query 6 output +-- !query output {2:{"k":2,"v1":null,"v2":1},922337203685477897945456575809789456:{"k":922337203685477897945456575809789456,"v1":922337203685477897945456575809789456,"v2":null}} --- !query 7 +-- !query SELECT map_zip_with(decimal_map1, double_map, (k, v1, v2) -> struct(k, v1, v2)) m FROM various_maps --- !query 7 schema +-- !query schema struct>> --- !query 7 output +-- !query output {2.0:{"k":2.0,"v1":null,"v2":1.0},9.223372036854779E35:{"k":9.223372036854779E35,"v1":922337203685477897945456575809789456,"v2":null}} --- !query 8 +-- !query SELECT map_zip_with(decimal_map2, int_map, (k, v1, v2) -> struct(k, v1, v2)) m FROM various_maps --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'map_zip_with(various_maps.`decimal_map2`, various_maps.`int_map`, lambdafunction(named_struct(NamePlaceholder(), k, NamePlaceholder(), v1, NamePlaceholder(), v2), k, v1, v2))' due to argument data type mismatch: The input to function map_zip_with should have been two maps with compatible key types, but the key types are [decimal(36,35), int].; line 1 pos 7 --- !query 9 +-- !query SELECT map_zip_with(decimal_map2, double_map, (k, v1, v2) -> struct(k, v1, v2)) m FROM various_maps --- !query 9 schema +-- !query schema struct>> --- !query 9 output +-- !query output {2.0:{"k":2.0,"v1":null,"v2":1.0},9.223372036854778:{"k":9.223372036854778,"v1":9.22337203685477897945456575809789456,"v2":null}} --- !query 10 +-- !query SELECT map_zip_with(string_map1, int_map, (k, v1, v2) -> struct(k, v1, v2)) m FROM various_maps --- !query 10 schema +-- !query schema struct>> --- !query 10 output +-- !query output {"2":{"k":"2","v1":"1","v2":1},"true":{"k":"true","v1":"false","v2":null}} --- !query 11 +-- !query SELECT map_zip_with(string_map2, date_map, (k, v1, v2) -> struct(k, v1, v2)) m FROM various_maps --- !query 11 schema +-- !query schema struct>> --- !query 11 output +-- !query output {"2016-03-14":{"k":"2016-03-14","v1":"2016-03-13","v2":2016-03-13}} --- !query 12 +-- !query SELECT map_zip_with(timestamp_map, string_map3, (k, v1, v2) -> struct(k, v1, v2)) m FROM various_maps --- !query 12 schema +-- !query schema struct>> --- !query 12 output -{"2016-11-15 20:54:00":{"k":"2016-11-15 20:54:00","v1":2016-11-12 20:54:00.0,"v2":null},"2016-11-15 20:54:00.000":{"k":"2016-11-15 20:54:00.000","v1":null,"v2":"2016-11-12 20:54:00.000"}} +-- !query output +{"2016-11-15 20:54:00":{"k":"2016-11-15 20:54:00","v1":2016-11-12 20:54:00,"v2":null},"2016-11-15 20:54:00.000":{"k":"2016-11-15 20:54:00.000","v1":null,"v2":"2016-11-12 20:54:00.000"}} --- !query 13 +-- !query SELECT map_zip_with(decimal_map1, string_map4, (k, v1, v2) -> struct(k, v1, v2)) m FROM various_maps --- !query 13 schema +-- !query schema struct>> --- !query 13 output +-- !query output {"922337203685477897945456575809789456":{"k":"922337203685477897945456575809789456","v1":922337203685477897945456575809789456,"v2":"text"}} --- !query 14 +-- !query SELECT map_zip_with(array_map1, array_map2, (k, v1, v2) -> struct(k, v1, v2)) m FROM various_maps --- !query 14 schema +-- !query schema struct,struct,v1:array,v2:array>>> --- !query 14 output +-- !query output {[1,2]:{"k":[1,2],"v1":[1,2],"v2":[1,2]}} --- !query 15 +-- !query SELECT map_zip_with(struct_map1, struct_map2, (k, v1, v2) -> struct(k, v1, v2)) m FROM various_maps --- !query 15 schema +-- !query schema struct,struct,v1:struct,v2:struct>>> --- !query 15 output +-- !query output {{"col1":1,"col2":2}:{"k":{"col1":1,"col2":2},"v1":{"col1":1,"col2":2},"v2":{"col1":1,"col2":2}}} diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/mapconcat.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/mapconcat.sql.out index 79e00860e4c05..fcf1afc72efe9 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/mapconcat.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/mapconcat.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 8 --- !query 0 +-- !query CREATE TEMPORARY VIEW various_maps AS SELECT * FROM VALUES ( map(true, false), map(false, true), map(1Y, 2Y), map(3Y, 4Y), @@ -38,13 +38,13 @@ CREATE TEMPORARY VIEW various_maps AS SELECT * FROM VALUES ( string_int_map1, string_int_map2, int_string_map1, int_string_map2 ) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT map_concat(boolean_map1, boolean_map2) boolean_map, map_concat(tinyint_map1, tinyint_map2) tinyint_map, @@ -62,13 +62,13 @@ SELECT map_concat(string_int_map1, string_int_map2) string_int_map, map_concat(int_string_map1, int_string_map2) int_string_map FROM various_maps --- !query 1 schema +-- !query schema struct,tinyint_map:map,smallint_map:map,int_map:map,bigint_map:map,decimal_map:map,float_map:map,double_map:map,date_map:map,timestamp_map:map,string_map:map,array_map:map,array>,struct_map:map,struct>,string_int_map:map,int_string_map:map> --- !query 1 output -{false:true,true:false} {1:2,3:4} {1:2,3:4} {4:6,7:8} {6:7,8:9} {9223372036854775808:9223372036854775809,9223372036854775809:9223372036854775808} {1.0:2.0,3.0:4.0} {1.0:2.0,3.0:4.0} {2016-03-12:2016-03-11,2016-03-14:2016-03-13} {2016-11-11 20:54:00.0:2016-11-09 20:54:00.0,2016-11-15 20:54:00.0:2016-11-12 20:54:00.0} {"a":"b","c":"d"} {["a","b"]:["c","d"],["e"]:["f"]} {{"col1":"a","col2":1}:{"col1":"b","col2":2},{"col1":"c","col2":3}:{"col1":"d","col2":4}} {"a":1,"c":2} {1:"a",2:"c"} +-- !query output +{false:true,true:false} {1:2,3:4} {1:2,3:4} {4:6,7:8} {6:7,8:9} {9223372036854775808:9223372036854775809,9223372036854775809:9223372036854775808} {1.0:2.0,3.0:4.0} {1.0:2.0,3.0:4.0} {2016-03-12:2016-03-11,2016-03-14:2016-03-13} {2016-11-11 20:54:00:2016-11-09 20:54:00,2016-11-15 20:54:00:2016-11-12 20:54:00} {"a":"b","c":"d"} {["a","b"]:["c","d"],["e"]:["f"]} {{"col1":"a","col2":1}:{"col1":"b","col2":2},{"col1":"c","col2":3}:{"col1":"d","col2":4}} {"a":1,"c":2} {1:"a",2:"c"} --- !query 2 +-- !query SELECT map_concat(tinyint_map1, smallint_map2) ts_map, map_concat(smallint_map1, int_map2) si_map, @@ -80,62 +80,62 @@ SELECT map_concat(string_map1, int_map2) sti_map, map_concat(int_string_map1, tinyint_map2) istt_map FROM various_maps --- !query 2 schema +-- !query schema struct,si_map:map,ib_map:map,bd_map:map,df_map:map,std_map:map,tst_map:map,sti_map:map,istt_map:map> --- !query 2 output +-- !query output {1:2,3:4} {1:2,7:8} {4:6,8:9} {6:7,9223372036854775808:9223372036854775809} {3.0:4.0,9.223372036854776E18:9.223372036854776E18} {"2016-03-12":"2016-03-11","a":"b"} {"2016-11-15 20:54:00":"2016-11-12 20:54:00","c":"d"} {"7":"8","a":"b"} {1:"a",3:"4"} --- !query 3 +-- !query SELECT map_concat(tinyint_map1, array_map1) tm_map FROM various_maps --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'map_concat(various_maps.`tinyint_map1`, various_maps.`array_map1`)' due to data type mismatch: input to function map_concat should all be the same type, but it's [map, map,array>]; line 2 pos 4 --- !query 4 +-- !query SELECT map_concat(boolean_map1, int_map2) bi_map FROM various_maps --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'map_concat(various_maps.`boolean_map1`, various_maps.`int_map2`)' due to data type mismatch: input to function map_concat should all be the same type, but it's [map, map]; line 2 pos 4 --- !query 5 +-- !query SELECT map_concat(int_map1, struct_map2) is_map FROM various_maps --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'map_concat(various_maps.`int_map1`, various_maps.`struct_map2`)' due to data type mismatch: input to function map_concat should all be the same type, but it's [map, map,struct>]; line 2 pos 4 --- !query 6 +-- !query SELECT map_concat(struct_map1, array_map2) ma_map FROM various_maps --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'map_concat(various_maps.`struct_map1`, various_maps.`array_map2`)' due to data type mismatch: input to function map_concat should all be the same type, but it's [map,struct>, map,array>]; line 2 pos 4 --- !query 7 +-- !query SELECT map_concat(int_map1, array_map2) ms_map FROM various_maps --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'map_concat(various_maps.`int_map1`, various_maps.`array_map2`)' due to data type mismatch: input to function map_concat should all be the same type, but it's [map, map,array>]; line 2 pos 4 diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/promoteStrings.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/promoteStrings.sql.out index c54ceba85ce79..31353bdedc69f 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/promoteStrings.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/promoteStrings.sql.out @@ -2,2577 +2,2576 @@ -- Number of queries: 316 --- !query 0 +-- !query CREATE TEMPORARY VIEW t AS SELECT 1 --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT '1' + cast(1 as tinyint) FROM t --- !query 1 schema +-- !query schema struct<(CAST(1 AS DOUBLE) + CAST(CAST(1 AS TINYINT) AS DOUBLE)):double> --- !query 1 output +-- !query output 2.0 --- !query 2 +-- !query SELECT '1' + cast(1 as smallint) FROM t --- !query 2 schema +-- !query schema struct<(CAST(1 AS DOUBLE) + CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double> --- !query 2 output +-- !query output 2.0 --- !query 3 +-- !query SELECT '1' + cast(1 as int) FROM t --- !query 3 schema +-- !query schema struct<(CAST(1 AS DOUBLE) + CAST(CAST(1 AS INT) AS DOUBLE)):double> --- !query 3 output +-- !query output 2.0 --- !query 4 +-- !query SELECT '1' + cast(1 as bigint) FROM t --- !query 4 schema +-- !query schema struct<(CAST(1 AS DOUBLE) + CAST(CAST(1 AS BIGINT) AS DOUBLE)):double> --- !query 4 output +-- !query output 2.0 --- !query 5 +-- !query SELECT '1' + cast(1 as float) FROM t --- !query 5 schema +-- !query schema struct<(CAST(1 AS DOUBLE) + CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 5 output +-- !query output 2.0 --- !query 6 +-- !query SELECT '1' + cast(1 as double) FROM t --- !query 6 schema +-- !query schema struct<(CAST(1 AS DOUBLE) + CAST(1 AS DOUBLE)):double> --- !query 6 output +-- !query output 2.0 --- !query 7 +-- !query SELECT '1' + cast(1 as decimal(10, 0)) FROM t --- !query 7 schema +-- !query schema struct<(CAST(1 AS DOUBLE) + CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double> --- !query 7 output +-- !query output 2.0 --- !query 8 +-- !query SELECT '1' + '1' FROM t --- !query 8 schema +-- !query schema struct<(CAST(1 AS DOUBLE) + CAST(1 AS DOUBLE)):double> --- !query 8 output +-- !query output 2.0 --- !query 9 +-- !query SELECT '1' + cast('1' as binary) FROM t --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS DOUBLE) + CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) + CAST('1' AS BINARY))' (double and binary).; line 1 pos 7 --- !query 10 +-- !query SELECT '1' + cast(1 as boolean) FROM t --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS DOUBLE) + CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) + CAST(1 AS BOOLEAN))' (double and boolean).; line 1 pos 7 --- !query 11 +-- !query SELECT '1' + cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS DOUBLE) + CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) + CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (double and timestamp).; line 1 pos 7 --- !query 12 +-- !query SELECT '1' + cast('2017-12-11 09:30:00' as date) FROM t --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST('1' AS DOUBLE) + CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) + CAST('2017-12-11 09:30:00' AS DATE))' (double and date).; line 1 pos 7 +cannot resolve 'date_add(CAST('2017-12-11 09:30:00' AS DATE), '1')' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, ''1'' is of string type.; line 1 pos 7 --- !query 13 +-- !query SELECT '1' - cast(1 as tinyint) FROM t --- !query 13 schema +-- !query schema struct<(CAST(1 AS DOUBLE) - CAST(CAST(1 AS TINYINT) AS DOUBLE)):double> --- !query 13 output +-- !query output 0.0 --- !query 14 +-- !query SELECT '1' - cast(1 as smallint) FROM t --- !query 14 schema +-- !query schema struct<(CAST(1 AS DOUBLE) - CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double> --- !query 14 output +-- !query output 0.0 --- !query 15 +-- !query SELECT '1' - cast(1 as int) FROM t --- !query 15 schema +-- !query schema struct<(CAST(1 AS DOUBLE) - CAST(CAST(1 AS INT) AS DOUBLE)):double> --- !query 15 output +-- !query output 0.0 --- !query 16 +-- !query SELECT '1' - cast(1 as bigint) FROM t --- !query 16 schema +-- !query schema struct<(CAST(1 AS DOUBLE) - CAST(CAST(1 AS BIGINT) AS DOUBLE)):double> --- !query 16 output +-- !query output 0.0 --- !query 17 +-- !query SELECT '1' - cast(1 as float) FROM t --- !query 17 schema +-- !query schema struct<(CAST(1 AS DOUBLE) - CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 17 output +-- !query output 0.0 --- !query 18 +-- !query SELECT '1' - cast(1 as double) FROM t --- !query 18 schema +-- !query schema struct<(CAST(1 AS DOUBLE) - CAST(1 AS DOUBLE)):double> --- !query 18 output +-- !query output 0.0 --- !query 19 +-- !query SELECT '1' - cast(1 as decimal(10, 0)) FROM t --- !query 19 schema +-- !query schema struct<(CAST(1 AS DOUBLE) - CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double> --- !query 19 output +-- !query output 0.0 --- !query 20 +-- !query SELECT '1' - '1' FROM t --- !query 20 schema +-- !query schema struct<(CAST(1 AS DOUBLE) - CAST(1 AS DOUBLE)):double> --- !query 20 output +-- !query output 0.0 --- !query 21 +-- !query SELECT '1' - cast('1' as binary) FROM t --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS DOUBLE) - CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) - CAST('1' AS BINARY))' (double and binary).; line 1 pos 7 --- !query 22 +-- !query SELECT '1' - cast(1 as boolean) FROM t --- !query 22 schema +-- !query schema struct<> --- !query 22 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS DOUBLE) - CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) - CAST(1 AS BOOLEAN))' (double and boolean).; line 1 pos 7 --- !query 23 +-- !query SELECT '1' - cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST('1' AS DOUBLE) - CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) - CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (double and timestamp).; line 1 pos 7 +cannot resolve 'subtracttimestamps('1', CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: argument 1 requires timestamp type, however, ''1'' is of string type.; line 1 pos 7 --- !query 24 +-- !query SELECT '1' - cast('2017-12-11 09:30:00' as date) FROM t --- !query 24 schema -struct<> --- !query 24 output -org.apache.spark.sql.AnalysisException -cannot resolve '(CAST('1' AS DOUBLE) - CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) - CAST('2017-12-11 09:30:00' AS DATE))' (double and date).; line 1 pos 7 +-- !query schema +struct +-- !query output +NULL --- !query 25 +-- !query SELECT '1' * cast(1 as tinyint) FROM t --- !query 25 schema +-- !query schema struct<(CAST(1 AS DOUBLE) * CAST(CAST(1 AS TINYINT) AS DOUBLE)):double> --- !query 25 output +-- !query output 1.0 --- !query 26 +-- !query SELECT '1' * cast(1 as smallint) FROM t --- !query 26 schema +-- !query schema struct<(CAST(1 AS DOUBLE) * CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double> --- !query 26 output +-- !query output 1.0 --- !query 27 +-- !query SELECT '1' * cast(1 as int) FROM t --- !query 27 schema +-- !query schema struct<(CAST(1 AS DOUBLE) * CAST(CAST(1 AS INT) AS DOUBLE)):double> --- !query 27 output +-- !query output 1.0 --- !query 28 +-- !query SELECT '1' * cast(1 as bigint) FROM t --- !query 28 schema +-- !query schema struct<(CAST(1 AS DOUBLE) * CAST(CAST(1 AS BIGINT) AS DOUBLE)):double> --- !query 28 output +-- !query output 1.0 --- !query 29 +-- !query SELECT '1' * cast(1 as float) FROM t --- !query 29 schema +-- !query schema struct<(CAST(1 AS DOUBLE) * CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 29 output +-- !query output 1.0 --- !query 30 +-- !query SELECT '1' * cast(1 as double) FROM t --- !query 30 schema +-- !query schema struct<(CAST(1 AS DOUBLE) * CAST(1 AS DOUBLE)):double> --- !query 30 output +-- !query output 1.0 --- !query 31 +-- !query SELECT '1' * cast(1 as decimal(10, 0)) FROM t --- !query 31 schema +-- !query schema struct<(CAST(1 AS DOUBLE) * CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double> --- !query 31 output +-- !query output 1.0 --- !query 32 +-- !query SELECT '1' * '1' FROM t --- !query 32 schema +-- !query schema struct<(CAST(1 AS DOUBLE) * CAST(1 AS DOUBLE)):double> --- !query 32 output +-- !query output 1.0 --- !query 33 +-- !query SELECT '1' * cast('1' as binary) FROM t --- !query 33 schema +-- !query schema struct<> --- !query 33 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS DOUBLE) * CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) * CAST('1' AS BINARY))' (double and binary).; line 1 pos 7 --- !query 34 +-- !query SELECT '1' * cast(1 as boolean) FROM t --- !query 34 schema +-- !query schema struct<> --- !query 34 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS DOUBLE) * CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) * CAST(1 AS BOOLEAN))' (double and boolean).; line 1 pos 7 --- !query 35 +-- !query SELECT '1' * cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 35 schema +-- !query schema struct<> --- !query 35 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS DOUBLE) * CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) * CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (double and timestamp).; line 1 pos 7 --- !query 36 +-- !query SELECT '1' * cast('2017-12-11 09:30:00' as date) FROM t --- !query 36 schema +-- !query schema struct<> --- !query 36 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS DOUBLE) * CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) * CAST('2017-12-11 09:30:00' AS DATE))' (double and date).; line 1 pos 7 --- !query 37 +-- !query SELECT '1' / cast(1 as tinyint) FROM t --- !query 37 schema +-- !query schema struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS TINYINT) AS DOUBLE)):double> --- !query 37 output +-- !query output 1.0 --- !query 38 +-- !query SELECT '1' / cast(1 as smallint) FROM t --- !query 38 schema +-- !query schema struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double> --- !query 38 output +-- !query output 1.0 --- !query 39 +-- !query SELECT '1' / cast(1 as int) FROM t --- !query 39 schema +-- !query schema struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS INT) AS DOUBLE)):double> --- !query 39 output +-- !query output 1.0 --- !query 40 +-- !query SELECT '1' / cast(1 as bigint) FROM t --- !query 40 schema +-- !query schema struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS BIGINT) AS DOUBLE)):double> --- !query 40 output +-- !query output 1.0 --- !query 41 +-- !query SELECT '1' / cast(1 as float) FROM t --- !query 41 schema +-- !query schema struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 41 output +-- !query output 1.0 --- !query 42 +-- !query SELECT '1' / cast(1 as double) FROM t --- !query 42 schema +-- !query schema struct<(CAST(1 AS DOUBLE) / CAST(1 AS DOUBLE)):double> --- !query 42 output +-- !query output 1.0 --- !query 43 +-- !query SELECT '1' / cast(1 as decimal(10, 0)) FROM t --- !query 43 schema +-- !query schema struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double> --- !query 43 output +-- !query output 1.0 --- !query 44 +-- !query SELECT '1' / '1' FROM t --- !query 44 schema +-- !query schema struct<(CAST(1 AS DOUBLE) / CAST(1 AS DOUBLE)):double> --- !query 44 output +-- !query output 1.0 --- !query 45 +-- !query SELECT '1' / cast('1' as binary) FROM t --- !query 45 schema +-- !query schema struct<> --- !query 45 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS DOUBLE) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) / CAST('1' AS BINARY))' (double and binary).; line 1 pos 7 --- !query 46 +-- !query SELECT '1' / cast(1 as boolean) FROM t --- !query 46 schema +-- !query schema struct<> --- !query 46 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS DOUBLE) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) / CAST(1 AS BOOLEAN))' (double and boolean).; line 1 pos 7 --- !query 47 +-- !query SELECT '1' / cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 47 schema +-- !query schema struct<> --- !query 47 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS DOUBLE) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (double and timestamp).; line 1 pos 7 --- !query 48 +-- !query SELECT '1' / cast('2017-12-11 09:30:00' as date) FROM t --- !query 48 schema +-- !query schema struct<> --- !query 48 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS DOUBLE) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) / CAST('2017-12-11 09:30:00' AS DATE))' (double and date).; line 1 pos 7 --- !query 49 +-- !query SELECT '1' % cast(1 as tinyint) FROM t --- !query 49 schema +-- !query schema struct<(CAST(1 AS DOUBLE) % CAST(CAST(1 AS TINYINT) AS DOUBLE)):double> --- !query 49 output +-- !query output 0.0 --- !query 50 +-- !query SELECT '1' % cast(1 as smallint) FROM t --- !query 50 schema +-- !query schema struct<(CAST(1 AS DOUBLE) % CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double> --- !query 50 output +-- !query output 0.0 --- !query 51 +-- !query SELECT '1' % cast(1 as int) FROM t --- !query 51 schema +-- !query schema struct<(CAST(1 AS DOUBLE) % CAST(CAST(1 AS INT) AS DOUBLE)):double> --- !query 51 output +-- !query output 0.0 --- !query 52 +-- !query SELECT '1' % cast(1 as bigint) FROM t --- !query 52 schema +-- !query schema struct<(CAST(1 AS DOUBLE) % CAST(CAST(1 AS BIGINT) AS DOUBLE)):double> --- !query 52 output +-- !query output 0.0 --- !query 53 +-- !query SELECT '1' % cast(1 as float) FROM t --- !query 53 schema +-- !query schema struct<(CAST(1 AS DOUBLE) % CAST(CAST(1 AS FLOAT) AS DOUBLE)):double> --- !query 53 output +-- !query output 0.0 --- !query 54 +-- !query SELECT '1' % cast(1 as double) FROM t --- !query 54 schema +-- !query schema struct<(CAST(1 AS DOUBLE) % CAST(1 AS DOUBLE)):double> --- !query 54 output +-- !query output 0.0 --- !query 55 +-- !query SELECT '1' % cast(1 as decimal(10, 0)) FROM t --- !query 55 schema +-- !query schema struct<(CAST(1 AS DOUBLE) % CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double> --- !query 55 output +-- !query output 0.0 --- !query 56 +-- !query SELECT '1' % '1' FROM t --- !query 56 schema +-- !query schema struct<(CAST(1 AS DOUBLE) % CAST(1 AS DOUBLE)):double> --- !query 56 output +-- !query output 0.0 --- !query 57 +-- !query SELECT '1' % cast('1' as binary) FROM t --- !query 57 schema +-- !query schema struct<> --- !query 57 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS DOUBLE) % CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) % CAST('1' AS BINARY))' (double and binary).; line 1 pos 7 --- !query 58 +-- !query SELECT '1' % cast(1 as boolean) FROM t --- !query 58 schema +-- !query schema struct<> --- !query 58 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS DOUBLE) % CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) % CAST(1 AS BOOLEAN))' (double and boolean).; line 1 pos 7 --- !query 59 +-- !query SELECT '1' % cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 59 schema +-- !query schema struct<> --- !query 59 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS DOUBLE) % CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) % CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (double and timestamp).; line 1 pos 7 --- !query 60 +-- !query SELECT '1' % cast('2017-12-11 09:30:00' as date) FROM t --- !query 60 schema +-- !query schema struct<> --- !query 60 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS DOUBLE) % CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) % CAST('2017-12-11 09:30:00' AS DATE))' (double and date).; line 1 pos 7 --- !query 61 +-- !query SELECT pmod('1', cast(1 as tinyint)) FROM t --- !query 61 schema +-- !query schema struct --- !query 61 output +-- !query output 0.0 --- !query 62 +-- !query SELECT pmod('1', cast(1 as smallint)) FROM t --- !query 62 schema +-- !query schema struct --- !query 62 output +-- !query output 0.0 --- !query 63 +-- !query SELECT pmod('1', cast(1 as int)) FROM t --- !query 63 schema +-- !query schema struct --- !query 63 output +-- !query output 0.0 --- !query 64 +-- !query SELECT pmod('1', cast(1 as bigint)) FROM t --- !query 64 schema +-- !query schema struct --- !query 64 output +-- !query output 0.0 --- !query 65 +-- !query SELECT pmod('1', cast(1 as float)) FROM t --- !query 65 schema +-- !query schema struct --- !query 65 output +-- !query output 0.0 --- !query 66 +-- !query SELECT pmod('1', cast(1 as double)) FROM t --- !query 66 schema +-- !query schema struct --- !query 66 output +-- !query output 0.0 --- !query 67 +-- !query SELECT pmod('1', cast(1 as decimal(10, 0))) FROM t --- !query 67 schema +-- !query schema struct --- !query 67 output +-- !query output 0.0 --- !query 68 +-- !query SELECT pmod('1', '1') FROM t --- !query 68 schema +-- !query schema struct --- !query 68 output +-- !query output 0.0 --- !query 69 +-- !query SELECT pmod('1', cast('1' as binary)) FROM t --- !query 69 schema +-- !query schema struct<> --- !query 69 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST('1' AS DOUBLE), CAST('1' AS BINARY))' due to data type mismatch: differing types in 'pmod(CAST('1' AS DOUBLE), CAST('1' AS BINARY))' (double and binary).; line 1 pos 7 --- !query 70 +-- !query SELECT pmod('1', cast(1 as boolean)) FROM t --- !query 70 schema +-- !query schema struct<> --- !query 70 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST('1' AS DOUBLE), CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in 'pmod(CAST('1' AS DOUBLE), CAST(1 AS BOOLEAN))' (double and boolean).; line 1 pos 7 --- !query 71 +-- !query SELECT pmod('1', cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 71 schema +-- !query schema struct<> --- !query 71 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST('1' AS DOUBLE), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in 'pmod(CAST('1' AS DOUBLE), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (double and timestamp).; line 1 pos 7 --- !query 72 +-- !query SELECT pmod('1', cast('2017-12-11 09:30:00' as date)) FROM t --- !query 72 schema +-- !query schema struct<> --- !query 72 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST('1' AS DOUBLE), CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in 'pmod(CAST('1' AS DOUBLE), CAST('2017-12-11 09:30:00' AS DATE))' (double and date).; line 1 pos 7 --- !query 73 +-- !query SELECT cast(1 as tinyint) + '1' FROM t --- !query 73 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) + CAST(1 AS DOUBLE)):double> --- !query 73 output +-- !query output 2.0 --- !query 74 +-- !query SELECT cast(1 as smallint) + '1' FROM t --- !query 74 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) + CAST(1 AS DOUBLE)):double> --- !query 74 output +-- !query output 2.0 --- !query 75 +-- !query SELECT cast(1 as int) + '1' FROM t --- !query 75 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DOUBLE) + CAST(1 AS DOUBLE)):double> --- !query 75 output +-- !query output 2.0 --- !query 76 +-- !query SELECT cast(1 as bigint) + '1' FROM t --- !query 76 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) + CAST(1 AS DOUBLE)):double> --- !query 76 output +-- !query output 2.0 --- !query 77 +-- !query SELECT cast(1 as float) + '1' FROM t --- !query 77 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) + CAST(1 AS DOUBLE)):double> --- !query 77 output +-- !query output 2.0 --- !query 78 +-- !query SELECT cast(1 as double) + '1' FROM t --- !query 78 schema +-- !query schema struct<(CAST(1 AS DOUBLE) + CAST(1 AS DOUBLE)):double> --- !query 78 output +-- !query output 2.0 --- !query 79 +-- !query SELECT cast(1 as decimal(10, 0)) + '1' FROM t --- !query 79 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) + CAST(1 AS DOUBLE)):double> --- !query 79 output +-- !query output 2.0 --- !query 80 +-- !query SELECT cast('1' as binary) + '1' FROM t --- !query 80 schema +-- !query schema struct<> --- !query 80 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) + CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) + CAST('1' AS DOUBLE))' (binary and double).; line 1 pos 7 --- !query 81 +-- !query SELECT cast(1 as boolean) + '1' FROM t --- !query 81 schema +-- !query schema struct<> --- !query 81 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BOOLEAN) + CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) + CAST('1' AS DOUBLE))' (boolean and double).; line 1 pos 7 --- !query 82 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) + '1' FROM t --- !query 82 schema +-- !query schema struct<> --- !query 82 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) + CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) + CAST('1' AS DOUBLE))' (timestamp and double).; line 1 pos 7 --- !query 83 +-- !query SELECT cast('2017-12-11 09:30:00' as date) + '1' FROM t --- !query 83 schema +-- !query schema struct<> --- !query 83 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) + CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) + CAST('1' AS DOUBLE))' (date and double).; line 1 pos 7 +cannot resolve 'date_add(CAST('2017-12-11 09:30:00' AS DATE), '1')' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, ''1'' is of string type.; line 1 pos 7 --- !query 84 +-- !query SELECT cast(1 as tinyint) - '1' FROM t --- !query 84 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) - CAST(1 AS DOUBLE)):double> --- !query 84 output +-- !query output 0.0 --- !query 85 +-- !query SELECT cast(1 as smallint) - '1' FROM t --- !query 85 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) - CAST(1 AS DOUBLE)):double> --- !query 85 output +-- !query output 0.0 --- !query 86 +-- !query SELECT cast(1 as int) - '1' FROM t --- !query 86 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DOUBLE) - CAST(1 AS DOUBLE)):double> --- !query 86 output +-- !query output 0.0 --- !query 87 +-- !query SELECT cast(1 as bigint) - '1' FROM t --- !query 87 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) - CAST(1 AS DOUBLE)):double> --- !query 87 output +-- !query output 0.0 --- !query 88 +-- !query SELECT cast(1 as float) - '1' FROM t --- !query 88 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) - CAST(1 AS DOUBLE)):double> --- !query 88 output +-- !query output 0.0 --- !query 89 +-- !query SELECT cast(1 as double) - '1' FROM t --- !query 89 schema +-- !query schema struct<(CAST(1 AS DOUBLE) - CAST(1 AS DOUBLE)):double> --- !query 89 output +-- !query output 0.0 --- !query 90 +-- !query SELECT cast(1 as decimal(10, 0)) - '1' FROM t --- !query 90 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) - CAST(1 AS DOUBLE)):double> --- !query 90 output +-- !query output 0.0 --- !query 91 +-- !query SELECT cast('1' as binary) - '1' FROM t --- !query 91 schema +-- !query schema struct<> --- !query 91 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) - CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) - CAST('1' AS DOUBLE))' (binary and double).; line 1 pos 7 --- !query 92 +-- !query SELECT cast(1 as boolean) - '1' FROM t --- !query 92 schema +-- !query schema struct<> --- !query 92 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BOOLEAN) - CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) - CAST('1' AS DOUBLE))' (boolean and double).; line 1 pos 7 --- !query 93 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) - '1' FROM t --- !query 93 schema +-- !query schema struct<> --- !query 93 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) - CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) - CAST('1' AS DOUBLE))' (timestamp and double).; line 1 pos 7 +cannot resolve 'subtracttimestamps(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), '1')' due to data type mismatch: argument 2 requires timestamp type, however, ''1'' is of string type.; line 1 pos 7 --- !query 94 +-- !query SELECT cast('2017-12-11 09:30:00' as date) - '1' FROM t --- !query 94 schema +-- !query schema struct<> --- !query 94 output +-- !query output org.apache.spark.sql.AnalysisException -cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) - CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) - CAST('1' AS DOUBLE))' (date and double).; line 1 pos 7 +cannot resolve 'date_sub(CAST('2017-12-11 09:30:00' AS DATE), '1')' due to data type mismatch: argument 2 requires (int or smallint or tinyint) type, however, ''1'' is of string type.; line 1 pos 7 --- !query 95 +-- !query SELECT cast(1 as tinyint) * '1' FROM t --- !query 95 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) * CAST(1 AS DOUBLE)):double> --- !query 95 output +-- !query output 1.0 --- !query 96 +-- !query SELECT cast(1 as smallint) * '1' FROM t --- !query 96 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) * CAST(1 AS DOUBLE)):double> --- !query 96 output +-- !query output 1.0 --- !query 97 +-- !query SELECT cast(1 as int) * '1' FROM t --- !query 97 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DOUBLE) * CAST(1 AS DOUBLE)):double> --- !query 97 output +-- !query output 1.0 --- !query 98 +-- !query SELECT cast(1 as bigint) * '1' FROM t --- !query 98 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) * CAST(1 AS DOUBLE)):double> --- !query 98 output +-- !query output 1.0 --- !query 99 +-- !query SELECT cast(1 as float) * '1' FROM t --- !query 99 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) * CAST(1 AS DOUBLE)):double> --- !query 99 output +-- !query output 1.0 --- !query 100 +-- !query SELECT cast(1 as double) * '1' FROM t --- !query 100 schema +-- !query schema struct<(CAST(1 AS DOUBLE) * CAST(1 AS DOUBLE)):double> --- !query 100 output +-- !query output 1.0 --- !query 101 +-- !query SELECT cast(1 as decimal(10, 0)) * '1' FROM t --- !query 101 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) * CAST(1 AS DOUBLE)):double> --- !query 101 output +-- !query output 1.0 --- !query 102 +-- !query SELECT cast('1' as binary) * '1' FROM t --- !query 102 schema +-- !query schema struct<> --- !query 102 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) * CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) * CAST('1' AS DOUBLE))' (binary and double).; line 1 pos 7 --- !query 103 +-- !query SELECT cast(1 as boolean) * '1' FROM t --- !query 103 schema +-- !query schema struct<> --- !query 103 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BOOLEAN) * CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) * CAST('1' AS DOUBLE))' (boolean and double).; line 1 pos 7 --- !query 104 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) * '1' FROM t --- !query 104 schema +-- !query schema struct<> --- !query 104 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) * CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) * CAST('1' AS DOUBLE))' (timestamp and double).; line 1 pos 7 --- !query 105 +-- !query SELECT cast('2017-12-11 09:30:00' as date) * '1' FROM t --- !query 105 schema +-- !query schema struct<> --- !query 105 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) * CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) * CAST('1' AS DOUBLE))' (date and double).; line 1 pos 7 --- !query 106 +-- !query SELECT cast(1 as tinyint) / '1' FROM t --- !query 106 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) / CAST(CAST(1 AS DOUBLE) AS DOUBLE)):double> --- !query 106 output +-- !query output 1.0 --- !query 107 +-- !query SELECT cast(1 as smallint) / '1' FROM t --- !query 107 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) / CAST(CAST(1 AS DOUBLE) AS DOUBLE)):double> --- !query 107 output +-- !query output 1.0 --- !query 108 +-- !query SELECT cast(1 as int) / '1' FROM t --- !query 108 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DOUBLE) / CAST(CAST(1 AS DOUBLE) AS DOUBLE)):double> --- !query 108 output +-- !query output 1.0 --- !query 109 +-- !query SELECT cast(1 as bigint) / '1' FROM t --- !query 109 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) / CAST(CAST(1 AS DOUBLE) AS DOUBLE)):double> --- !query 109 output +-- !query output 1.0 --- !query 110 +-- !query SELECT cast(1 as float) / '1' FROM t --- !query 110 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(1 AS DOUBLE) AS DOUBLE)):double> --- !query 110 output +-- !query output 1.0 --- !query 111 +-- !query SELECT cast(1 as double) / '1' FROM t --- !query 111 schema +-- !query schema struct<(CAST(1 AS DOUBLE) / CAST(1 AS DOUBLE)):double> --- !query 111 output +-- !query output 1.0 --- !query 112 +-- !query SELECT cast(1 as decimal(10, 0)) / '1' FROM t --- !query 112 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) / CAST(1 AS DOUBLE)):double> --- !query 112 output +-- !query output 1.0 --- !query 113 +-- !query SELECT cast('1' as binary) / '1' FROM t --- !query 113 schema +-- !query schema struct<> --- !query 113 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) / CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST('1' AS DOUBLE))' (binary and double).; line 1 pos 7 --- !query 114 +-- !query SELECT cast(1 as boolean) / '1' FROM t --- !query 114 schema +-- !query schema struct<> --- !query 114 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BOOLEAN) / CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST('1' AS DOUBLE))' (boolean and double).; line 1 pos 7 --- !query 115 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) / '1' FROM t --- !query 115 schema +-- !query schema struct<> --- !query 115 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST('1' AS DOUBLE))' (timestamp and double).; line 1 pos 7 --- !query 116 +-- !query SELECT cast('2017-12-11 09:30:00' as date) / '1' FROM t --- !query 116 schema +-- !query schema struct<> --- !query 116 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST('1' AS DOUBLE))' (date and double).; line 1 pos 7 --- !query 117 +-- !query SELECT cast(1 as tinyint) % '1' FROM t --- !query 117 schema +-- !query schema struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) % CAST(1 AS DOUBLE)):double> --- !query 117 output +-- !query output 0.0 --- !query 118 +-- !query SELECT cast(1 as smallint) % '1' FROM t --- !query 118 schema +-- !query schema struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) % CAST(1 AS DOUBLE)):double> --- !query 118 output +-- !query output 0.0 --- !query 119 +-- !query SELECT cast(1 as int) % '1' FROM t --- !query 119 schema +-- !query schema struct<(CAST(CAST(1 AS INT) AS DOUBLE) % CAST(1 AS DOUBLE)):double> --- !query 119 output +-- !query output 0.0 --- !query 120 +-- !query SELECT cast(1 as bigint) % '1' FROM t --- !query 120 schema +-- !query schema struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) % CAST(1 AS DOUBLE)):double> --- !query 120 output +-- !query output 0.0 --- !query 121 +-- !query SELECT cast(1 as float) % '1' FROM t --- !query 121 schema +-- !query schema struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) % CAST(1 AS DOUBLE)):double> --- !query 121 output +-- !query output 0.0 --- !query 122 +-- !query SELECT cast(1 as double) % '1' FROM t --- !query 122 schema +-- !query schema struct<(CAST(1 AS DOUBLE) % CAST(1 AS DOUBLE)):double> --- !query 122 output +-- !query output 0.0 --- !query 123 +-- !query SELECT cast(1 as decimal(10, 0)) % '1' FROM t --- !query 123 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) % CAST(1 AS DOUBLE)):double> --- !query 123 output +-- !query output 0.0 --- !query 124 +-- !query SELECT cast('1' as binary) % '1' FROM t --- !query 124 schema +-- !query schema struct<> --- !query 124 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('1' AS BINARY) % CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) % CAST('1' AS DOUBLE))' (binary and double).; line 1 pos 7 --- !query 125 +-- !query SELECT cast(1 as boolean) % '1' FROM t --- !query 125 schema +-- !query schema struct<> --- !query 125 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST(1 AS BOOLEAN) % CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) % CAST('1' AS DOUBLE))' (boolean and double).; line 1 pos 7 --- !query 126 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) % '1' FROM t --- !query 126 schema +-- !query schema struct<> --- !query 126 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) % CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) % CAST('1' AS DOUBLE))' (timestamp and double).; line 1 pos 7 --- !query 127 +-- !query SELECT cast('2017-12-11 09:30:00' as date) % '1' FROM t --- !query 127 schema +-- !query schema struct<> --- !query 127 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) % CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) % CAST('1' AS DOUBLE))' (date and double).; line 1 pos 7 --- !query 128 +-- !query SELECT pmod(cast(1 as tinyint), '1') FROM t --- !query 128 schema +-- !query schema struct --- !query 128 output +-- !query output 0.0 --- !query 129 +-- !query SELECT pmod(cast(1 as smallint), '1') FROM t --- !query 129 schema +-- !query schema struct --- !query 129 output +-- !query output 0.0 --- !query 130 +-- !query SELECT pmod(cast(1 as int), '1') FROM t --- !query 130 schema +-- !query schema struct --- !query 130 output +-- !query output 0.0 --- !query 131 +-- !query SELECT pmod(cast(1 as bigint), '1') FROM t --- !query 131 schema +-- !query schema struct --- !query 131 output +-- !query output 0.0 --- !query 132 +-- !query SELECT pmod(cast(1 as float), '1') FROM t --- !query 132 schema +-- !query schema struct --- !query 132 output +-- !query output 0.0 --- !query 133 +-- !query SELECT pmod(cast(1 as double), '1') FROM t --- !query 133 schema +-- !query schema struct --- !query 133 output +-- !query output 0.0 --- !query 134 +-- !query SELECT pmod(cast(1 as decimal(10, 0)), '1') FROM t --- !query 134 schema +-- !query schema struct --- !query 134 output +-- !query output 0.0 --- !query 135 +-- !query SELECT pmod(cast('1' as binary), '1') FROM t --- !query 135 schema +-- !query schema struct<> --- !query 135 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST('1' AS BINARY), CAST('1' AS DOUBLE))' due to data type mismatch: differing types in 'pmod(CAST('1' AS BINARY), CAST('1' AS DOUBLE))' (binary and double).; line 1 pos 7 --- !query 136 +-- !query SELECT pmod(cast(1 as boolean), '1') FROM t --- !query 136 schema +-- !query schema struct<> --- !query 136 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST(1 AS BOOLEAN), CAST('1' AS DOUBLE))' due to data type mismatch: differing types in 'pmod(CAST(1 AS BOOLEAN), CAST('1' AS DOUBLE))' (boolean and double).; line 1 pos 7 --- !query 137 +-- !query SELECT pmod(cast('2017-12-11 09:30:00.0' as timestamp), '1') FROM t --- !query 137 schema +-- !query schema struct<> --- !query 137 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST('1' AS DOUBLE))' due to data type mismatch: differing types in 'pmod(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST('1' AS DOUBLE))' (timestamp and double).; line 1 pos 7 --- !query 138 +-- !query SELECT pmod(cast('2017-12-11 09:30:00' as date), '1') FROM t --- !query 138 schema +-- !query schema struct<> --- !query 138 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'pmod(CAST('2017-12-11 09:30:00' AS DATE), CAST('1' AS DOUBLE))' due to data type mismatch: differing types in 'pmod(CAST('2017-12-11 09:30:00' AS DATE), CAST('1' AS DOUBLE))' (date and double).; line 1 pos 7 --- !query 139 +-- !query SELECT '1' = cast(1 as tinyint) FROM t --- !query 139 schema +-- !query schema struct<(CAST(1 AS TINYINT) = CAST(1 AS TINYINT)):boolean> --- !query 139 output +-- !query output true --- !query 140 +-- !query SELECT '1' = cast(1 as smallint) FROM t --- !query 140 schema +-- !query schema struct<(CAST(1 AS SMALLINT) = CAST(1 AS SMALLINT)):boolean> --- !query 140 output +-- !query output true --- !query 141 +-- !query SELECT '1' = cast(1 as int) FROM t --- !query 141 schema +-- !query schema struct<(CAST(1 AS INT) = CAST(1 AS INT)):boolean> --- !query 141 output +-- !query output true --- !query 142 +-- !query SELECT '1' = cast(1 as bigint) FROM t --- !query 142 schema +-- !query schema struct<(CAST(1 AS BIGINT) = CAST(1 AS BIGINT)):boolean> --- !query 142 output +-- !query output true --- !query 143 +-- !query SELECT '1' = cast(1 as float) FROM t --- !query 143 schema +-- !query schema struct<(CAST(1 AS FLOAT) = CAST(1 AS FLOAT)):boolean> --- !query 143 output +-- !query output true --- !query 144 +-- !query SELECT '1' = cast(1 as double) FROM t --- !query 144 schema +-- !query schema struct<(CAST(1 AS DOUBLE) = CAST(1 AS DOUBLE)):boolean> --- !query 144 output +-- !query output true --- !query 145 +-- !query SELECT '1' = cast(1 as decimal(10, 0)) FROM t --- !query 145 schema +-- !query schema struct<(CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 145 output +-- !query output true --- !query 146 +-- !query SELECT '1' = '1' FROM t --- !query 146 schema +-- !query schema struct<(1 = 1):boolean> --- !query 146 output +-- !query output true --- !query 147 +-- !query SELECT '1' = cast('1' as binary) FROM t --- !query 147 schema +-- !query schema struct<(CAST(1 AS BINARY) = CAST(1 AS BINARY)):boolean> --- !query 147 output +-- !query output true --- !query 148 +-- !query SELECT '1' = cast(1 as boolean) FROM t --- !query 148 schema +-- !query schema struct<(CAST(1 AS BOOLEAN) = CAST(1 AS BOOLEAN)):boolean> --- !query 148 output +-- !query output true --- !query 149 +-- !query SELECT '1' = cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 149 schema +-- !query schema struct<(CAST(1 AS TIMESTAMP) = CAST(2017-12-11 09:30:00.0 AS TIMESTAMP)):boolean> --- !query 149 output +-- !query output NULL --- !query 150 +-- !query SELECT '1' = cast('2017-12-11 09:30:00' as date) FROM t --- !query 150 schema +-- !query schema struct<(CAST(1 AS DATE) = CAST(2017-12-11 09:30:00 AS DATE)):boolean> --- !query 150 output +-- !query output NULL --- !query 151 +-- !query SELECT cast(1 as tinyint) = '1' FROM t --- !query 151 schema +-- !query schema struct<(CAST(1 AS TINYINT) = CAST(1 AS TINYINT)):boolean> --- !query 151 output +-- !query output true --- !query 152 +-- !query SELECT cast(1 as smallint) = '1' FROM t --- !query 152 schema +-- !query schema struct<(CAST(1 AS SMALLINT) = CAST(1 AS SMALLINT)):boolean> --- !query 152 output +-- !query output true --- !query 153 +-- !query SELECT cast(1 as int) = '1' FROM t --- !query 153 schema +-- !query schema struct<(CAST(1 AS INT) = CAST(1 AS INT)):boolean> --- !query 153 output +-- !query output true --- !query 154 +-- !query SELECT cast(1 as bigint) = '1' FROM t --- !query 154 schema +-- !query schema struct<(CAST(1 AS BIGINT) = CAST(1 AS BIGINT)):boolean> --- !query 154 output +-- !query output true --- !query 155 +-- !query SELECT cast(1 as float) = '1' FROM t --- !query 155 schema +-- !query schema struct<(CAST(1 AS FLOAT) = CAST(1 AS FLOAT)):boolean> --- !query 155 output +-- !query output true --- !query 156 +-- !query SELECT cast(1 as double) = '1' FROM t --- !query 156 schema +-- !query schema struct<(CAST(1 AS DOUBLE) = CAST(1 AS DOUBLE)):boolean> --- !query 156 output +-- !query output true --- !query 157 +-- !query SELECT cast(1 as decimal(10, 0)) = '1' FROM t --- !query 157 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(1 AS DOUBLE)):boolean> --- !query 157 output +-- !query output true --- !query 158 +-- !query SELECT cast('1' as binary) = '1' FROM t --- !query 158 schema +-- !query schema struct<(CAST(1 AS BINARY) = CAST(1 AS BINARY)):boolean> --- !query 158 output +-- !query output true --- !query 159 +-- !query SELECT cast(1 as boolean) = '1' FROM t --- !query 159 schema +-- !query schema struct<(CAST(1 AS BOOLEAN) = CAST(1 AS BOOLEAN)):boolean> --- !query 159 output +-- !query output true --- !query 160 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) = '1' FROM t --- !query 160 schema +-- !query schema struct<(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) = CAST(1 AS TIMESTAMP)):boolean> --- !query 160 output +-- !query output NULL --- !query 161 +-- !query SELECT cast('2017-12-11 09:30:00' as date) = '1' FROM t --- !query 161 schema +-- !query schema struct<(CAST(2017-12-11 09:30:00 AS DATE) = CAST(1 AS DATE)):boolean> --- !query 161 output +-- !query output NULL --- !query 162 +-- !query SELECT '1' <=> cast(1 as tinyint) FROM t --- !query 162 schema +-- !query schema struct<(CAST(1 AS TINYINT) <=> CAST(1 AS TINYINT)):boolean> --- !query 162 output +-- !query output true --- !query 163 +-- !query SELECT '1' <=> cast(1 as smallint) FROM t --- !query 163 schema +-- !query schema struct<(CAST(1 AS SMALLINT) <=> CAST(1 AS SMALLINT)):boolean> --- !query 163 output +-- !query output true --- !query 164 +-- !query SELECT '1' <=> cast(1 as int) FROM t --- !query 164 schema +-- !query schema struct<(CAST(1 AS INT) <=> CAST(1 AS INT)):boolean> --- !query 164 output +-- !query output true --- !query 165 +-- !query SELECT '1' <=> cast(1 as bigint) FROM t --- !query 165 schema +-- !query schema struct<(CAST(1 AS BIGINT) <=> CAST(1 AS BIGINT)):boolean> --- !query 165 output +-- !query output true --- !query 166 +-- !query SELECT '1' <=> cast(1 as float) FROM t --- !query 166 schema +-- !query schema struct<(CAST(1 AS FLOAT) <=> CAST(1 AS FLOAT)):boolean> --- !query 166 output +-- !query output true --- !query 167 +-- !query SELECT '1' <=> cast(1 as double) FROM t --- !query 167 schema +-- !query schema struct<(CAST(1 AS DOUBLE) <=> CAST(1 AS DOUBLE)):boolean> --- !query 167 output +-- !query output true --- !query 168 +-- !query SELECT '1' <=> cast(1 as decimal(10, 0)) FROM t --- !query 168 schema +-- !query schema struct<(CAST(1 AS DOUBLE) <=> CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 168 output +-- !query output true --- !query 169 +-- !query SELECT '1' <=> '1' FROM t --- !query 169 schema +-- !query schema struct<(1 <=> 1):boolean> --- !query 169 output +-- !query output true --- !query 170 +-- !query SELECT '1' <=> cast('1' as binary) FROM t --- !query 170 schema +-- !query schema struct<(CAST(1 AS BINARY) <=> CAST(1 AS BINARY)):boolean> --- !query 170 output +-- !query output true --- !query 171 +-- !query SELECT '1' <=> cast(1 as boolean) FROM t --- !query 171 schema +-- !query schema struct<(CAST(1 AS BOOLEAN) <=> CAST(1 AS BOOLEAN)):boolean> --- !query 171 output +-- !query output true --- !query 172 +-- !query SELECT '1' <=> cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 172 schema +-- !query schema struct<(CAST(1 AS TIMESTAMP) <=> CAST(2017-12-11 09:30:00.0 AS TIMESTAMP)):boolean> --- !query 172 output +-- !query output false --- !query 173 +-- !query SELECT '1' <=> cast('2017-12-11 09:30:00' as date) FROM t --- !query 173 schema +-- !query schema struct<(CAST(1 AS DATE) <=> CAST(2017-12-11 09:30:00 AS DATE)):boolean> --- !query 173 output +-- !query output false --- !query 174 +-- !query SELECT cast(1 as tinyint) <=> '1' FROM t --- !query 174 schema +-- !query schema struct<(CAST(1 AS TINYINT) <=> CAST(1 AS TINYINT)):boolean> --- !query 174 output +-- !query output true --- !query 175 +-- !query SELECT cast(1 as smallint) <=> '1' FROM t --- !query 175 schema +-- !query schema struct<(CAST(1 AS SMALLINT) <=> CAST(1 AS SMALLINT)):boolean> --- !query 175 output +-- !query output true --- !query 176 +-- !query SELECT cast(1 as int) <=> '1' FROM t --- !query 176 schema +-- !query schema struct<(CAST(1 AS INT) <=> CAST(1 AS INT)):boolean> --- !query 176 output +-- !query output true --- !query 177 +-- !query SELECT cast(1 as bigint) <=> '1' FROM t --- !query 177 schema +-- !query schema struct<(CAST(1 AS BIGINT) <=> CAST(1 AS BIGINT)):boolean> --- !query 177 output +-- !query output true --- !query 178 +-- !query SELECT cast(1 as float) <=> '1' FROM t --- !query 178 schema +-- !query schema struct<(CAST(1 AS FLOAT) <=> CAST(1 AS FLOAT)):boolean> --- !query 178 output +-- !query output true --- !query 179 +-- !query SELECT cast(1 as double) <=> '1' FROM t --- !query 179 schema +-- !query schema struct<(CAST(1 AS DOUBLE) <=> CAST(1 AS DOUBLE)):boolean> --- !query 179 output +-- !query output true --- !query 180 +-- !query SELECT cast(1 as decimal(10, 0)) <=> '1' FROM t --- !query 180 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) <=> CAST(1 AS DOUBLE)):boolean> --- !query 180 output +-- !query output true --- !query 181 +-- !query SELECT cast('1' as binary) <=> '1' FROM t --- !query 181 schema +-- !query schema struct<(CAST(1 AS BINARY) <=> CAST(1 AS BINARY)):boolean> --- !query 181 output +-- !query output true --- !query 182 +-- !query SELECT cast(1 as boolean) <=> '1' FROM t --- !query 182 schema +-- !query schema struct<(CAST(1 AS BOOLEAN) <=> CAST(1 AS BOOLEAN)):boolean> --- !query 182 output +-- !query output true --- !query 183 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) <=> '1' FROM t --- !query 183 schema +-- !query schema struct<(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) <=> CAST(1 AS TIMESTAMP)):boolean> --- !query 183 output +-- !query output false --- !query 184 +-- !query SELECT cast('2017-12-11 09:30:00' as date) <=> '1' FROM t --- !query 184 schema +-- !query schema struct<(CAST(2017-12-11 09:30:00 AS DATE) <=> CAST(1 AS DATE)):boolean> --- !query 184 output +-- !query output false --- !query 185 +-- !query SELECT '1' < cast(1 as tinyint) FROM t --- !query 185 schema +-- !query schema struct<(CAST(1 AS TINYINT) < CAST(1 AS TINYINT)):boolean> --- !query 185 output +-- !query output false --- !query 186 +-- !query SELECT '1' < cast(1 as smallint) FROM t --- !query 186 schema +-- !query schema struct<(CAST(1 AS SMALLINT) < CAST(1 AS SMALLINT)):boolean> --- !query 186 output +-- !query output false --- !query 187 +-- !query SELECT '1' < cast(1 as int) FROM t --- !query 187 schema +-- !query schema struct<(CAST(1 AS INT) < CAST(1 AS INT)):boolean> --- !query 187 output +-- !query output false --- !query 188 +-- !query SELECT '1' < cast(1 as bigint) FROM t --- !query 188 schema +-- !query schema struct<(CAST(1 AS BIGINT) < CAST(1 AS BIGINT)):boolean> --- !query 188 output +-- !query output false --- !query 189 +-- !query SELECT '1' < cast(1 as float) FROM t --- !query 189 schema +-- !query schema struct<(CAST(1 AS FLOAT) < CAST(1 AS FLOAT)):boolean> --- !query 189 output +-- !query output false --- !query 190 +-- !query SELECT '1' < cast(1 as double) FROM t --- !query 190 schema +-- !query schema struct<(CAST(1 AS DOUBLE) < CAST(1 AS DOUBLE)):boolean> --- !query 190 output +-- !query output false --- !query 191 +-- !query SELECT '1' < cast(1 as decimal(10, 0)) FROM t --- !query 191 schema +-- !query schema struct<(CAST(1 AS DOUBLE) < CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 191 output +-- !query output false --- !query 192 +-- !query SELECT '1' < '1' FROM t --- !query 192 schema +-- !query schema struct<(1 < 1):boolean> --- !query 192 output +-- !query output false --- !query 193 +-- !query SELECT '1' < cast('1' as binary) FROM t --- !query 193 schema +-- !query schema struct<(CAST(1 AS BINARY) < CAST(1 AS BINARY)):boolean> --- !query 193 output +-- !query output false --- !query 194 +-- !query SELECT '1' < cast(1 as boolean) FROM t --- !query 194 schema +-- !query schema struct<(CAST(1 AS BOOLEAN) < CAST(1 AS BOOLEAN)):boolean> --- !query 194 output +-- !query output false --- !query 195 +-- !query SELECT '1' < cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 195 schema +-- !query schema struct<(CAST(1 AS TIMESTAMP) < CAST(2017-12-11 09:30:00.0 AS TIMESTAMP)):boolean> --- !query 195 output +-- !query output NULL --- !query 196 +-- !query SELECT '1' < cast('2017-12-11 09:30:00' as date) FROM t --- !query 196 schema +-- !query schema struct<(CAST(1 AS DATE) < CAST(2017-12-11 09:30:00 AS DATE)):boolean> --- !query 196 output +-- !query output NULL --- !query 197 +-- !query SELECT '1' <= cast(1 as tinyint) FROM t --- !query 197 schema +-- !query schema struct<(CAST(1 AS TINYINT) <= CAST(1 AS TINYINT)):boolean> --- !query 197 output +-- !query output true --- !query 198 +-- !query SELECT '1' <= cast(1 as smallint) FROM t --- !query 198 schema +-- !query schema struct<(CAST(1 AS SMALLINT) <= CAST(1 AS SMALLINT)):boolean> --- !query 198 output +-- !query output true --- !query 199 +-- !query SELECT '1' <= cast(1 as int) FROM t --- !query 199 schema +-- !query schema struct<(CAST(1 AS INT) <= CAST(1 AS INT)):boolean> --- !query 199 output +-- !query output true --- !query 200 +-- !query SELECT '1' <= cast(1 as bigint) FROM t --- !query 200 schema +-- !query schema struct<(CAST(1 AS BIGINT) <= CAST(1 AS BIGINT)):boolean> --- !query 200 output +-- !query output true --- !query 201 +-- !query SELECT '1' <= cast(1 as float) FROM t --- !query 201 schema +-- !query schema struct<(CAST(1 AS FLOAT) <= CAST(1 AS FLOAT)):boolean> --- !query 201 output +-- !query output true --- !query 202 +-- !query SELECT '1' <= cast(1 as double) FROM t --- !query 202 schema +-- !query schema struct<(CAST(1 AS DOUBLE) <= CAST(1 AS DOUBLE)):boolean> --- !query 202 output +-- !query output true --- !query 203 +-- !query SELECT '1' <= cast(1 as decimal(10, 0)) FROM t --- !query 203 schema +-- !query schema struct<(CAST(1 AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 203 output +-- !query output true --- !query 204 +-- !query SELECT '1' <= '1' FROM t --- !query 204 schema +-- !query schema struct<(1 <= 1):boolean> --- !query 204 output +-- !query output true --- !query 205 +-- !query SELECT '1' <= cast('1' as binary) FROM t --- !query 205 schema +-- !query schema struct<(CAST(1 AS BINARY) <= CAST(1 AS BINARY)):boolean> --- !query 205 output +-- !query output true --- !query 206 +-- !query SELECT '1' <= cast(1 as boolean) FROM t --- !query 206 schema +-- !query schema struct<(CAST(1 AS BOOLEAN) <= CAST(1 AS BOOLEAN)):boolean> --- !query 206 output +-- !query output true --- !query 207 +-- !query SELECT '1' <= cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 207 schema +-- !query schema struct<(CAST(1 AS TIMESTAMP) <= CAST(2017-12-11 09:30:00.0 AS TIMESTAMP)):boolean> --- !query 207 output +-- !query output NULL --- !query 208 +-- !query SELECT '1' <= cast('2017-12-11 09:30:00' as date) FROM t --- !query 208 schema +-- !query schema struct<(CAST(1 AS DATE) <= CAST(2017-12-11 09:30:00 AS DATE)):boolean> --- !query 208 output +-- !query output NULL --- !query 209 +-- !query SELECT '1' > cast(1 as tinyint) FROM t --- !query 209 schema +-- !query schema struct<(CAST(1 AS TINYINT) > CAST(1 AS TINYINT)):boolean> --- !query 209 output +-- !query output false --- !query 210 +-- !query SELECT '1' > cast(1 as smallint) FROM t --- !query 210 schema +-- !query schema struct<(CAST(1 AS SMALLINT) > CAST(1 AS SMALLINT)):boolean> --- !query 210 output +-- !query output false --- !query 211 +-- !query SELECT '1' > cast(1 as int) FROM t --- !query 211 schema +-- !query schema struct<(CAST(1 AS INT) > CAST(1 AS INT)):boolean> --- !query 211 output +-- !query output false --- !query 212 +-- !query SELECT '1' > cast(1 as bigint) FROM t --- !query 212 schema +-- !query schema struct<(CAST(1 AS BIGINT) > CAST(1 AS BIGINT)):boolean> --- !query 212 output +-- !query output false --- !query 213 +-- !query SELECT '1' > cast(1 as float) FROM t --- !query 213 schema +-- !query schema struct<(CAST(1 AS FLOAT) > CAST(1 AS FLOAT)):boolean> --- !query 213 output +-- !query output false --- !query 214 +-- !query SELECT '1' > cast(1 as double) FROM t --- !query 214 schema +-- !query schema struct<(CAST(1 AS DOUBLE) > CAST(1 AS DOUBLE)):boolean> --- !query 214 output +-- !query output false --- !query 215 +-- !query SELECT '1' > cast(1 as decimal(10, 0)) FROM t --- !query 215 schema +-- !query schema struct<(CAST(1 AS DOUBLE) > CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 215 output +-- !query output false --- !query 216 +-- !query SELECT '1' > '1' FROM t --- !query 216 schema +-- !query schema struct<(1 > 1):boolean> --- !query 216 output +-- !query output false --- !query 217 +-- !query SELECT '1' > cast('1' as binary) FROM t --- !query 217 schema +-- !query schema struct<(CAST(1 AS BINARY) > CAST(1 AS BINARY)):boolean> --- !query 217 output +-- !query output false --- !query 218 +-- !query SELECT '1' > cast(1 as boolean) FROM t --- !query 218 schema +-- !query schema struct<(CAST(1 AS BOOLEAN) > CAST(1 AS BOOLEAN)):boolean> --- !query 218 output +-- !query output false --- !query 219 +-- !query SELECT '1' > cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 219 schema +-- !query schema struct<(CAST(1 AS TIMESTAMP) > CAST(2017-12-11 09:30:00.0 AS TIMESTAMP)):boolean> --- !query 219 output +-- !query output NULL --- !query 220 +-- !query SELECT '1' > cast('2017-12-11 09:30:00' as date) FROM t --- !query 220 schema +-- !query schema struct<(CAST(1 AS DATE) > CAST(2017-12-11 09:30:00 AS DATE)):boolean> --- !query 220 output +-- !query output NULL --- !query 221 +-- !query SELECT '1' >= cast(1 as tinyint) FROM t --- !query 221 schema +-- !query schema struct<(CAST(1 AS TINYINT) >= CAST(1 AS TINYINT)):boolean> --- !query 221 output +-- !query output true --- !query 222 +-- !query SELECT '1' >= cast(1 as smallint) FROM t --- !query 222 schema +-- !query schema struct<(CAST(1 AS SMALLINT) >= CAST(1 AS SMALLINT)):boolean> --- !query 222 output +-- !query output true --- !query 223 +-- !query SELECT '1' >= cast(1 as int) FROM t --- !query 223 schema +-- !query schema struct<(CAST(1 AS INT) >= CAST(1 AS INT)):boolean> --- !query 223 output +-- !query output true --- !query 224 +-- !query SELECT '1' >= cast(1 as bigint) FROM t --- !query 224 schema +-- !query schema struct<(CAST(1 AS BIGINT) >= CAST(1 AS BIGINT)):boolean> --- !query 224 output +-- !query output true --- !query 225 +-- !query SELECT '1' >= cast(1 as float) FROM t --- !query 225 schema +-- !query schema struct<(CAST(1 AS FLOAT) >= CAST(1 AS FLOAT)):boolean> --- !query 225 output +-- !query output true --- !query 226 +-- !query SELECT '1' >= cast(1 as double) FROM t --- !query 226 schema +-- !query schema struct<(CAST(1 AS DOUBLE) >= CAST(1 AS DOUBLE)):boolean> --- !query 226 output +-- !query output true --- !query 227 +-- !query SELECT '1' >= cast(1 as decimal(10, 0)) FROM t --- !query 227 schema +-- !query schema struct<(CAST(1 AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean> --- !query 227 output +-- !query output true --- !query 228 +-- !query SELECT '1' >= '1' FROM t --- !query 228 schema +-- !query schema struct<(1 >= 1):boolean> --- !query 228 output +-- !query output true --- !query 229 +-- !query SELECT '1' >= cast('1' as binary) FROM t --- !query 229 schema +-- !query schema struct<(CAST(1 AS BINARY) >= CAST(1 AS BINARY)):boolean> --- !query 229 output +-- !query output true --- !query 230 +-- !query SELECT '1' >= cast(1 as boolean) FROM t --- !query 230 schema +-- !query schema struct<(CAST(1 AS BOOLEAN) >= CAST(1 AS BOOLEAN)):boolean> --- !query 230 output +-- !query output true --- !query 231 +-- !query SELECT '1' >= cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 231 schema +-- !query schema struct<(CAST(1 AS TIMESTAMP) >= CAST(2017-12-11 09:30:00.0 AS TIMESTAMP)):boolean> --- !query 231 output +-- !query output NULL --- !query 232 +-- !query SELECT '1' >= cast('2017-12-11 09:30:00' as date) FROM t --- !query 232 schema +-- !query schema struct<(CAST(1 AS DATE) >= CAST(2017-12-11 09:30:00 AS DATE)):boolean> --- !query 232 output +-- !query output NULL --- !query 233 +-- !query SELECT '1' <> cast(1 as tinyint) FROM t --- !query 233 schema +-- !query schema struct<(NOT (CAST(1 AS TINYINT) = CAST(1 AS TINYINT))):boolean> --- !query 233 output +-- !query output false --- !query 234 +-- !query SELECT '1' <> cast(1 as smallint) FROM t --- !query 234 schema +-- !query schema struct<(NOT (CAST(1 AS SMALLINT) = CAST(1 AS SMALLINT))):boolean> --- !query 234 output +-- !query output false --- !query 235 +-- !query SELECT '1' <> cast(1 as int) FROM t --- !query 235 schema +-- !query schema struct<(NOT (CAST(1 AS INT) = CAST(1 AS INT))):boolean> --- !query 235 output +-- !query output false --- !query 236 +-- !query SELECT '1' <> cast(1 as bigint) FROM t --- !query 236 schema +-- !query schema struct<(NOT (CAST(1 AS BIGINT) = CAST(1 AS BIGINT))):boolean> --- !query 236 output +-- !query output false --- !query 237 +-- !query SELECT '1' <> cast(1 as float) FROM t --- !query 237 schema +-- !query schema struct<(NOT (CAST(1 AS FLOAT) = CAST(1 AS FLOAT))):boolean> --- !query 237 output +-- !query output false --- !query 238 +-- !query SELECT '1' <> cast(1 as double) FROM t --- !query 238 schema +-- !query schema struct<(NOT (CAST(1 AS DOUBLE) = CAST(1 AS DOUBLE))):boolean> --- !query 238 output +-- !query output false --- !query 239 +-- !query SELECT '1' <> cast(1 as decimal(10, 0)) FROM t --- !query 239 schema +-- !query schema struct<(NOT (CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE))):boolean> --- !query 239 output +-- !query output false --- !query 240 +-- !query SELECT '1' <> '1' FROM t --- !query 240 schema +-- !query schema struct<(NOT (1 = 1)):boolean> --- !query 240 output +-- !query output false --- !query 241 +-- !query SELECT '1' <> cast('1' as binary) FROM t --- !query 241 schema +-- !query schema struct<(NOT (CAST(1 AS BINARY) = CAST(1 AS BINARY))):boolean> --- !query 241 output +-- !query output false --- !query 242 +-- !query SELECT '1' <> cast(1 as boolean) FROM t --- !query 242 schema +-- !query schema struct<(NOT (CAST(1 AS BOOLEAN) = CAST(1 AS BOOLEAN))):boolean> --- !query 242 output +-- !query output false --- !query 243 +-- !query SELECT '1' <> cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 243 schema +-- !query schema struct<(NOT (CAST(1 AS TIMESTAMP) = CAST(2017-12-11 09:30:00.0 AS TIMESTAMP))):boolean> --- !query 243 output +-- !query output NULL --- !query 244 +-- !query SELECT '1' <> cast('2017-12-11 09:30:00' as date) FROM t --- !query 244 schema +-- !query schema struct<(NOT (CAST(1 AS DATE) = CAST(2017-12-11 09:30:00 AS DATE))):boolean> --- !query 244 output +-- !query output NULL --- !query 245 +-- !query SELECT cast(1 as tinyint) < '1' FROM t --- !query 245 schema +-- !query schema struct<(CAST(1 AS TINYINT) < CAST(1 AS TINYINT)):boolean> --- !query 245 output +-- !query output false --- !query 246 +-- !query SELECT cast(1 as smallint) < '1' FROM t --- !query 246 schema +-- !query schema struct<(CAST(1 AS SMALLINT) < CAST(1 AS SMALLINT)):boolean> --- !query 246 output +-- !query output false --- !query 247 +-- !query SELECT cast(1 as int) < '1' FROM t --- !query 247 schema +-- !query schema struct<(CAST(1 AS INT) < CAST(1 AS INT)):boolean> --- !query 247 output +-- !query output false --- !query 248 +-- !query SELECT cast(1 as bigint) < '1' FROM t --- !query 248 schema +-- !query schema struct<(CAST(1 AS BIGINT) < CAST(1 AS BIGINT)):boolean> --- !query 248 output +-- !query output false --- !query 249 +-- !query SELECT cast(1 as float) < '1' FROM t --- !query 249 schema +-- !query schema struct<(CAST(1 AS FLOAT) < CAST(1 AS FLOAT)):boolean> --- !query 249 output +-- !query output false --- !query 250 +-- !query SELECT cast(1 as double) < '1' FROM t --- !query 250 schema +-- !query schema struct<(CAST(1 AS DOUBLE) < CAST(1 AS DOUBLE)):boolean> --- !query 250 output +-- !query output false --- !query 251 +-- !query SELECT cast(1 as decimal(10, 0)) < '1' FROM t --- !query 251 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) < CAST(1 AS DOUBLE)):boolean> --- !query 251 output +-- !query output false --- !query 252 +-- !query SELECT '1' < '1' FROM t --- !query 252 schema +-- !query schema struct<(1 < 1):boolean> --- !query 252 output +-- !query output false --- !query 253 +-- !query SELECT cast('1' as binary) < '1' FROM t --- !query 253 schema +-- !query schema struct<(CAST(1 AS BINARY) < CAST(1 AS BINARY)):boolean> --- !query 253 output +-- !query output false --- !query 254 +-- !query SELECT cast(1 as boolean) < '1' FROM t --- !query 254 schema +-- !query schema struct<(CAST(1 AS BOOLEAN) < CAST(1 AS BOOLEAN)):boolean> --- !query 254 output +-- !query output false --- !query 255 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) < '1' FROM t --- !query 255 schema +-- !query schema struct<(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) < CAST(1 AS TIMESTAMP)):boolean> --- !query 255 output +-- !query output NULL --- !query 256 +-- !query SELECT cast('2017-12-11 09:30:00' as date) < '1' FROM t --- !query 256 schema +-- !query schema struct<(CAST(2017-12-11 09:30:00 AS DATE) < CAST(1 AS DATE)):boolean> --- !query 256 output +-- !query output NULL --- !query 257 +-- !query SELECT cast(1 as tinyint) <= '1' FROM t --- !query 257 schema +-- !query schema struct<(CAST(1 AS TINYINT) <= CAST(1 AS TINYINT)):boolean> --- !query 257 output +-- !query output true --- !query 258 +-- !query SELECT cast(1 as smallint) <= '1' FROM t --- !query 258 schema +-- !query schema struct<(CAST(1 AS SMALLINT) <= CAST(1 AS SMALLINT)):boolean> --- !query 258 output +-- !query output true --- !query 259 +-- !query SELECT cast(1 as int) <= '1' FROM t --- !query 259 schema +-- !query schema struct<(CAST(1 AS INT) <= CAST(1 AS INT)):boolean> --- !query 259 output +-- !query output true --- !query 260 +-- !query SELECT cast(1 as bigint) <= '1' FROM t --- !query 260 schema +-- !query schema struct<(CAST(1 AS BIGINT) <= CAST(1 AS BIGINT)):boolean> --- !query 260 output +-- !query output true --- !query 261 +-- !query SELECT cast(1 as float) <= '1' FROM t --- !query 261 schema +-- !query schema struct<(CAST(1 AS FLOAT) <= CAST(1 AS FLOAT)):boolean> --- !query 261 output +-- !query output true --- !query 262 +-- !query SELECT cast(1 as double) <= '1' FROM t --- !query 262 schema +-- !query schema struct<(CAST(1 AS DOUBLE) <= CAST(1 AS DOUBLE)):boolean> --- !query 262 output +-- !query output true --- !query 263 +-- !query SELECT cast(1 as decimal(10, 0)) <= '1' FROM t --- !query 263 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) <= CAST(1 AS DOUBLE)):boolean> --- !query 263 output +-- !query output true --- !query 264 +-- !query SELECT '1' <= '1' FROM t --- !query 264 schema +-- !query schema struct<(1 <= 1):boolean> --- !query 264 output +-- !query output true --- !query 265 +-- !query SELECT cast('1' as binary) <= '1' FROM t --- !query 265 schema +-- !query schema struct<(CAST(1 AS BINARY) <= CAST(1 AS BINARY)):boolean> --- !query 265 output +-- !query output true --- !query 266 +-- !query SELECT cast(1 as boolean) <= '1' FROM t --- !query 266 schema +-- !query schema struct<(CAST(1 AS BOOLEAN) <= CAST(1 AS BOOLEAN)):boolean> --- !query 266 output +-- !query output true --- !query 267 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) <= '1' FROM t --- !query 267 schema +-- !query schema struct<(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) <= CAST(1 AS TIMESTAMP)):boolean> --- !query 267 output +-- !query output NULL --- !query 268 +-- !query SELECT cast('2017-12-11 09:30:00' as date) <= '1' FROM t --- !query 268 schema +-- !query schema struct<(CAST(2017-12-11 09:30:00 AS DATE) <= CAST(1 AS DATE)):boolean> --- !query 268 output +-- !query output NULL --- !query 269 +-- !query SELECT cast(1 as tinyint) > '1' FROM t --- !query 269 schema +-- !query schema struct<(CAST(1 AS TINYINT) > CAST(1 AS TINYINT)):boolean> --- !query 269 output +-- !query output false --- !query 270 +-- !query SELECT cast(1 as smallint) > '1' FROM t --- !query 270 schema +-- !query schema struct<(CAST(1 AS SMALLINT) > CAST(1 AS SMALLINT)):boolean> --- !query 270 output +-- !query output false --- !query 271 +-- !query SELECT cast(1 as int) > '1' FROM t --- !query 271 schema +-- !query schema struct<(CAST(1 AS INT) > CAST(1 AS INT)):boolean> --- !query 271 output +-- !query output false --- !query 272 +-- !query SELECT cast(1 as bigint) > '1' FROM t --- !query 272 schema +-- !query schema struct<(CAST(1 AS BIGINT) > CAST(1 AS BIGINT)):boolean> --- !query 272 output +-- !query output false --- !query 273 +-- !query SELECT cast(1 as float) > '1' FROM t --- !query 273 schema +-- !query schema struct<(CAST(1 AS FLOAT) > CAST(1 AS FLOAT)):boolean> --- !query 273 output +-- !query output false --- !query 274 +-- !query SELECT cast(1 as double) > '1' FROM t --- !query 274 schema +-- !query schema struct<(CAST(1 AS DOUBLE) > CAST(1 AS DOUBLE)):boolean> --- !query 274 output +-- !query output false --- !query 275 +-- !query SELECT cast(1 as decimal(10, 0)) > '1' FROM t --- !query 275 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) > CAST(1 AS DOUBLE)):boolean> --- !query 275 output +-- !query output false --- !query 276 +-- !query SELECT '1' > '1' FROM t --- !query 276 schema +-- !query schema struct<(1 > 1):boolean> --- !query 276 output +-- !query output false --- !query 277 +-- !query SELECT cast('1' as binary) > '1' FROM t --- !query 277 schema +-- !query schema struct<(CAST(1 AS BINARY) > CAST(1 AS BINARY)):boolean> --- !query 277 output +-- !query output false --- !query 278 +-- !query SELECT cast(1 as boolean) > '1' FROM t --- !query 278 schema +-- !query schema struct<(CAST(1 AS BOOLEAN) > CAST(1 AS BOOLEAN)):boolean> --- !query 278 output +-- !query output false --- !query 279 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) > '1' FROM t --- !query 279 schema +-- !query schema struct<(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) > CAST(1 AS TIMESTAMP)):boolean> --- !query 279 output +-- !query output NULL --- !query 280 +-- !query SELECT cast('2017-12-11 09:30:00' as date) > '1' FROM t --- !query 280 schema +-- !query schema struct<(CAST(2017-12-11 09:30:00 AS DATE) > CAST(1 AS DATE)):boolean> --- !query 280 output +-- !query output NULL --- !query 281 +-- !query SELECT cast(1 as tinyint) >= '1' FROM t --- !query 281 schema +-- !query schema struct<(CAST(1 AS TINYINT) >= CAST(1 AS TINYINT)):boolean> --- !query 281 output +-- !query output true --- !query 282 +-- !query SELECT cast(1 as smallint) >= '1' FROM t --- !query 282 schema +-- !query schema struct<(CAST(1 AS SMALLINT) >= CAST(1 AS SMALLINT)):boolean> --- !query 282 output +-- !query output true --- !query 283 +-- !query SELECT cast(1 as int) >= '1' FROM t --- !query 283 schema +-- !query schema struct<(CAST(1 AS INT) >= CAST(1 AS INT)):boolean> --- !query 283 output +-- !query output true --- !query 284 +-- !query SELECT cast(1 as bigint) >= '1' FROM t --- !query 284 schema +-- !query schema struct<(CAST(1 AS BIGINT) >= CAST(1 AS BIGINT)):boolean> --- !query 284 output +-- !query output true --- !query 285 +-- !query SELECT cast(1 as float) >= '1' FROM t --- !query 285 schema +-- !query schema struct<(CAST(1 AS FLOAT) >= CAST(1 AS FLOAT)):boolean> --- !query 285 output +-- !query output true --- !query 286 +-- !query SELECT cast(1 as double) >= '1' FROM t --- !query 286 schema +-- !query schema struct<(CAST(1 AS DOUBLE) >= CAST(1 AS DOUBLE)):boolean> --- !query 286 output +-- !query output true --- !query 287 +-- !query SELECT cast(1 as decimal(10, 0)) >= '1' FROM t --- !query 287 schema +-- !query schema struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) >= CAST(1 AS DOUBLE)):boolean> --- !query 287 output +-- !query output true --- !query 288 +-- !query SELECT '1' >= '1' FROM t --- !query 288 schema +-- !query schema struct<(1 >= 1):boolean> --- !query 288 output +-- !query output true --- !query 289 +-- !query SELECT cast('1' as binary) >= '1' FROM t --- !query 289 schema +-- !query schema struct<(CAST(1 AS BINARY) >= CAST(1 AS BINARY)):boolean> --- !query 289 output +-- !query output true --- !query 290 +-- !query SELECT cast(1 as boolean) >= '1' FROM t --- !query 290 schema +-- !query schema struct<(CAST(1 AS BOOLEAN) >= CAST(1 AS BOOLEAN)):boolean> --- !query 290 output +-- !query output true --- !query 291 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) >= '1' FROM t --- !query 291 schema +-- !query schema struct<(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) >= CAST(1 AS TIMESTAMP)):boolean> --- !query 291 output +-- !query output NULL --- !query 292 +-- !query SELECT cast('2017-12-11 09:30:00' as date) >= '1' FROM t --- !query 292 schema +-- !query schema struct<(CAST(2017-12-11 09:30:00 AS DATE) >= CAST(1 AS DATE)):boolean> --- !query 292 output +-- !query output NULL --- !query 293 +-- !query SELECT cast(1 as tinyint) <> '1' FROM t --- !query 293 schema +-- !query schema struct<(NOT (CAST(1 AS TINYINT) = CAST(1 AS TINYINT))):boolean> --- !query 293 output +-- !query output false --- !query 294 +-- !query SELECT cast(1 as smallint) <> '1' FROM t --- !query 294 schema +-- !query schema struct<(NOT (CAST(1 AS SMALLINT) = CAST(1 AS SMALLINT))):boolean> --- !query 294 output +-- !query output false --- !query 295 +-- !query SELECT cast(1 as int) <> '1' FROM t --- !query 295 schema +-- !query schema struct<(NOT (CAST(1 AS INT) = CAST(1 AS INT))):boolean> --- !query 295 output +-- !query output false --- !query 296 +-- !query SELECT cast(1 as bigint) <> '1' FROM t --- !query 296 schema +-- !query schema struct<(NOT (CAST(1 AS BIGINT) = CAST(1 AS BIGINT))):boolean> --- !query 296 output +-- !query output false --- !query 297 +-- !query SELECT cast(1 as float) <> '1' FROM t --- !query 297 schema +-- !query schema struct<(NOT (CAST(1 AS FLOAT) = CAST(1 AS FLOAT))):boolean> --- !query 297 output +-- !query output false --- !query 298 +-- !query SELECT cast(1 as double) <> '1' FROM t --- !query 298 schema +-- !query schema struct<(NOT (CAST(1 AS DOUBLE) = CAST(1 AS DOUBLE))):boolean> --- !query 298 output +-- !query output false --- !query 299 +-- !query SELECT cast(1 as decimal(10, 0)) <> '1' FROM t --- !query 299 schema +-- !query schema struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(1 AS DOUBLE))):boolean> --- !query 299 output +-- !query output false --- !query 300 +-- !query SELECT '1' <> '1' FROM t --- !query 300 schema +-- !query schema struct<(NOT (1 = 1)):boolean> --- !query 300 output +-- !query output false --- !query 301 +-- !query SELECT cast('1' as binary) <> '1' FROM t --- !query 301 schema +-- !query schema struct<(NOT (CAST(1 AS BINARY) = CAST(1 AS BINARY))):boolean> --- !query 301 output +-- !query output false --- !query 302 +-- !query SELECT cast(1 as boolean) <> '1' FROM t --- !query 302 schema +-- !query schema struct<(NOT (CAST(1 AS BOOLEAN) = CAST(1 AS BOOLEAN))):boolean> --- !query 302 output +-- !query output false --- !query 303 +-- !query SELECT cast('2017-12-11 09:30:00.0' as timestamp) <> '1' FROM t --- !query 303 schema +-- !query schema struct<(NOT (CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) = CAST(1 AS TIMESTAMP))):boolean> --- !query 303 output +-- !query output NULL --- !query 304 +-- !query SELECT cast('2017-12-11 09:30:00' as date) <> '1' FROM t --- !query 304 schema +-- !query schema struct<(NOT (CAST(2017-12-11 09:30:00 AS DATE) = CAST(1 AS DATE))):boolean> --- !query 304 output +-- !query output NULL --- !query 305 +-- !query SELECT abs('1') FROM t --- !query 305 schema +-- !query schema struct --- !query 305 output +-- !query output 1.0 --- !query 306 +-- !query SELECT sum('1') FROM t --- !query 306 schema +-- !query schema struct --- !query 306 output +-- !query output 1.0 --- !query 307 +-- !query SELECT avg('1') FROM t --- !query 307 schema +-- !query schema struct --- !query 307 output +-- !query output 1.0 --- !query 308 +-- !query SELECT stddev_pop('1') FROM t --- !query 308 schema +-- !query schema struct --- !query 308 output +-- !query output 0.0 --- !query 309 +-- !query SELECT stddev_samp('1') FROM t --- !query 309 schema +-- !query schema struct --- !query 309 output +-- !query output NaN --- !query 310 +-- !query SELECT - '1' FROM t --- !query 310 schema +-- !query schema struct<(- CAST(1 AS DOUBLE)):double> --- !query 310 output +-- !query output -1.0 --- !query 311 +-- !query SELECT + '1' FROM t --- !query 311 schema -struct<1:string> --- !query 311 output -1 +-- !query schema +struct<(+ CAST(1 AS DOUBLE)):double> +-- !query output +1.0 --- !query 312 +-- !query SELECT var_pop('1') FROM t --- !query 312 schema +-- !query schema struct --- !query 312 output +-- !query output 0.0 --- !query 313 +-- !query SELECT var_samp('1') FROM t --- !query 313 schema +-- !query schema struct --- !query 313 output +-- !query output NaN --- !query 314 +-- !query SELECT skewness('1') FROM t --- !query 314 schema +-- !query schema struct --- !query 314 output +-- !query output NaN --- !query 315 +-- !query SELECT kurtosis('1') FROM t --- !query 315 schema +-- !query schema struct --- !query 315 output +-- !query output NaN diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/stringCastAndExpressions.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/stringCastAndExpressions.sql.out index 850cf9171a2fd..7b419c6702586 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/stringCastAndExpressions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/stringCastAndExpressions.sql.out @@ -2,260 +2,260 @@ -- Number of queries: 32 --- !query 0 +-- !query CREATE TEMPORARY VIEW t AS SELECT 'aa' as a --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query select cast(a as byte) from t --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output NULL --- !query 2 +-- !query select cast(a as short) from t --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output NULL --- !query 3 +-- !query select cast(a as int) from t --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output NULL --- !query 4 +-- !query select cast(a as long) from t --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output NULL --- !query 5 +-- !query select cast(a as float) from t --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output NULL --- !query 6 +-- !query select cast(a as double) from t --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output NULL --- !query 7 +-- !query select cast(a as decimal) from t --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output NULL --- !query 8 +-- !query select cast(a as boolean) from t --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output NULL --- !query 9 +-- !query select cast(a as timestamp) from t --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output NULL --- !query 10 +-- !query select cast(a as date) from t --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output NULL --- !query 11 +-- !query select cast(a as binary) from t --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output aa --- !query 12 +-- !query select cast(a as array) from t --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 't.`a`' due to data type mismatch: cannot cast string to array; line 1 pos 7 --- !query 13 +-- !query select cast(a as struct) from t --- !query 13 schema +-- !query schema struct<> --- !query 13 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 't.`a`' due to data type mismatch: cannot cast string to struct; line 1 pos 7 --- !query 14 +-- !query select cast(a as map) from t --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 't.`a`' due to data type mismatch: cannot cast string to map; line 1 pos 7 --- !query 15 +-- !query select to_timestamp(a) from t --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output NULL --- !query 16 +-- !query select to_timestamp('2018-01-01', a) from t --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output NULL --- !query 17 +-- !query select to_unix_timestamp(a) from t --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output NULL --- !query 18 +-- !query select to_unix_timestamp('2018-01-01', a) from t --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output NULL --- !query 19 +-- !query select unix_timestamp(a) from t --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output NULL --- !query 20 +-- !query select unix_timestamp('2018-01-01', a) from t --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output NULL --- !query 21 +-- !query select from_unixtime(a) from t --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output NULL --- !query 22 +-- !query select from_unixtime('2018-01-01', a) from t --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output NULL --- !query 23 +-- !query select next_day(a, 'MO') from t --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output NULL --- !query 24 +-- !query select next_day('2018-01-01', a) from t --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output NULL --- !query 25 +-- !query select trunc(a, 'MM') from t --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output NULL --- !query 26 +-- !query select trunc('2018-01-01', a) from t --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output NULL --- !query 27 +-- !query select unhex('-123') --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output NULL --- !query 28 +-- !query select sha2(a, a) from t --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output NULL --- !query 29 +-- !query select get_json_object(a, a) from t --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output NULL --- !query 30 +-- !query select json_tuple(a, a) from t --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output NULL --- !query 31 +-- !query select from_json(a, 'a INT') from t --- !query 31 schema +-- !query schema struct> --- !query 31 output +-- !query output {"a":null} diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/widenSetOperationTypes.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/widenSetOperationTypes.sql.out index 20a9e47217238..89b1cdb3e353d 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/widenSetOperationTypes.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/widenSetOperationTypes.sql.out @@ -2,1304 +2,1304 @@ -- Number of queries: 145 --- !query 0 +-- !query CREATE TEMPORARY VIEW t AS SELECT 1 --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as tinyint) FROM t --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 1 2 --- !query 2 +-- !query SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as smallint) FROM t --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 1 2 --- !query 3 +-- !query SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as int) FROM t --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 1 2 --- !query 4 +-- !query SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as bigint) FROM t --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 1 2 --- !query 5 +-- !query SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as float) FROM t --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 1.0 2.0 --- !query 6 +-- !query SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as double) FROM t --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 1.0 2.0 --- !query 7 +-- !query SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 1 2 --- !query 8 +-- !query SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as string) FROM t --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 1 2 --- !query 9 +-- !query SELECT cast(1 as tinyint) FROM t UNION SELECT cast('2' as binary) FROM t --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. binary <> tinyint at the first column of the second table; --- !query 10 +-- !query SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as boolean) FROM t --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. boolean <> tinyint at the first column of the second table; --- !query 11 +-- !query SELECT cast(1 as tinyint) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. timestamp <> tinyint at the first column of the second table; --- !query 12 +-- !query SELECT cast(1 as tinyint) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. date <> tinyint at the first column of the second table; --- !query 13 +-- !query SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as tinyint) FROM t --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output 1 2 --- !query 14 +-- !query SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as smallint) FROM t --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output 1 2 --- !query 15 +-- !query SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as int) FROM t --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 1 2 --- !query 16 +-- !query SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as bigint) FROM t --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output 1 2 --- !query 17 +-- !query SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as float) FROM t --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output 1.0 2.0 --- !query 18 +-- !query SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as double) FROM t --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output 1.0 2.0 --- !query 19 +-- !query SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output 1 2 --- !query 20 +-- !query SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as string) FROM t --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output 1 2 --- !query 21 +-- !query SELECT cast(1 as smallint) FROM t UNION SELECT cast('2' as binary) FROM t --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. binary <> smallint at the first column of the second table; --- !query 22 +-- !query SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as boolean) FROM t --- !query 22 schema +-- !query schema struct<> --- !query 22 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. boolean <> smallint at the first column of the second table; --- !query 23 +-- !query SELECT cast(1 as smallint) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. timestamp <> smallint at the first column of the second table; --- !query 24 +-- !query SELECT cast(1 as smallint) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t --- !query 24 schema +-- !query schema struct<> --- !query 24 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. date <> smallint at the first column of the second table; --- !query 25 +-- !query SELECT cast(1 as int) FROM t UNION SELECT cast(2 as tinyint) FROM t --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output 1 2 --- !query 26 +-- !query SELECT cast(1 as int) FROM t UNION SELECT cast(2 as smallint) FROM t --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output 1 2 --- !query 27 +-- !query SELECT cast(1 as int) FROM t UNION SELECT cast(2 as int) FROM t --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output 1 2 --- !query 28 +-- !query SELECT cast(1 as int) FROM t UNION SELECT cast(2 as bigint) FROM t --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output 1 2 --- !query 29 +-- !query SELECT cast(1 as int) FROM t UNION SELECT cast(2 as float) FROM t --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output 1.0 2.0 --- !query 30 +-- !query SELECT cast(1 as int) FROM t UNION SELECT cast(2 as double) FROM t --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output 1.0 2.0 --- !query 31 +-- !query SELECT cast(1 as int) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output 1 2 --- !query 32 +-- !query SELECT cast(1 as int) FROM t UNION SELECT cast(2 as string) FROM t --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output 1 2 --- !query 33 +-- !query SELECT cast(1 as int) FROM t UNION SELECT cast('2' as binary) FROM t --- !query 33 schema +-- !query schema struct<> --- !query 33 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. binary <> int at the first column of the second table; --- !query 34 +-- !query SELECT cast(1 as int) FROM t UNION SELECT cast(2 as boolean) FROM t --- !query 34 schema +-- !query schema struct<> --- !query 34 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. boolean <> int at the first column of the second table; --- !query 35 +-- !query SELECT cast(1 as int) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 35 schema +-- !query schema struct<> --- !query 35 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. timestamp <> int at the first column of the second table; --- !query 36 +-- !query SELECT cast(1 as int) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t --- !query 36 schema +-- !query schema struct<> --- !query 36 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. date <> int at the first column of the second table; --- !query 37 +-- !query SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as tinyint) FROM t --- !query 37 schema +-- !query schema struct --- !query 37 output +-- !query output 1 2 --- !query 38 +-- !query SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as smallint) FROM t --- !query 38 schema +-- !query schema struct --- !query 38 output +-- !query output 1 2 --- !query 39 +-- !query SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as int) FROM t --- !query 39 schema +-- !query schema struct --- !query 39 output +-- !query output 1 2 --- !query 40 +-- !query SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as bigint) FROM t --- !query 40 schema +-- !query schema struct --- !query 40 output +-- !query output 1 2 --- !query 41 +-- !query SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as float) FROM t --- !query 41 schema +-- !query schema struct --- !query 41 output +-- !query output 1.0 2.0 --- !query 42 +-- !query SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as double) FROM t --- !query 42 schema +-- !query schema struct --- !query 42 output +-- !query output 1.0 2.0 --- !query 43 +-- !query SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t --- !query 43 schema +-- !query schema struct --- !query 43 output +-- !query output 1 2 --- !query 44 +-- !query SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as string) FROM t --- !query 44 schema +-- !query schema struct --- !query 44 output +-- !query output 1 2 --- !query 45 +-- !query SELECT cast(1 as bigint) FROM t UNION SELECT cast('2' as binary) FROM t --- !query 45 schema +-- !query schema struct<> --- !query 45 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. binary <> bigint at the first column of the second table; --- !query 46 +-- !query SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as boolean) FROM t --- !query 46 schema +-- !query schema struct<> --- !query 46 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. boolean <> bigint at the first column of the second table; --- !query 47 +-- !query SELECT cast(1 as bigint) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 47 schema +-- !query schema struct<> --- !query 47 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. timestamp <> bigint at the first column of the second table; --- !query 48 +-- !query SELECT cast(1 as bigint) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t --- !query 48 schema +-- !query schema struct<> --- !query 48 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. date <> bigint at the first column of the second table; --- !query 49 +-- !query SELECT cast(1 as float) FROM t UNION SELECT cast(2 as tinyint) FROM t --- !query 49 schema +-- !query schema struct --- !query 49 output +-- !query output 1.0 2.0 --- !query 50 +-- !query SELECT cast(1 as float) FROM t UNION SELECT cast(2 as smallint) FROM t --- !query 50 schema +-- !query schema struct --- !query 50 output +-- !query output 1.0 2.0 --- !query 51 +-- !query SELECT cast(1 as float) FROM t UNION SELECT cast(2 as int) FROM t --- !query 51 schema +-- !query schema struct --- !query 51 output +-- !query output 1.0 2.0 --- !query 52 +-- !query SELECT cast(1 as float) FROM t UNION SELECT cast(2 as bigint) FROM t --- !query 52 schema +-- !query schema struct --- !query 52 output +-- !query output 1.0 2.0 --- !query 53 +-- !query SELECT cast(1 as float) FROM t UNION SELECT cast(2 as float) FROM t --- !query 53 schema +-- !query schema struct --- !query 53 output +-- !query output 1.0 2.0 --- !query 54 +-- !query SELECT cast(1 as float) FROM t UNION SELECT cast(2 as double) FROM t --- !query 54 schema +-- !query schema struct --- !query 54 output +-- !query output 1.0 2.0 --- !query 55 +-- !query SELECT cast(1 as float) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t --- !query 55 schema +-- !query schema struct --- !query 55 output +-- !query output 1.0 2.0 --- !query 56 +-- !query SELECT cast(1 as float) FROM t UNION SELECT cast(2 as string) FROM t --- !query 56 schema +-- !query schema struct --- !query 56 output +-- !query output 1.0 2 --- !query 57 +-- !query SELECT cast(1 as float) FROM t UNION SELECT cast('2' as binary) FROM t --- !query 57 schema +-- !query schema struct<> --- !query 57 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. binary <> float at the first column of the second table; --- !query 58 +-- !query SELECT cast(1 as float) FROM t UNION SELECT cast(2 as boolean) FROM t --- !query 58 schema +-- !query schema struct<> --- !query 58 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. boolean <> float at the first column of the second table; --- !query 59 +-- !query SELECT cast(1 as float) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 59 schema +-- !query schema struct<> --- !query 59 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. timestamp <> float at the first column of the second table; --- !query 60 +-- !query SELECT cast(1 as float) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t --- !query 60 schema +-- !query schema struct<> --- !query 60 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. date <> float at the first column of the second table; --- !query 61 +-- !query SELECT cast(1 as double) FROM t UNION SELECT cast(2 as tinyint) FROM t --- !query 61 schema +-- !query schema struct --- !query 61 output +-- !query output 1.0 2.0 --- !query 62 +-- !query SELECT cast(1 as double) FROM t UNION SELECT cast(2 as smallint) FROM t --- !query 62 schema +-- !query schema struct --- !query 62 output +-- !query output 1.0 2.0 --- !query 63 +-- !query SELECT cast(1 as double) FROM t UNION SELECT cast(2 as int) FROM t --- !query 63 schema +-- !query schema struct --- !query 63 output +-- !query output 1.0 2.0 --- !query 64 +-- !query SELECT cast(1 as double) FROM t UNION SELECT cast(2 as bigint) FROM t --- !query 64 schema +-- !query schema struct --- !query 64 output +-- !query output 1.0 2.0 --- !query 65 +-- !query SELECT cast(1 as double) FROM t UNION SELECT cast(2 as float) FROM t --- !query 65 schema +-- !query schema struct --- !query 65 output +-- !query output 1.0 2.0 --- !query 66 +-- !query SELECT cast(1 as double) FROM t UNION SELECT cast(2 as double) FROM t --- !query 66 schema +-- !query schema struct --- !query 66 output +-- !query output 1.0 2.0 --- !query 67 +-- !query SELECT cast(1 as double) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t --- !query 67 schema +-- !query schema struct --- !query 67 output +-- !query output 1.0 2.0 --- !query 68 +-- !query SELECT cast(1 as double) FROM t UNION SELECT cast(2 as string) FROM t --- !query 68 schema +-- !query schema struct --- !query 68 output +-- !query output 1.0 2 --- !query 69 +-- !query SELECT cast(1 as double) FROM t UNION SELECT cast('2' as binary) FROM t --- !query 69 schema +-- !query schema struct<> --- !query 69 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. binary <> double at the first column of the second table; --- !query 70 +-- !query SELECT cast(1 as double) FROM t UNION SELECT cast(2 as boolean) FROM t --- !query 70 schema +-- !query schema struct<> --- !query 70 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. boolean <> double at the first column of the second table; --- !query 71 +-- !query SELECT cast(1 as double) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 71 schema +-- !query schema struct<> --- !query 71 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. timestamp <> double at the first column of the second table; --- !query 72 +-- !query SELECT cast(1 as double) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t --- !query 72 schema +-- !query schema struct<> --- !query 72 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. date <> double at the first column of the second table; --- !query 73 +-- !query SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as tinyint) FROM t --- !query 73 schema +-- !query schema struct --- !query 73 output +-- !query output 1 2 --- !query 74 +-- !query SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as smallint) FROM t --- !query 74 schema +-- !query schema struct --- !query 74 output +-- !query output 1 2 --- !query 75 +-- !query SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as int) FROM t --- !query 75 schema +-- !query schema struct --- !query 75 output +-- !query output 1 2 --- !query 76 +-- !query SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as bigint) FROM t --- !query 76 schema +-- !query schema struct --- !query 76 output +-- !query output 1 2 --- !query 77 +-- !query SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as float) FROM t --- !query 77 schema +-- !query schema struct --- !query 77 output +-- !query output 1.0 2.0 --- !query 78 +-- !query SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as double) FROM t --- !query 78 schema +-- !query schema struct --- !query 78 output +-- !query output 1.0 2.0 --- !query 79 +-- !query SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t --- !query 79 schema +-- !query schema struct --- !query 79 output +-- !query output 1 2 --- !query 80 +-- !query SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as string) FROM t --- !query 80 schema +-- !query schema struct --- !query 80 output +-- !query output 1 2 --- !query 81 +-- !query SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast('2' as binary) FROM t --- !query 81 schema +-- !query schema struct<> --- !query 81 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. binary <> decimal(10,0) at the first column of the second table; --- !query 82 +-- !query SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as boolean) FROM t --- !query 82 schema +-- !query schema struct<> --- !query 82 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. boolean <> decimal(10,0) at the first column of the second table; --- !query 83 +-- !query SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 83 schema +-- !query schema struct<> --- !query 83 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. timestamp <> decimal(10,0) at the first column of the second table; --- !query 84 +-- !query SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t --- !query 84 schema +-- !query schema struct<> --- !query 84 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. date <> decimal(10,0) at the first column of the second table; --- !query 85 +-- !query SELECT cast(1 as string) FROM t UNION SELECT cast(2 as tinyint) FROM t --- !query 85 schema +-- !query schema struct --- !query 85 output +-- !query output 1 2 --- !query 86 +-- !query SELECT cast(1 as string) FROM t UNION SELECT cast(2 as smallint) FROM t --- !query 86 schema +-- !query schema struct --- !query 86 output +-- !query output 1 2 --- !query 87 +-- !query SELECT cast(1 as string) FROM t UNION SELECT cast(2 as int) FROM t --- !query 87 schema +-- !query schema struct --- !query 87 output +-- !query output 1 2 --- !query 88 +-- !query SELECT cast(1 as string) FROM t UNION SELECT cast(2 as bigint) FROM t --- !query 88 schema +-- !query schema struct --- !query 88 output +-- !query output 1 2 --- !query 89 +-- !query SELECT cast(1 as string) FROM t UNION SELECT cast(2 as float) FROM t --- !query 89 schema +-- !query schema struct --- !query 89 output +-- !query output 1 2.0 --- !query 90 +-- !query SELECT cast(1 as string) FROM t UNION SELECT cast(2 as double) FROM t --- !query 90 schema +-- !query schema struct --- !query 90 output +-- !query output 1 2.0 --- !query 91 +-- !query SELECT cast(1 as string) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t --- !query 91 schema +-- !query schema struct --- !query 91 output +-- !query output 1 2 --- !query 92 +-- !query SELECT cast(1 as string) FROM t UNION SELECT cast(2 as string) FROM t --- !query 92 schema +-- !query schema struct --- !query 92 output +-- !query output 1 2 --- !query 93 +-- !query SELECT cast(1 as string) FROM t UNION SELECT cast('2' as binary) FROM t --- !query 93 schema +-- !query schema struct<> --- !query 93 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. binary <> string at the first column of the second table; --- !query 94 +-- !query SELECT cast(1 as string) FROM t UNION SELECT cast(2 as boolean) FROM t --- !query 94 schema +-- !query schema struct<> --- !query 94 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. boolean <> string at the first column of the second table; --- !query 95 +-- !query SELECT cast(1 as string) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 95 schema +-- !query schema struct --- !query 95 output +-- !query output 1 2017-12-11 09:30:00 --- !query 96 +-- !query SELECT cast(1 as string) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t --- !query 96 schema +-- !query schema struct --- !query 96 output +-- !query output 1 2017-12-11 --- !query 97 +-- !query SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as tinyint) FROM t --- !query 97 schema +-- !query schema struct<> --- !query 97 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. tinyint <> binary at the first column of the second table; --- !query 98 +-- !query SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as smallint) FROM t --- !query 98 schema +-- !query schema struct<> --- !query 98 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. smallint <> binary at the first column of the second table; --- !query 99 +-- !query SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as int) FROM t --- !query 99 schema +-- !query schema struct<> --- !query 99 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. int <> binary at the first column of the second table; --- !query 100 +-- !query SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as bigint) FROM t --- !query 100 schema +-- !query schema struct<> --- !query 100 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. bigint <> binary at the first column of the second table; --- !query 101 +-- !query SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as float) FROM t --- !query 101 schema +-- !query schema struct<> --- !query 101 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. float <> binary at the first column of the second table; --- !query 102 +-- !query SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as double) FROM t --- !query 102 schema +-- !query schema struct<> --- !query 102 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. double <> binary at the first column of the second table; --- !query 103 +-- !query SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t --- !query 103 schema +-- !query schema struct<> --- !query 103 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. decimal(10,0) <> binary at the first column of the second table; --- !query 104 +-- !query SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as string) FROM t --- !query 104 schema +-- !query schema struct<> --- !query 104 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. string <> binary at the first column of the second table; --- !query 105 +-- !query SELECT cast('1' as binary) FROM t UNION SELECT cast('2' as binary) FROM t --- !query 105 schema +-- !query schema struct --- !query 105 output +-- !query output 1 2 --- !query 106 +-- !query SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as boolean) FROM t --- !query 106 schema +-- !query schema struct<> --- !query 106 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. boolean <> binary at the first column of the second table; --- !query 107 +-- !query SELECT cast('1' as binary) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 107 schema +-- !query schema struct<> --- !query 107 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. timestamp <> binary at the first column of the second table; --- !query 108 +-- !query SELECT cast('1' as binary) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t --- !query 108 schema +-- !query schema struct<> --- !query 108 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. date <> binary at the first column of the second table; --- !query 109 +-- !query SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as tinyint) FROM t --- !query 109 schema +-- !query schema struct<> --- !query 109 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. tinyint <> boolean at the first column of the second table; --- !query 110 +-- !query SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as smallint) FROM t --- !query 110 schema +-- !query schema struct<> --- !query 110 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. smallint <> boolean at the first column of the second table; --- !query 111 +-- !query SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as int) FROM t --- !query 111 schema +-- !query schema struct<> --- !query 111 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. int <> boolean at the first column of the second table; --- !query 112 +-- !query SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as bigint) FROM t --- !query 112 schema +-- !query schema struct<> --- !query 112 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. bigint <> boolean at the first column of the second table; --- !query 113 +-- !query SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as float) FROM t --- !query 113 schema +-- !query schema struct<> --- !query 113 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. float <> boolean at the first column of the second table; --- !query 114 +-- !query SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as double) FROM t --- !query 114 schema +-- !query schema struct<> --- !query 114 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. double <> boolean at the first column of the second table; --- !query 115 +-- !query SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t --- !query 115 schema +-- !query schema struct<> --- !query 115 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. decimal(10,0) <> boolean at the first column of the second table; --- !query 116 +-- !query SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as string) FROM t --- !query 116 schema +-- !query schema struct<> --- !query 116 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. string <> boolean at the first column of the second table; --- !query 117 +-- !query SELECT cast(1 as boolean) FROM t UNION SELECT cast('2' as binary) FROM t --- !query 117 schema +-- !query schema struct<> --- !query 117 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. binary <> boolean at the first column of the second table; --- !query 118 +-- !query SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as boolean) FROM t --- !query 118 schema +-- !query schema struct --- !query 118 output +-- !query output true --- !query 119 +-- !query SELECT cast(1 as boolean) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 119 schema +-- !query schema struct<> --- !query 119 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. timestamp <> boolean at the first column of the second table; --- !query 120 +-- !query SELECT cast(1 as boolean) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t --- !query 120 schema +-- !query schema struct<> --- !query 120 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. date <> boolean at the first column of the second table; --- !query 121 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as tinyint) FROM t --- !query 121 schema +-- !query schema struct<> --- !query 121 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. tinyint <> timestamp at the first column of the second table; --- !query 122 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as smallint) FROM t --- !query 122 schema +-- !query schema struct<> --- !query 122 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. smallint <> timestamp at the first column of the second table; --- !query 123 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as int) FROM t --- !query 123 schema +-- !query schema struct<> --- !query 123 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. int <> timestamp at the first column of the second table; --- !query 124 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as bigint) FROM t --- !query 124 schema +-- !query schema struct<> --- !query 124 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. bigint <> timestamp at the first column of the second table; --- !query 125 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as float) FROM t --- !query 125 schema +-- !query schema struct<> --- !query 125 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. float <> timestamp at the first column of the second table; --- !query 126 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as double) FROM t --- !query 126 schema +-- !query schema struct<> --- !query 126 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. double <> timestamp at the first column of the second table; --- !query 127 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t --- !query 127 schema +-- !query schema struct<> --- !query 127 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. decimal(10,0) <> timestamp at the first column of the second table; --- !query 128 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as string) FROM t --- !query 128 schema +-- !query schema struct --- !query 128 output +-- !query output 2 2017-12-12 09:30:00 --- !query 129 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast('2' as binary) FROM t --- !query 129 schema +-- !query schema struct<> --- !query 129 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. binary <> timestamp at the first column of the second table; --- !query 130 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as boolean) FROM t --- !query 130 schema +-- !query schema struct<> --- !query 130 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. boolean <> timestamp at the first column of the second table; --- !query 131 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 131 schema +-- !query schema struct --- !query 131 output +-- !query output 2017-12-11 09:30:00 2017-12-12 09:30:00 --- !query 132 +-- !query SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t --- !query 132 schema +-- !query schema struct --- !query 132 output +-- !query output 2017-12-11 00:00:00 2017-12-12 09:30:00 --- !query 133 +-- !query SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as tinyint) FROM t --- !query 133 schema +-- !query schema struct<> --- !query 133 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. tinyint <> date at the first column of the second table; --- !query 134 +-- !query SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as smallint) FROM t --- !query 134 schema +-- !query schema struct<> --- !query 134 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. smallint <> date at the first column of the second table; --- !query 135 +-- !query SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as int) FROM t --- !query 135 schema +-- !query schema struct<> --- !query 135 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. int <> date at the first column of the second table; --- !query 136 +-- !query SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as bigint) FROM t --- !query 136 schema +-- !query schema struct<> --- !query 136 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. bigint <> date at the first column of the second table; --- !query 137 +-- !query SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as float) FROM t --- !query 137 schema +-- !query schema struct<> --- !query 137 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. float <> date at the first column of the second table; --- !query 138 +-- !query SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as double) FROM t --- !query 138 schema +-- !query schema struct<> --- !query 138 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. double <> date at the first column of the second table; --- !query 139 +-- !query SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t --- !query 139 schema +-- !query schema struct<> --- !query 139 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. decimal(10,0) <> date at the first column of the second table; --- !query 140 +-- !query SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as string) FROM t --- !query 140 schema +-- !query schema struct --- !query 140 output +-- !query output 2 2017-12-12 --- !query 141 +-- !query SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast('2' as binary) FROM t --- !query 141 schema +-- !query schema struct<> --- !query 141 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. binary <> date at the first column of the second table; --- !query 142 +-- !query SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as boolean) FROM t --- !query 142 schema +-- !query schema struct<> --- !query 142 output +-- !query output org.apache.spark.sql.AnalysisException Union can only be performed on tables with the compatible column types. boolean <> date at the first column of the second table; --- !query 143 +-- !query SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t --- !query 143 schema +-- !query schema struct --- !query 143 output +-- !query output 2017-12-11 09:30:00 2017-12-12 00:00:00 --- !query 144 +-- !query SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t --- !query 144 schema +-- !query schema struct --- !query 144 output +-- !query output 2017-12-11 2017-12-12 diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/windowFrameCoercion.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/windowFrameCoercion.sql.out index 5b77bf9f35f25..12af1b7d034da 100644 --- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/windowFrameCoercion.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/windowFrameCoercion.sql.out @@ -2,205 +2,205 @@ -- Number of queries: 25 --- !query 0 +-- !query CREATE TEMPORARY VIEW t AS SELECT 1 --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as tinyint)) FROM t --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 1 --- !query 2 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as smallint)) FROM t --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 1 --- !query 3 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as int)) FROM t --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 1 --- !query 4 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as bigint)) FROM t --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 1 --- !query 5 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as float)) FROM t --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 1 --- !query 6 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as double)) FROM t --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 1 --- !query 7 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as decimal(10, 0))) FROM t --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 1 --- !query 8 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as string)) FROM t --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 1 --- !query 9 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('1' as binary)) FROM t --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 1 --- !query 10 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as boolean)) FROM t --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 1 --- !query 11 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('2017-12-11 09:30:00.0' as timestamp)) FROM t --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 1 --- !query 12 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('2017-12-11 09:30:00' as date)) FROM t --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 1 --- !query 13 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as tinyint) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output 1 --- !query 14 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as smallint) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output 1 --- !query 15 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as int) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 1 --- !query 16 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as bigint) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output 1 --- !query 17 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as float) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output 1 --- !query 18 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as double) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output 1 --- !query 19 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as decimal(10, 0)) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output 1 --- !query 20 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as string) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t --- !query 20 schema +-- !query schema struct<> --- !query 20 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'RANGE BETWEEN CURRENT ROW AND CAST(1 AS STRING) FOLLOWING' due to data type mismatch: The data type of the upper bound 'string' does not match the expected data type '(numeric or interval)'.; line 1 pos 21 --- !query 21 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('1' as binary) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'RANGE BETWEEN CURRENT ROW AND CAST(1 AS BINARY) FOLLOWING' due to data type mismatch: The data type of the upper bound 'binary' does not match the expected data type '(numeric or interval)'.; line 1 pos 21 --- !query 22 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as boolean) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t --- !query 22 schema +-- !query schema struct<> --- !query 22 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'RANGE BETWEEN CURRENT ROW AND CAST(1 AS BOOLEAN) FOLLOWING' due to data type mismatch: The data type of the upper bound 'boolean' does not match the expected data type '(numeric or interval)'.; line 1 pos 21 --- !query 23 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('2017-12-11 09:30:00.0' as timestamp) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(PARTITION BY 1 ORDER BY CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: The data type 'timestamp' used in the order specification does not match the data type 'int' which is used in the range frame.; line 1 pos 21 --- !query 24 +-- !query SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('2017-12-11 09:30:00' as date) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output 1 diff --git a/sql/core/src/test/resources/sql-tests/results/udaf.sql.out b/sql/core/src/test/resources/sql-tests/results/udaf.sql.out index f4455bb717578..9f4229a11b65d 100644 --- a/sql/core/src/test/resources/sql-tests/results/udaf.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udaf.sql.out @@ -2,69 +2,69 @@ -- Number of queries: 8 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (1), (2), (3), (4) as t1(int_col1) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE FUNCTION myDoubleAvg AS 'test.org.apache.spark.sql.MyDoubleAvg' --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query SELECT default.myDoubleAvg(int_col1) as my_avg from t1 --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 102.5 --- !query 3 +-- !query SELECT default.myDoubleAvg(int_col1, 3) as my_avg from t1 --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output org.apache.spark.sql.AnalysisException Invalid number of arguments for function default.myDoubleAvg. Expected: 1; Found: 2; line 1 pos 7 --- !query 4 +-- !query CREATE FUNCTION udaf1 AS 'test.non.existent.udaf' --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query SELECT default.udaf1(int_col1) as udaf1 from t1 --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output org.apache.spark.sql.AnalysisException Can not load class 'test.non.existent.udaf' when registering the function 'default.udaf1', please make sure it is on the classpath; line 1 pos 7 --- !query 6 +-- !query DROP FUNCTION myDoubleAvg --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output --- !query 7 +-- !query DROP FUNCTION udaf1 --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part2.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part2.sql.out deleted file mode 100644 index ad2f1bdf77d7a..0000000000000 --- a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part2.sql.out +++ /dev/null @@ -1,156 +0,0 @@ --- Automatically generated by SQLQueryTestSuite --- Number of queries: 16 - - --- !query 0 -create temporary view int4_tbl as select * from values - (0), - (123456), - (-123456), - (2147483647), - (-2147483647) - as int4_tbl(f1) --- !query 0 schema -struct<> --- !query 0 output - - - --- !query 1 -SELECT - (NULL AND NULL) IS NULL AS `t`, - (TRUE AND NULL) IS NULL AS `t`, - (FALSE AND NULL) IS NULL AS `t`, - (NULL AND TRUE) IS NULL AS `t`, - (NULL AND FALSE) IS NULL AS `t`, - (TRUE AND TRUE) AS `t`, - NOT (TRUE AND FALSE) AS `t`, - NOT (FALSE AND TRUE) AS `t`, - NOT (FALSE AND FALSE) AS `t` --- !query 1 schema -struct --- !query 1 output -true true false true false true true true true - - --- !query 2 -SELECT - (NULL OR NULL) IS NULL AS `t`, - (TRUE OR NULL) IS NULL AS `t`, - (FALSE OR NULL) IS NULL AS `t`, - (NULL OR TRUE) IS NULL AS `t`, - (NULL OR FALSE) IS NULL AS `t`, - (TRUE OR TRUE) AS `t`, - (TRUE OR FALSE) AS `t`, - (FALSE OR TRUE) AS `t`, - NOT (FALSE OR FALSE) AS `t` --- !query 2 schema -struct --- !query 2 output -true false true false true true true true true - - --- !query 3 -select min(udf(unique1)) from tenk1 --- !query 3 schema -struct --- !query 3 output -0 - - --- !query 4 -select udf(max(unique1)) from tenk1 --- !query 4 schema -struct --- !query 4 output -9999 - - --- !query 5 -select max(unique1) from tenk1 where udf(unique1) < 42 --- !query 5 schema -struct --- !query 5 output -41 - - --- !query 6 -select max(unique1) from tenk1 where unique1 > udf(42) --- !query 6 schema -struct --- !query 6 output -9999 - - --- !query 7 -select max(unique1) from tenk1 where udf(unique1) > 42000 --- !query 7 schema -struct --- !query 7 output -NULL - - --- !query 8 -select max(tenthous) from tenk1 where udf(thousand) = 33 --- !query 8 schema -struct --- !query 8 output -9033 - - --- !query 9 -select min(tenthous) from tenk1 where udf(thousand) = 33 --- !query 9 schema -struct --- !query 9 output -33 - - --- !query 10 -select distinct max(udf(unique2)) from tenk1 --- !query 10 schema -struct --- !query 10 output -9999 - - --- !query 11 -select max(unique2) from tenk1 order by udf(1) --- !query 11 schema -struct --- !query 11 output -9999 - - --- !query 12 -select max(unique2) from tenk1 order by max(udf(unique2)) --- !query 12 schema -struct --- !query 12 output -9999 - - --- !query 13 -select udf(max(udf(unique2))) from tenk1 order by udf(max(unique2))+1 --- !query 13 schema -struct --- !query 13 output -9999 - - --- !query 14 -select t1.max_unique2, udf(g) from (select max(udf(unique2)) as max_unique2 FROM tenk1) t1 LATERAL VIEW explode(array(1,2,3)) t2 AS g order by g desc --- !query 14 schema -struct --- !query 14 output -9999 3 -9999 2 -9999 1 - - --- !query 15 -select udf(max(100)) from tenk1 --- !query 15 schema -struct --- !query 15 output -100 diff --git a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part1.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out similarity index 77% rename from sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part1.sql.out rename to sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out index a2f64717d73a1..d65c56774eafd 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part1.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out @@ -2,321 +2,321 @@ -- Number of queries: 43 --- !query 0 +-- !query SELECT avg(udf(four)) AS avg_1 FROM onek --- !query 0 schema +-- !query schema struct --- !query 0 output +-- !query output 1.5 --- !query 1 +-- !query SELECT udf(avg(a)) AS avg_32 FROM aggtest WHERE a < 100 --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 32.666666666666664 --- !query 2 +-- !query select CAST(avg(udf(b)) AS Decimal(10,3)) AS avg_107_943 FROM aggtest --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 107.943 --- !query 3 +-- !query SELECT sum(udf(four)) AS sum_1500 FROM onek --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 1500 --- !query 4 +-- !query SELECT udf(sum(a)) AS sum_198 FROM aggtest --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 198 --- !query 5 +-- !query SELECT udf(udf(sum(b))) AS avg_431_773 FROM aggtest --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 431.77260909229517 --- !query 6 +-- !query SELECT udf(max(four)) AS max_3 FROM onek --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 3 --- !query 7 +-- !query SELECT max(udf(a)) AS max_100 FROM aggtest --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 100 --- !query 8 +-- !query SELECT udf(udf(max(aggtest.b))) AS max_324_78 FROM aggtest --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 324.78 --- !query 9 +-- !query SELECT stddev_pop(udf(b)) FROM aggtest --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 131.10703231895047 --- !query 10 +-- !query SELECT udf(stddev_samp(b)) FROM aggtest --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 151.38936080399804 --- !query 11 +-- !query SELECT var_pop(udf(b)) FROM aggtest --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 17189.053923482323 --- !query 12 +-- !query SELECT udf(var_samp(b)) FROM aggtest --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 22918.738564643096 --- !query 13 +-- !query SELECT udf(stddev_pop(CAST(b AS Decimal(38,0)))) FROM aggtest --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output 131.18117242958306 --- !query 14 +-- !query SELECT stddev_samp(CAST(udf(b) AS Decimal(38,0))) FROM aggtest --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output 151.47497042966097 --- !query 15 +-- !query SELECT udf(var_pop(CAST(b AS Decimal(38,0)))) FROM aggtest --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 17208.5 --- !query 16 +-- !query SELECT var_samp(udf(CAST(b AS Decimal(38,0)))) FROM aggtest --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output 22944.666666666668 --- !query 17 +-- !query SELECT udf(var_pop(1.0)), var_samp(udf(2.0)) --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output 0.0 NaN --- !query 18 +-- !query SELECT stddev_pop(udf(CAST(3.0 AS Decimal(38,0)))), stddev_samp(CAST(udf(4.0) AS Decimal(38,0))) --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output 0.0 NaN --- !query 19 +-- !query select sum(udf(CAST(null AS int))) from range(1,4) --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output NULL --- !query 20 +-- !query select sum(udf(CAST(null AS long))) from range(1,4) --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output NULL --- !query 21 +-- !query select sum(udf(CAST(null AS Decimal(38,0)))) from range(1,4) --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output NULL --- !query 22 +-- !query select sum(udf(CAST(null AS DOUBLE))) from range(1,4) --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output NULL --- !query 23 +-- !query select avg(udf(CAST(null AS int))) from range(1,4) --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output NULL --- !query 24 +-- !query select avg(udf(CAST(null AS long))) from range(1,4) --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output NULL --- !query 25 +-- !query select avg(udf(CAST(null AS Decimal(38,0)))) from range(1,4) --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output NULL --- !query 26 +-- !query select avg(udf(CAST(null AS DOUBLE))) from range(1,4) --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output NULL --- !query 27 +-- !query select sum(CAST(udf('NaN') AS DOUBLE)) from range(1,4) --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output NaN --- !query 28 +-- !query select avg(CAST(udf('NaN') AS DOUBLE)) from range(1,4) --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output NaN --- !query 29 +-- !query SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE)) FROM (VALUES ('Infinity'), ('1')) v(x) --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output Infinity NaN --- !query 30 +-- !query SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE)) FROM (VALUES ('Infinity'), ('Infinity')) v(x) --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output Infinity NaN --- !query 31 +-- !query SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE)) FROM (VALUES ('-Infinity'), ('Infinity')) v(x) --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output NaN NaN --- !query 32 +-- !query SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE))) FROM (VALUES (100000003), (100000004), (100000006), (100000007)) v(x) --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output 1.00000005E8 2.5 --- !query 33 +-- !query SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE))) FROM (VALUES (7000000000005), (7000000000007)) v(x) --- !query 33 schema +-- !query schema struct --- !query 33 output +-- !query output 7.000000000006E12 1.0 --- !query 34 +-- !query SELECT udf(covar_pop(b, udf(a))), covar_samp(udf(b), a) FROM aggtest --- !query 34 schema +-- !query schema struct --- !query 34 output +-- !query output 653.6289553875104 871.5052738500139 --- !query 35 +-- !query SELECT corr(b, udf(a)) FROM aggtest --- !query 35 schema +-- !query schema struct --- !query 35 output +-- !query output 0.1396345165178734 --- !query 36 +-- !query SELECT count(udf(four)) AS cnt_1000 FROM onek --- !query 36 schema +-- !query schema struct --- !query 36 output +-- !query output 1000 --- !query 37 +-- !query SELECT udf(count(DISTINCT four)) AS cnt_4 FROM onek --- !query 37 schema +-- !query schema struct --- !query 37 output +-- !query output 4 --- !query 38 +-- !query select ten, udf(count(*)), sum(udf(four)) from onek group by ten order by ten --- !query 38 schema +-- !query schema struct --- !query 38 output +-- !query output 0 100 100 1 100 200 2 100 100 @@ -329,12 +329,12 @@ struct --- !query 39 output +-- !query output 0 100 2 1 100 4 2 100 2 @@ -347,13 +347,13 @@ struct --- !query 40 output +-- !query output 0 2 2 2 4 2 @@ -361,14 +361,14 @@ struct --- !query 41 output +-- !query output org.apache.spark.sql.AnalysisException Aggregate/Window/Generate expressions are not valid in where clause of the query. @@ -376,12 +376,12 @@ Expression in where clause: [(sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT)) Invalid expressions: [sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT))]; --- !query 42 +-- !query select (select udf(max((select i.unique2 from tenk1 i where i.unique1 = o.unique1)))) from tenk1 o --- !query 42 schema +-- !query schema struct<> --- !query 42 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`o.unique1`' given input columns: [i.even, i.fivethous, i.four, i.hundred, i.odd, i.string4, i.stringu1, i.stringu2, i.ten, i.tenthous, i.thousand, i.twenty, i.two, i.twothousand, i.unique1, i.unique2]; line 2 pos 67 diff --git a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part2.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part2.sql.out new file mode 100644 index 0000000000000..c10fe9b51dd72 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part2.sql.out @@ -0,0 +1,264 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 25 + + +-- !query +create temporary view int4_tbl as select * from values + (0), + (123456), + (-123456), + (2147483647), + (-2147483647) + as int4_tbl(f1) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW bitwise_test AS SELECT * FROM VALUES + (1, 1, 1, 1L), + (3, 3, 3, null), + (7, 7, 7, 3L) AS bitwise_test(b1, b2, b3, b4) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT BIT_AND(b1) AS n1, BIT_OR(b2) AS n2 FROM bitwise_test where 1 = 0 +-- !query schema +struct +-- !query output +NULL NULL + + +-- !query +SELECT BIT_AND(b4) AS n1, BIT_OR(b4) AS n2 FROM bitwise_test where b4 is null +-- !query schema +struct +-- !query output +NULL NULL + + +-- !query +SELECT + BIT_AND(cast(b1 as tinyint)) AS a1, + BIT_AND(cast(b2 as smallint)) AS b1, + BIT_AND(b3) AS c1, + BIT_AND(b4) AS d1, + BIT_OR(cast(b1 as tinyint)) AS e7, + BIT_OR(cast(b2 as smallint)) AS f7, + BIT_OR(b3) AS g7, + BIT_OR(b4) AS h3 +FROM bitwise_test +-- !query schema +struct +-- !query output +1 1 1 1 7 7 7 3 + + +-- !query +SELECT + (NULL AND NULL) IS NULL AS `t`, + (TRUE AND NULL) IS NULL AS `t`, + (FALSE AND NULL) IS NULL AS `t`, + (NULL AND TRUE) IS NULL AS `t`, + (NULL AND FALSE) IS NULL AS `t`, + (TRUE AND TRUE) AS `t`, + NOT (TRUE AND FALSE) AS `t`, + NOT (FALSE AND TRUE) AS `t`, + NOT (FALSE AND FALSE) AS `t` +-- !query schema +struct +-- !query output +true true false true false true true true true + + +-- !query +SELECT + (NULL OR NULL) IS NULL AS `t`, + (TRUE OR NULL) IS NULL AS `t`, + (FALSE OR NULL) IS NULL AS `t`, + (NULL OR TRUE) IS NULL AS `t`, + (NULL OR FALSE) IS NULL AS `t`, + (TRUE OR TRUE) AS `t`, + (TRUE OR FALSE) AS `t`, + (FALSE OR TRUE) AS `t`, + NOT (FALSE OR FALSE) AS `t` +-- !query schema +struct +-- !query output +true false true false true true true true true + + +-- !query +CREATE OR REPLACE TEMPORARY VIEW bool_test AS SELECT * FROM VALUES + (TRUE, null, FALSE, null), + (FALSE, TRUE, null, null), + (null, TRUE, FALSE, null) AS bool_test(b1, b2, b3, b4) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT BOOL_AND(b1) AS n1, BOOL_OR(b3) AS n2 FROM bool_test WHERE 1 = 0 +-- !query schema +struct +-- !query output +NULL NULL + + +-- !query +SELECT + BOOL_AND(b1) AS f1, + BOOL_AND(b2) AS t2, + BOOL_AND(b3) AS f3, + BOOL_AND(b4) AS n4, + BOOL_AND(NOT b2) AS f5, + BOOL_AND(NOT b3) AS t6 +FROM bool_test +-- !query schema +struct +-- !query output +false true false NULL false true + + +-- !query +SELECT + EVERY(b1) AS f1, + EVERY(b2) AS t2, + EVERY(b3) AS f3, + EVERY(b4) AS n4, + EVERY(NOT b2) AS f5, + EVERY(NOT b3) AS t6 +FROM bool_test +-- !query schema +struct +-- !query output +false true false NULL false true + + +-- !query +SELECT + BOOL_OR(b1) AS t1, + BOOL_OR(b2) AS t2, + BOOL_OR(b3) AS f3, + BOOL_OR(b4) AS n4, + BOOL_OR(NOT b2) AS f5, + BOOL_OR(NOT b3) AS t6 +FROM bool_test +-- !query schema +struct +-- !query output +true true false NULL false true + + +-- !query +select min(udf(unique1)) from tenk1 +-- !query schema +struct +-- !query output +0 + + +-- !query +select udf(max(unique1)) from tenk1 +-- !query schema +struct +-- !query output +9999 + + +-- !query +select max(unique1) from tenk1 where udf(unique1) < 42 +-- !query schema +struct +-- !query output +41 + + +-- !query +select max(unique1) from tenk1 where unique1 > udf(42) +-- !query schema +struct +-- !query output +9999 + + +-- !query +select max(unique1) from tenk1 where udf(unique1) > 42000 +-- !query schema +struct +-- !query output +NULL + + +-- !query +select max(tenthous) from tenk1 where udf(thousand) = 33 +-- !query schema +struct +-- !query output +9033 + + +-- !query +select min(tenthous) from tenk1 where udf(thousand) = 33 +-- !query schema +struct +-- !query output +33 + + +-- !query +select distinct max(udf(unique2)) from tenk1 +-- !query schema +struct +-- !query output +9999 + + +-- !query +select max(unique2) from tenk1 order by udf(1) +-- !query schema +struct +-- !query output +9999 + + +-- !query +select max(unique2) from tenk1 order by max(udf(unique2)) +-- !query schema +struct +-- !query output +9999 + + +-- !query +select udf(max(udf(unique2))) from tenk1 order by udf(max(unique2))+1 +-- !query schema +struct +-- !query output +9999 + + +-- !query +select t1.max_unique2, udf(g) from (select max(udf(unique2)) as max_unique2 FROM tenk1) t1 LATERAL VIEW explode(array(1,2,3)) t2 AS g order by g desc +-- !query schema +struct +-- !query output +9999 3 +9999 2 +9999 1 + + +-- !query +select udf(max(100)) from tenk1 +-- !query schema +struct +-- !query output +100 diff --git a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part3.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part3.sql.out similarity index 81% rename from sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part3.sql.out rename to sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part3.sql.out index eff33f280cff4..f491d9b9ba3a8 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part3.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part3.sql.out @@ -2,21 +2,21 @@ -- Number of queries: 2 --- !query 0 +-- !query select udf(max(min(unique1))) from tenk1 --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output org.apache.spark.sql.AnalysisException It is not allowed to use an aggregate function in the argument of another aggregate function. Please use the inner aggregate function in a sub-query.; --- !query 1 +-- !query select udf((select udf(count(*)) from (values (1)) t0(inner_c))) as col from (values (2),(3)) t1(outer_c) --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 1 1 diff --git a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part4.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part4.sql.out similarity index 100% rename from sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part4.sql.out rename to sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part4.sql.out diff --git a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-case.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-case.sql.out old mode 100644 new mode 100755 similarity index 67% rename from sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-case.sql.out rename to sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-case.sql.out index 44a764ce4e6dd..04c4f54b02a3e --- a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-case.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-case.sql.out @@ -2,243 +2,243 @@ -- Number of queries: 35 --- !query 0 +-- !query CREATE TABLE CASE_TBL ( i integer, f double ) USING parquet --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TABLE CASE2_TBL ( i integer, j integer ) USING parquet --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query INSERT INTO CASE_TBL VALUES (1, 10.1) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query INSERT INTO CASE_TBL VALUES (2, 20.2) --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query INSERT INTO CASE_TBL VALUES (3, -30.3) --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query INSERT INTO CASE_TBL VALUES (4, NULL) --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query INSERT INTO CASE2_TBL VALUES (1, -1) --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output --- !query 7 +-- !query INSERT INTO CASE2_TBL VALUES (2, -2) --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output --- !query 8 +-- !query INSERT INTO CASE2_TBL VALUES (3, -3) --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output --- !query 9 +-- !query INSERT INTO CASE2_TBL VALUES (2, -4) --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output --- !query 10 +-- !query INSERT INTO CASE2_TBL VALUES (1, NULL) --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output --- !query 11 +-- !query INSERT INTO CASE2_TBL VALUES (NULL, -6) --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output --- !query 12 +-- !query SELECT '3' AS `One`, CASE WHEN udf(1 < 2) THEN 3 END AS `Simple WHEN` --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 3 3 --- !query 13 +-- !query SELECT '' AS `One`, CASE WHEN 1 > 2 THEN udf(3) END AS `Simple default` --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output NULL --- !query 14 +-- !query SELECT '3' AS `One`, CASE WHEN udf(1) < 2 THEN udf(3) ELSE udf(4) END AS `Simple ELSE` --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output 3 3 --- !query 15 +-- !query SELECT udf('4') AS `One`, CASE WHEN 1 > 2 THEN 3 ELSE 4 END AS `ELSE default` --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 4 4 --- !query 16 +-- !query SELECT udf('6') AS `One`, CASE WHEN udf(1 > 2) THEN 3 WHEN udf(4) < 5 THEN 6 ELSE 7 END AS `Two WHEN with default` --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output 6 6 --- !query 17 +-- !query SELECT '7' AS `None`, CASE WHEN rand() < udf(0) THEN 1 END AS `NULL on no matches` --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output 7 NULL --- !query 18 +-- !query SELECT CASE WHEN udf(1=0) THEN 1/0 WHEN 1=1 THEN 1 ELSE 2/0 END --- !query 18 schema -struct --- !query 18 output -1 +-- !query schema +struct +-- !query output +1.0 --- !query 19 +-- !query SELECT CASE 1 WHEN 0 THEN 1/udf(0) WHEN 1 THEN 1 ELSE 2/0 END --- !query 19 schema -struct --- !query 19 output -1 +-- !query schema +struct +-- !query output +1.0 --- !query 20 +-- !query SELECT CASE WHEN i > 100 THEN udf(1/0) ELSE udf(0) END FROM case_tbl --- !query 20 schema -struct 100) THEN CAST(udf(cast((1 div 0) as string)) AS INT) ELSE CAST(udf(cast(0 as string)) AS INT) END:int> --- !query 20 output -0 -0 -0 -0 +-- !query schema +struct 100) THEN CAST(udf(cast((cast(1 as double) / cast(0 as double)) as string)) AS DOUBLE) ELSE CAST(CAST(udf(cast(0 as string)) AS INT) AS DOUBLE) END:double> +-- !query output +0.0 +0.0 +0.0 +0.0 --- !query 21 +-- !query SELECT CASE 'a' WHEN 'a' THEN udf(1) ELSE udf(2) END --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output 1 --- !query 22 +-- !query SELECT '' AS `Five`, CASE WHEN i >= 3 THEN i END AS `>= 3 or Null` FROM CASE_TBL --- !query 22 schema +-- !query schema struct= 3 or Null:int> --- !query 22 output +-- !query output 3 4 NULL NULL --- !query 23 +-- !query SELECT '' AS `Five`, CASE WHEN i >= 3 THEN (i + i) ELSE i END AS `Simplest Math` FROM CASE_TBL --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output 1 2 6 8 --- !query 24 +-- !query SELECT '' AS `Five`, i AS `Value`, CASE WHEN (i < 0) THEN 'small' WHEN (i = 0) THEN 'zero' @@ -247,16 +247,16 @@ SELECT '' AS `Five`, i AS `Value`, ELSE 'big' END AS `Category` FROM CASE_TBL --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output 1 one 2 two 3 big 4 big --- !query 25 +-- !query SELECT '' AS `Five`, CASE WHEN ((i < 0) or (i < 0)) THEN 'small' WHEN ((i = 0) or (i = 0)) THEN 'zero' @@ -265,37 +265,37 @@ SELECT '' AS `Five`, ELSE 'big' END AS `Category` FROM CASE_TBL --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output big big one two --- !query 26 +-- !query SELECT * FROM CASE_TBL WHERE udf(COALESCE(f,i)) = 4 --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output 4 NULL --- !query 27 +-- !query SELECT * FROM CASE_TBL WHERE udf(NULLIF(f,i)) = 2 --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output --- !query 28 +-- !query SELECT udf(COALESCE(a.f, b.i, b.j)) FROM CASE_TBL a, CASE2_TBL b --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output -30.3 -30.3 -30.3 @@ -322,24 +322,24 @@ struct --- !query 29 output +-- !query output 4 NULL 2 -2 4 NULL 2 -4 --- !query 30 +-- !query SELECT udf('') AS Five, NULLIF(a.i,b.i) AS `NULLIF(a.i,b.i)`, NULLIF(b.i, 4) AS `NULLIF(b.i,4)` FROM CASE_TBL a, CASE2_TBL b --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output 1 2 1 2 1 3 @@ -366,18 +366,18 @@ struct NULL 3 --- !query 31 +-- !query SELECT '' AS `Two`, * FROM CASE_TBL a, CASE2_TBL b WHERE udf(COALESCE(f,b.i) = 2) --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output 4 NULL 2 -2 4 NULL 2 -4 --- !query 32 +-- !query SELECT CASE (CASE vol('bar') WHEN udf('foo') THEN 'it was foo!' @@ -387,23 +387,23 @@ SELECT CASE WHEN udf('it was foo!') THEN 'foo recognized' WHEN 'it was bar!' THEN udf('bar recognized') ELSE 'unrecognized' END AS col --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output bar recognized --- !query 33 +-- !query DROP TABLE CASE_TBL --- !query 33 schema +-- !query schema struct<> --- !query 33 output +-- !query output --- !query 34 +-- !query DROP TABLE CASE2_TBL --- !query 34 schema +-- !query schema struct<> --- !query 34 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-join.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out similarity index 86% rename from sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-join.sql.out rename to sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out index 6fcff129d7568..f113aee6d3b51 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-join.sql.out @@ -2,17 +2,17 @@ -- Number of queries: 185 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW INT4_TBL AS SELECT * FROM (VALUES (0), (123456), (-123456), (2147483647), (-2147483647)) AS v(f1) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE OR REPLACE TEMPORARY VIEW INT8_TBL AS SELECT * FROM (VALUES (123, 456), @@ -21,230 +21,230 @@ CREATE OR REPLACE TEMPORARY VIEW INT8_TBL AS SELECT * FROM (4567890123456789, 4567890123456789), (4567890123456789, -4567890123456789)) AS v(q1, q2) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE OR REPLACE TEMPORARY VIEW FLOAT8_TBL AS SELECT * FROM (VALUES (0.0), (1004.30), (-34.84), (cast('1.2345678901234e+200' as double)), (cast('1.2345678901234e-200' as double))) AS v(f1) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query CREATE OR REPLACE TEMPORARY VIEW TEXT_TBL AS SELECT * FROM (VALUES ('doh!'), ('hi de ho neighbor')) AS v(f1) --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query CREATE OR REPLACE TEMPORARY VIEW tenk2 AS SELECT * FROM tenk1 --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query CREATE TABLE J1_TBL ( i integer, j integer, t string ) USING parquet --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query CREATE TABLE J2_TBL ( i integer, k integer ) USING parquet --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output --- !query 7 +-- !query INSERT INTO J1_TBL VALUES (1, 4, 'one') --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output --- !query 8 +-- !query INSERT INTO J1_TBL VALUES (2, 3, 'two') --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output --- !query 9 +-- !query INSERT INTO J1_TBL VALUES (3, 2, 'three') --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output --- !query 10 +-- !query INSERT INTO J1_TBL VALUES (4, 1, 'four') --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output --- !query 11 +-- !query INSERT INTO J1_TBL VALUES (5, 0, 'five') --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output --- !query 12 +-- !query INSERT INTO J1_TBL VALUES (6, 6, 'six') --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output --- !query 13 +-- !query INSERT INTO J1_TBL VALUES (7, 7, 'seven') --- !query 13 schema +-- !query schema struct<> --- !query 13 output +-- !query output --- !query 14 +-- !query INSERT INTO J1_TBL VALUES (8, 8, 'eight') --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output --- !query 15 +-- !query INSERT INTO J1_TBL VALUES (0, NULL, 'zero') --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output --- !query 16 +-- !query INSERT INTO J1_TBL VALUES (NULL, NULL, 'null') --- !query 16 schema +-- !query schema struct<> --- !query 16 output +-- !query output --- !query 17 +-- !query INSERT INTO J1_TBL VALUES (NULL, 0, 'zero') --- !query 17 schema +-- !query schema struct<> --- !query 17 output +-- !query output --- !query 18 +-- !query INSERT INTO J2_TBL VALUES (1, -1) --- !query 18 schema +-- !query schema struct<> --- !query 18 output +-- !query output --- !query 19 +-- !query INSERT INTO J2_TBL VALUES (2, 2) --- !query 19 schema +-- !query schema struct<> --- !query 19 output +-- !query output --- !query 20 +-- !query INSERT INTO J2_TBL VALUES (3, -3) --- !query 20 schema +-- !query schema struct<> --- !query 20 output +-- !query output --- !query 21 +-- !query INSERT INTO J2_TBL VALUES (2, 4) --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output --- !query 22 +-- !query INSERT INTO J2_TBL VALUES (5, -5) --- !query 22 schema +-- !query schema struct<> --- !query 22 output +-- !query output --- !query 23 +-- !query INSERT INTO J2_TBL VALUES (5, -5) --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output --- !query 24 +-- !query INSERT INTO J2_TBL VALUES (0, NULL) --- !query 24 schema +-- !query schema struct<> --- !query 24 output +-- !query output --- !query 25 +-- !query INSERT INTO J2_TBL VALUES (NULL, NULL) --- !query 25 schema +-- !query schema struct<> --- !query 25 output +-- !query output --- !query 26 +-- !query INSERT INTO J2_TBL VALUES (NULL, 0) --- !query 26 schema +-- !query schema struct<> --- !query 26 output +-- !query output --- !query 27 +-- !query SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t) FROM J1_TBL AS tx --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output 0 NULL zero 1 4 one 2 3 two @@ -258,12 +258,12 @@ struct --- !query 28 output +-- !query output 0 NULL zero 1 4 one 2 3 two @@ -277,12 +277,12 @@ struct --- !query 29 output +-- !query output 0 NULL zero 1 4 one 2 3 two @@ -296,12 +296,12 @@ struct --- !query 30 output +-- !query output 0 NULL zero 1 4 one 2 3 two @@ -315,12 +315,12 @@ struct --- !query 31 output +-- !query output 0 NULL zero 0 NULL 0 NULL zero 1 -1 0 NULL zero 2 2 @@ -422,12 +422,12 @@ struct --- !query 32 output +-- !query output 0 NULL zero 0 NULL 0 NULL zero 1 -1 0 NULL zero 2 2 @@ -529,22 +529,22 @@ struct NULL NULL null NULL NULL --- !query 33 +-- !query SELECT udf('') AS `xxx`, udf(i) AS i, udf(k), udf(t) AS t FROM J1_TBL CROSS JOIN J2_TBL --- !query 33 schema +-- !query schema struct<> --- !query 33 output +-- !query output org.apache.spark.sql.AnalysisException Reference 'i' is ambiguous, could be: default.j1_tbl.i, default.j2_tbl.i.; line 1 pos 29 --- !query 34 +-- !query SELECT udf('') AS `xxx`, udf(t1.i) AS i, udf(k), udf(t) FROM J1_TBL t1 CROSS JOIN J2_TBL t2 --- !query 34 schema +-- !query schema struct --- !query 34 output +-- !query output 0 -1 zero 0 -3 zero 0 -5 zero @@ -646,13 +646,13 @@ struct --- !query 35 output +-- !query output 0 zero -1 0 zero -3 0 zero -5 @@ -754,12 +754,12 @@ struct --- !query 36 output +-- !query output 0 NULL zero 0 NULL 0 NULL 0 NULL zero 0 NULL 1 -1 0 NULL zero 0 NULL 2 2 @@ -1653,12 +1653,12 @@ struct --- !query 37 output +-- !query output 0 NULL zero NULL 1 4 one -1 2 3 two 2 @@ -1668,12 +1668,12 @@ struct --- !query 38 output +-- !query output 0 NULL zero NULL 1 4 one -1 2 3 two 2 @@ -1683,13 +1683,13 @@ struct --- !query 39 output +-- !query output 0 NULL zero NULL 1 4 one -1 2 3 two 2 @@ -1699,12 +1699,12 @@ struct 5 0 five -5 --- !query 40 +-- !query SELECT udf(udf('')) AS `xxx`, udf(i), udf(j), udf(t), udf(k) FROM J1_TBL NATURAL JOIN J2_TBL --- !query 40 schema +-- !query schema struct --- !query 40 output +-- !query output 0 NULL zero NULL 1 4 one -1 2 3 two 2 @@ -1714,12 +1714,12 @@ struct --- !query 41 output +-- !query output 0 NULL zero NULL 1 4 one -1 2 3 two 2 @@ -1729,23 +1729,23 @@ struct --- !query 42 output +-- !query output 0 NULL zero NULL 2 3 two 2 4 1 four 2 --- !query 43 +-- !query SELECT udf('') AS `xxx`, udf(J1_TBL.i), udf(udf(J1_TBL.j)), udf(J1_TBL.t), udf(J2_TBL.i), udf(J2_TBL.k) FROM J1_TBL JOIN J2_TBL ON (udf(J1_TBL.i) = J2_TBL.i) --- !query 43 schema +-- !query schema struct --- !query 43 output +-- !query output 0 NULL zero 0 NULL 1 4 one 1 -1 2 3 two 2 2 @@ -1755,23 +1755,23 @@ struct --- !query 44 output +-- !query output 0 NULL zero NULL 0 2 3 two 2 2 4 1 four 2 4 --- !query 45 +-- !query SELECT udf('') AS `xxx`, udf(J1_TBL.i), udf(J1_TBL.j), udf(J1_TBL.t), udf(J2_TBL.i), udf(J2_TBL.k) FROM J1_TBL JOIN J2_TBL ON (udf(J1_TBL.i) <= udf(udf(J2_TBL.k))) --- !query 45 schema +-- !query schema struct --- !query 45 output +-- !query output 0 NULL zero 2 2 0 NULL zero 2 4 0 NULL zero NULL 0 @@ -1783,13 +1783,13 @@ struct --- !query 46 output +-- !query output NULL NULL null NULL NULL 0 zero NULL 0 NULL zero NULL @@ -1805,13 +1805,13 @@ struct --- !query 47 output +-- !query output NULL NULL null NULL NULL 0 zero NULL 0 NULL zero NULL @@ -1827,12 +1827,12 @@ struct --- !query 48 output +-- !query output 0 NULL zero NULL 1 4 one -1 2 3 two 2 @@ -1844,12 +1844,12 @@ struct --- !query 49 output +-- !query output 0 NULL zero NULL 1 4 one -1 2 3 two 2 @@ -1861,13 +1861,13 @@ struct --- !query 50 output +-- !query output NULL NULL NULL NULL NULL NULL null NULL NULL 0 zero NULL @@ -1885,13 +1885,13 @@ struct --- !query 51 output +-- !query output NULL NULL NULL NULL NULL NULL null NULL NULL 0 zero NULL @@ -1909,226 +1909,226 @@ struct --- !query 52 output +-- !query output --- !query 53 +-- !query SELECT udf('') AS `xxx`, udf(i), udf(j), udf(t), udf(k) FROM J1_TBL LEFT JOIN J2_TBL USING (i) WHERE (udf(udf(i)) = udf(1)) --- !query 53 schema +-- !query schema struct --- !query 53 output +-- !query output 1 4 one -1 --- !query 54 +-- !query CREATE TABLE t1 (name STRING, n INTEGER) USING parquet --- !query 54 schema +-- !query schema struct<> --- !query 54 output +-- !query output --- !query 55 +-- !query CREATE TABLE t2 (name STRING, n INTEGER) USING parquet --- !query 55 schema +-- !query schema struct<> --- !query 55 output +-- !query output --- !query 56 +-- !query CREATE TABLE t3 (name STRING, n INTEGER) USING parquet --- !query 56 schema +-- !query schema struct<> --- !query 56 output +-- !query output --- !query 57 +-- !query INSERT INTO t1 VALUES ( 'bb', 11 ) --- !query 57 schema +-- !query schema struct<> --- !query 57 output +-- !query output --- !query 58 +-- !query INSERT INTO t2 VALUES ( 'bb', 12 ) --- !query 58 schema +-- !query schema struct<> --- !query 58 output +-- !query output --- !query 59 +-- !query INSERT INTO t2 VALUES ( 'cc', 22 ) --- !query 59 schema +-- !query schema struct<> --- !query 59 output +-- !query output --- !query 60 +-- !query INSERT INTO t2 VALUES ( 'ee', 42 ) --- !query 60 schema +-- !query schema struct<> --- !query 60 output +-- !query output --- !query 61 +-- !query INSERT INTO t3 VALUES ( 'bb', 13 ) --- !query 61 schema +-- !query schema struct<> --- !query 61 output +-- !query output --- !query 62 +-- !query INSERT INTO t3 VALUES ( 'cc', 23 ) --- !query 62 schema +-- !query schema struct<> --- !query 62 output +-- !query output --- !query 63 +-- !query INSERT INTO t3 VALUES ( 'dd', 33 ) --- !query 63 schema +-- !query schema struct<> --- !query 63 output +-- !query output --- !query 64 +-- !query SELECT * FROM t1 FULL JOIN t2 USING (name) FULL JOIN t3 USING (name) --- !query 64 schema +-- !query schema struct --- !query 64 output +-- !query output bb 11 12 13 cc NULL 22 23 dd NULL NULL 33 ee NULL 42 NULL --- !query 65 +-- !query SELECT * FROM (SELECT udf(name) as name, t2.n FROM t2) as s2 INNER JOIN (SELECT udf(udf(name)) as name, t3.n FROM t3) s3 USING (name) --- !query 65 schema +-- !query schema struct --- !query 65 output +-- !query output bb 12 13 cc 22 23 --- !query 66 +-- !query SELECT * FROM (SELECT udf(udf(name)) as name, t2.n FROM t2) as s2 LEFT JOIN (SELECT udf(name) as name, t3.n FROM t3) s3 USING (name) --- !query 66 schema +-- !query schema struct --- !query 66 output +-- !query output bb 12 13 cc 22 23 ee 42 NULL --- !query 67 +-- !query SELECT udf(name), udf(udf(s2.n)), udf(s3.n) FROM (SELECT * FROM t2) as s2 FULL JOIN (SELECT * FROM t3) s3 USING (name) --- !query 67 schema +-- !query schema struct --- !query 67 output +-- !query output bb 12 13 cc 22 23 dd NULL 33 ee 42 NULL --- !query 68 +-- !query SELECT * FROM (SELECT udf(udf(name)) as name, udf(n) as s2_n, udf(2) as s2_2 FROM t2) as s2 NATURAL INNER JOIN (SELECT udf(name) as name, udf(udf(n)) as s3_n, udf(3) as s3_2 FROM t3) s3 --- !query 68 schema +-- !query schema struct --- !query 68 output +-- !query output bb 12 2 13 3 cc 22 2 23 3 --- !query 69 +-- !query SELECT * FROM (SELECT udf(name) as name, udf(udf(n)) as s2_n, 2 as s2_2 FROM t2) as s2 NATURAL LEFT JOIN (SELECT udf(udf(name)) as name, udf(n) as s3_n, 3 as s3_2 FROM t3) s3 --- !query 69 schema +-- !query schema struct --- !query 69 output +-- !query output bb 12 2 13 3 cc 22 2 23 3 ee 42 2 NULL NULL --- !query 70 +-- !query SELECT * FROM (SELECT udf(name) as name, udf(n) as s2_n, 2 as s2_2 FROM t2) as s2 NATURAL FULL JOIN (SELECT udf(udf(name)) as name, udf(udf(n)) as s3_n, 3 as s3_2 FROM t3) s3 --- !query 70 schema +-- !query schema struct --- !query 70 output +-- !query output bb 12 2 13 3 cc 22 2 23 3 dd NULL NULL 33 3 ee 42 2 NULL NULL --- !query 71 +-- !query SELECT * FROM (SELECT udf(udf(name)) as name, udf(n) as s1_n, 1 as s1_1 FROM t1) as s1 NATURAL INNER JOIN (SELECT udf(name) as name, udf(n) as s2_n, 2 as s2_2 FROM t2) as s2 NATURAL INNER JOIN (SELECT udf(udf(udf(name))) as name, udf(n) as s3_n, 3 as s3_2 FROM t3) s3 --- !query 71 schema +-- !query schema struct --- !query 71 output +-- !query output bb 11 1 12 2 13 3 --- !query 72 +-- !query SELECT * FROM (SELECT udf(name) as name, udf(n) as s1_n, udf(udf(1)) as s1_1 FROM t1) as s1 NATURAL FULL JOIN (SELECT udf(name) as name, udf(udf(n)) as s2_n, udf(2) as s2_2 FROM t2) as s2 NATURAL FULL JOIN (SELECT udf(udf(name)) as name, udf(n) as s3_n, udf(3) as s3_2 FROM t3) s3 --- !query 72 schema +-- !query schema struct --- !query 72 output +-- !query output bb 11 1 12 2 13 3 cc NULL NULL 22 2 23 3 dd NULL NULL NULL NULL 33 3 ee NULL NULL 42 2 NULL NULL --- !query 73 +-- !query SELECT name, udf(udf(s1_n)), udf(s2_n), udf(s3_n) FROM (SELECT name, udf(udf(n)) as s1_n FROM t1) as s1 NATURAL FULL JOIN @@ -2137,16 +2137,16 @@ NATURAL FULL JOIN NATURAL FULL JOIN (SELECT name, udf(udf(n)) as s3_n FROM t3) as s3 ) ss2 --- !query 73 schema +-- !query schema struct --- !query 73 output +-- !query output bb 11 12 13 cc NULL 22 23 dd NULL NULL 33 ee NULL 42 NULL --- !query 74 +-- !query SELECT * FROM (SELECT name, n as s1_n FROM t1) as s1 NATURAL FULL JOIN @@ -2155,55 +2155,55 @@ NATURAL FULL JOIN NATURAL FULL JOIN (SELECT name, udf(n) as s3_n FROM t3) as s3 ) ss2 --- !query 74 schema +-- !query schema struct --- !query 74 output +-- !query output bb 11 12 2 13 cc NULL 22 2 23 dd NULL NULL NULL 33 ee NULL 42 2 NULL --- !query 75 +-- !query SELECT s1.name, udf(s1_n), s2.name, udf(udf(s2_n)) FROM (SELECT name, udf(n) as s1_n FROM t1) as s1 FULL JOIN (SELECT name, 2 as s2_n FROM t2) as s2 ON (udf(udf(s1_n)) = udf(s2_n)) --- !query 75 schema +-- !query schema struct --- !query 75 output +-- !query output NULL NULL bb 2 NULL NULL cc 2 NULL NULL ee 2 bb 11 NULL NULL --- !query 76 +-- !query create or replace temporary view x as select * from (values (1,11), (2,22), (3,null), (4,44), (5,null)) as v(x1, x2) --- !query 76 schema +-- !query schema struct<> --- !query 76 output +-- !query output --- !query 77 +-- !query create or replace temporary view y as select * from (values (1,111), (2,222), (3,333), (4,null)) as v(y1, y2) --- !query 77 schema +-- !query schema struct<> --- !query 77 output +-- !query output --- !query 78 +-- !query select udf(udf(x1)), udf(x2) from x --- !query 78 schema +-- !query schema struct --- !query 78 output +-- !query output 1 11 2 22 3 NULL @@ -2211,22 +2211,22 @@ struct --- !query 79 output +-- !query output 1 111 2 222 3 333 4 NULL --- !query 80 +-- !query select * from x left join y on (udf(x1) = udf(udf(y1)) and udf(x2) is not null) --- !query 80 schema +-- !query schema struct --- !query 80 output +-- !query output 1 11 1 111 2 22 2 222 3 NULL NULL NULL @@ -2234,11 +2234,11 @@ struct 5 NULL NULL NULL --- !query 81 +-- !query select * from x left join y on (udf(udf(x1)) = udf(y1) and udf(y2) is not null) --- !query 81 schema +-- !query schema struct --- !query 81 output +-- !query output 1 11 1 111 2 22 2 222 3 NULL 3 333 @@ -2246,12 +2246,12 @@ struct 5 NULL NULL NULL --- !query 82 +-- !query select * from (x left join y on (udf(x1) = udf(udf(y1)))) left join x xx(xx1,xx2) on (udf(udf(x1)) = udf(xx1)) --- !query 82 schema +-- !query schema struct --- !query 82 output +-- !query output 1 11 1 111 1 11 2 22 2 222 2 22 3 NULL 3 333 3 NULL @@ -2259,12 +2259,12 @@ struct 5 NULL NULL NULL 5 NULL --- !query 83 +-- !query select * from (x left join y on (udf(x1) = udf(y1))) left join x xx(xx1,xx2) on (udf(x1) = xx1 and udf(x2) is not null) --- !query 83 schema +-- !query schema struct --- !query 83 output +-- !query output 1 11 1 111 1 11 2 22 2 222 2 22 3 NULL 3 333 NULL NULL @@ -2272,12 +2272,12 @@ struct 5 NULL NULL NULL NULL NULL --- !query 84 +-- !query select * from (x left join y on (x1 = udf(y1))) left join x xx(xx1,xx2) on (udf(x1) = udf(udf(xx1)) and udf(y2) is not null) --- !query 84 schema +-- !query schema struct --- !query 84 output +-- !query output 1 11 1 111 1 11 2 22 2 222 2 22 3 NULL 3 333 3 NULL @@ -2285,12 +2285,12 @@ struct 5 NULL NULL NULL NULL NULL --- !query 85 +-- !query select * from (x left join y on (udf(x1) = y1)) left join x xx(xx1,xx2) on (udf(udf(x1)) = udf(xx1) and udf(udf(xx2)) is not null) --- !query 85 schema +-- !query schema struct --- !query 85 output +-- !query output 1 11 1 111 1 11 2 22 2 222 2 22 3 NULL 3 333 NULL NULL @@ -2298,78 +2298,78 @@ struct 5 NULL NULL NULL NULL NULL --- !query 86 +-- !query select * from (x left join y on (udf(udf(x1)) = udf(udf(y1)))) left join x xx(xx1,xx2) on (udf(x1) = udf(xx1)) where (udf(x2) is not null) --- !query 86 schema +-- !query schema struct --- !query 86 output +-- !query output 1 11 1 111 1 11 2 22 2 222 2 22 4 44 4 NULL 4 44 --- !query 87 +-- !query select * from (x left join y on (udf(x1) = udf(y1))) left join x xx(xx1,xx2) on (udf(x1) = xx1) where (udf(y2) is not null) --- !query 87 schema +-- !query schema struct --- !query 87 output +-- !query output 1 11 1 111 1 11 2 22 2 222 2 22 3 NULL 3 333 3 NULL --- !query 88 +-- !query select * from (x left join y on (udf(x1) = udf(y1))) left join x xx(xx1,xx2) on (x1 = udf(xx1)) where (xx2 is not null) --- !query 88 schema +-- !query schema struct --- !query 88 output +-- !query output 1 11 1 111 1 11 2 22 2 222 2 22 4 44 4 NULL 4 44 --- !query 89 +-- !query select udf(udf(count(*))) from tenk1 a where udf(udf(unique1)) in (select udf(unique1) from tenk1 b join tenk1 c using (unique1) where udf(udf(b.unique2)) = udf(42)) --- !query 89 schema +-- !query schema struct --- !query 89 output +-- !query output 1 --- !query 90 +-- !query select udf(count(*)) from tenk1 x where udf(x.unique1) in (select udf(a.f1) from int4_tbl a,float8_tbl b where udf(udf(a.f1))=b.f1) and udf(x.unique1) = 0 and udf(x.unique1) in (select aa.f1 from int4_tbl aa,float8_tbl bb where aa.f1=udf(udf(bb.f1))) --- !query 90 schema +-- !query schema struct --- !query 90 output +-- !query output 1 --- !query 91 +-- !query select udf(udf(count(*))) from tenk1 x where udf(x.unique1) in (select udf(a.f1) from int4_tbl a,float8_tbl b where udf(udf(a.f1))=b.f1) and udf(x.unique1) = 0 and udf(udf(x.unique1)) in (select udf(aa.f1) from int4_tbl aa,float8_tbl bb where udf(aa.f1)=udf(udf(bb.f1))) --- !query 91 schema +-- !query schema struct --- !query 91 output +-- !query output 1 --- !query 92 +-- !query select * from int8_tbl i1 left join (int8_tbl i2 join (select udf(123) as x) ss on udf(udf(i2.q1)) = udf(x)) on udf(udf(i1.q2)) = udf(udf(i2.q2)) order by udf(udf(1)), 2 --- !query 92 schema +-- !query schema struct --- !query 92 output +-- !query output 4567890123456789 -4567890123456789 NULL NULL NULL 4567890123456789 123 NULL NULL NULL 123 456 123 456 123 @@ -2377,7 +2377,7 @@ struct 4567890123456789 4567890123456789 123 4567890123456789 123 --- !query 93 +-- !query select udf(count(*)) from (select udf(t3.tenthous) as x1, udf(coalesce(udf(t1.stringu1), udf(t2.stringu1))) as x2 @@ -2387,32 +2387,32 @@ from tenk1 t4, tenk1 t5 where udf(t4.thousand) = udf(t5.unique1) and udf(udf(ss.x1)) = t4.tenthous and udf(ss.x2) = udf(udf(t5.stringu1)) --- !query 93 schema +-- !query schema struct --- !query 93 output +-- !query output 1000 --- !query 94 +-- !query select udf(a.f1), udf(b.f1), udf(t.thousand), udf(t.tenthous) from tenk1 t, (select udf(udf(sum(udf(f1))+1)) as f1 from int4_tbl i4a) a, (select udf(sum(udf(f1))) as f1 from int4_tbl i4b) b where b.f1 = udf(t.thousand) and udf(a.f1) = udf(b.f1) and udf((udf(a.f1)+udf(b.f1)+999)) = udf(udf(t.tenthous)) --- !query 94 schema +-- !query schema struct --- !query 94 output +-- !query output --- !query 95 +-- !query select * from j1_tbl full join (select * from j2_tbl order by udf(udf(j2_tbl.i)) desc, udf(j2_tbl.k) asc) j2_tbl on udf(j1_tbl.i) = udf(j2_tbl.i) and udf(j1_tbl.i) = udf(j2_tbl.k) --- !query 95 schema +-- !query schema struct --- !query 95 output +-- !query output 0 NULL zero NULL NULL 1 4 one NULL NULL 2 3 two 2 2 @@ -2434,156 +2434,156 @@ NULL NULL NULL NULL NULL NULL NULL null NULL NULL --- !query 96 +-- !query select udf(count(*)) from (select * from tenk1 x order by udf(x.thousand), udf(udf(x.twothousand)), x.fivethous) x left join (select * from tenk1 y order by udf(y.unique2)) y on udf(x.thousand) = y.unique2 and x.twothousand = udf(y.hundred) and x.fivethous = y.unique2 --- !query 96 schema +-- !query schema struct --- !query 96 output +-- !query output 10000 --- !query 97 +-- !query DROP TABLE t1 --- !query 97 schema +-- !query schema struct<> --- !query 97 output +-- !query output --- !query 98 +-- !query DROP TABLE t2 --- !query 98 schema +-- !query schema struct<> --- !query 98 output +-- !query output --- !query 99 +-- !query DROP TABLE t3 --- !query 99 schema +-- !query schema struct<> --- !query 99 output +-- !query output --- !query 100 +-- !query DROP TABLE J1_TBL --- !query 100 schema +-- !query schema struct<> --- !query 100 output +-- !query output --- !query 101 +-- !query DROP TABLE J2_TBL --- !query 101 schema +-- !query schema struct<> --- !query 101 output +-- !query output --- !query 102 +-- !query create or replace temporary view tt1 as select * from (values (1, 11), (2, NULL)) as v(tt1_id, joincol) --- !query 102 schema +-- !query schema struct<> --- !query 102 output +-- !query output --- !query 103 +-- !query create or replace temporary view tt2 as select * from (values (21, 11), (22, 11)) as v(tt2_id, joincol) --- !query 103 schema +-- !query schema struct<> --- !query 103 output +-- !query output --- !query 104 +-- !query select tt1.*, tt2.* from tt1 left join tt2 on udf(udf(tt1.joincol)) = udf(tt2.joincol) --- !query 104 schema +-- !query schema struct --- !query 104 output +-- !query output 1 11 21 11 1 11 22 11 2 NULL NULL NULL --- !query 105 +-- !query select tt1.*, tt2.* from tt2 right join tt1 on udf(udf(tt1.joincol)) = udf(udf(tt2.joincol)) --- !query 105 schema +-- !query schema struct --- !query 105 output +-- !query output 1 11 21 11 1 11 22 11 2 NULL NULL NULL --- !query 106 +-- !query select udf(count(*)) from tenk1 a, tenk1 b where udf(a.hundred) = b.thousand and udf(udf((b.fivethous % 10)) < 10) --- !query 106 schema +-- !query schema struct --- !query 106 output +-- !query output 100000 --- !query 107 +-- !query DROP TABLE IF EXISTS tt3 --- !query 107 schema +-- !query schema struct<> --- !query 107 output +-- !query output --- !query 108 +-- !query CREATE TABLE tt3(f1 int, f2 string) USING parquet --- !query 108 schema +-- !query schema struct<> --- !query 108 output +-- !query output --- !query 109 +-- !query INSERT INTO tt3 SELECT x.id, repeat('xyzzy', 100) FROM range(1,10001) x --- !query 109 schema +-- !query schema struct<> --- !query 109 output +-- !query output --- !query 110 +-- !query DROP TABLE IF EXISTS tt4 --- !query 110 schema +-- !query schema struct<> --- !query 110 output +-- !query output --- !query 111 +-- !query CREATE TABLE tt4(f1 int) USING parquet --- !query 111 schema +-- !query schema struct<> --- !query 111 output +-- !query output --- !query 112 +-- !query INSERT INTO tt4 VALUES (0),(1),(9999) --- !query 112 schema +-- !query schema struct<> --- !query 112 output +-- !query output --- !query 113 +-- !query SELECT udf(udf(a.f1)) as f1 FROM tt4 a LEFT JOIN ( @@ -2592,242 +2592,242 @@ LEFT JOIN ( WHERE udf(c.f1) IS NULL ) AS d ON udf(a.f1) = d.f1 WHERE udf(udf(d.f1)) IS NULL --- !query 113 schema +-- !query schema struct --- !query 113 output +-- !query output 0 1 9999 --- !query 114 +-- !query create or replace temporary view tt5 as select * from (values (1, 10), (1, 11)) as v(f1, f2) --- !query 114 schema +-- !query schema struct<> --- !query 114 output +-- !query output --- !query 115 +-- !query create or replace temporary view tt6 as select * from (values (1, 9), (1, 2), (2, 9)) as v(f1, f2) --- !query 115 schema +-- !query schema struct<> --- !query 115 output +-- !query output --- !query 116 +-- !query select * from tt5,tt6 where udf(tt5.f1) = udf(tt6.f1) and udf(tt5.f1) = udf(udf(tt5.f2) - udf(tt6.f2)) --- !query 116 schema +-- !query schema struct --- !query 116 output +-- !query output 1 10 1 9 --- !query 117 +-- !query create or replace temporary view xx as select * from (values (1), (2), (3)) as v(pkxx) --- !query 117 schema +-- !query schema struct<> --- !query 117 output +-- !query output --- !query 118 +-- !query create or replace temporary view yy as select * from (values (101, 1), (201, 2), (301, NULL)) as v(pkyy, pkxx) --- !query 118 schema +-- !query schema struct<> --- !query 118 output +-- !query output --- !query 119 +-- !query select udf(udf(yy.pkyy)) as yy_pkyy, udf(yy.pkxx) as yy_pkxx, udf(yya.pkyy) as yya_pkyy, udf(xxa.pkxx) as xxa_pkxx, udf(xxb.pkxx) as xxb_pkxx from yy left join (SELECT * FROM yy where pkyy = 101) as yya ON udf(yy.pkyy) = udf(yya.pkyy) left join xx xxa on udf(yya.pkxx) = udf(udf(xxa.pkxx)) left join xx xxb on udf(udf(coalesce (xxa.pkxx, 1))) = udf(xxb.pkxx) --- !query 119 schema +-- !query schema struct --- !query 119 output +-- !query output 101 1 101 1 1 201 2 NULL NULL 1 301 NULL NULL NULL 1 --- !query 120 +-- !query create or replace temporary view zt1 as select * from (values (53)) as v(f1) --- !query 120 schema +-- !query schema struct<> --- !query 120 output +-- !query output --- !query 121 +-- !query create or replace temporary view zt2 as select * from (values (53)) as v(f2) --- !query 121 schema +-- !query schema struct<> --- !query 121 output +-- !query output --- !query 122 +-- !query create or replace temporary view zt3(f3 int) using parquet --- !query 122 schema +-- !query schema struct<> --- !query 122 output +-- !query output --- !query 123 +-- !query select * from zt2 left join zt3 on (udf(f2) = udf(udf(f3))) left join zt1 on (udf(udf(f3)) = udf(f1)) where udf(f2) = 53 --- !query 123 schema +-- !query schema struct --- !query 123 output +-- !query output 53 NULL NULL --- !query 124 +-- !query create temp view zv1 as select *,'dummy' AS junk from zt1 --- !query 124 schema +-- !query schema struct<> --- !query 124 output +-- !query output --- !query 125 +-- !query select * from zt2 left join zt3 on (f2 = udf(f3)) left join zv1 on (udf(f3) = f1) where udf(udf(f2)) = 53 --- !query 125 schema +-- !query schema struct --- !query 125 output +-- !query output 53 NULL NULL NULL --- !query 126 +-- !query select udf(a.unique2), udf(a.ten), udf(b.tenthous), udf(b.unique2), udf(b.hundred) from tenk1 a left join tenk1 b on a.unique2 = udf(b.tenthous) where udf(a.unique1) = 42 and ((udf(b.unique2) is null and udf(a.ten) = 2) or udf(udf(b.hundred)) = udf(udf(3))) --- !query 126 schema +-- !query schema struct --- !query 126 output +-- !query output --- !query 127 +-- !query create or replace temporary view a (i integer) using parquet --- !query 127 schema +-- !query schema struct<> --- !query 127 output +-- !query output --- !query 128 +-- !query create or replace temporary view b (x integer, y integer) using parquet --- !query 128 schema +-- !query schema struct<> --- !query 128 output +-- !query output --- !query 129 +-- !query select * from a left join b on udf(i) = x and i = udf(y) and udf(x) = udf(i) --- !query 129 schema +-- !query schema struct --- !query 129 output +-- !query output --- !query 130 +-- !query select udf(t1.q2), udf(count(t2.*)) from int8_tbl t1 left join int8_tbl t2 on (udf(udf(t1.q2)) = t2.q1) group by udf(t1.q2) order by 1 --- !query 130 schema +-- !query schema struct --- !query 130 output +-- !query output -4567890123456789 0 123 2 456 0 4567890123456789 6 --- !query 131 +-- !query select udf(udf(t1.q2)), udf(count(t2.*)) from int8_tbl t1 left join (select * from int8_tbl) t2 on (udf(udf(t1.q2)) = udf(t2.q1)) group by udf(udf(t1.q2)) order by 1 --- !query 131 schema +-- !query schema struct --- !query 131 output +-- !query output -4567890123456789 0 123 2 456 0 4567890123456789 6 --- !query 132 +-- !query select udf(t1.q2) as q2, udf(udf(count(t2.*))) from int8_tbl t1 left join (select udf(q1) as q1, case when q2=1 then 1 else q2 end as q2 from int8_tbl) t2 on (udf(t1.q2) = udf(t2.q1)) group by t1.q2 order by 1 --- !query 132 schema +-- !query schema struct --- !query 132 output +-- !query output -4567890123456789 0 123 2 456 0 4567890123456789 6 --- !query 133 +-- !query create or replace temporary view a as select * from (values ('p'), ('q')) as v(code) --- !query 133 schema +-- !query schema struct<> --- !query 133 output +-- !query output --- !query 134 +-- !query create or replace temporary view b as select * from (values ('p', 1), ('p', 2)) as v(a, num) --- !query 134 schema +-- !query schema struct<> --- !query 134 output +-- !query output --- !query 135 +-- !query create or replace temporary view c as select * from (values ('A', 'p'), ('B', 'q'), ('C', null)) as v(name, a) --- !query 135 schema +-- !query schema struct<> --- !query 135 output +-- !query output --- !query 136 +-- !query select udf(c.name), udf(ss.code), udf(ss.b_cnt), udf(ss.const) from c left join (select a.code, coalesce(b_grp.cnt, 0) as b_cnt, -1 as const @@ -2837,15 +2837,15 @@ from c left join ) as ss on (udf(udf(c.a)) = udf(ss.code)) order by c.name --- !query 136 schema +-- !query schema struct --- !query 136 output +-- !query output A p 2 -1 B q 0 -1 C NULL NULL NULL --- !query 137 +-- !query SELECT * FROM ( SELECT 1 as key1 ) sub1 LEFT JOIN @@ -2861,13 +2861,13 @@ LEFT JOIN ON udf(sub4.key5) = sub3.key3 ) sub2 ON udf(udf(sub1.key1)) = udf(udf(sub2.key3)) --- !query 137 schema +-- !query schema struct --- !query 137 output +-- !query output 1 1 1 1 --- !query 138 +-- !query SELECT * FROM ( SELECT 1 as key1 ) sub1 LEFT JOIN @@ -2883,13 +2883,13 @@ LEFT JOIN ON sub4.key5 = sub3.key3 ) sub2 ON sub1.key1 = udf(udf(sub2.key3)) --- !query 138 schema +-- !query schema struct --- !query 138 output +-- !query output 1 1 1 1 --- !query 139 +-- !query SELECT udf(qq), udf(udf(unique1)) FROM ( SELECT udf(COALESCE(q1, 0)) AS qq FROM int8_tbl a ) AS ss1 @@ -2897,45 +2897,45 @@ SELECT udf(qq), udf(udf(unique1)) ( SELECT udf(udf(COALESCE(q2, -1))) AS qq FROM int8_tbl b ) AS ss2 USING (qq) INNER JOIN tenk1 c ON udf(qq) = udf(unique2) --- !query 139 schema +-- !query schema struct --- !query 139 output +-- !query output 123 4596 123 4596 456 7318 --- !query 140 +-- !query create or replace temporary view nt1 as select * from (values(1,true,true), (2,true,false), (3,false,false)) as v(id, a1, a2) --- !query 140 schema +-- !query schema struct<> --- !query 140 output +-- !query output --- !query 141 +-- !query create or replace temporary view nt2 as select * from (values(1,1,true,true), (2,2,true,false), (3,3,false,false)) as v(id, nt1_id, b1, b2) --- !query 141 schema +-- !query schema struct<> --- !query 141 output +-- !query output --- !query 142 +-- !query create or replace temporary view nt3 as select * from (values(1,1,true), (2,2,false), (3,3,true)) as v(id, nt2_id, c1) --- !query 142 schema +-- !query schema struct<> --- !query 142 output +-- !query output --- !query 143 +-- !query select udf(nt3.id) from nt3 as nt3 left join @@ -2947,17 +2947,17 @@ from nt3 as nt3 ) as ss2 on udf(ss2.id) = nt3.nt2_id where udf(nt3.id) = 1 and udf(ss2.b3) --- !query 143 schema +-- !query schema struct --- !query 143 output +-- !query output 1 --- !query 144 +-- !query select * from int4_tbl a full join int4_tbl b on true --- !query 144 schema +-- !query schema struct --- !query 144 output +-- !query output -123456 -123456 -123456 -2147483647 -123456 0 @@ -2985,11 +2985,11 @@ struct 2147483647 2147483647 --- !query 145 +-- !query select * from int4_tbl a full join int4_tbl b on false --- !query 145 schema +-- !query schema struct --- !query 145 output +-- !query output -123456 NULL -2147483647 NULL 0 NULL @@ -3002,27 +3002,27 @@ NULL 123456 NULL 2147483647 --- !query 146 +-- !query select udf(count(*)) from tenk1 a join tenk1 b on udf(a.unique1) = udf(b.unique2) left join tenk1 c on udf(a.unique2) = udf(b.unique1) and udf(c.thousand) = udf(udf(a.thousand)) join int4_tbl on udf(b.thousand) = f1 --- !query 146 schema +-- !query schema struct --- !query 146 output +-- !query output 10 --- !query 147 +-- !query select udf(b.unique1) from tenk1 a join tenk1 b on udf(a.unique1) = udf(b.unique2) left join tenk1 c on udf(b.unique1) = 42 and c.thousand = udf(a.thousand) join int4_tbl i1 on udf(b.thousand) = udf(udf(f1)) right join int4_tbl i2 on udf(udf(i2.f1)) = udf(b.tenthous) order by udf(1) --- !query 147 schema +-- !query schema struct --- !query 147 output +-- !query output NULL NULL 0 @@ -3030,7 +3030,7 @@ NULL NULL --- !query 148 +-- !query select * from ( select udf(unique1), udf(q1), udf(udf(coalesce(unique1, -1)) + udf(q1)) as fault @@ -3038,43 +3038,43 @@ select * from ) ss where udf(fault) = udf(122) order by udf(fault) --- !query 148 schema +-- !query schema struct --- !query 148 output +-- !query output NULL 123 122 --- !query 149 +-- !query select udf(q1), udf(unique2), udf(thousand), udf(hundred) from int8_tbl a left join tenk1 b on udf(q1) = udf(unique2) where udf(coalesce(thousand,123)) = udf(q1) and udf(q1) = udf(udf(coalesce(hundred,123))) --- !query 149 schema +-- !query schema struct --- !query 149 output +-- !query output --- !query 150 +-- !query select udf(f1), udf(unique2), case when udf(udf(unique2)) is null then udf(f1) else 0 end from int4_tbl a left join tenk1 b on udf(f1) = udf(udf(unique2)) where (case when udf(unique2) is null then udf(f1) else 0 end) = 0 --- !query 150 schema +-- !query schema struct --- !query 150 output +-- !query output 0 0 0 --- !query 151 +-- !query select udf(a.unique1), udf(b.unique1), udf(c.unique1), udf(coalesce(b.twothousand, a.twothousand)) from tenk1 a left join tenk1 b on udf(b.thousand) = a.unique1 left join tenk1 c on udf(c.unique2) = udf(coalesce(b.twothousand, a.twothousand)) where a.unique2 < udf(10) and udf(udf(coalesce(b.twothousand, a.twothousand))) = udf(44) --- !query 151 schema +-- !query schema struct --- !query 151 output +-- !query output --- !query 152 +-- !query select * from text_tbl t1 inner join int8_tbl i8 @@ -3083,32 +3083,32 @@ select * from on udf(t1.f1) = udf(udf('doh!')) left join int4_tbl i4 on udf(udf(i8.q1)) = i4.f1 --- !query 152 schema +-- !query schema struct --- !query 152 output +-- !query output doh! 123 456 doh! NULL doh! 123 456 hi de ho neighbor NULL --- !query 153 +-- !query select * from (select udf(udf(1)) as id) as xx left join (tenk1 as a1 full join (select udf(1) as id) as yy on (udf(a1.unique1) = udf(yy.id))) on (xx.id = udf(udf(coalesce(yy.id)))) --- !query 153 schema +-- !query schema struct --- !query 153 output +-- !query output 1 1 2838 1 1 1 1 1 1 1 1 1 2 3 BAAAAA EFEAAA OOOOxx 1 --- !query 154 +-- !query select udf(a.q2), udf(b.q1) from int8_tbl a left join int8_tbl b on udf(a.q2) = coalesce(b.q1, 1) where udf(udf(coalesce(b.q1, 1)) > 0) --- !query 154 schema +-- !query schema struct --- !query 154 output +-- !query output -4567890123456789 NULL 123 123 123 123 @@ -3121,124 +3121,124 @@ struct --- !query 155 output +-- !query output --- !query 156 +-- !query create or replace temporary view child as select * from (values (1, 100), (4, 400)) as v(k, cd) --- !query 156 schema +-- !query schema struct<> --- !query 156 output +-- !query output --- !query 157 +-- !query select p.* from parent p left join child c on (udf(p.k) = udf(c.k)) --- !query 157 schema +-- !query schema struct --- !query 157 output +-- !query output 1 10 2 20 3 30 --- !query 158 +-- !query select p.*, linked from parent p left join (select c.*, udf(udf(true)) as linked from child c) as ss on (udf(p.k) = udf(udf(ss.k))) --- !query 158 schema +-- !query schema struct --- !query 158 output +-- !query output 1 10 true 2 20 NULL 3 30 NULL --- !query 159 +-- !query select p.* from parent p left join child c on (udf(p.k) = c.k) where p.k = udf(1) and udf(udf(p.k)) = udf(udf(2)) --- !query 159 schema +-- !query schema struct --- !query 159 output +-- !query output --- !query 160 +-- !query select p.* from (parent p left join child c on (udf(p.k) = c.k)) join parent x on p.k = udf(x.k) where udf(p.k) = udf(1) and udf(udf(p.k)) = udf(udf(2)) --- !query 160 schema +-- !query schema struct --- !query 160 output +-- !query output --- !query 161 +-- !query create or replace temporary view a as select * from (values (0), (1)) as v(id) --- !query 161 schema +-- !query schema struct<> --- !query 161 output +-- !query output --- !query 162 +-- !query create or replace temporary view b as select * from (values (0, 0), (1, NULL)) as v(id, a_id) --- !query 162 schema +-- !query schema struct<> --- !query 162 output +-- !query output --- !query 163 +-- !query SELECT * FROM b LEFT JOIN a ON (udf(b.a_id) = udf(a.id)) WHERE (udf(udf(a.id)) IS NULL OR udf(a.id) > 0) --- !query 163 schema +-- !query schema struct --- !query 163 output +-- !query output 1 NULL NULL --- !query 164 +-- !query SELECT b.* FROM b LEFT JOIN a ON (udf(b.a_id) = udf(a.id)) WHERE (udf(a.id) IS NULL OR udf(udf(a.id)) > 0) --- !query 164 schema +-- !query schema struct --- !query 164 output +-- !query output 1 NULL --- !query 165 +-- !query create or replace temporary view innertab as select * from (values (123L, 42L)) as v(id, dat1) --- !query 165 schema +-- !query schema struct<> --- !query 165 output +-- !query output --- !query 166 +-- !query SELECT * FROM (SELECT udf(1) AS x) ss1 LEFT JOIN (SELECT udf(q1), udf(q2), udf(COALESCE(dat1, q1)) AS y FROM int8_tbl LEFT JOIN innertab ON udf(udf(q2)) = id) ss2 ON true --- !query 166 schema +-- !query schema struct --- !query 166 output +-- !query output 1 123 456 123 1 123 4567890123456789 123 1 4567890123456789 -4567890123456789 4567890123456789 @@ -3246,163 +3246,163 @@ struct --- !query 167 output +-- !query output org.apache.spark.sql.AnalysisException Reference 'f1' is ambiguous, could be: j.f1, j.f1.; line 2 pos 72 --- !query 168 +-- !query select * from int8_tbl x join (int4_tbl x cross join int4_tbl y) j on udf(q1) = udf(y.f1) --- !query 168 schema +-- !query schema struct<> --- !query 168 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`y.f1`' given input columns: [j.f1, j.f1, x.q1, x.q2]; line 2 pos 72 --- !query 169 +-- !query select * from int8_tbl x join (int4_tbl x cross join int4_tbl y(ff)) j on udf(q1) = udf(udf(f1)) --- !query 169 schema +-- !query schema struct --- !query 169 output +-- !query output --- !query 170 +-- !query select udf(t1.uunique1) from tenk1 t1 join tenk2 t2 on t1.two = udf(t2.two) --- !query 170 schema +-- !query schema struct<> --- !query 170 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`t1.uunique1`' given input columns: [t1.even, t2.even, t1.fivethous, t2.fivethous, t1.four, t2.four, t1.hundred, t2.hundred, t1.odd, t2.odd, t1.string4, t2.string4, t1.stringu1, t2.stringu1, t1.stringu2, t2.stringu2, t1.ten, t2.ten, t1.tenthous, t2.tenthous, t1.thousand, t2.thousand, t1.twenty, t2.twenty, t1.two, t2.two, t1.twothousand, t2.twothousand, t1.unique1, t2.unique1, t1.unique2, t2.unique2]; line 1 pos 11 --- !query 171 +-- !query select udf(udf(t2.uunique1)) from tenk1 t1 join tenk2 t2 on udf(t1.two) = t2.two --- !query 171 schema +-- !query schema struct<> --- !query 171 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`t2.uunique1`' given input columns: [t1.even, t2.even, t1.fivethous, t2.fivethous, t1.four, t2.four, t1.hundred, t2.hundred, t1.odd, t2.odd, t1.string4, t2.string4, t1.stringu1, t2.stringu1, t1.stringu2, t2.stringu2, t1.ten, t2.ten, t1.tenthous, t2.tenthous, t1.thousand, t2.thousand, t1.twenty, t2.twenty, t1.two, t2.two, t1.twothousand, t2.twothousand, t1.unique1, t2.unique1, t1.unique2, t2.unique2]; line 1 pos 15 --- !query 172 +-- !query select udf(uunique1) from tenk1 t1 join tenk2 t2 on udf(t1.two) = udf(t2.two) --- !query 172 schema +-- !query schema struct<> --- !query 172 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`uunique1`' given input columns: [t1.even, t2.even, t1.fivethous, t2.fivethous, t1.four, t2.four, t1.hundred, t2.hundred, t1.odd, t2.odd, t1.string4, t2.string4, t1.stringu1, t2.stringu1, t1.stringu2, t2.stringu2, t1.ten, t2.ten, t1.tenthous, t2.tenthous, t1.thousand, t2.thousand, t1.twenty, t2.twenty, t1.two, t2.two, t1.twothousand, t2.twothousand, t1.unique1, t2.unique1, t1.unique2, t2.unique2]; line 1 pos 11 --- !query 173 +-- !query select udf(udf(f1,g)) from int4_tbl a, (select udf(udf(f1)) as g) ss --- !query 173 schema +-- !query schema struct<> --- !query 173 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`f1`' given input columns: []; line 1 pos 55 --- !query 174 +-- !query select udf(f1,g) from int4_tbl a, (select a.f1 as g) ss --- !query 174 schema +-- !query schema struct<> --- !query 174 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`a.f1`' given input columns: []; line 1 pos 42 --- !query 175 +-- !query select udf(udf(f1,g)) from int4_tbl a cross join (select udf(f1) as g) ss --- !query 175 schema +-- !query schema struct<> --- !query 175 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`f1`' given input columns: []; line 1 pos 61 --- !query 176 +-- !query select udf(f1,g) from int4_tbl a cross join (select udf(udf(a.f1)) as g) ss --- !query 176 schema +-- !query schema struct<> --- !query 176 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`a.f1`' given input columns: []; line 1 pos 60 --- !query 177 +-- !query CREATE TABLE j1 (id1 int, id2 int) USING parquet --- !query 177 schema +-- !query schema struct<> --- !query 177 output +-- !query output --- !query 178 +-- !query CREATE TABLE j2 (id1 int, id2 int) USING parquet --- !query 178 schema +-- !query schema struct<> --- !query 178 output +-- !query output --- !query 179 +-- !query INSERT INTO j1 values(1,1),(1,2) --- !query 179 schema +-- !query schema struct<> --- !query 179 output +-- !query output --- !query 180 +-- !query INSERT INTO j2 values(1,1) --- !query 180 schema +-- !query schema struct<> --- !query 180 output +-- !query output --- !query 181 +-- !query INSERT INTO j2 values(1,2) --- !query 181 schema +-- !query schema struct<> --- !query 181 output +-- !query output --- !query 182 +-- !query select * from j1 inner join j2 on udf(j1.id1) = udf(j2.id1) and udf(udf(j1.id2)) = udf(j2.id2) where udf(j1.id1) % 1000 = 1 and udf(udf(j2.id1) % 1000) = 1 --- !query 182 schema +-- !query schema struct --- !query 182 output +-- !query output 1 1 1 1 1 2 1 2 --- !query 183 +-- !query drop table j1 --- !query 183 schema +-- !query schema struct<> --- !query 183 output +-- !query output --- !query 184 +-- !query drop table j2 --- !query 184 schema +-- !query schema struct<> --- !query 184 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-select_having.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_having.sql.out similarity index 72% rename from sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-select_having.sql.out rename to sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_having.sql.out index f731d11c6d3da..68113afdfae30 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-select_having.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_having.sql.out @@ -2,186 +2,186 @@ -- Number of queries: 22 --- !query 0 +-- !query CREATE TABLE test_having (a int, b int, c string, d string) USING parquet --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query INSERT INTO test_having VALUES (0, 1, 'XXXX', 'A') --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query INSERT INTO test_having VALUES (1, 2, 'AAAA', 'b') --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query INSERT INTO test_having VALUES (2, 2, 'AAAA', 'c') --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query INSERT INTO test_having VALUES (3, 3, 'BBBB', 'D') --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query INSERT INTO test_having VALUES (4, 3, 'BBBB', 'e') --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query INSERT INTO test_having VALUES (5, 3, 'bbbb', 'F') --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output --- !query 7 +-- !query INSERT INTO test_having VALUES (6, 4, 'cccc', 'g') --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output --- !query 8 +-- !query INSERT INTO test_having VALUES (7, 4, 'cccc', 'h') --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output --- !query 9 +-- !query INSERT INTO test_having VALUES (8, 4, 'CCCC', 'I') --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output --- !query 10 +-- !query INSERT INTO test_having VALUES (9, 4, 'CCCC', 'j') --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output --- !query 11 +-- !query SELECT udf(b), udf(c) FROM test_having GROUP BY b, c HAVING udf(count(*)) = 1 ORDER BY udf(b), udf(c) --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 1 XXXX 3 bbbb --- !query 12 +-- !query SELECT udf(b), udf(c) FROM test_having GROUP BY b, c HAVING udf(b) = 3 ORDER BY udf(b), udf(c) --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 3 BBBB 3 bbbb --- !query 13 +-- !query SELECT udf(c), max(udf(a)) FROM test_having GROUP BY c HAVING udf(count(*)) > 2 OR udf(min(a)) = udf(max(a)) ORDER BY c --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output XXXX 0 bbbb 5 --- !query 14 +-- !query SELECT udf(udf(min(udf(a)))), udf(udf(max(udf(a)))) FROM test_having HAVING udf(udf(min(udf(a)))) = udf(udf(max(udf(a)))) --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output --- !query 15 +-- !query SELECT udf(min(udf(a))), udf(udf(max(a))) FROM test_having HAVING udf(min(a)) < udf(max(udf(a))) --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 0 9 --- !query 16 +-- !query SELECT udf(a) FROM test_having HAVING udf(min(a)) < udf(max(a)) --- !query 16 schema +-- !query schema struct<> --- !query 16 output +-- !query output org.apache.spark.sql.AnalysisException grouping expressions sequence is empty, and 'default.test_having.`a`' is not an aggregate function. Wrap '(min(default.test_having.`a`) AS `min(a#x)`, max(default.test_having.`a`) AS `max(a#x)`)' in windowing function(s) or wrap 'default.test_having.`a`' in first() (or first_value) if you don't care which value you get.; --- !query 17 +-- !query SELECT 1 AS one FROM test_having HAVING udf(a) > 1 --- !query 17 schema +-- !query schema struct<> --- !query 17 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`a`' given input columns: [one]; line 1 pos 44 --- !query 18 +-- !query SELECT 1 AS one FROM test_having HAVING udf(udf(1) > udf(2)) --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output --- !query 19 +-- !query SELECT 1 AS one FROM test_having HAVING udf(udf(1) < udf(2)) --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output 1 --- !query 20 +-- !query SELECT 1 AS one FROM test_having WHERE 1/udf(a) = 1 HAVING 1 < 2 --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output 1 --- !query 21 +-- !query DROP TABLE test_having --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-select_implicit.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_implicit.sql.out similarity index 71% rename from sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-select_implicit.sql.out rename to sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_implicit.sql.out index a60cbf33b9b24..11cb682ee1494 100755 --- a/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-select_implicit.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-select_implicit.sql.out @@ -2,101 +2,101 @@ -- Number of queries: 38 --- !query 0 +-- !query CREATE TABLE test_missing_target (a int, b int, c string, d string) using parquet --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query INSERT INTO test_missing_target VALUES (0, 1, 'XXXX', 'A') --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query INSERT INTO test_missing_target VALUES (1, 2, 'ABAB', 'b') --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query INSERT INTO test_missing_target VALUES (2, 2, 'ABAB', 'c') --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query INSERT INTO test_missing_target VALUES (3, 3, 'BBBB', 'D') --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query INSERT INTO test_missing_target VALUES (4, 3, 'BBBB', 'e') --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query INSERT INTO test_missing_target VALUES (5, 3, 'bbbb', 'F') --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output --- !query 7 +-- !query INSERT INTO test_missing_target VALUES (6, 4, 'cccc', 'g') --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output --- !query 8 +-- !query INSERT INTO test_missing_target VALUES (7, 4, 'cccc', 'h') --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output --- !query 9 +-- !query INSERT INTO test_missing_target VALUES (8, 4, 'CCCC', 'I') --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output --- !query 10 +-- !query INSERT INTO test_missing_target VALUES (9, 4, 'CCCC', 'j') --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output --- !query 11 +-- !query SELECT udf(c), udf(count(*)) FROM test_missing_target GROUP BY udf(test_missing_target.c) ORDER BY udf(c) --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output ABAB 2 BBBB 2 CCCC 2 @@ -105,12 +105,12 @@ bbbb 1 cccc 2 --- !query 12 +-- !query SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(test_missing_target.c) ORDER BY udf(c) --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 2 2 2 @@ -119,43 +119,43 @@ struct 2 --- !query 13 +-- !query SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(a) ORDER BY udf(b) --- !query 13 schema +-- !query schema struct<> --- !query 13 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`b`' given input columns: [CAST(udf(cast(count(1) as string)) AS BIGINT)]; line 1 pos 75 --- !query 14 +-- !query SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(b) ORDER BY udf(b) --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output 1 2 3 4 --- !query 15 +-- !query SELECT udf(test_missing_target.b), udf(count(*)) FROM test_missing_target GROUP BY udf(b) ORDER BY udf(b) --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 1 1 2 2 3 3 4 4 --- !query 16 +-- !query SELECT udf(c) FROM test_missing_target ORDER BY udf(a) --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output XXXX ABAB ABAB @@ -168,30 +168,30 @@ CCCC CCCC --- !query 17 +-- !query SELECT udf(count(*)) FROM test_missing_target GROUP BY udf(b) ORDER BY udf(b) desc --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output 4 3 2 1 --- !query 18 +-- !query SELECT udf(count(*)) FROM test_missing_target ORDER BY udf(1) desc --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output 10 --- !query 19 +-- !query SELECT udf(c), udf(count(*)) FROM test_missing_target GROUP BY 1 ORDER BY 1 --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output ABAB 2 BBBB 2 CCCC 2 @@ -200,32 +200,32 @@ bbbb 1 cccc 2 --- !query 20 +-- !query SELECT udf(c), udf(count(*)) FROM test_missing_target GROUP BY 3 --- !query 20 schema +-- !query schema struct<> --- !query 20 output +-- !query output org.apache.spark.sql.AnalysisException GROUP BY position 3 is not in select list (valid range is [1, 2]); line 1 pos 63 --- !query 21 +-- !query SELECT udf(count(*)) FROM test_missing_target x, test_missing_target y WHERE udf(x.a) = udf(y.a) GROUP BY udf(b) ORDER BY udf(b) --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output org.apache.spark.sql.AnalysisException Reference 'b' is ambiguous, could be: x.b, y.b.; line 3 pos 14 --- !query 22 +-- !query SELECT udf(a), udf(a) FROM test_missing_target ORDER BY udf(a) --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output 0 0 1 1 2 2 @@ -238,123 +238,129 @@ struct --- !query 23 output -0 0 -0 0 -1 1 -1 1 -2 2 -2 2 -3 3 -3 3 -4 4 -4 4 - - --- !query 24 +-- !query schema +struct +-- !query output +0.0 0.0 +0.5 0.5 +1.0 1.0 +1.5 1.5 +2.0 2.0 +2.5 2.5 +3.0 3.0 +3.5 3.5 +4.0 4.0 +4.5 4.5 + + +-- !query SELECT udf(a/2), udf(a/2) FROM test_missing_target GROUP BY udf(a/2) ORDER BY udf(a/2) --- !query 24 schema -struct --- !query 24 output -0 0 -1 1 -2 2 -3 3 -4 4 - - --- !query 25 +-- !query schema +struct +-- !query output +0.0 0.0 +0.5 0.5 +1.0 1.0 +1.5 1.5 +2.0 2.0 +2.5 2.5 +3.0 3.0 +3.5 3.5 +4.0 4.0 +4.5 4.5 + + +-- !query SELECT udf(x.b), udf(count(*)) FROM test_missing_target x, test_missing_target y WHERE udf(x.a) = udf(y.a) GROUP BY udf(x.b) ORDER BY udf(x.b) --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output 1 1 2 2 3 3 4 4 --- !query 26 +-- !query SELECT udf(count(*)) FROM test_missing_target x, test_missing_target y WHERE udf(x.a) = udf(y.a) GROUP BY udf(x.b) ORDER BY udf(x.b) --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output 1 2 3 4 --- !query 27 +-- !query SELECT udf(a%2), udf(count(udf(b))) FROM test_missing_target GROUP BY udf(test_missing_target.a%2) ORDER BY udf(test_missing_target.a%2) --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output 0 5 1 5 --- !query 28 +-- !query SELECT udf(count(c)) FROM test_missing_target GROUP BY udf(lower(test_missing_target.c)) ORDER BY udf(lower(test_missing_target.c)) --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output 2 3 4 1 --- !query 29 +-- !query SELECT udf(count(udf(a))) FROM test_missing_target GROUP BY udf(a) ORDER BY udf(b) --- !query 29 schema +-- !query schema struct<> --- !query 29 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`b`' given input columns: [CAST(udf(cast(count(cast(udf(cast(a as string)) as int)) as string)) AS BIGINT)]; line 1 pos 80 --- !query 30 +-- !query SELECT udf(count(b)) FROM test_missing_target GROUP BY udf(b/2) ORDER BY udf(b/2) --- !query 30 schema +-- !query schema struct --- !query 30 output +-- !query output 1 -5 +2 +3 4 --- !query 31 +-- !query SELECT udf(lower(test_missing_target.c)), udf(count(udf(c))) FROM test_missing_target GROUP BY udf(lower(c)) ORDER BY udf(lower(c)) --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output abab 2 bbbb 3 cccc 4 xxxx 1 --- !query 32 +-- !query SELECT udf(a) FROM test_missing_target ORDER BY udf(upper(udf(d))) --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output 0 1 2 @@ -367,54 +373,57 @@ struct 9 --- !query 33 +-- !query SELECT udf(count(b)) FROM test_missing_target GROUP BY udf((b + 1) / 2) ORDER BY udf((b + 1) / 2) desc --- !query 33 schema +-- !query schema struct --- !query 33 output -7 +-- !query output +4 3 +2 +1 --- !query 34 +-- !query SELECT udf(count(udf(x.a))) FROM test_missing_target x, test_missing_target y WHERE udf(x.a) = udf(y.a) GROUP BY udf(b/2) ORDER BY udf(b/2) --- !query 34 schema +-- !query schema struct<> --- !query 34 output +-- !query output org.apache.spark.sql.AnalysisException Reference 'b' is ambiguous, could be: x.b, y.b.; line 3 pos 14 --- !query 35 +-- !query SELECT udf(x.b/2), udf(count(udf(x.b))) FROM test_missing_target x, test_missing_target y WHERE udf(x.a) = udf(y.a) GROUP BY udf(x.b/2) ORDER BY udf(x.b/2) --- !query 35 schema -struct --- !query 35 output -0 1 -1 5 -2 4 +-- !query schema +struct +-- !query output +0.5 1 +1.0 2 +1.5 3 +2.0 4 --- !query 36 +-- !query SELECT udf(count(udf(b))) FROM test_missing_target x, test_missing_target y WHERE udf(x.a) = udf(y.a) GROUP BY udf(x.b/2) --- !query 36 schema +-- !query schema struct<> --- !query 36 output +-- !query output org.apache.spark.sql.AnalysisException Reference 'b' is ambiguous, could be: x.b, y.b.; line 1 pos 21 --- !query 37 +-- !query DROP TABLE test_missing_target --- !query 37 schema +-- !query schema struct<> --- !query 37 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-count.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-count.sql.out index 3d7c64054a6ac..e66948dcdea34 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-count.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-count.sql.out @@ -2,27 +2,27 @@ -- Number of queries: 5 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES (1, 1), (1, 2), (2, 1), (1, 1), (null, 2), (1, null), (null, null) AS testData(a, b) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT udf(count(*)), udf(count(1)), udf(count(null)), udf(count(a)), udf(count(b)), udf(count(a + b)), udf(count((a, b))) FROM testData --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 7 7 0 5 5 4 7 --- !query 2 +-- !query SELECT udf(count(DISTINCT 1)), udf(count(DISTINCT null)), @@ -31,25 +31,25 @@ SELECT udf(count(DISTINCT (a + b))), udf(count(DISTINCT (a, b))) FROM testData --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 1 0 2 2 2 6 --- !query 3 +-- !query SELECT udf(count(a, b)), udf(count(b, a)), udf(count(testData.*)) FROM testData --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 4 4 4 --- !query 4 +-- !query SELECT udf(count(DISTINCT a, b)), udf(count(DISTINCT b, a)), udf(count(DISTINCT *)), udf(count(DISTINCT testData.*)) FROM testData --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 3 3 3 3 diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-cross-join.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-cross-join.sql.out index 98d3ad37a8dfa..fdddfc55978b4 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-cross-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-cross-join.sql.out @@ -2,35 +2,35 @@ -- Number of queries: 13 --- !query 0 +-- !query create temporary view nt1 as select * from values ("one", 1), ("two", 2), ("three", 3) as nt1(k, v1) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view nt2 as select * from values ("one", 1), ("two", 22), ("one", 5) as nt2(k, v2) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query SELECT * FROM nt1 cross join nt2 --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output one 1 one 1 one 1 one 5 one 1 two 22 @@ -42,82 +42,82 @@ two 2 one 5 two 2 two 22 --- !query 3 +-- !query SELECT * FROM nt1 cross join nt2 where udf(nt1.k) = udf(nt2.k) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output one 1 one 1 one 1 one 5 two 2 two 22 --- !query 4 +-- !query SELECT * FROM nt1 cross join nt2 on (udf(nt1.k) = udf(nt2.k)) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output one 1 one 1 one 1 one 5 two 2 two 22 --- !query 5 +-- !query SELECT * FROM nt1 cross join nt2 where udf(nt1.v1) = "1" and udf(nt2.v2) = "22" --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output one 1 two 22 --- !query 6 +-- !query SELECT udf(a.key), udf(b.key) FROM (SELECT udf(k) key FROM nt1 WHERE v1 < 2) a CROSS JOIN (SELECT udf(k) key FROM nt2 WHERE v2 = 22) b --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output one two --- !query 7 +-- !query create temporary view A(a, va) as select * from nt1 --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output --- !query 8 +-- !query create temporary view B(b, vb) as select * from nt1 --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output --- !query 9 +-- !query create temporary view C(c, vc) as select * from nt1 --- !query 9 schema +-- !query schema struct<> --- !query 9 output +-- !query output --- !query 10 +-- !query create temporary view D(d, vd) as select * from nt1 --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output --- !query 11 +-- !query select * from ((A join B on (udf(a) = udf(b))) cross join C) join D on (udf(a) = udf(d)) --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output one 1 one 1 one 1 one 1 one 1 one 1 three 3 one 1 one 1 one 1 two 2 one 1 @@ -129,11 +129,11 @@ two 2 two 2 three 3 two 2 two 2 two 2 two 2 two 2 --- !query 12 +-- !query SELECT * FROM nt1 CROSS JOIN nt2 ON (udf(nt1.k) > udf(nt2.k)) --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output three 3 one 1 three 3 one 5 two 2 one 1 diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-except-all.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-except-all.sql.out index b7bfad0e538ac..2613120e004df 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-except-all.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-except-all.sql.out @@ -2,25 +2,25 @@ -- Number of queries: 27 --- !query 0 +-- !query CREATE TEMPORARY VIEW tab1 AS SELECT * FROM VALUES (0), (1), (2), (2), (2), (2), (3), (null), (null) AS tab1(c1) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TEMPORARY VIEW tab2 AS SELECT * FROM VALUES (1), (2), (2), (3), (5), (5), (null) AS tab2(c1) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE TEMPORARY VIEW tab3 AS SELECT * FROM VALUES (1, 2), (1, 2), @@ -28,13 +28,13 @@ CREATE TEMPORARY VIEW tab3 AS SELECT * FROM VALUES (2, 3), (2, 2) AS tab3(k, v) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query CREATE TEMPORARY VIEW tab4 AS SELECT * FROM VALUES (1, 2), (2, 3), @@ -42,45 +42,45 @@ CREATE TEMPORARY VIEW tab4 AS SELECT * FROM VALUES (2, 2), (2, 20) AS tab4(k, v) --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query SELECT udf(c1) FROM tab1 EXCEPT ALL SELECT udf(c1) FROM tab2 --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 0 2 2 NULL --- !query 5 +-- !query SELECT udf(c1) FROM tab1 MINUS ALL SELECT udf(c1) FROM tab2 --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 0 2 2 NULL --- !query 6 +-- !query SELECT udf(c1) FROM tab1 EXCEPT ALL SELECT udf(c1) FROM tab2 WHERE udf(c1) IS NOT NULL --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 0 2 2 @@ -88,23 +88,23 @@ NULL NULL --- !query 7 +-- !query SELECT udf(c1) FROM tab1 WHERE udf(c1) > 5 EXCEPT ALL SELECT udf(c1) FROM tab2 --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output --- !query 8 +-- !query SELECT udf(c1) FROM tab1 EXCEPT ALL SELECT udf(c1) FROM tab2 WHERE udf(c1 > udf(6)) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 0 1 2 @@ -116,13 +116,13 @@ NULL NULL --- !query 9 +-- !query SELECT udf(c1) FROM tab1 EXCEPT ALL SELECT CAST(udf(1) AS BIGINT) --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 0 2 2 @@ -133,65 +133,65 @@ NULL NULL --- !query 10 +-- !query SELECT udf(c1) FROM tab1 EXCEPT ALL SELECT array(1) --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output org.apache.spark.sql.AnalysisException ExceptAll can only be performed on tables with the compatible column types. array <> int at the first column of the second table; --- !query 11 +-- !query SELECT udf(k), v FROM tab3 EXCEPT ALL SELECT k, udf(v) FROM tab4 --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 1 2 1 3 --- !query 12 +-- !query SELECT k, udf(v) FROM tab4 EXCEPT ALL SELECT udf(k), v FROM tab3 --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 2 2 2 20 --- !query 13 +-- !query SELECT udf(k), udf(v) FROM tab4 EXCEPT ALL SELECT udf(k), udf(v) FROM tab3 INTERSECT DISTINCT SELECT udf(k), udf(v) FROM tab4 --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output 2 2 2 20 --- !query 14 +-- !query SELECT udf(k), v FROM tab4 EXCEPT ALL SELECT k, udf(v) FROM tab3 EXCEPT DISTINCT SELECT udf(k), udf(v) FROM tab4 --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output --- !query 15 +-- !query SELECT k, udf(v) FROM tab3 EXCEPT ALL SELECT udf(k), udf(v) FROM tab4 @@ -199,24 +199,24 @@ UNION ALL SELECT udf(k), v FROM tab3 EXCEPT DISTINCT SELECT k, udf(v) FROM tab4 --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 1 3 --- !query 16 +-- !query SELECT k FROM tab3 EXCEPT ALL SELECT k, v FROM tab4 --- !query 16 schema +-- !query schema struct<> --- !query 16 output +-- !query output org.apache.spark.sql.AnalysisException ExceptAll can only be performed on tables with the same number of columns, but the first table has 1 columns and the second table has 2 columns; --- !query 17 +-- !query SELECT udf(k), udf(v) FROM tab3 EXCEPT ALL SELECT udf(k), udf(v) FROM tab4 @@ -224,13 +224,13 @@ UNION SELECT udf(k), udf(v) FROM tab3 EXCEPT DISTINCT SELECT udf(k), udf(v) FROM tab4 --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output 1 3 --- !query 18 +-- !query SELECT udf(k), udf(v) FROM tab3 MINUS ALL SELECT k, udf(v) FROM tab4 @@ -238,13 +238,13 @@ UNION SELECT udf(k), udf(v) FROM tab3 MINUS DISTINCT SELECT k, udf(v) FROM tab4 --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output 1 3 --- !query 19 +-- !query SELECT k, udf(v) FROM tab3 EXCEPT ALL SELECT udf(k), v FROM tab4 @@ -252,13 +252,13 @@ EXCEPT DISTINCT SELECT k, udf(v) FROM tab3 EXCEPT DISTINCT SELECT udf(k), v FROM tab4 --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output --- !query 20 +-- !query SELECT * FROM (SELECT tab3.k, udf(tab4.v) @@ -272,13 +272,13 @@ FROM (SELECT udf(tab3.k), FROM tab3 JOIN tab4 ON tab3.k = udf(tab4.k)) --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output --- !query 21 +-- !query SELECT * FROM (SELECT udf(udf(tab3.k)), udf(tab4.v) @@ -292,9 +292,9 @@ FROM (SELECT udf(tab4.v) AS k, FROM tab3 JOIN tab4 ON udf(tab3.k) = udf(tab4.k)) --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output 1 2 1 2 1 2 @@ -304,43 +304,43 @@ struct --- !query 22 output +-- !query output 3 --- !query 23 +-- !query DROP VIEW IF EXISTS tab1 --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output --- !query 24 +-- !query DROP VIEW IF EXISTS tab2 --- !query 24 schema +-- !query schema struct<> --- !query 24 output +-- !query output --- !query 25 +-- !query DROP VIEW IF EXISTS tab3 --- !query 25 schema +-- !query schema struct<> --- !query 25 output +-- !query output --- !query 26 +-- !query DROP VIEW IF EXISTS tab4 --- !query 26 schema +-- !query schema struct<> --- !query 26 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-except.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-except.sql.out index 0badaf050e194..054ee00ecc2ae 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-except.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-except.sql.out @@ -2,20 +2,20 @@ -- Number of queries: 9 --- !query 0 +-- !query create temporary view t1 as select * from values ("one", 1), ("two", 2), ("three", 3), ("one", NULL) as t1(k, v) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view t2 as select * from values ("one", 1), ("two", 22), @@ -23,71 +23,71 @@ create temporary view t2 as select * from values ("one", NULL), (NULL, 5) as t2(k, v) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query SELECT udf(k), udf(v) FROM t1 EXCEPT SELECT udf(k), udf(v) FROM t2 --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output three 3 two 2 --- !query 3 +-- !query SELECT * FROM t1 EXCEPT SELECT * FROM t1 where udf(v) <> 1 and v <> udf(2) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output one 1 one NULL two 2 --- !query 4 +-- !query SELECT * FROM t1 where udf(v) <> 1 and v <> udf(22) EXCEPT SELECT * FROM t1 where udf(v) <> 2 and v >= udf(3) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output two 2 --- !query 5 +-- !query SELECT t1.* FROM t1, t2 where t1.k = t2.k EXCEPT SELECT t1.* FROM t1, t2 where t1.k = t2.k and t1.k != udf('one') --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output one 1 one NULL --- !query 6 +-- !query SELECT * FROM t2 where v >= udf(1) and udf(v) <> 22 EXCEPT SELECT * FROM t1 --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output NULL 5 one 5 --- !query 7 +-- !query SELECT (SELECT min(udf(k)) FROM t2 WHERE t2.k = t1.k) min_t2 FROM t1 MINUS SELECT (SELECT udf(min(k)) FROM t2) abs_min_t2 FROM t1 WHERE t1.k = udf('one') --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output NULL two --- !query 8 +-- !query SELECT t1.k FROM t1 WHERE t1.v <= (SELECT udf(max(udf(t2.v))) @@ -99,7 +99,7 @@ FROM t1 WHERE udf(t1.v) >= (SELECT min(udf(t2.v)) FROM t2 WHERE t2.k = t1.k) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output two diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-analytics.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-analytics.sql.out index de297ab166965..dc291a7696ea7 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-analytics.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-analytics.sql.out @@ -2,21 +2,21 @@ -- Number of queries: 29 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES (1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2) AS testData(a, b) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT udf(a + b), b, udf(SUM(a - b)) FROM testData GROUP BY udf(a + b), b WITH CUBE --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output 2 1 0 2 NULL 0 3 1 1 @@ -32,11 +32,11 @@ NULL 2 0 NULL NULL 3 --- !query 2 +-- !query SELECT udf(a), udf(b), SUM(b) FROM testData GROUP BY udf(a), b WITH CUBE --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 1 1 1 1 2 2 1 NULL 3 @@ -51,11 +51,11 @@ NULL 2 6 NULL NULL 9 --- !query 3 +-- !query SELECT udf(a + b), b, SUM(a - b) FROM testData GROUP BY a + b, b WITH ROLLUP --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 2 1 0 2 NULL 0 3 1 1 @@ -69,11 +69,11 @@ struct NULL NULL 3 --- !query 4 +-- !query SELECT udf(a), b, udf(SUM(b)) FROM testData GROUP BY udf(a), b WITH ROLLUP --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 1 1 1 1 2 2 1 NULL 3 @@ -86,21 +86,21 @@ struct --- !query 5 output +-- !query output --- !query 6 +-- !query SELECT course, year, SUM(earnings) FROM courseSales GROUP BY ROLLUP(course, year) ORDER BY udf(course), year --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output NULL NULL 113000 Java NULL 50000 Java 2012 20000 @@ -110,11 +110,11 @@ dotNET 2012 15000 dotNET 2013 48000 --- !query 7 +-- !query SELECT course, year, SUM(earnings) FROM courseSales GROUP BY CUBE(course, year) ORDER BY course, udf(year) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output NULL NULL 113000 NULL 2012 35000 NULL 2013 78000 @@ -126,41 +126,41 @@ dotNET 2012 15000 dotNET 2013 48000 --- !query 8 +-- !query SELECT course, udf(year), SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(course, year) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output Java NULL 50000 NULL 2012 35000 NULL 2013 78000 dotNET NULL 63000 --- !query 9 +-- !query SELECT course, year, udf(SUM(earnings)) FROM courseSales GROUP BY course, year GROUPING SETS(course) --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output Java NULL 50000 dotNET NULL 63000 --- !query 10 +-- !query SELECT udf(course), year, SUM(earnings) FROM courseSales GROUP BY course, year GROUPING SETS(year) --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output NULL 2012 35000 NULL 2013 78000 --- !query 11 +-- !query SELECT course, udf(SUM(earnings)) AS sum FROM courseSales GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER BY course, udf(sum) --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output NULL 113000 Java 20000 Java 30000 @@ -171,12 +171,12 @@ dotNET 48000 dotNET 63000 --- !query 12 +-- !query SELECT course, SUM(earnings) AS sum, GROUPING_ID(course, earnings) FROM courseSales GROUP BY course, earnings GROUPING SETS((), (course), (course, earnings)) ORDER BY udf(course), sum --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output NULL 113000 3 Java 20000 0 Java 30000 0 @@ -187,12 +187,12 @@ dotNET 48000 0 dotNET 63000 1 --- !query 13 +-- !query SELECT udf(course), udf(year), GROUPING(course), GROUPING(year), GROUPING_ID(course, year) FROM courseSales GROUP BY CUBE(course, year) --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output Java 2012 0 0 0 Java 2013 0 0 0 Java NULL 0 1 1 @@ -204,29 +204,29 @@ dotNET 2013 0 0 0 dotNET NULL 0 1 1 --- !query 14 +-- !query SELECT course, udf(year), GROUPING(course) FROM courseSales GROUP BY course, udf(year) --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output org.apache.spark.sql.AnalysisException grouping() can only be used with GroupingSets/Cube/Rollup; --- !query 15 +-- !query SELECT course, udf(year), GROUPING_ID(course, year) FROM courseSales GROUP BY udf(course), year --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output org.apache.spark.sql.AnalysisException grouping_id() can only be used with GroupingSets/Cube/Rollup; --- !query 16 +-- !query SELECT course, year, grouping__id FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id, course, udf(year) --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output Java 2012 0 Java 2013 0 dotNET 2012 0 @@ -238,40 +238,40 @@ NULL 2013 2 NULL NULL 3 --- !query 17 +-- !query SELECT course, year FROM courseSales GROUP BY CUBE(course, year) HAVING GROUPING(year) = 1 AND GROUPING_ID(course, year) > 0 ORDER BY course, udf(year) --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output NULL NULL Java NULL dotNET NULL --- !query 18 +-- !query SELECT course, udf(year) FROM courseSales GROUP BY udf(course), year HAVING GROUPING(course) > 0 --- !query 18 schema +-- !query schema struct<> --- !query 18 output +-- !query output org.apache.spark.sql.AnalysisException grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup; --- !query 19 +-- !query SELECT course, udf(udf(year)) FROM courseSales GROUP BY course, year HAVING GROUPING_ID(course) > 0 --- !query 19 schema +-- !query schema struct<> --- !query 19 output +-- !query output org.apache.spark.sql.AnalysisException grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup; --- !query 20 +-- !query SELECT udf(course), year FROM courseSales GROUP BY CUBE(course, year) HAVING grouping__id > 0 --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output Java NULL NULL 2012 NULL 2013 @@ -279,12 +279,12 @@ NULL NULL dotNET NULL --- !query 21 +-- !query SELECT course, year, GROUPING(course), GROUPING(year) FROM courseSales GROUP BY CUBE(course, year) ORDER BY GROUPING(course), GROUPING(year), course, udf(year) --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output Java 2012 0 0 Java 2013 0 0 dotNET 2012 0 0 @@ -296,12 +296,12 @@ NULL 2013 1 0 NULL NULL 1 1 --- !query 22 +-- !query SELECT course, year, GROUPING_ID(course, year) FROM courseSales GROUP BY CUBE(course, year) ORDER BY GROUPING(course), GROUPING(year), course, udf(year) --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output Java 2012 0 Java 2013 0 dotNET 2012 0 @@ -313,29 +313,29 @@ NULL 2013 2 NULL NULL 3 --- !query 23 +-- !query SELECT course, udf(year) FROM courseSales GROUP BY course, udf(year) ORDER BY GROUPING(course) --- !query 23 schema +-- !query schema struct<> --- !query 23 output +-- !query output org.apache.spark.sql.AnalysisException grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup; --- !query 24 +-- !query SELECT course, udf(year) FROM courseSales GROUP BY course, udf(year) ORDER BY GROUPING_ID(course) --- !query 24 schema +-- !query schema struct<> --- !query 24 output +-- !query output org.apache.spark.sql.AnalysisException grouping()/grouping_id() can only be used with GroupingSets/Cube/Rollup; --- !query 25 +-- !query SELECT course, year FROM courseSales GROUP BY CUBE(course, year) ORDER BY grouping__id, udf(course), year --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output Java 2012 Java 2013 dotNET 2012 @@ -347,11 +347,11 @@ NULL 2013 NULL NULL --- !query 26 +-- !query SELECT udf(a + b) AS k1, udf(b) AS k2, SUM(a - b) FROM testData GROUP BY CUBE(k1, k2) --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output 2 1 0 2 NULL 0 3 1 1 @@ -367,11 +367,11 @@ NULL 2 0 NULL NULL 3 --- !query 27 +-- !query SELECT udf(udf(a + b)) AS k, b, SUM(a - b) FROM testData GROUP BY ROLLUP(k, b) --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output 2 1 0 2 NULL 0 3 1 1 @@ -385,10 +385,10 @@ struct NULL NULL 3 --- !query 28 +-- !query SELECT udf(a + b), udf(udf(b)) AS k, SUM(a - b) FROM testData GROUP BY a + b, k GROUPING SETS(k) --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output NULL 1 3 NULL 2 0 diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out index febe47b5ba84e..6403406413db9 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-group-by.sql.out @@ -2,101 +2,101 @@ -- Number of queries: 52 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES (1, 1), (1, 2), (2, 1), (2, 2), (3, 1), (3, 2), (null, 1), (3, null), (null, null) AS testData(a, b) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT udf(a), udf(COUNT(b)) FROM testData --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output org.apache.spark.sql.AnalysisException grouping expressions sequence is empty, and 'testdata.`a`' is not an aggregate function. Wrap '(CAST(udf(cast(count(b) as string)) AS BIGINT) AS `CAST(udf(cast(count(b) as string)) AS BIGINT)`)' in windowing function(s) or wrap 'testdata.`a`' in first() (or first_value) if you don't care which value you get.; --- !query 2 +-- !query SELECT COUNT(udf(a)), udf(COUNT(b)) FROM testData --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 7 7 --- !query 3 +-- !query SELECT udf(a), COUNT(udf(b)) FROM testData GROUP BY a --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 1 2 2 2 3 2 NULL 1 --- !query 4 +-- !query SELECT udf(a), udf(COUNT(udf(b))) FROM testData GROUP BY b --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output org.apache.spark.sql.AnalysisException expression 'testdata.`a`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.; --- !query 5 +-- !query SELECT COUNT(udf(a)), COUNT(udf(b)) FROM testData GROUP BY udf(a) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 0 1 2 2 2 2 3 2 --- !query 6 +-- !query SELECT 'foo', COUNT(udf(a)) FROM testData GROUP BY 1 --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output foo 7 --- !query 7 +-- !query SELECT 'foo' FROM testData WHERE a = 0 GROUP BY udf(1) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output --- !query 8 +-- !query SELECT 'foo', udf(APPROX_COUNT_DISTINCT(udf(a))) FROM testData WHERE a = 0 GROUP BY udf(1) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output --- !query 9 +-- !query SELECT 'foo', MAX(STRUCT(udf(a))) FROM testData WHERE a = 0 GROUP BY udf(1) --- !query 9 schema +-- !query schema struct> --- !query 9 output +-- !query output --- !query 10 +-- !query SELECT udf(a + b), udf(COUNT(b)) FROM testData GROUP BY a + b --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 2 1 3 2 4 2 @@ -104,132 +104,132 @@ struct --- !query 11 output +-- !query output org.apache.spark.sql.AnalysisException expression 'testdata.`a`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.; --- !query 12 +-- !query SELECT udf(a + 1) + 1, udf(COUNT(b)) FROM testData GROUP BY udf(a + 1) --- !query 12 schema +-- !query schema struct<(CAST(udf(cast((a + 1) as string)) AS INT) + 1):int,CAST(udf(cast(count(b) as string)) AS BIGINT):bigint> --- !query 12 output +-- !query output 3 2 4 2 5 2 NULL 1 --- !query 13 +-- !query SELECT SKEWNESS(udf(a)), udf(KURTOSIS(a)), udf(MIN(a)), MAX(udf(a)), udf(AVG(udf(a))), udf(VARIANCE(a)), STDDEV(udf(a)), udf(SUM(a)), udf(COUNT(a)) FROM testData --- !query 13 schema -struct --- !query 13 output +-- !query schema +struct +-- !query output -0.2723801058145729 -1.5069204152249134 1 3 2.142857142857143 0.8095238095238094 0.8997354108424372 15 7 --- !query 14 +-- !query SELECT COUNT(DISTINCT udf(b)), udf(COUNT(DISTINCT b, c)) FROM (SELECT 1 AS a, 2 AS b, 3 AS c) GROUP BY udf(a) --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output 1 1 --- !query 15 +-- !query SELECT udf(a) AS k, COUNT(udf(b)) FROM testData GROUP BY k --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 1 2 2 2 3 2 NULL 1 --- !query 16 +-- !query SELECT a AS k, udf(COUNT(b)) FROM testData GROUP BY k HAVING k > 1 --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output 2 2 3 2 --- !query 17 +-- !query SELECT udf(COUNT(b)) AS k FROM testData GROUP BY k --- !query 17 schema +-- !query schema struct<> --- !query 17 output +-- !query output org.apache.spark.sql.AnalysisException aggregate functions are not allowed in GROUP BY, but found CAST(udf(cast(count(b) as string)) AS BIGINT); --- !query 18 +-- !query CREATE OR REPLACE TEMPORARY VIEW testDataHasSameNameWithAlias AS SELECT * FROM VALUES (1, 1, 3), (1, 2, 1) AS testDataHasSameNameWithAlias(k, a, v) --- !query 18 schema +-- !query schema struct<> --- !query 18 output +-- !query output --- !query 19 +-- !query SELECT k AS a, udf(COUNT(udf(v))) FROM testDataHasSameNameWithAlias GROUP BY udf(a) --- !query 19 schema +-- !query schema struct<> --- !query 19 output +-- !query output org.apache.spark.sql.AnalysisException expression 'testdatahassamenamewithalias.`k`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.; --- !query 20 +-- !query set spark.sql.groupByAliases=false --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output spark.sql.groupByAliases false --- !query 21 +-- !query SELECT a AS k, udf(COUNT(udf(b))) FROM testData GROUP BY k --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`k`' given input columns: [testdata.a, testdata.b]; line 1 pos 57 --- !query 22 +-- !query SELECT udf(a), COUNT(udf(1)) FROM testData WHERE false GROUP BY udf(a) --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output --- !query 23 +-- !query SELECT udf(COUNT(1)) FROM testData WHERE false --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output 0 --- !query 24 +-- !query SELECT 1 FROM (SELECT udf(COUNT(1)) FROM testData WHERE false) t --- !query 24 schema +-- !query schema struct<1:int> --- !query 24 output +-- !query output 1 --- !query 25 +-- !query SELECT 1 from ( SELECT 1 AS z, udf(MIN(a.x)) @@ -237,88 +237,88 @@ SELECT 1 from ( WHERE false ) b where b.z != b.z --- !query 25 schema +-- !query schema struct<1:int> --- !query 25 output +-- !query output --- !query 26 +-- !query SELECT corr(DISTINCT x, y), udf(corr(DISTINCT y, x)), count(*) FROM (VALUES (1, 1), (2, 2), (2, 2)) t(x, y) --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output 1.0 1.0 3 --- !query 27 +-- !query SELECT udf(1) FROM range(10) HAVING true --- !query 27 schema +-- !query schema struct --- !query 27 output +-- !query output 1 --- !query 28 +-- !query SELECT udf(udf(1)) FROM range(10) HAVING MAX(id) > 0 --- !query 28 schema +-- !query schema struct --- !query 28 output +-- !query output 1 --- !query 29 +-- !query SELECT udf(id) FROM range(10) HAVING id > 0 --- !query 29 schema +-- !query schema struct<> --- !query 29 output +-- !query output org.apache.spark.sql.AnalysisException grouping expressions sequence is empty, and '`id`' is not an aggregate function. Wrap '()' in windowing function(s) or wrap '`id`' in first() (or first_value) if you don't care which value you get.; --- !query 30 +-- !query CREATE OR REPLACE TEMPORARY VIEW test_agg AS SELECT * FROM VALUES (1, true), (1, false), (2, true), (3, false), (3, null), (4, null), (4, null), (5, null), (5, true), (5, false) AS test_agg(k, v) --- !query 30 schema +-- !query schema struct<> --- !query 30 output +-- !query output --- !query 31 +-- !query SELECT udf(every(v)), udf(some(v)), any(v) FROM test_agg WHERE 1 = 0 --- !query 31 schema +-- !query schema struct --- !query 31 output +-- !query output NULL NULL NULL --- !query 32 +-- !query SELECT udf(every(udf(v))), some(v), any(v) FROM test_agg WHERE k = 4 --- !query 32 schema +-- !query schema struct --- !query 32 output +-- !query output NULL NULL NULL --- !query 33 +-- !query SELECT every(v), udf(some(v)), any(v) FROM test_agg WHERE k = 5 --- !query 33 schema +-- !query schema struct --- !query 33 output +-- !query output false true true --- !query 34 +-- !query SELECT udf(k), every(v), udf(some(v)), any(v) FROM test_agg GROUP BY udf(k) --- !query 34 schema +-- !query schema struct --- !query 34 output +-- !query output 1 false true true 2 true true true 3 false false false @@ -326,25 +326,25 @@ struct --- !query 35 output +-- !query output 1 false 3 false 5 false --- !query 36 +-- !query SELECT udf(k), udf(every(v)) FROM test_agg GROUP BY udf(k) HAVING every(v) IS NULL --- !query 36 schema +-- !query schema struct --- !query 36 output +-- !query output 4 NULL --- !query 37 +-- !query SELECT udf(k), udf(Every(v)) AS every FROM test_agg @@ -353,13 +353,13 @@ WHERE k = 2 FROM test_agg WHERE k = 1) GROUP BY udf(k) --- !query 37 schema +-- !query schema struct --- !query 37 output +-- !query output 2 true --- !query 38 +-- !query SELECT udf(udf(k)), Every(v) AS every FROM test_agg @@ -368,53 +368,53 @@ WHERE k = 2 FROM test_agg WHERE k = 1) GROUP BY udf(udf(k)) --- !query 38 schema +-- !query schema struct --- !query 38 output +-- !query output --- !query 39 +-- !query SELECT every(udf(1)) --- !query 39 schema +-- !query schema struct<> --- !query 39 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'every(CAST(udf(cast(1 as string)) AS INT))' due to data type mismatch: Input to function 'every' should have been boolean, but it's [int].; line 1 pos 7 --- !query 40 +-- !query SELECT some(udf(1S)) --- !query 40 schema +-- !query schema struct<> --- !query 40 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'some(CAST(udf(cast(1 as string)) AS SMALLINT))' due to data type mismatch: Input to function 'some' should have been boolean, but it's [smallint].; line 1 pos 7 --- !query 41 +-- !query SELECT any(udf(1L)) --- !query 41 schema +-- !query schema struct<> --- !query 41 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'any(CAST(udf(cast(1 as string)) AS BIGINT))' due to data type mismatch: Input to function 'any' should have been boolean, but it's [bigint].; line 1 pos 7 --- !query 42 +-- !query SELECT udf(every("true")) --- !query 42 schema +-- !query schema struct<> --- !query 42 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'every('true')' due to data type mismatch: Input to function 'every' should have been boolean, but it's [string].; line 1 pos 11 --- !query 43 +-- !query SELECT k, v, every(v) OVER (PARTITION BY k ORDER BY v) FROM test_agg --- !query 43 schema +-- !query schema struct --- !query 43 output +-- !query output 1 false false 1 true false 2 true true @@ -427,11 +427,11 @@ struct --- !query 44 output +-- !query output 1 false false 1 true true 2 true true @@ -444,11 +444,11 @@ struct --- !query 45 output +-- !query output 1 false false 1 true true 2 true true @@ -461,37 +461,37 @@ struct 1L --- !query 46 schema +-- !query schema struct --- !query 46 output +-- !query output 10 --- !query 47 +-- !query SELECT k, udf(max(v)) FROM test_agg GROUP BY k HAVING max(v) = true --- !query 47 schema +-- !query schema struct --- !query 47 output +-- !query output 1 true 2 true 5 true --- !query 48 +-- !query SELECT * FROM (SELECT udf(COUNT(*)) AS cnt FROM test_agg) WHERE cnt > 1L --- !query 48 schema +-- !query schema struct --- !query 48 output +-- !query output 10 --- !query 49 +-- !query SELECT udf(count(*)) FROM test_agg WHERE count(*) > 1L --- !query 49 schema +-- !query schema struct<> --- !query 49 output +-- !query output org.apache.spark.sql.AnalysisException Aggregate/Window/Generate expressions are not valid in where clause of the query. @@ -499,11 +499,11 @@ Expression in where clause: [(count(1) > 1L)] Invalid expressions: [count(1)]; --- !query 50 +-- !query SELECT udf(count(*)) FROM test_agg WHERE count(*) + 1L > 1L --- !query 50 schema +-- !query schema struct<> --- !query 50 output +-- !query output org.apache.spark.sql.AnalysisException Aggregate/Window/Generate expressions are not valid in where clause of the query. @@ -511,11 +511,11 @@ Expression in where clause: [((count(1) + 1L) > 1L)] Invalid expressions: [count(1)]; --- !query 51 +-- !query SELECT udf(count(*)) FROM test_agg WHERE k = 1 or k = 2 or count(*) + 1L > 1L or max(k) > 1 --- !query 51 schema +-- !query schema struct<> --- !query 51 output +-- !query output org.apache.spark.sql.AnalysisException Aggregate/Window/Generate expressions are not valid in where clause of the query. diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-having.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-having.sql.out index 1effcc8470e19..9be27bb77f81a 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-having.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-having.sql.out @@ -2,48 +2,48 @@ -- Number of queries: 5 --- !query 0 +-- !query create temporary view hav as select * from values ("one", 1), ("two", 2), ("three", 3), ("one", 5) as hav(k, v) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT udf(k) AS k, udf(sum(v)) FROM hav GROUP BY k HAVING udf(sum(v)) > 2 --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output one 6 three 3 --- !query 2 +-- !query SELECT udf(count(udf(k))) FROM hav GROUP BY v + 1 HAVING v + 1 = udf(2) --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 1 --- !query 3 +-- !query SELECT udf(MIN(t.v)) FROM (SELECT * FROM hav WHERE v > 0) t HAVING(udf(COUNT(udf(1))) > 0) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 1 --- !query 4 +-- !query SELECT udf(a + b) FROM VALUES (1L, 2), (3L, 4) AS T(a, b) GROUP BY a + b HAVING a + b > udf(1) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 3 7 diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-inline-table.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-inline-table.sql.out index 2cf24e50c80a5..d78d347bc9802 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-inline-table.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-inline-table.sql.out @@ -2,152 +2,152 @@ -- Number of queries: 17 --- !query 0 +-- !query select udf(col1), udf(col2) from values ("one", 1) --- !query 0 schema +-- !query schema struct --- !query 0 output +-- !query output one 1 --- !query 1 +-- !query select udf(col1), udf(udf(col2)) from values ("one", 1) as data --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output one 1 --- !query 2 +-- !query select udf(a), b from values ("one", 1) as data(a, b) --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output one 1 --- !query 3 +-- !query select udf(a) from values 1, 2, 3 as data(a) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 1 2 3 --- !query 4 +-- !query select udf(a), b from values ("one", 1), ("two", 2), ("three", null) as data(a, b) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output one 1 three NULL two 2 --- !query 5 +-- !query select udf(a), b from values ("one", null), ("two", null) as data(a, b) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output one NULL two NULL --- !query 6 +-- !query select udf(a), b from values ("one", 1), ("two", 2L) as data(a, b) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output one 1 two 2 --- !query 7 +-- !query select udf(udf(a)), udf(b) from values ("one", 1 + 0), ("two", 1 + 3L) as data(a, b) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output one 1 two 4 --- !query 8 +-- !query select udf(a), b from values ("one", array(0, 1)), ("two", array(2, 3)) as data(a, b) --- !query 8 schema +-- !query schema struct> --- !query 8 output +-- !query output one [0,1] two [2,3] --- !query 9 +-- !query select udf(a), b from values ("one", 2.0), ("two", 3.0D) as data(a, b) --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output one 2.0 two 3.0 --- !query 10 +-- !query select udf(a), b from values ("one", rand(5)), ("two", 3.0D) as data(a, b) --- !query 10 schema +-- !query schema struct<> --- !query 10 output +-- !query output org.apache.spark.sql.AnalysisException cannot evaluate expression rand(5) in inline table definition; line 1 pos 37 --- !query 11 +-- !query select udf(a), udf(b) from values ("one", 2.0), ("two") as data(a, b) --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output org.apache.spark.sql.AnalysisException expected 2 columns but found 1 columns in row 1; line 1 pos 27 --- !query 12 +-- !query select udf(a), udf(b) from values ("one", array(0, 1)), ("two", struct(1, 2)) as data(a, b) --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output org.apache.spark.sql.AnalysisException incompatible types found in column b for inline table; line 1 pos 27 --- !query 13 +-- !query select udf(a), udf(b) from values ("one"), ("two") as data(a, b) --- !query 13 schema +-- !query schema struct<> --- !query 13 output +-- !query output org.apache.spark.sql.AnalysisException expected 2 columns but found 1 columns in row 0; line 1 pos 27 --- !query 14 +-- !query select udf(a), udf(b) from values ("one", random_not_exist_func(1)), ("two", 2) as data(a, b) --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output org.apache.spark.sql.AnalysisException Undefined function: 'random_not_exist_func'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 42 --- !query 15 +-- !query select udf(a), udf(b) from values ("one", count(1)), ("two", 2) as data(a, b) --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output org.apache.spark.sql.AnalysisException cannot evaluate expression count(1) in inline table definition; line 1 pos 42 --- !query 16 +-- !query select udf(a), b from values (timestamp('1991-12-06 00:00:00.0'), array(timestamp('1991-12-06 01:00:00.0'), timestamp('1991-12-06 12:00:00.0'))) as data(a, b) --- !query 16 schema +-- !query schema struct> --- !query 16 output -1991-12-06 00:00:00 [1991-12-06 01:00:00.0,1991-12-06 12:00:00.0] +-- !query output +1991-12-06 00:00:00 [1991-12-06 01:00:00,1991-12-06 12:00:00] diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-inner-join.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-inner-join.sql.out index 120f2d39f73dc..107fe9eb2fe55 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-inner-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-inner-join.sql.out @@ -2,65 +2,65 @@ -- Number of queries: 7 --- !query 0 +-- !query CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (1) AS GROUPING(a) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (1) AS GROUPING(a) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE TEMPORARY VIEW t3 AS SELECT * FROM VALUES (1), (1) AS GROUPING(a) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query CREATE TEMPORARY VIEW t4 AS SELECT * FROM VALUES (1), (1) AS GROUPING(a) --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query CREATE TEMPORARY VIEW ta AS SELECT udf(a) AS a, udf('a') AS tag FROM t1 UNION ALL SELECT udf(a) AS a, udf('b') AS tag FROM t2 --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query CREATE TEMPORARY VIEW tb AS SELECT udf(a) AS a, udf('a') AS tag FROM t3 UNION ALL SELECT udf(a) AS a, udf('b') AS tag FROM t4 --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query SELECT tb.* FROM ta INNER JOIN tb ON ta.a = tb.a AND ta.tag = tb.tag --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 1 a 1 a 1 b diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-intersect-all.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-intersect-all.sql.out index 0cb82be2da228..b3735ae153267 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-intersect-all.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-intersect-all.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 22 --- !query 0 +-- !query CREATE TEMPORARY VIEW tab1 AS SELECT * FROM VALUES (1, 2), (1, 2), @@ -12,13 +12,13 @@ CREATE TEMPORARY VIEW tab1 AS SELECT * FROM VALUES (null, null), (null, null) AS tab1(k, v) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TEMPORARY VIEW tab2 AS SELECT * FROM VALUES (1, 2), (1, 2), @@ -27,19 +27,19 @@ CREATE TEMPORARY VIEW tab2 AS SELECT * FROM VALUES (null, null), (null, null) AS tab2(k, v) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query SELECT udf(k), v FROM tab1 INTERSECT ALL SELECT k, udf(v) FROM tab2 --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 1 2 1 2 2 3 @@ -47,80 +47,80 @@ NULL NULL NULL NULL --- !query 3 +-- !query SELECT k, udf(v) FROM tab1 INTERSECT ALL SELECT udf(k), v FROM tab1 WHERE udf(k) = 1 --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 1 2 1 2 1 3 1 3 --- !query 4 +-- !query SELECT udf(k), udf(v) FROM tab1 WHERE k > udf(2) INTERSECT ALL SELECT udf(k), udf(v) FROM tab2 --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output --- !query 5 +-- !query SELECT udf(k), v FROM tab1 INTERSECT ALL SELECT udf(k), v FROM tab2 WHERE udf(udf(k)) > 3 --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output --- !query 6 +-- !query SELECT udf(k), v FROM tab1 INTERSECT ALL SELECT CAST(udf(1) AS BIGINT), CAST(udf(2) AS BIGINT) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 1 2 --- !query 7 +-- !query SELECT k, udf(v) FROM tab1 INTERSECT ALL SELECT array(1), udf(2) --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output org.apache.spark.sql.AnalysisException IntersectAll can only be performed on tables with the compatible column types. array <> int at the first column of the second table; --- !query 8 +-- !query SELECT udf(k) FROM tab1 INTERSECT ALL SELECT udf(k), udf(v) FROM tab2 --- !query 8 schema +-- !query schema struct<> --- !query 8 output +-- !query output org.apache.spark.sql.AnalysisException IntersectAll can only be performed on tables with the same number of columns, but the first table has 1 columns and the second table has 2 columns; --- !query 9 +-- !query SELECT udf(k), v FROM tab2 INTERSECT ALL SELECT k, udf(v) FROM tab1 INTERSECT ALL SELECT udf(k), udf(v) FROM tab2 --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 1 2 1 2 2 3 @@ -128,7 +128,7 @@ NULL NULL NULL NULL --- !query 10 +-- !query SELECT udf(k), v FROM tab1 EXCEPT SELECT k, udf(v) FROM tab2 @@ -136,9 +136,9 @@ UNION ALL SELECT k, udf(udf(v)) FROM tab1 INTERSECT ALL SELECT udf(k), v FROM tab2 --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 1 2 1 2 1 3 @@ -147,7 +147,7 @@ NULL NULL NULL NULL --- !query 11 +-- !query SELECT udf(k), udf(v) FROM tab1 EXCEPT SELECT udf(k), v FROM tab2 @@ -155,13 +155,13 @@ EXCEPT SELECT k, udf(v) FROM tab1 INTERSECT ALL SELECT udf(k), udf(udf(v)) FROM tab2 --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output 1 3 --- !query 12 +-- !query ( ( ( @@ -175,13 +175,13 @@ struct --- !query 12 output +-- !query output --- !query 13 +-- !query SELECT * FROM (SELECT udf(tab1.k), udf(tab2.v) @@ -195,9 +195,9 @@ FROM (SELECT udf(tab1.k), FROM tab1 JOIN tab2 ON udf(tab1.k) = udf(udf(tab2.k))) --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output 1 2 1 2 1 2 @@ -209,7 +209,7 @@ struct --- !query 14 output +-- !query output --- !query 15 +-- !query SELECT udf(v) FROM tab1 GROUP BY v INTERSECT ALL SELECT udf(udf(k)) FROM tab2 GROUP BY k --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output 2 3 NULL --- !query 16 +-- !query SET spark.sql.legacy.setopsPrecedence.enabled= true --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output spark.sql.legacy.setopsPrecedence.enabled true --- !query 17 +-- !query SELECT udf(k), v FROM tab1 EXCEPT SELECT k, udf(v) FROM tab2 @@ -257,9 +257,9 @@ UNION ALL SELECT udf(k), udf(v) FROM tab1 INTERSECT ALL SELECT udf(udf(k)), udf(v) FROM tab2 --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output 1 2 1 2 2 3 @@ -267,7 +267,7 @@ NULL NULL NULL NULL --- !query 18 +-- !query SELECT k, udf(v) FROM tab1 EXCEPT SELECT udf(k), v FROM tab2 @@ -275,33 +275,33 @@ UNION ALL SELECT udf(k), udf(v) FROM tab1 INTERSECT SELECT udf(k), udf(udf(v)) FROM tab2 --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output 1 2 2 3 NULL NULL --- !query 19 +-- !query SET spark.sql.legacy.setopsPrecedence.enabled = false --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output spark.sql.legacy.setopsPrecedence.enabled false --- !query 20 +-- !query DROP VIEW IF EXISTS tab1 --- !query 20 schema +-- !query schema struct<> --- !query 20 output +-- !query output --- !query 21 +-- !query DROP VIEW IF EXISTS tab2 --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-join-empty-relation.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-join-empty-relation.sql.out index e79d01fb14d60..0802eb9a9f62b 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-join-empty-relation.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-join-empty-relation.sql.out @@ -2,193 +2,193 @@ -- Number of queries: 24 --- !query 0 +-- !query CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (1) AS GROUPING(a) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (1) AS GROUPING(a) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query CREATE TEMPORARY VIEW empty_table as SELECT a FROM t2 WHERE false --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT udf(t1.a), udf(empty_table.a) FROM t1 INNER JOIN empty_table ON (udf(t1.a) = udf(udf(empty_table.a))) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output --- !query 4 +-- !query SELECT udf(t1.a), udf(udf(empty_table.a)) FROM t1 CROSS JOIN empty_table ON (udf(udf(t1.a)) = udf(empty_table.a)) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output --- !query 5 +-- !query SELECT udf(udf(t1.a)), empty_table.a FROM t1 LEFT OUTER JOIN empty_table ON (udf(t1.a) = udf(empty_table.a)) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 1 NULL --- !query 6 +-- !query SELECT udf(t1.a), udf(empty_table.a) FROM t1 RIGHT OUTER JOIN empty_table ON (udf(t1.a) = udf(empty_table.a)) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output --- !query 7 +-- !query SELECT udf(t1.a), empty_table.a FROM t1 FULL OUTER JOIN empty_table ON (udf(t1.a) = udf(empty_table.a)) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 1 NULL --- !query 8 +-- !query SELECT udf(udf(t1.a)) FROM t1 LEFT SEMI JOIN empty_table ON (udf(t1.a) = udf(udf(empty_table.a))) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output --- !query 9 +-- !query SELECT udf(t1.a) FROM t1 LEFT ANTI JOIN empty_table ON (udf(t1.a) = udf(empty_table.a)) --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 1 --- !query 10 +-- !query SELECT udf(empty_table.a), udf(t1.a) FROM empty_table INNER JOIN t1 ON (udf(udf(empty_table.a)) = udf(t1.a)) --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output --- !query 11 +-- !query SELECT udf(empty_table.a), udf(udf(t1.a)) FROM empty_table CROSS JOIN t1 ON (udf(empty_table.a) = udf(udf(t1.a))) --- !query 11 schema +-- !query schema struct --- !query 11 output +-- !query output --- !query 12 +-- !query SELECT udf(udf(empty_table.a)), udf(t1.a) FROM empty_table LEFT OUTER JOIN t1 ON (udf(empty_table.a) = udf(t1.a)) --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output --- !query 13 +-- !query SELECT empty_table.a, udf(t1.a) FROM empty_table RIGHT OUTER JOIN t1 ON (udf(empty_table.a) = udf(t1.a)) --- !query 13 schema +-- !query schema struct --- !query 13 output +-- !query output NULL 1 --- !query 14 +-- !query SELECT empty_table.a, udf(udf(t1.a)) FROM empty_table FULL OUTER JOIN t1 ON (udf(empty_table.a) = udf(t1.a)) --- !query 14 schema +-- !query schema struct --- !query 14 output +-- !query output NULL 1 --- !query 15 +-- !query SELECT udf(udf(empty_table.a)) FROM empty_table LEFT SEMI JOIN t1 ON (udf(empty_table.a) = udf(udf(t1.a))) --- !query 15 schema +-- !query schema struct --- !query 15 output +-- !query output --- !query 16 +-- !query SELECT empty_table.a FROM empty_table LEFT ANTI JOIN t1 ON (udf(empty_table.a) = udf(t1.a)) --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output --- !query 17 +-- !query SELECT udf(empty_table.a) FROM empty_table INNER JOIN empty_table AS empty_table2 ON (udf(empty_table.a) = udf(udf(empty_table2.a))) --- !query 17 schema +-- !query schema struct --- !query 17 output +-- !query output --- !query 18 +-- !query SELECT udf(udf(empty_table.a)) FROM empty_table CROSS JOIN empty_table AS empty_table2 ON (udf(udf(empty_table.a)) = udf(empty_table2.a)) --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output --- !query 19 +-- !query SELECT udf(empty_table.a) FROM empty_table LEFT OUTER JOIN empty_table AS empty_table2 ON (udf(empty_table.a) = udf(empty_table2.a)) --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output --- !query 20 +-- !query SELECT udf(udf(empty_table.a)) FROM empty_table RIGHT OUTER JOIN empty_table AS empty_table2 ON (udf(empty_table.a) = udf(udf(empty_table2.a))) --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output --- !query 21 +-- !query SELECT udf(empty_table.a) FROM empty_table FULL OUTER JOIN empty_table AS empty_table2 ON (udf(empty_table.a) = udf(empty_table2.a)) --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output --- !query 22 +-- !query SELECT udf(udf(empty_table.a)) FROM empty_table LEFT SEMI JOIN empty_table AS empty_table2 ON (udf(empty_table.a) = udf(empty_table2.a)) --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output --- !query 23 +-- !query SELECT udf(empty_table.a) FROM empty_table LEFT ANTI JOIN empty_table AS empty_table2 ON (udf(empty_table.a) = udf(empty_table2.a)) --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-natural-join.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-natural-join.sql.out index 950809ddcaf25..a8233a0e398b0 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-natural-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-natural-join.sql.out @@ -2,63 +2,63 @@ -- Number of queries: 6 --- !query 0 +-- !query create temporary view nt1 as select * from values ("one", 1), ("two", 2), ("three", 3) as nt1(k, v1) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view nt2 as select * from values ("one", 1), ("two", 22), ("one", 5) as nt2(k, v2) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query SELECT * FROM nt1 natural join nt2 where udf(k) = "one" --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output one 1 1 one 1 5 --- !query 3 +-- !query SELECT * FROM nt1 natural left join nt2 where k <> udf("") order by v1, v2 --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output one 1 1 one 1 5 two 2 22 three 3 NULL --- !query 4 +-- !query SELECT * FROM nt1 natural right join nt2 where udf(k) <> udf("") order by v1, v2 --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output one 1 1 one 1 5 two 2 22 --- !query 5 +-- !query SELECT udf(count(*)) FROM nt1 natural full outer join nt2 --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 4 diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-outer-join.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-outer-join.sql.out index 819f786070882..afebbb0c1da92 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-outer-join.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-outer-join.sql.out @@ -2,27 +2,27 @@ -- Number of queries: 8 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (-234), (145), (367), (975), (298) as t1(int_col1) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE OR REPLACE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (-769, -244), (-800, -409), (940, 86), (-507, 304), (-367, 158) as t2(int_col0, int_col1) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query SELECT (udf(SUM(udf(COALESCE(t1.int_col1, t2.int_col0))))), (udf(COALESCE(t1.int_col1, t2.int_col0)) * 2) @@ -33,40 +33,40 @@ GROUP BY udf(GREATEST(COALESCE(udf(t2.int_col1), 109), COALESCE(t1.int_col1, udf COALESCE(t1.int_col1, t2.int_col0) HAVING (udf(SUM(COALESCE(udf(t1.int_col1), udf(t2.int_col0))))) > (udf(COALESCE(t1.int_col1, t2.int_col0)) * 2) --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output -367 -734 -507 -1014 -769 -1538 -800 -1600 --- !query 3 +-- !query CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (97) as t1(int_col1) --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output --- !query 4 +-- !query CREATE OR REPLACE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (0) as t2(int_col1) --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query set spark.sql.crossJoin.enabled = true --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output spark.sql.crossJoin.enabled true --- !query 6 +-- !query SELECT * FROM ( SELECT @@ -74,15 +74,15 @@ SELECT FROM t1 LEFT JOIN t2 ON false ) t where (udf(t.int_col)) is not null --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 97 --- !query 7 +-- !query set spark.sql.crossJoin.enabled = false --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output spark.sql.crossJoin.enabled false diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out index cb9e4d736c9a0..087b4ed9302d8 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-pivot.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 30 --- !query 0 +-- !query create temporary view courseSales as select * from values ("dotNET", 2012, 10000), ("Java", 2012, 20000), @@ -10,35 +10,35 @@ create temporary view courseSales as select * from values ("dotNET", 2013, 48000), ("Java", 2013, 30000) as courseSales(course, year, earnings) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query create temporary view years as select * from values (2012, 1), (2013, 2) as years(y, s) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query create temporary view yearsWithComplexTypes as select * from values (2012, array(1, 1), map('1', 1), struct(1, 'a')), (2013, array(2, 2), map('2', 2), struct(2, 'b')) as yearsWithComplexTypes(y, a, m, s) --- !query 2 schema +-- !query schema struct<> --- !query 2 output +-- !query output --- !query 3 +-- !query SELECT * FROM ( SELECT udf(year), course, earnings FROM courseSales ) @@ -46,27 +46,27 @@ PIVOT ( udf(sum(earnings)) FOR course IN ('dotNET', 'Java') ) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 2012 15000 20000 2013 48000 30000 --- !query 4 +-- !query SELECT * FROM courseSales PIVOT ( udf(sum(earnings)) FOR year IN (2012, 2013) ) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output Java 20000 30000 dotNET 15000 48000 --- !query 5 +-- !query SELECT * FROM ( SELECT year, course, earnings FROM courseSales ) @@ -74,14 +74,14 @@ PIVOT ( udf(sum(earnings)), udf(avg(earnings)) FOR course IN ('dotNET', 'Java') ) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 2012 15000 7500.0 20000 20000.0 2013 48000 48000.0 30000 30000.0 --- !query 6 +-- !query SELECT * FROM ( SELECT udf(course) as course, earnings FROM courseSales ) @@ -89,13 +89,13 @@ PIVOT ( udf(sum(earnings)) FOR course IN ('dotNET', 'Java') ) --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output 63000 50000 --- !query 7 +-- !query SELECT * FROM ( SELECT year, course, earnings FROM courseSales ) @@ -103,13 +103,13 @@ PIVOT ( udf(sum(udf(earnings))), udf(min(year)) FOR course IN ('dotNET', 'Java') ) --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output 63000 2012 50000 2012 --- !query 8 +-- !query SELECT * FROM ( SELECT course, year, earnings, udf(s) as s FROM courseSales @@ -119,16 +119,16 @@ PIVOT ( udf(sum(earnings)) FOR s IN (1, 2) ) --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output Java 2012 20000 NULL Java 2013 NULL 30000 dotNET 2012 15000 NULL dotNET 2013 NULL 48000 --- !query 9 +-- !query SELECT * FROM ( SELECT course, year, earnings, s FROM courseSales @@ -138,14 +138,14 @@ PIVOT ( udf(sum(earnings)), udf(min(s)) FOR course IN ('dotNET', 'Java') ) --- !query 9 schema +-- !query schema struct --- !query 9 output +-- !query output 2012 15000 1 20000 1 2013 48000 2 30000 2 --- !query 10 +-- !query SELECT * FROM ( SELECT course, year, earnings, s FROM courseSales @@ -155,14 +155,14 @@ PIVOT ( udf(sum(earnings * s)) FOR course IN ('dotNET', 'Java') ) --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output 2012 15000 20000 2013 96000 60000 --- !query 11 +-- !query SELECT 2012_s, 2013_s, 2012_a, 2013_a, c FROM ( SELECT year y, course c, earnings e FROM courseSales ) @@ -170,14 +170,14 @@ PIVOT ( udf(sum(e)) s, udf(avg(e)) a FOR y IN (2012, 2013) ) --- !query 11 schema +-- !query schema struct<2012_s:bigint,2013_s:bigint,2012_a:double,2013_a:double,c:string> --- !query 11 output +-- !query output 15000 48000 7500.0 48000.0 dotNET 20000 30000 20000.0 30000.0 Java --- !query 12 +-- !query SELECT firstYear_s, secondYear_s, firstYear_a, secondYear_a, c FROM ( SELECT year y, course c, earnings e FROM courseSales ) @@ -185,27 +185,27 @@ PIVOT ( udf(sum(e)) s, udf(avg(e)) a FOR y IN (2012 as firstYear, 2013 secondYear) ) --- !query 12 schema +-- !query schema struct --- !query 12 output +-- !query output 15000 48000 7500.0 48000.0 dotNET 20000 30000 20000.0 30000.0 Java --- !query 13 +-- !query SELECT * FROM courseSales PIVOT ( udf(abs(earnings)) FOR year IN (2012, 2013) ) --- !query 13 schema +-- !query schema struct<> --- !query 13 output +-- !query output org.apache.spark.sql.AnalysisException Aggregate expression required for pivot, but 'coursesales.`earnings`' did not appear in any aggregate function.; --- !query 14 +-- !query SELECT * FROM ( SELECT year, course, earnings FROM courseSales ) @@ -213,14 +213,14 @@ PIVOT ( udf(sum(earnings)), year FOR course IN ('dotNET', 'Java') ) --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output org.apache.spark.sql.AnalysisException Aggregate expression required for pivot, but '__auto_generated_subquery_name.`year`' did not appear in any aggregate function.; --- !query 15 +-- !query SELECT * FROM ( SELECT course, earnings FROM courseSales ) @@ -228,14 +228,14 @@ PIVOT ( udf(sum(earnings)) FOR year IN (2012, 2013) ) --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`year`' given input columns: [__auto_generated_subquery_name.course, __auto_generated_subquery_name.earnings]; line 4 pos 0 --- !query 16 +-- !query SELECT * FROM ( SELECT year, course, earnings FROM courseSales ) @@ -243,14 +243,14 @@ PIVOT ( udf(ceil(udf(sum(earnings)))), avg(earnings) + 1 as a1 FOR course IN ('dotNET', 'Java') ) --- !query 16 schema +-- !query schema struct --- !query 16 output +-- !query output 2012 15000 7501.0 20000 20001.0 2013 48000 48001.0 30000 30001.0 --- !query 17 +-- !query SELECT * FROM ( SELECT year, course, earnings FROM courseSales ) @@ -258,14 +258,14 @@ PIVOT ( sum(udf(avg(earnings))) FOR course IN ('dotNET', 'Java') ) --- !query 17 schema +-- !query schema struct<> --- !query 17 output +-- !query output org.apache.spark.sql.AnalysisException It is not allowed to use an aggregate function in the argument of another aggregate function. Please use the inner aggregate function in a sub-query.; --- !query 18 +-- !query SELECT * FROM ( SELECT course, year, earnings, s FROM courseSales @@ -275,14 +275,14 @@ PIVOT ( udf(sum(earnings)) FOR (course, year) IN (('dotNET', 2012), ('Java', 2013)) ) --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output 1 15000 NULL 2 NULL 30000 --- !query 19 +-- !query SELECT * FROM ( SELECT course, year, earnings, s FROM courseSales @@ -292,14 +292,14 @@ PIVOT ( udf(sum(earnings)) FOR (course, s) IN (('dotNET', 2) as c1, ('Java', 1) as c2) ) --- !query 19 schema +-- !query schema struct --- !query 19 output +-- !query output 2012 NULL 20000 2013 48000 NULL --- !query 20 +-- !query SELECT * FROM ( SELECT course, year, earnings, s FROM courseSales @@ -309,40 +309,40 @@ PIVOT ( udf(sum(earnings)) FOR (course, year) IN ('dotNET', 'Java') ) --- !query 20 schema +-- !query schema struct<> --- !query 20 output +-- !query output org.apache.spark.sql.AnalysisException Invalid pivot value 'dotNET': value data type string does not match pivot column data type struct; --- !query 21 +-- !query SELECT * FROM courseSales PIVOT ( udf(sum(earnings)) FOR year IN (s, 2013) ) --- !query 21 schema +-- !query schema struct<> --- !query 21 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '`s`' given input columns: [coursesales.course, coursesales.earnings, coursesales.year]; line 4 pos 15 --- !query 22 +-- !query SELECT * FROM courseSales PIVOT ( udf(sum(earnings)) FOR year IN (course, 2013) ) --- !query 22 schema +-- !query schema struct<> --- !query 22 output +-- !query output org.apache.spark.sql.AnalysisException Literal expressions required for pivot values, found 'course#x'; --- !query 23 +-- !query SELECT * FROM ( SELECT earnings, year, a FROM courseSales @@ -352,14 +352,14 @@ PIVOT ( udf(sum(earnings)) FOR a IN (array(1, 1), array(2, 2)) ) --- !query 23 schema +-- !query schema struct --- !query 23 output +-- !query output 2012 35000 NULL 2013 NULL 78000 --- !query 24 +-- !query SELECT * FROM ( SELECT course, earnings, udf(year) as year, a FROM courseSales @@ -369,14 +369,14 @@ PIVOT ( udf(sum(earnings)) FOR (course, a) IN (('dotNET', array(1, 1)), ('Java', array(2, 2))) ) --- !query 24 schema +-- !query schema struct --- !query 24 output +-- !query output 2012 15000 NULL 2013 NULL 30000 --- !query 25 +-- !query SELECT * FROM ( SELECT earnings, year, s FROM courseSales @@ -386,14 +386,14 @@ PIVOT ( udf(sum(earnings)) FOR s IN ((1, 'a'), (2, 'b')) ) --- !query 25 schema +-- !query schema struct --- !query 25 output +-- !query output 2012 35000 NULL 2013 NULL 78000 --- !query 26 +-- !query SELECT * FROM ( SELECT course, earnings, year, s FROM courseSales @@ -403,14 +403,14 @@ PIVOT ( udf(sum(earnings)) FOR (course, s) IN (('dotNET', (1, 'a')), ('Java', (2, 'b'))) ) --- !query 26 schema +-- !query schema struct --- !query 26 output +-- !query output 2012 15000 NULL 2013 NULL 30000 --- !query 27 +-- !query SELECT * FROM ( SELECT earnings, year, m FROM courseSales @@ -420,14 +420,14 @@ PIVOT ( udf(sum(earnings)) FOR m IN (map('1', 1), map('2', 2)) ) --- !query 27 schema +-- !query schema struct<> --- !query 27 output +-- !query output org.apache.spark.sql.AnalysisException Invalid pivot column 'm#x'. Pivot columns must be comparable.; --- !query 28 +-- !query SELECT * FROM ( SELECT course, earnings, year, m FROM courseSales @@ -437,14 +437,14 @@ PIVOT ( udf(sum(earnings)) FOR (course, m) IN (('dotNET', map('1', 1)), ('Java', map('2', 2))) ) --- !query 28 schema +-- !query schema struct<> --- !query 28 output +-- !query output org.apache.spark.sql.AnalysisException Invalid pivot column 'named_struct(course, course#x, m, m#x)'. Pivot columns must be comparable.; --- !query 29 +-- !query SELECT * FROM ( SELECT course, earnings, udf("a") as a, udf("z") as z, udf("b") as b, udf("y") as y, udf("c") as c, udf("x") as x, udf("d") as d, udf("w") as w @@ -454,7 +454,7 @@ PIVOT ( udf(sum(Earnings)) FOR Course IN ('dotNET', 'Java') ) --- !query 29 schema +-- !query schema struct --- !query 29 output +-- !query output a z b y c x d w 63000 50000 diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-special-values.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-special-values.sql.out index 7b2b5dbe578cc..5e5c79172bb7a 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-special-values.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-special-values.sql.out @@ -2,61 +2,61 @@ -- Number of queries: 6 --- !query 0 +-- !query SELECT udf(x) FROM (VALUES (1), (2), (NULL)) v(x) --- !query 0 schema +-- !query schema struct --- !query 0 output +-- !query output 1 2 NULL --- !query 1 +-- !query SELECT udf(x) FROM (VALUES ('A'), ('B'), (NULL)) v(x) --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output A B NULL --- !query 2 +-- !query SELECT udf(x) FROM (VALUES ('NaN'), ('1'), ('2')) v(x) --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 1 2 NaN --- !query 3 +-- !query SELECT udf(x) FROM (VALUES ('Infinity'), ('1'), ('2')) v(x) --- !query 3 schema +-- !query schema struct --- !query 3 output +-- !query output 1 2 Infinity --- !query 4 +-- !query SELECT udf(x) FROM (VALUES ('-Infinity'), ('1'), ('2')) v(x) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output -Infinity 1 2 --- !query 5 +-- !query SELECT udf(x) FROM (VALUES 0.00000001, 0.00000002, 0.00000003) v(x) --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output 0.00000001 0.00000002 0.00000003 diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-udaf.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-udaf.sql.out index f8e5fe6a62f33..19221947b4a88 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-udaf.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-udaf.sql.out @@ -2,69 +2,69 @@ -- Number of queries: 8 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (1), (2), (3), (4) as t1(int_col1) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE FUNCTION myDoubleAvg AS 'test.org.apache.spark.sql.MyDoubleAvg' --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query SELECT default.myDoubleAvg(udf(int_col1)) as my_avg, udf(default.myDoubleAvg(udf(int_col1))) as my_avg2, udf(default.myDoubleAvg(int_col1)) as my_avg3 from t1 --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 102.5 102.5 102.5 --- !query 3 +-- !query SELECT default.myDoubleAvg(udf(int_col1), udf(3)) as my_avg from t1 --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output org.apache.spark.sql.AnalysisException Invalid number of arguments for function default.myDoubleAvg. Expected: 1; Found: 2; line 1 pos 7 --- !query 4 +-- !query CREATE FUNCTION udaf1 AS 'test.non.existent.udaf' --- !query 4 schema +-- !query schema struct<> --- !query 4 output +-- !query output --- !query 5 +-- !query SELECT default.udaf1(udf(int_col1)) as udaf1, udf(default.udaf1(udf(int_col1))) as udaf2, udf(default.udaf1(int_col1)) as udaf3 from t1 --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output org.apache.spark.sql.AnalysisException Can not load class 'test.non.existent.udaf' when registering the function 'default.udaf1', please make sure it is on the classpath; line 1 pos 94 --- !query 6 +-- !query DROP FUNCTION myDoubleAvg --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output --- !query 7 +-- !query DROP FUNCTION udaf1 --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-union.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-union.sql.out index 84b5e10dbeb8e..c06c35d34cd74 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-union.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-union.sql.out @@ -2,93 +2,93 @@ -- Number of queries: 16 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW t1 AS VALUES (1, 'a'), (2, 'b') tbl(c1, c2) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE OR REPLACE TEMPORARY VIEW t2 AS VALUES (1.0, 1), (2.0, 4) tbl(c1, c2) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query SELECT udf(c1) as c1, udf(c2) as c2 FROM (SELECT udf(c1) as c1, udf(c2) as c2 FROM t1 UNION ALL SELECT udf(c1) as c1, udf(c2) as c2 FROM t1) --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 1 a 1 a 2 b 2 b --- !query 3 +-- !query SELECT udf(c1) as c1, udf(c2) as c2 FROM (SELECT udf(c1) as c1, udf(c2) as c2 FROM t1 UNION ALL SELECT udf(c1) as c1, udf(c2) as c2 FROM t2 UNION ALL SELECT udf(c1) as c1, udf(c2) as c2 FROM t2) --- !query 3 schema +-- !query schema struct --- !query 3 output -1 1 -1 1 -1 a -2 4 -2 4 -2 b +-- !query output +1.0 1 +1.0 1 +1.0 a +2.0 4 +2.0 4 +2.0 b --- !query 4 +-- !query SELECT udf(udf(a)) as a FROM (SELECT udf(0) a, udf(0) b UNION ALL SELECT udf(SUM(1)) a, udf(CAST(0 AS BIGINT)) b UNION ALL SELECT udf(0) a, udf(0) b) T --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 0 0 1 --- !query 5 +-- !query CREATE OR REPLACE TEMPORARY VIEW p1 AS VALUES 1 T(col) --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query CREATE OR REPLACE TEMPORARY VIEW p2 AS VALUES 1 T(col) --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output --- !query 7 +-- !query CREATE OR REPLACE TEMPORARY VIEW p3 AS VALUES 1 T(col) --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output --- !query 8 +-- !query SELECT udf(1) AS x, udf(col) as col FROM (SELECT udf(col) AS col @@ -97,70 +97,70 @@ FROM (SELECT udf(col) AS col UNION ALL SELECT udf(col) FROM p3) T1) T2 --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 1 1 1 1 --- !query 9 +-- !query SELECT map(1, 2), udf('str') as str UNION ALL SELECT map(1, 2, 3, NULL), udf(1) --- !query 9 schema +-- !query schema struct,str:string> --- !query 9 output +-- !query output {1:2,3:null} 1 {1:2} str --- !query 10 +-- !query SELECT array(1, 2), udf('str') as str UNION ALL SELECT array(1, 2, 3, NULL), udf(1) --- !query 10 schema +-- !query schema struct,str:string> --- !query 10 output +-- !query output [1,2,3,null] 1 [1,2] str --- !query 11 +-- !query DROP VIEW IF EXISTS t1 --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output --- !query 12 +-- !query DROP VIEW IF EXISTS t2 --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output --- !query 13 +-- !query DROP VIEW IF EXISTS p1 --- !query 13 schema +-- !query schema struct<> --- !query 13 output +-- !query output --- !query 14 +-- !query DROP VIEW IF EXISTS p2 --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output --- !query 15 +-- !query DROP VIEW IF EXISTS p3 --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-window.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-window.sql.out index 9354d5e3117f4..a915c1bd6c717 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/udf-window.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-window.sql.out @@ -2,7 +2,7 @@ -- Number of queries: 23 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES (null, 1L, 1.0D, date("2017-08-01"), timestamp(1501545600), "a"), (1, 1L, 1.0D, date("2017-08-01"), timestamp(1501545600), "a"), @@ -14,18 +14,18 @@ CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES (null, null, null, null, null, null), (3, 1L, 1.0D, date("2017-08-01"), timestamp(1501545600), null) AS testData(val, val_long, val_double, val_date, val_timestamp, cate) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT udf(val), cate, count(val) OVER(PARTITION BY cate ORDER BY udf(val) ROWS CURRENT ROW) FROM testData ORDER BY cate, udf(val) --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output NULL NULL 0 3 NULL 1 NULL a 0 @@ -37,12 +37,12 @@ NULL a 0 3 b 1 --- !query 2 +-- !query SELECT udf(val), cate, sum(val) OVER(PARTITION BY cate ORDER BY udf(val) ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) FROM testData ORDER BY cate, udf(val) --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output NULL NULL 3 3 NULL 3 NULL a 1 @@ -54,22 +54,22 @@ NULL a 1 3 b 6 --- !query 3 +-- !query SELECT val_long, udf(cate), sum(val_long) OVER(PARTITION BY cate ORDER BY udf(val_long) ROWS BETWEEN CURRENT ROW AND 2147483648 FOLLOWING) FROM testData ORDER BY udf(cate), val_long --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'ROWS BETWEEN CURRENT ROW AND 2147483648L FOLLOWING' due to data type mismatch: The data type of the upper bound 'bigint' does not match the expected data type 'int'.; line 1 pos 46 --- !query 4 +-- !query SELECT udf(val), cate, count(val) OVER(PARTITION BY udf(cate) ORDER BY val RANGE 1 PRECEDING) FROM testData ORDER BY cate, udf(val) --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output NULL NULL 0 3 NULL 1 NULL a 0 @@ -81,12 +81,12 @@ NULL a 0 3 b 2 --- !query 5 +-- !query SELECT val, udf(cate), sum(val) OVER(PARTITION BY udf(cate) ORDER BY val RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY udf(cate), val --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output NULL NULL NULL 3 NULL 3 NULL a NULL @@ -98,12 +98,12 @@ NULL a NULL 3 b 3 --- !query 6 +-- !query SELECT val_long, udf(cate), sum(val_long) OVER(PARTITION BY udf(cate) ORDER BY val_long RANGE BETWEEN CURRENT ROW AND 2147483648 FOLLOWING) FROM testData ORDER BY udf(cate), val_long --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output NULL NULL NULL 1 NULL 1 1 a 4 @@ -115,12 +115,12 @@ NULL b NULL 2147483650 b 2147483650 --- !query 7 +-- !query SELECT val_double, udf(cate), sum(val_double) OVER(PARTITION BY udf(cate) ORDER BY val_double RANGE BETWEEN CURRENT ROW AND 2.5 FOLLOWING) FROM testData ORDER BY udf(cate), val_double --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output NULL NULL NULL 1.0 NULL 1.0 1.0 a 4.5 @@ -132,12 +132,12 @@ NULL NULL NULL 100.001 b 100.001 --- !query 8 +-- !query SELECT val_date, udf(cate), max(val_date) OVER(PARTITION BY udf(cate) ORDER BY val_date RANGE BETWEEN CURRENT ROW AND 2 FOLLOWING) FROM testData ORDER BY udf(cate), val_date --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output NULL NULL NULL 2017-08-01 NULL 2017-08-01 2017-08-01 a 2017-08-02 @@ -149,13 +149,13 @@ NULL NULL NULL 2020-12-31 b 2020-12-31 --- !query 9 +-- !query SELECT val_timestamp, udf(cate), avg(val_timestamp) OVER(PARTITION BY udf(cate) ORDER BY val_timestamp RANGE BETWEEN CURRENT ROW AND interval 23 days 4 hours FOLLOWING) FROM testData ORDER BY udf(cate), val_timestamp --- !query 9 schema -struct --- !query 9 output +-- !query schema +struct +-- !query output NULL NULL NULL 2017-07-31 17:00:00 NULL 1.5015456E9 2017-07-31 17:00:00 a 1.5016970666666667E9 @@ -167,12 +167,12 @@ NULL NULL NULL 2020-12-30 16:00:00 b 1.6093728E9 --- !query 10 +-- !query SELECT val, udf(cate), sum(val) OVER(PARTITION BY cate ORDER BY val DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output NULL NULL NULL 3 NULL 3 NULL a NULL @@ -184,62 +184,62 @@ NULL a NULL 3 b 5 --- !query 11 +-- !query SELECT udf(val), cate, count(val) OVER(PARTITION BY udf(cate) ROWS BETWEEN UNBOUNDED FOLLOWING AND 1 FOLLOWING) FROM testData ORDER BY cate, udf(val) --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'ROWS BETWEEN UNBOUNDED FOLLOWING AND 1 FOLLOWING' due to data type mismatch: Window frame upper bound '1' does not follow the lower bound 'unboundedfollowing$()'.; line 1 pos 38 --- !query 12 +-- !query SELECT udf(val), cate, count(val) OVER(PARTITION BY udf(cate) RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, udf(val) --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(PARTITION BY CAST(udf(cast(cate as string)) AS STRING) RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: A range window frame cannot be used in an unordered window specification.; line 1 pos 38 --- !query 13 +-- !query SELECT udf(val), cate, count(val) OVER(PARTITION BY udf(cate) ORDER BY udf(val), cate RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, udf(val) --- !query 13 schema +-- !query schema struct<> --- !query 13 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(PARTITION BY CAST(udf(cast(cate as string)) AS STRING) ORDER BY CAST(udf(cast(val as string)) AS INT) ASC NULLS FIRST, testdata.`cate` ASC NULLS FIRST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: A range window frame with value boundaries cannot be used in a window specification with multiple order by expressions: cast(udf(cast(val#x as string)) as int) ASC NULLS FIRST,cate#x ASC NULLS FIRST; line 1 pos 38 --- !query 14 +-- !query SELECT udf(val), cate, count(val) OVER(PARTITION BY udf(cate) ORDER BY current_timestamp RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, udf(val) --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(PARTITION BY CAST(udf(cast(cate as string)) AS STRING) ORDER BY current_timestamp() ASC NULLS FIRST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: The data type 'timestamp' used in the order specification does not match the data type 'int' which is used in the range frame.; line 1 pos 38 --- !query 15 +-- !query SELECT udf(val), cate, count(val) OVER(PARTITION BY udf(cate) ORDER BY val RANGE BETWEEN 1 FOLLOWING AND 1 PRECEDING) FROM testData ORDER BY udf(cate), val --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'RANGE BETWEEN 1 FOLLOWING AND 1 PRECEDING' due to data type mismatch: The lower bound of a window frame must be less than or equal to the upper bound; line 1 pos 38 --- !query 16 +-- !query SELECT udf(val), cate, count(val) OVER(PARTITION BY udf(cate) ORDER BY udf(val) RANGE BETWEEN CURRENT ROW AND current_date PRECEDING) FROM testData ORDER BY cate, val(val) --- !query 16 schema +-- !query schema struct<> --- !query 16 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException Frame bound value must be a literal.(line 2, pos 30) @@ -250,7 +250,7 @@ RANGE BETWEEN CURRENT ROW AND current_date PRECEDING) FROM testData ORDER BY cat ------------------------------^^^ --- !query 17 +-- !query SELECT udf(val), cate, max(udf(val)) OVER w AS max, min(udf(val)) OVER w AS min, @@ -285,9 +285,9 @@ kurtosis(udf(val_double)) OVER w AS kurtosis FROM testData WINDOW w AS (PARTITION BY udf(cate) ORDER BY udf(val)) ORDER BY cate, udf(val) --- !query 17 schema +-- !query schema struct,collect_set:array,skewness:double,kurtosis:double> --- !query 17 output +-- !query output NULL NULL NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL NULL NULL 1 1 0.5 0.0 1 1 NULL NULL 0 NULL NULL NULL NULL [] [] NULL NULL 3 NULL 3 3 3 1 3 3.0 NaN NULL 3 NULL 3 3 3 2 2 1.0 1.0 2 2 0.0 NaN 1 0.0 NaN NaN 0.0 [3] [3] NaN NaN NULL a NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL NULL NULL 1 1 0.25 0.0 1 1 NULL NULL 0 NULL NULL NULL NULL [] [] NaN NaN @@ -299,11 +299,11 @@ NULL a NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL NULL NULL 1 1 0.25 0. 3 b 3 1 1 3 6 2.0 1.0 1 1 1 3 3 3 3 3 1.0 1.0 2 3 0.6666666666666666 1.0 3 5.3687091175E8 1.0 1.0 0.816496580927726 [1,2,3] [1,2,3] 0.7057890433107311 -1.4999999999999984 --- !query 18 +-- !query SELECT udf(val), cate, avg(null) OVER(PARTITION BY cate ORDER BY val) FROM testData ORDER BY cate, val --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output NULL NULL NULL 3 NULL NULL NULL a NULL @@ -315,20 +315,20 @@ NULL a NULL 3 b NULL --- !query 19 +-- !query SELECT udf(val), cate, row_number() OVER(PARTITION BY cate) FROM testData ORDER BY cate, udf(val) --- !query 19 schema +-- !query schema struct<> --- !query 19 output +-- !query output org.apache.spark.sql.AnalysisException Window function row_number() requires window to be ordered, please add ORDER BY clause. For example SELECT row_number()(value_expr) OVER (PARTITION BY window_partition ORDER BY window_ordering) from table; --- !query 20 +-- !query SELECT udf(val), cate, sum(val) OVER(), avg(val) OVER() FROM testData ORDER BY cate, val --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output NULL NULL 13 1.8571428571428572 3 NULL 13 1.8571428571428572 NULL a 13 1.8571428571428572 @@ -340,7 +340,7 @@ NULL a 13 1.8571428571428572 3 b 13 1.8571428571428572 --- !query 21 +-- !query SELECT udf(val), cate, first_value(false) OVER w AS first_value, first_value(true, true) OVER w AS first_value_ignore_null, @@ -351,9 +351,9 @@ last_value(false, false) OVER w AS last_value_contain_null FROM testData WINDOW w AS () ORDER BY cate, val --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output NULL NULL false true false false true false 3 NULL false true false false true false NULL a false true false false true false @@ -365,14 +365,14 @@ NULL a false true false false true false 3 b false true false false true false --- !query 22 +-- !query SELECT udf(cate), sum(val) OVER (w) FROM testData WHERE val is not null WINDOW w AS (PARTITION BY cate ORDER BY val) --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output NULL 3 a 2 a 2 diff --git a/sql/core/src/test/resources/sql-tests/results/union.sql.out b/sql/core/src/test/resources/sql-tests/results/union.sql.out index b023df825d814..44002406836a4 100644 --- a/sql/core/src/test/resources/sql-tests/results/union.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/union.sql.out @@ -2,93 +2,93 @@ -- Number of queries: 16 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW t1 AS VALUES (1, 'a'), (2, 'b') tbl(c1, c2) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query CREATE OR REPLACE TEMPORARY VIEW t2 AS VALUES (1.0, 1), (2.0, 4) tbl(c1, c2) --- !query 1 schema +-- !query schema struct<> --- !query 1 output +-- !query output --- !query 2 +-- !query SELECT * FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t1) --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output 1 a 1 a 2 b 2 b --- !query 3 +-- !query SELECT * FROM (SELECT * FROM t1 UNION ALL SELECT * FROM t2 UNION ALL SELECT * FROM t2) --- !query 3 schema +-- !query schema struct --- !query 3 output -1 1 -1 1 -1 a -2 4 -2 4 -2 b +-- !query output +1.0 1 +1.0 1 +1.0 a +2.0 4 +2.0 4 +2.0 b --- !query 4 +-- !query SELECT a FROM (SELECT 0 a, 0 b UNION ALL SELECT SUM(1) a, CAST(0 AS BIGINT) b UNION ALL SELECT 0 a, 0 b) T --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output 0 0 1 --- !query 5 +-- !query CREATE OR REPLACE TEMPORARY VIEW p1 AS VALUES 1 T(col) --- !query 5 schema +-- !query schema struct<> --- !query 5 output +-- !query output --- !query 6 +-- !query CREATE OR REPLACE TEMPORARY VIEW p2 AS VALUES 1 T(col) --- !query 6 schema +-- !query schema struct<> --- !query 6 output +-- !query output --- !query 7 +-- !query CREATE OR REPLACE TEMPORARY VIEW p3 AS VALUES 1 T(col) --- !query 7 schema +-- !query schema struct<> --- !query 7 output +-- !query output --- !query 8 +-- !query SELECT 1 AS x, col FROM (SELECT col AS col @@ -97,70 +97,70 @@ FROM (SELECT col AS col UNION ALL SELECT col FROM p3) T1) T2 --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output 1 1 1 1 --- !query 9 +-- !query SELECT map(1, 2), 'str' UNION ALL SELECT map(1, 2, 3, NULL), 1 --- !query 9 schema +-- !query schema struct,str:string> --- !query 9 output +-- !query output {1:2,3:null} 1 {1:2} str --- !query 10 +-- !query SELECT array(1, 2), 'str' UNION ALL SELECT array(1, 2, 3, NULL), 1 --- !query 10 schema +-- !query schema struct,str:string> --- !query 10 output +-- !query output [1,2,3,null] 1 [1,2] str --- !query 11 +-- !query DROP VIEW IF EXISTS t1 --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output --- !query 12 +-- !query DROP VIEW IF EXISTS t2 --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output --- !query 13 +-- !query DROP VIEW IF EXISTS p1 --- !query 13 schema +-- !query schema struct<> --- !query 13 output +-- !query output --- !query 14 +-- !query DROP VIEW IF EXISTS p2 --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output --- !query 15 +-- !query DROP VIEW IF EXISTS p3 --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output diff --git a/sql/core/src/test/resources/sql-tests/results/window.sql.out b/sql/core/src/test/resources/sql-tests/results/window.sql.out index 367dc4f513635..625088f90ced9 100644 --- a/sql/core/src/test/resources/sql-tests/results/window.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/window.sql.out @@ -1,8 +1,8 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 23 +-- Number of queries: 24 --- !query 0 +-- !query CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES (null, 1L, 1.0D, date("2017-08-01"), timestamp(1501545600), "a"), (1, 1L, 1.0D, date("2017-08-01"), timestamp(1501545600), "a"), @@ -14,18 +14,18 @@ CREATE OR REPLACE TEMPORARY VIEW testData AS SELECT * FROM VALUES (null, null, null, null, null, null), (3, 1L, 1.0D, date("2017-08-01"), timestamp(1501545600), null) AS testData(val, val_long, val_double, val_date, val_timestamp, cate) --- !query 0 schema +-- !query schema struct<> --- !query 0 output +-- !query output --- !query 1 +-- !query SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val ROWS CURRENT ROW) FROM testData ORDER BY cate, val --- !query 1 schema +-- !query schema struct --- !query 1 output +-- !query output NULL NULL 0 3 NULL 1 NULL a 0 @@ -37,12 +37,12 @@ NULL a 0 3 b 1 --- !query 2 +-- !query SELECT val, cate, sum(val) OVER(PARTITION BY cate ORDER BY val ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) FROM testData ORDER BY cate, val --- !query 2 schema +-- !query schema struct --- !query 2 output +-- !query output NULL NULL 3 3 NULL 3 NULL a 1 @@ -54,22 +54,22 @@ NULL a 1 3 b 6 --- !query 3 +-- !query SELECT val_long, cate, sum(val_long) OVER(PARTITION BY cate ORDER BY val_long ROWS BETWEEN CURRENT ROW AND 2147483648 FOLLOWING) FROM testData ORDER BY cate, val_long --- !query 3 schema +-- !query schema struct<> --- !query 3 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'ROWS BETWEEN CURRENT ROW AND 2147483648L FOLLOWING' due to data type mismatch: The data type of the upper bound 'bigint' does not match the expected data type 'int'.; line 1 pos 41 --- !query 4 +-- !query SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val RANGE 1 PRECEDING) FROM testData ORDER BY cate, val --- !query 4 schema +-- !query schema struct --- !query 4 output +-- !query output NULL NULL 0 3 NULL 1 NULL a 0 @@ -81,12 +81,12 @@ NULL a 0 3 b 2 --- !query 5 +-- !query SELECT val, cate, sum(val) OVER(PARTITION BY cate ORDER BY val RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val --- !query 5 schema +-- !query schema struct --- !query 5 output +-- !query output NULL NULL NULL 3 NULL 3 NULL a NULL @@ -98,12 +98,12 @@ NULL a NULL 3 b 3 --- !query 6 +-- !query SELECT val_long, cate, sum(val_long) OVER(PARTITION BY cate ORDER BY val_long RANGE BETWEEN CURRENT ROW AND 2147483648 FOLLOWING) FROM testData ORDER BY cate, val_long --- !query 6 schema +-- !query schema struct --- !query 6 output +-- !query output NULL NULL NULL 1 NULL 1 1 a 4 @@ -115,12 +115,12 @@ NULL b NULL 2147483650 b 2147483650 --- !query 7 +-- !query SELECT val_double, cate, sum(val_double) OVER(PARTITION BY cate ORDER BY val_double RANGE BETWEEN CURRENT ROW AND 2.5 FOLLOWING) FROM testData ORDER BY cate, val_double --- !query 7 schema +-- !query schema struct --- !query 7 output +-- !query output NULL NULL NULL 1.0 NULL 1.0 1.0 a 4.5 @@ -132,12 +132,12 @@ NULL NULL NULL 100.001 b 100.001 --- !query 8 +-- !query SELECT val_date, cate, max(val_date) OVER(PARTITION BY cate ORDER BY val_date RANGE BETWEEN CURRENT ROW AND 2 FOLLOWING) FROM testData ORDER BY cate, val_date --- !query 8 schema +-- !query schema struct --- !query 8 output +-- !query output NULL NULL NULL 2017-08-01 NULL 2017-08-01 2017-08-01 a 2017-08-02 @@ -149,13 +149,13 @@ NULL NULL NULL 2020-12-31 b 2020-12-31 --- !query 9 +-- !query SELECT val_timestamp, cate, avg(val_timestamp) OVER(PARTITION BY cate ORDER BY val_timestamp RANGE BETWEEN CURRENT ROW AND interval 23 days 4 hours FOLLOWING) FROM testData ORDER BY cate, val_timestamp --- !query 9 schema -struct --- !query 9 output +-- !query schema +struct +-- !query output NULL NULL NULL 2017-07-31 17:00:00 NULL 1.5015456E9 2017-07-31 17:00:00 a 1.5016970666666667E9 @@ -167,12 +167,12 @@ NULL NULL NULL 2020-12-30 16:00:00 b 1.6093728E9 --- !query 10 +-- !query SELECT val, cate, sum(val) OVER(PARTITION BY cate ORDER BY val DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val --- !query 10 schema +-- !query schema struct --- !query 10 output +-- !query output NULL NULL NULL 3 NULL 3 NULL a NULL @@ -184,62 +184,62 @@ NULL a NULL 3 b 5 --- !query 11 +-- !query SELECT val, cate, count(val) OVER(PARTITION BY cate ROWS BETWEEN UNBOUNDED FOLLOWING AND 1 FOLLOWING) FROM testData ORDER BY cate, val --- !query 11 schema +-- !query schema struct<> --- !query 11 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'ROWS BETWEEN UNBOUNDED FOLLOWING AND 1 FOLLOWING' due to data type mismatch: Window frame upper bound '1' does not follow the lower bound 'unboundedfollowing$()'.; line 1 pos 33 --- !query 12 +-- !query SELECT val, cate, count(val) OVER(PARTITION BY cate RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val --- !query 12 schema +-- !query schema struct<> --- !query 12 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(PARTITION BY testdata.`cate` RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: A range window frame cannot be used in an unordered window specification.; line 1 pos 33 --- !query 13 +-- !query SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val, cate RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val --- !query 13 schema +-- !query schema struct<> --- !query 13 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(PARTITION BY testdata.`cate` ORDER BY testdata.`val` ASC NULLS FIRST, testdata.`cate` ASC NULLS FIRST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: A range window frame with value boundaries cannot be used in a window specification with multiple order by expressions: val#x ASC NULLS FIRST,cate#x ASC NULLS FIRST; line 1 pos 33 --- !query 14 +-- !query SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY current_timestamp RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val --- !query 14 schema +-- !query schema struct<> --- !query 14 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve '(PARTITION BY testdata.`cate` ORDER BY current_timestamp() ASC NULLS FIRST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: The data type 'timestamp' used in the order specification does not match the data type 'int' which is used in the range frame.; line 1 pos 33 --- !query 15 +-- !query SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val RANGE BETWEEN 1 FOLLOWING AND 1 PRECEDING) FROM testData ORDER BY cate, val --- !query 15 schema +-- !query schema struct<> --- !query 15 output +-- !query output org.apache.spark.sql.AnalysisException cannot resolve 'RANGE BETWEEN 1 FOLLOWING AND 1 PRECEDING' due to data type mismatch: The lower bound of a window frame must be less than or equal to the upper bound; line 1 pos 33 --- !query 16 +-- !query SELECT val, cate, count(val) OVER(PARTITION BY cate ORDER BY val RANGE BETWEEN CURRENT ROW AND current_date PRECEDING) FROM testData ORDER BY cate, val --- !query 16 schema +-- !query schema struct<> --- !query 16 output +-- !query output org.apache.spark.sql.catalyst.parser.ParseException Frame bound value must be a literal.(line 2, pos 30) @@ -250,7 +250,7 @@ RANGE BETWEEN CURRENT ROW AND current_date PRECEDING) FROM testData ORDER BY cat ------------------------------^^^ --- !query 17 +-- !query SELECT val, cate, max(val) OVER w AS max, min(val) OVER w AS min, @@ -285,9 +285,9 @@ kurtosis(val_double) OVER w AS kurtosis FROM testData WINDOW w AS (PARTITION BY cate ORDER BY val) ORDER BY cate, val --- !query 17 schema +-- !query schema struct,collect_set:array,skewness:double,kurtosis:double> --- !query 17 output +-- !query output NULL NULL NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL NULL NULL 1 1 0.5 0.0 1 1 NULL NULL 0 NULL NULL NULL NULL [] [] NULL NULL 3 NULL 3 3 3 1 3 3.0 NaN NULL 3 NULL 3 3 3 2 2 1.0 1.0 2 2 0.0 NaN 1 0.0 NaN NaN 0.0 [3] [3] NaN NaN NULL a NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL NULL NULL 1 1 0.25 0.0 1 1 NULL NULL 0 NULL NULL NULL NULL [] [] NaN NaN @@ -299,11 +299,11 @@ NULL a NULL NULL NULL 0 NULL NULL NULL NULL NULL NULL NULL NULL NULL 1 1 0.25 0. 3 b 3 1 1 3 6 2.0 1.0 1 1 1 3 3 3 3 3 1.0 1.0 2 3 0.6666666666666666 1.0 3 5.3687091175E8 1.0 1.0 0.816496580927726 [1,2,3] [1,2,3] 0.7057890433107311 -1.4999999999999984 --- !query 18 +-- !query SELECT val, cate, avg(null) OVER(PARTITION BY cate ORDER BY val) FROM testData ORDER BY cate, val --- !query 18 schema +-- !query schema struct --- !query 18 output +-- !query output NULL NULL NULL 3 NULL NULL NULL a NULL @@ -315,20 +315,20 @@ NULL a NULL 3 b NULL --- !query 19 +-- !query SELECT val, cate, row_number() OVER(PARTITION BY cate) FROM testData ORDER BY cate, val --- !query 19 schema +-- !query schema struct<> --- !query 19 output +-- !query output org.apache.spark.sql.AnalysisException Window function row_number() requires window to be ordered, please add ORDER BY clause. For example SELECT row_number()(value_expr) OVER (PARTITION BY window_partition ORDER BY window_ordering) from table; --- !query 20 +-- !query SELECT val, cate, sum(val) OVER(), avg(val) OVER() FROM testData ORDER BY cate, val --- !query 20 schema +-- !query schema struct --- !query 20 output +-- !query output NULL NULL 13 1.8571428571428572 3 NULL 13 1.8571428571428572 NULL a 13 1.8571428571428572 @@ -340,7 +340,7 @@ NULL a 13 1.8571428571428572 3 b 13 1.8571428571428572 --- !query 21 +-- !query SELECT val, cate, first_value(false) OVER w AS first_value, first_value(true, true) OVER w AS first_value_ignore_null, @@ -351,9 +351,9 @@ last_value(false, false) OVER w AS last_value_contain_null FROM testData WINDOW w AS () ORDER BY cate, val --- !query 21 schema +-- !query schema struct --- !query 21 output +-- !query output NULL NULL false true false false true false 3 NULL false true false false true false NULL a false true false false true false @@ -365,14 +365,14 @@ NULL a false true false false true false 3 b false true false false true false --- !query 22 +-- !query SELECT cate, sum(val) OVER (w) FROM testData WHERE val is not null WINDOW w AS (PARTITION BY cate ORDER BY val) --- !query 22 schema +-- !query schema struct --- !query 22 output +-- !query output NULL 3 a 2 a 2 @@ -380,3 +380,14 @@ a 4 b 1 b 3 b 6 + + +-- !query +SELECT val, cate, +count(val) FILTER (WHERE val > 1) OVER(PARTITION BY cate) +FROM testData ORDER BY cate, val +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +window aggregate function with filter predicate is not supported yet.; diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/commits/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/commits/0 new file mode 100644 index 0000000000000..9c1e3021c3ead --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/commits/0 @@ -0,0 +1,2 @@ +v1 +{"nextBatchWatermarkMs":0} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/metadata b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/metadata new file mode 100644 index 0000000000000..543f156048abe --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/metadata @@ -0,0 +1 @@ +{"id":"1ab1ee6f-993c-4a51-824c-1c7cc8202f62"} \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/offsets/0 b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/offsets/0 new file mode 100644 index 0000000000000..63dba425b7e16 --- /dev/null +++ b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/offsets/0 @@ -0,0 +1,4 @@ +v1 +{"batchWatermarkMs":0,"batchTimestampMs":1548845804202,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"5"}} +0 +0 \ No newline at end of file diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/0/left-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/0/left-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/0/left-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/0/left-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/0/left-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/0/left-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/0/right-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/0/right-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/0/right-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/0/right-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/0/right-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/0/right-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/1/left-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/1/left-keyToNumValues/1.delta new file mode 100644 index 0000000000000..2cdf645d3a406 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/1/left-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/1/left-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/1/left-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..9c69d01231196 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/1/left-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/1/right-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/1/right-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/1/right-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/1/right-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/1/right-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/1/right-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/2/left-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/2/left-keyToNumValues/1.delta new file mode 100644 index 0000000000000..4e421cd377fb6 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/2/left-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/2/left-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/2/left-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..edc7a97408aaa Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/2/left-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/2/right-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/2/right-keyToNumValues/1.delta new file mode 100644 index 0000000000000..4e421cd377fb6 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/2/right-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/2/right-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/2/right-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..edc7a97408aaa Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/2/right-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/3/left-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/3/left-keyToNumValues/1.delta new file mode 100644 index 0000000000000..859c2b1315a5e Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/3/left-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/3/left-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/3/left-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..7535621b3adb2 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/3/left-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/3/right-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/3/right-keyToNumValues/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/3/right-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/3/right-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/3/right-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..6352978051846 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/3/right-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/4/left-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/4/left-keyToNumValues/1.delta new file mode 100644 index 0000000000000..0bdaf341003b9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/4/left-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/4/left-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/4/left-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..f17037b3c5218 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/4/left-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/4/right-keyToNumValues/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/4/right-keyToNumValues/1.delta new file mode 100644 index 0000000000000..0bdaf341003b9 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/4/right-keyToNumValues/1.delta differ diff --git a/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/4/right-keyWithIndexToValue/1.delta b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/4/right-keyWithIndexToValue/1.delta new file mode 100644 index 0000000000000..f17037b3c5218 Binary files /dev/null and b/sql/core/src/test/resources/structured-streaming/checkpoint-version-2.4.0-streaming-join/state/0/4/right-keyWithIndexToValue/1.delta differ diff --git a/sql/core/src/test/resources/test-data/bad_after_good.csv b/sql/core/src/test/resources/test-data/bad_after_good.csv index 4621a7d23714d..1a7c2651a11a7 100644 --- a/sql/core/src/test/resources/test-data/bad_after_good.csv +++ b/sql/core/src/test/resources/test-data/bad_after_good.csv @@ -1,2 +1,2 @@ "good record",1999-08-01 -"bad record",1999-088-01 +"bad record",1999-088_01 diff --git a/sql/core/src/test/resources/test-data/cars-multichar-delim-crazy.csv b/sql/core/src/test/resources/test-data/cars-multichar-delim-crazy.csv new file mode 100644 index 0000000000000..cabb50e9608e6 --- /dev/null +++ b/sql/core/src/test/resources/test-data/cars-multichar-delim-crazy.csv @@ -0,0 +1,4 @@ +year_/-\_make_/-\_model_/-\_comment_/-\_blank +'2012'_/-\_'Tesla'_/-\_'S'_/-\_'No comment'_/-\_ +1997_/-\_Ford_/-\_E350_/-\_'Go get one now they are going fast'_/-\_ +2015_/-\_Chevy_/-\_Volt diff --git a/sql/core/src/test/resources/test-data/cars-multichar-delim.csv b/sql/core/src/test/resources/test-data/cars-multichar-delim.csv new file mode 100644 index 0000000000000..4309edbf04418 --- /dev/null +++ b/sql/core/src/test/resources/test-data/cars-multichar-delim.csv @@ -0,0 +1,4 @@ +year, make, model, comment, blank +'2012', 'Tesla', 'S', No comment, +1997, Ford, E350, 'Go get one now they are going fast', +2015, Chevy, Volt diff --git a/sql/core/src/test/resources/test-data/malformedRow.csv b/sql/core/src/test/resources/test-data/malformedRow.csv new file mode 100644 index 0000000000000..8cfb3eefb982c --- /dev/null +++ b/sql/core/src/test/resources/test-data/malformedRow.csv @@ -0,0 +1,5 @@ +fruit,color,price,quantity +apple,red,1,3 +banana,yellow,2,4 +orange,orange,3,5 +malformedrow diff --git a/sql/core/src/test/resources/test-data/value-malformed.csv b/sql/core/src/test/resources/test-data/value-malformed.csv index 8945ed73d2e83..6e6f08fca6df8 100644 --- a/sql/core/src/test/resources/test-data/value-malformed.csv +++ b/sql/core/src/test/resources/test-data/value-malformed.csv @@ -1,2 +1,2 @@ -0,2013-111-11 12:13:14 +0,2013-111_11 12:13:14 1,1983-08-04 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala index a4b142b7ab78e..2b4abed645910 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ApproximatePercentileQuerySuite.scala @@ -124,20 +124,24 @@ class ApproximatePercentileQuerySuite extends QueryTest with SharedSparkSession test("percentile_approx, with different accuracies") { withTempView(table) { - (1 to 1000).toDF("col").createOrReplaceTempView(table) + val tableCount = 1000 + (1 to tableCount).toDF("col").createOrReplaceTempView(table) // With different accuracies - val expectedPercentile = 250D val accuracies = Array(1, 10, 100, 1000, 10000) - val errors = accuracies.map { accuracy => - val df = spark.sql(s"SELECT percentile_approx(col, 0.25, $accuracy) FROM $table") - val approximatePercentile = df.collect().head.getInt(0) - val error = Math.abs(approximatePercentile - expectedPercentile) - error + val expectedPercentiles = Array(100D, 200D, 250D, 314D, 777D) + for (accuracy <- accuracies) { + for (expectedPercentile <- expectedPercentiles) { + val df = spark.sql( + s"""SELECT + | percentile_approx(col, $expectedPercentile/$tableCount, $accuracy) + |FROM $table + """.stripMargin) + val approximatePercentile = df.collect().head.getInt(0) + val error = Math.abs(approximatePercentile - expectedPercentile) + assert(error <= math.floor(tableCount.toDouble / accuracy.toDouble)) + } } - - // The larger accuracy value we use, the smaller error we get - assert(errors.sorted.sameElements(errors.reverse)) } } @@ -145,7 +149,7 @@ class ApproximatePercentileQuerySuite extends QueryTest with SharedSparkSession withTempView(table) { (1 to 1000).toDF("col").createOrReplaceTempView(table) checkAnswer( - spark.sql(s"SELECT percentile_approx(col, array(0.25 + 0.25D), 200 + 800D) FROM $table"), + spark.sql(s"SELECT percentile_approx(col, array(0.25 + 0.25D), 200 + 800) FROM $table"), Row(Seq(499)) ) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/BenchmarkQueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/BenchmarkQueryTest.scala index 3fcb9892800b6..07afd4195c3d4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/BenchmarkQueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/BenchmarkQueryTest.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql import org.apache.spark.internal.config.Tests.IS_TESTING -import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodeGenerator} +import org.apache.spark.sql.catalyst.expressions.codegen.{ByteCodeStats, CodeFormatter, CodeGenerator} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.execution.{SparkPlan, WholeStageCodegenExec} import org.apache.spark.sql.test.SharedSparkSession @@ -43,12 +43,12 @@ abstract class BenchmarkQueryTest extends QueryTest with SharedSparkSession { } } - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() RuleExecutor.resetMetrics() } - protected def checkGeneratedCode(plan: SparkPlan): Unit = { + protected def checkGeneratedCode(plan: SparkPlan, checkMethodCodeSize: Boolean = true): Unit = { val codegenSubtrees = new collection.mutable.HashSet[WholeStageCodegenExec]() plan foreach { case s: WholeStageCodegenExec => @@ -57,7 +57,7 @@ abstract class BenchmarkQueryTest extends QueryTest with SharedSparkSession { } codegenSubtrees.toSeq.foreach { subtree => val code = subtree.doCodeGen()._2 - try { + val (_, ByteCodeStats(maxMethodCodeSize, _, _)) = try { // Just check the generated code can be properly compiled CodeGenerator.compile(code) } catch { @@ -72,6 +72,11 @@ abstract class BenchmarkQueryTest extends QueryTest with SharedSparkSession { """.stripMargin throw new Exception(msg, e) } + + assert(!checkMethodCodeSize || + maxMethodCodeSize <= CodeGenerator.DEFAULT_JVM_HUGE_METHOD_LIMIT, + s"too long generated codes found in the WholeStageCodegenExec subtree (id=${subtree.id}) " + + s"and JIT optimization might not work:\n${subtree.treeString}") } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala index 6e1ee6da9200d..cd2c681dd7e0e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala @@ -27,7 +27,9 @@ import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.SubqueryExpression import org.apache.spark.sql.catalyst.plans.logical.{BROADCAST, Join, JoinStrategyHint, SHUFFLE_HASH} -import org.apache.spark.sql.execution.{RDDScanExec, SparkPlan} +import org.apache.spark.sql.catalyst.util.DateTimeConstants +import org.apache.spark.sql.execution.{ExecSubqueryExpression, RDDScanExec, SparkPlan} +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.columnar._ import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec import org.apache.spark.sql.functions._ @@ -36,11 +38,14 @@ import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils} import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.storage.{RDDBlockId, StorageLevel} import org.apache.spark.storage.StorageLevel.{MEMORY_AND_DISK_2, MEMORY_ONLY} +import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.{AccumulatorContext, Utils} private case class BigData(s: String) -class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSparkSession { +class CachedTableSuite extends QueryTest with SQLTestUtils + with SharedSparkSession + with AdaptiveSparkPlanHelper { import testImplicits._ setupTestData() @@ -87,16 +92,25 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSparkSessi sum } + private def getNumInMemoryTablesInSubquery(plan: SparkPlan): Int = { + plan.expressions.flatMap(_.collect { + case sub: ExecSubqueryExpression => getNumInMemoryTablesRecursively(sub.plan) + }).sum + } + private def getNumInMemoryTablesRecursively(plan: SparkPlan): Int = { - plan.collect { - case InMemoryTableScanExec(_, _, relation) => - getNumInMemoryTablesRecursively(relation.cachedPlan) + 1 + collect(plan) { + case inMemoryTable @ InMemoryTableScanExec(_, _, relation) => + getNumInMemoryTablesRecursively(relation.cachedPlan) + + getNumInMemoryTablesInSubquery(inMemoryTable) + 1 + case p => + getNumInMemoryTablesInSubquery(p) }.sum } test("cache temp table") { withTempView("tempTable") { - testData.select('key).createOrReplaceTempView("tempTable") + testData.select("key").createOrReplaceTempView("tempTable") assertCached(sql("SELECT COUNT(*) FROM tempTable"), 0) spark.catalog.cacheTable("tempTable") assertCached(sql("SELECT COUNT(*) FROM tempTable")) @@ -127,8 +141,8 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSparkSessi } test("uncaching temp table") { - testData.select('key).createOrReplaceTempView("tempTable1") - testData.select('key).createOrReplaceTempView("tempTable2") + testData.select("key").createOrReplaceTempView("tempTable1") + testData.select("key").createOrReplaceTempView("tempTable2") spark.catalog.cacheTable("tempTable1") assertCached(sql("SELECT COUNT(*) FROM tempTable1")) @@ -361,15 +375,15 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSparkSessi } test("Drops temporary table") { - testData.select('key).createOrReplaceTempView("t1") + testData.select("key").createOrReplaceTempView("t1") spark.table("t1") spark.catalog.dropTempView("t1") intercept[AnalysisException](spark.table("t1")) } test("Drops cached temporary table") { - testData.select('key).createOrReplaceTempView("t1") - testData.select('key).createOrReplaceTempView("t2") + testData.select("key").createOrReplaceTempView("t1") + testData.select("key").createOrReplaceTempView("t2") spark.catalog.cacheTable("t1") assert(spark.catalog.isCached("t1")) @@ -464,7 +478,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSparkSessi */ private def verifyNumExchanges(df: DataFrame, expected: Int): Unit = { assert( - df.queryExecution.executedPlan.collect { case e: ShuffleExchangeExec => e }.size == expected) + collect(df.queryExecution.executedPlan) { case e: ShuffleExchangeExec => e }.size == expected) } test("A cached table preserves the partitioning and ordering of its cached SparkPlan") { @@ -515,7 +529,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSparkSessi val query = sql("SELECT key, value, a, b FROM t1 t1 JOIN t2 t2 ON t1.key = t2.a") verifyNumExchanges(query, 1) - assert(query.queryExecution.executedPlan.outputPartitioning.numPartitions === 6) + assert(stripAQEPlan(query.queryExecution.executedPlan).outputPartitioning.numPartitions === 6) checkAnswer( query, testData.join(testData2, $"key" === $"a").select($"key", $"value", $"a", $"b")) @@ -532,7 +546,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSparkSessi val query = sql("SELECT key, value, a, b FROM t1 t1 JOIN t2 t2 ON t1.key = t2.a") verifyNumExchanges(query, 1) - assert(query.queryExecution.executedPlan.outputPartitioning.numPartitions === 6) + assert(stripAQEPlan(query.queryExecution.executedPlan).outputPartitioning.numPartitions === 6) checkAnswer( query, testData.join(testData2, $"key" === $"a").select($"key", $"value", $"a", $"b")) @@ -548,7 +562,8 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSparkSessi val query = sql("SELECT key, value, a, b FROM t1 t1 JOIN t2 t2 ON t1.key = t2.a") verifyNumExchanges(query, 1) - assert(query.queryExecution.executedPlan.outputPartitioning.numPartitions === 12) + assert(stripAQEPlan(query.queryExecution.executedPlan). + outputPartitioning.numPartitions === 12) checkAnswer( query, testData.join(testData2, $"key" === $"a").select($"key", $"value", $"a", $"b")) @@ -603,7 +618,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSparkSessi val query = sql("SELECT key, value, a, b FROM t1 t1 JOIN t2 t2 ON t1.key = t2.a and t1.value = t2.b") verifyNumExchanges(query, 1) - assert(query.queryExecution.executedPlan.outputPartitioning.numPartitions === 6) + assert(stripAQEPlan(query.queryExecution.executedPlan).outputPartitioning.numPartitions === 6) checkAnswer( query, df1.join(df2, $"key" === $"a" && $"value" === $"b").select($"key", $"value", $"a", $"b")) @@ -849,7 +864,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSparkSessi sparkContext.addSparkListener(jobListener) try { val result = f - sparkContext.listenerBus.waitUntilEmpty(10000L) + sparkContext.listenerBus.waitUntilEmpty() assert(numJobTrigered === 0) result } finally { @@ -859,7 +874,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSparkSessi test("SPARK-23880 table cache should be lazy and don't trigger any jobs") { val cachedData = checkIfNoJobTriggered { - spark.range(1002).filter('id > 1000).orderBy('id.desc).cache() + spark.range(1002).filter($"id" > 1000).orderBy($"id".desc).cache() } assert(cachedData.collect === Seq(1001)) } @@ -891,7 +906,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSparkSessi test("SPARK-24596 Non-cascading Cache Invalidation - drop persistent view") { withTable("t") { - spark.range(1, 10).toDF("key").withColumn("value", 'key * 2) + spark.range(1, 10).toDF("key").withColumn("value", $"key" * 2) .write.format("json").saveAsTable("t") withView("t1") { withTempView("t2") { @@ -911,7 +926,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSparkSessi test("SPARK-24596 Non-cascading Cache Invalidation - uncache table") { withTable("t") { - spark.range(1, 10).toDF("key").withColumn("value", 'key * 2) + spark.range(1, 10).toDF("key").withColumn("value", $"key" * 2) .write.format("json").saveAsTable("t") withTempView("t1", "t2") { sql("CACHE TABLE t") @@ -1094,4 +1109,17 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSparkSessi } } } + + test("cache supports for intervals") { + withTable("interval_cache") { + Seq((1, "1 second"), (2, "2 seconds"), (2, null)) + .toDF("k", "v").write.saveAsTable("interval_cache") + sql("CACHE TABLE t1 AS SELECT k, cast(v as interval) FROM interval_cache") + assert(spark.catalog.isCached("t1")) + checkAnswer(sql("SELECT * FROM t1 WHERE k = 1"), + Row(1, new CalendarInterval(0, 0, DateTimeConstants.MICROS_PER_SECOND))) + sql("UNCACHE TABLE t1") + assert(!spark.catalog.isCached("t1")) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala index a52c6d503d147..a9ee25b10dc02 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala @@ -26,7 +26,7 @@ import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.mapreduce.lib.input.{TextInputFormat => NewTextInputFormat} import org.scalatest.Matchers._ -import org.apache.spark.sql.catalyst.expressions.NamedExpression +import org.apache.spark.sql.catalyst.expressions.{In, InSet, NamedExpression} import org.apache.spark.sql.execution.ProjectExec import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf @@ -454,25 +454,36 @@ class ColumnExpressionSuite extends QueryTest with SharedSparkSession { test("isInCollection: Scala Collection") { val df = Seq((1, "x"), (2, "y"), (3, "z")).toDF("a", "b") - // Test with different types of collections - checkAnswer(df.filter($"a".isInCollection(Seq(3, 1))), - df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 1)) - checkAnswer(df.filter($"a".isInCollection(Seq(1, 2).toSet)), - df.collect().toSeq.filter(r => r.getInt(0) == 1 || r.getInt(0) == 2)) - checkAnswer(df.filter($"a".isInCollection(Seq(3, 2).toArray)), - df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 2)) - checkAnswer(df.filter($"a".isInCollection(Seq(3, 1).toList)), - df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 1)) - val df2 = Seq((1, Seq(1)), (2, Seq(2)), (3, Seq(3))).toDF("a", "b") + Seq(1, 2).foreach { conf => + withSQLConf(SQLConf.OPTIMIZER_INSET_CONVERSION_THRESHOLD.key -> conf.toString) { + if (conf <= 1) { + assert($"a".isInCollection(Seq(3, 1)).expr.isInstanceOf[InSet], "Expect expr to be InSet") + } else { + assert($"a".isInCollection(Seq(3, 1)).expr.isInstanceOf[In], "Expect expr to be In") + } - val e = intercept[AnalysisException] { - df2.filter($"a".isInCollection(Seq($"b"))) - } - Seq("cannot resolve", "due to data type mismatch: Arguments must be same type but were") - .foreach { s => - assert(e.getMessage.toLowerCase(Locale.ROOT).contains(s.toLowerCase(Locale.ROOT))) + // Test with different types of collections + checkAnswer(df.filter($"a".isInCollection(Seq(3, 1))), + df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 1)) + checkAnswer(df.filter($"a".isInCollection(Seq(1, 2).toSet)), + df.collect().toSeq.filter(r => r.getInt(0) == 1 || r.getInt(0) == 2)) + checkAnswer(df.filter($"a".isInCollection(Seq(3, 2).toArray)), + df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 2)) + checkAnswer(df.filter($"a".isInCollection(Seq(3, 1).toList)), + df.collect().toSeq.filter(r => r.getInt(0) == 3 || r.getInt(0) == 1)) + + val df2 = Seq((1, Seq(1)), (2, Seq(2)), (3, Seq(3))).toDF("a", "b") + + val e = intercept[AnalysisException] { + df2.filter($"a".isInCollection(Seq($"b"))) + } + Seq("cannot resolve", + "due to data type mismatch: Arguments must be same type but were").foreach { s => + assert(e.getMessage.toLowerCase(Locale.ROOT).contains(s.toLowerCase(Locale.ROOT))) + } } + } } test("&&") { @@ -526,12 +537,12 @@ class ColumnExpressionSuite extends QueryTest with SharedSparkSession { test("sqrt") { checkAnswer( - testData.select(sqrt('key)).orderBy('key.asc), + testData.select(sqrt($"key")).orderBy($"key".asc), (1 to 100).map(n => Row(math.sqrt(n))) ) checkAnswer( - testData.select(sqrt('value), 'key).orderBy('key.asc, 'value.asc), + testData.select(sqrt($"value"), $"key").orderBy($"key".asc, $"value".asc), (1 to 100).map(n => Row(math.sqrt(n), n)) ) @@ -543,12 +554,12 @@ class ColumnExpressionSuite extends QueryTest with SharedSparkSession { test("upper") { checkAnswer( - lowerCaseData.select(upper('l)), + lowerCaseData.select(upper($"l")), ('a' to 'd').map(c => Row(c.toString.toUpperCase(Locale.ROOT))) ) checkAnswer( - testData.select(upper('value), 'key), + testData.select(upper($"value"), $"key"), (1 to 100).map(n => Row(n.toString, n)) ) @@ -564,12 +575,12 @@ class ColumnExpressionSuite extends QueryTest with SharedSparkSession { test("lower") { checkAnswer( - upperCaseData.select(lower('L)), + upperCaseData.select(lower($"L")), ('A' to 'F').map(c => Row(c.toString.toLowerCase(Locale.ROOT))) ) checkAnswer( - testData.select(lower('value), 'key), + testData.select(lower($"value"), $"key"), (1 to 100).map(n => Row(n.toString, n)) ) @@ -742,8 +753,8 @@ class ColumnExpressionSuite extends QueryTest with SharedSparkSession { } test("columns can be compared") { - assert('key.desc == 'key.desc) - assert('key.desc != 'key.asc) + assert($"key".desc == $"key".desc) + assert($"key".desc != $"key".asc) } test("alias with metadata") { @@ -806,7 +817,7 @@ class ColumnExpressionSuite extends QueryTest with SharedSparkSession { } test("randn") { - val randCol = testData.select('key, randn(5L).as("rand")) + val randCol = testData.select($"key", randn(5L).as("rand")) randCol.columns.length should be (2) val rows = randCol.collect() rows.foreach { row => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ComplexTypesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ComplexTypesSuite.scala index 4d0eb04be751b..6b503334f9f23 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ComplexTypesSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ComplexTypesSuite.scala @@ -23,14 +23,14 @@ import org.apache.spark.sql.test.SharedSparkSession class ComplexTypesSuite extends QueryTest with SharedSparkSession { - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() spark.range(10).selectExpr( "id + 1 as i1", "id + 2 as i2", "id + 3 as i3", "id + 4 as i4", "id + 5 as i5") .write.saveAsTable("tab") } - override def afterAll() { + override def afterAll(): Unit = { try { spark.sql("DROP TABLE IF EXISTS tab") } finally { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ConfigBehaviorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ConfigBehaviorSuite.scala index 431e797e1686e..c3dbbb325d842 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ConfigBehaviorSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ConfigBehaviorSuite.scala @@ -41,7 +41,7 @@ class ConfigBehaviorSuite extends QueryTest with SharedSparkSession { // Trigger a sort // Range has range partitioning in its output now. To have a range shuffle, we // need to run a repartition first. - val data = spark.range(0, n, 1, 1).repartition(10).sort('id.desc) + val data = spark.range(0, n, 1, 1).repartition(10).sort($"id".desc) .selectExpr("SPARK_PARTITION_ID() pid", "id").as[(Int, Long)].collect() // Compute histogram for the number of records per partition post sort @@ -53,7 +53,11 @@ class ConfigBehaviorSuite extends QueryTest with SharedSparkSession { dist) } - withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> numPartitions.toString) { + // When enable AQE, the post partition number is changed. + // And the ChiSquareTest result is also need updated. So disable AQE. + withSQLConf( + SQLConf.SHUFFLE_PARTITIONS.key -> numPartitions.toString, + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { // The default chi-sq value should be low assert(computeChiSquareTest() < 100) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala index 52cf91cfade51..61f0e138cc358 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql +import java.sql.{Date, Timestamp} import java.text.SimpleDateFormat import java.util.Locale @@ -152,7 +153,7 @@ class CsvFunctionsSuite extends QueryTest with SharedSparkSession { test("infers schemas of a CSV string and pass to to from_csv") { val in = Seq("""0.123456789,987654321,"San Francisco"""").toDS() val options = Map.empty[String, String].asJava - val out = in.select(from_csv('value, schema_of_csv("0.1,1,a"), options) as "parsed") + val out = in.select(from_csv($"value", schema_of_csv("0.1,1,a"), options) as "parsed") val expected = StructType(Seq(StructField( "parsed", StructType(Seq( @@ -181,4 +182,22 @@ class CsvFunctionsSuite extends QueryTest with SharedSparkSession { checkAnswer(df, Row(Row(java.sql.Timestamp.valueOf("2018-11-06 18:00:00.0")))) } } + + test("special timestamp values") { + Seq("now", "today", "epoch", "tomorrow", "yesterday").foreach { specialValue => + val input = Seq(specialValue).toDS() + val readback = input.select(from_csv($"value", lit("t timestamp"), + Map.empty[String, String].asJava)).collect() + assert(readback(0).getAs[Row](0).getAs[Timestamp](0).getTime >= 0) + } + } + + test("special date values") { + Seq("now", "today", "epoch", "tomorrow", "yesterday").foreach { specialValue => + val input = Seq(specialValue).toDS() + val readback = input.select(from_csv($"value", lit("d date"), + Map.empty[String, String].asJava)).collect() + assert(readback(0).getAs[Row](0).getAs[Date](0).getTime >= 0) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala index ec7b636c8f695..d7df75fd0e2c3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala @@ -22,6 +22,7 @@ import scala.util.Random import org.scalatest.Matchers.the import org.apache.spark.sql.execution.WholeStageCodegenExec +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec} import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec import org.apache.spark.sql.expressions.Window @@ -29,11 +30,14 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.test.SQLTestData.DecimalData -import org.apache.spark.sql.types.DecimalType +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.CalendarInterval case class Fact(date: Int, hour: Int, minute: Int, room_name: String, temp: Double) -class DataFrameAggregateSuite extends QueryTest with SharedSparkSession { +class DataFrameAggregateSuite extends QueryTest + with SharedSparkSession + with AdaptiveSparkPlanHelper { import testImplicits._ val absTol = 1e-8 @@ -44,7 +48,7 @@ class DataFrameAggregateSuite extends QueryTest with SharedSparkSession { Seq(Row(1, 3), Row(2, 3), Row(3, 3)) ) checkAnswer( - testData2.groupBy("a").agg(sum($"b").as("totB")).agg(sum('totB)), + testData2.groupBy("a").agg(sum($"b").as("totB")).agg(sum($"totB")), Row(9) ) checkAnswer( @@ -110,7 +114,7 @@ class DataFrameAggregateSuite extends QueryTest with SharedSparkSession { val df = Seq(("some[thing]", "random-string")).toDF("key", "val") checkAnswer( - df.groupBy(regexp_extract('key, "([a-z]+)\\[", 1)).count(), + df.groupBy(regexp_extract($"key", "([a-z]+)\\[", 1)).count(), Row("some", 1) :: Nil ) } @@ -276,7 +280,7 @@ class DataFrameAggregateSuite extends QueryTest with SharedSparkSession { test("agg without groups") { checkAnswer( - testData2.agg(sum('b)), + testData2.agg(sum($"b")), Row(9) ) } @@ -290,52 +294,53 @@ class DataFrameAggregateSuite extends QueryTest with SharedSparkSession { test("average") { checkAnswer( - testData2.agg(avg('a), mean('a)), + testData2.agg(avg($"a"), mean($"a")), Row(2.0, 2.0)) checkAnswer( - testData2.agg(avg('a), sumDistinct('a)), // non-partial + testData2.agg(avg($"a"), sumDistinct($"a")), // non-partial Row(2.0, 6.0) :: Nil) checkAnswer( - decimalData.agg(avg('a)), + decimalData.agg(avg($"a")), Row(new java.math.BigDecimal(2))) checkAnswer( - decimalData.agg(avg('a), sumDistinct('a)), // non-partial + decimalData.agg(avg($"a"), sumDistinct($"a")), // non-partial Row(new java.math.BigDecimal(2), new java.math.BigDecimal(6)) :: Nil) checkAnswer( - decimalData.agg(avg('a cast DecimalType(10, 2))), + decimalData.agg(avg($"a" cast DecimalType(10, 2))), Row(new java.math.BigDecimal(2))) // non-partial checkAnswer( - decimalData.agg(avg('a cast DecimalType(10, 2)), sumDistinct('a cast DecimalType(10, 2))), + decimalData.agg( + avg($"a" cast DecimalType(10, 2)), sumDistinct($"a" cast DecimalType(10, 2))), Row(new java.math.BigDecimal(2), new java.math.BigDecimal(6)) :: Nil) } test("null average") { checkAnswer( - testData3.agg(avg('b)), + testData3.agg(avg($"b")), Row(2.0)) checkAnswer( - testData3.agg(avg('b), countDistinct('b)), + testData3.agg(avg($"b"), countDistinct($"b")), Row(2.0, 1)) checkAnswer( - testData3.agg(avg('b), sumDistinct('b)), // non-partial + testData3.agg(avg($"b"), sumDistinct($"b")), // non-partial Row(2.0, 2.0)) } test("zero average") { val emptyTableData = Seq.empty[(Int, Int)].toDF("a", "b") checkAnswer( - emptyTableData.agg(avg('a)), + emptyTableData.agg(avg($"a")), Row(null)) checkAnswer( - emptyTableData.agg(avg('a), sumDistinct('b)), // non-partial + emptyTableData.agg(avg($"a"), sumDistinct($"b")), // non-partial Row(null, null)) } @@ -343,28 +348,29 @@ class DataFrameAggregateSuite extends QueryTest with SharedSparkSession { assert(testData2.count() === testData2.rdd.map(_ => 1).count()) checkAnswer( - testData2.agg(count('a), sumDistinct('a)), // non-partial + testData2.agg(count($"a"), sumDistinct($"a")), // non-partial Row(6, 6.0)) } test("null count") { checkAnswer( - testData3.groupBy('a).agg(count('b)), + testData3.groupBy($"a").agg(count($"b")), Seq(Row(1, 0), Row(2, 1)) ) checkAnswer( - testData3.groupBy('a).agg(count('a + 'b)), + testData3.groupBy($"a").agg(count($"a" + $"b")), Seq(Row(1, 0), Row(2, 1)) ) checkAnswer( - testData3.agg(count('a), count('b), count(lit(1)), countDistinct('a), countDistinct('b)), + testData3.agg( + count($"a"), count($"b"), count(lit(1)), countDistinct($"a"), countDistinct($"b")), Row(2, 1, 2, 2, 1) ) checkAnswer( - testData3.agg(count('b), countDistinct('b), sumDistinct('b)), // non-partial + testData3.agg(count($"b"), countDistinct($"b"), sumDistinct($"b")), // non-partial Row(1, 1, 2) ) } @@ -379,17 +385,17 @@ class DataFrameAggregateSuite extends QueryTest with SharedSparkSession { .toDF("key1", "key2", "key3") checkAnswer( - df1.agg(countDistinct('key1, 'key2)), + df1.agg(countDistinct($"key1", $"key2")), Row(3) ) checkAnswer( - df1.agg(countDistinct('key1, 'key2, 'key3)), + df1.agg(countDistinct($"key1", $"key2", $"key3")), Row(3) ) checkAnswer( - df1.groupBy('key1).agg(countDistinct('key2, 'key3)), + df1.groupBy($"key1").agg(countDistinct($"key2", $"key3")), Seq(Row("a", 2), Row("x", 1)) ) } @@ -397,14 +403,14 @@ class DataFrameAggregateSuite extends QueryTest with SharedSparkSession { test("zero count") { val emptyTableData = Seq.empty[(Int, Int)].toDF("a", "b") checkAnswer( - emptyTableData.agg(count('a), sumDistinct('a)), // non-partial + emptyTableData.agg(count($"a"), sumDistinct($"a")), // non-partial Row(0, null)) } test("stddev") { val testData2ADev = math.sqrt(4.0 / 5.0) checkAnswer( - testData2.agg(stddev('a), stddev_pop('a), stddev_samp('a)), + testData2.agg(stddev($"a"), stddev_pop($"a"), stddev_samp($"a")), Row(testData2ADev, math.sqrt(4 / 6.0), testData2ADev)) checkAnswer( testData2.agg(stddev("a"), stddev_pop("a"), stddev_samp("a")), @@ -414,47 +420,47 @@ class DataFrameAggregateSuite extends QueryTest with SharedSparkSession { test("zero stddev") { val emptyTableData = Seq.empty[(Int, Int)].toDF("a", "b") checkAnswer( - emptyTableData.agg(stddev('a), stddev_pop('a), stddev_samp('a)), + emptyTableData.agg(stddev($"a"), stddev_pop($"a"), stddev_samp($"a")), Row(null, null, null)) } test("zero sum") { val emptyTableData = Seq.empty[(Int, Int)].toDF("a", "b") checkAnswer( - emptyTableData.agg(sum('a)), + emptyTableData.agg(sum($"a")), Row(null)) } test("zero sum distinct") { val emptyTableData = Seq.empty[(Int, Int)].toDF("a", "b") checkAnswer( - emptyTableData.agg(sumDistinct('a)), + emptyTableData.agg(sumDistinct($"a")), Row(null)) } test("moments") { - val sparkVariance = testData2.agg(variance('a)) + val sparkVariance = testData2.agg(variance($"a")) checkAggregatesWithTol(sparkVariance, Row(4.0 / 5.0), absTol) - val sparkVariancePop = testData2.agg(var_pop('a)) + val sparkVariancePop = testData2.agg(var_pop($"a")) checkAggregatesWithTol(sparkVariancePop, Row(4.0 / 6.0), absTol) - val sparkVarianceSamp = testData2.agg(var_samp('a)) + val sparkVarianceSamp = testData2.agg(var_samp($"a")) checkAggregatesWithTol(sparkVarianceSamp, Row(4.0 / 5.0), absTol) - val sparkSkewness = testData2.agg(skewness('a)) + val sparkSkewness = testData2.agg(skewness($"a")) checkAggregatesWithTol(sparkSkewness, Row(0.0), absTol) - val sparkKurtosis = testData2.agg(kurtosis('a)) + val sparkKurtosis = testData2.agg(kurtosis($"a")) checkAggregatesWithTol(sparkKurtosis, Row(-1.5), absTol) } test("zero moments") { val input = Seq((1, 2)).toDF("a", "b") checkAnswer( - input.agg(stddev('a), stddev_samp('a), stddev_pop('a), variance('a), - var_samp('a), var_pop('a), skewness('a), kurtosis('a)), + input.agg(stddev($"a"), stddev_samp($"a"), stddev_pop($"a"), variance($"a"), + var_samp($"a"), var_pop($"a"), skewness($"a"), kurtosis($"a")), Row(Double.NaN, Double.NaN, 0.0, Double.NaN, Double.NaN, 0.0, Double.NaN, Double.NaN)) @@ -474,8 +480,8 @@ class DataFrameAggregateSuite extends QueryTest with SharedSparkSession { test("null moments") { val emptyTableData = Seq.empty[(Int, Int)].toDF("a", "b") - checkAnswer( - emptyTableData.agg(variance('a), var_samp('a), var_pop('a), skewness('a), kurtosis('a)), + checkAnswer(emptyTableData.agg( + variance($"a"), var_samp($"a"), var_pop($"a"), skewness($"a"), kurtosis($"a")), Row(null, null, null, null, null)) checkAnswer( @@ -546,6 +552,14 @@ class DataFrameAggregateSuite extends QueryTest with SharedSparkSession { ) } + test("collect functions should be able to cast to array type with no null values") { + val df = Seq(1, 2).toDF("a") + checkAnswer(df.select(collect_list("a") cast ArrayType(IntegerType, false)), + Seq(Row(Seq(1, 2)))) + checkAnswer(df.select(collect_set("a") cast ArrayType(FloatType, false)), + Seq(Row(Seq(1.0, 2.0)))) + } + test("SPARK-14664: Decimal sum/avg over window should work.") { checkAnswer( spark.sql("select sum(a) over () from values 1.0, 2.0, 3.0 T(a)"), @@ -557,7 +571,7 @@ class DataFrameAggregateSuite extends QueryTest with SharedSparkSession { test("SQL decimal test (used for catching certain decimal handling bugs in aggregates)") { checkAnswer( - decimalData.groupBy('a cast DecimalType(10, 2)).agg(avg('b cast DecimalType(10, 2))), + decimalData.groupBy($"a" cast DecimalType(10, 2)).agg(avg($"b" cast DecimalType(10, 2))), Seq(Row(new java.math.BigDecimal(1), new java.math.BigDecimal("1.5")), Row(new java.math.BigDecimal(2), new java.math.BigDecimal("1.5")), Row(new java.math.BigDecimal(3), new java.math.BigDecimal("1.5")))) @@ -607,26 +621,27 @@ class DataFrameAggregateSuite extends QueryTest with SharedSparkSession { // test case for HashAggregate val hashAggDF = df.groupBy("x").agg(c, sum("y")) + hashAggDF.collect() val hashAggPlan = hashAggDF.queryExecution.executedPlan if (wholeStage) { - assert(hashAggPlan.find { + assert(find(hashAggPlan) { case WholeStageCodegenExec(_: HashAggregateExec) => true case _ => false }.isDefined) } else { - assert(hashAggPlan.isInstanceOf[HashAggregateExec]) + assert(stripAQEPlan(hashAggPlan).isInstanceOf[HashAggregateExec]) } - hashAggDF.collect() // test case for ObjectHashAggregate and SortAggregate val objHashAggOrSortAggDF = df.groupBy("x").agg(c, collect_list("y")) - val objHashAggOrSortAggPlan = objHashAggOrSortAggDF.queryExecution.executedPlan + objHashAggOrSortAggDF.collect() + val objHashAggOrSortAggPlan = + stripAQEPlan(objHashAggOrSortAggDF.queryExecution.executedPlan) if (useObjectHashAgg) { assert(objHashAggOrSortAggPlan.isInstanceOf[ObjectHashAggregateExec]) } else { assert(objHashAggOrSortAggPlan.isInstanceOf[SortAggregateExec]) } - objHashAggOrSortAggDF.collect() } } } @@ -644,7 +659,7 @@ class DataFrameAggregateSuite extends QueryTest with SharedSparkSession { testData2.groupBy(lit(3), lit(4)).agg(lit(6), lit(7), sum("b")), Seq(Row(3, 4, 6, 7, 9))) checkAnswer( - testData2.groupBy(lit(3), lit(4)).agg(lit(6), 'b, sum("b")), + testData2.groupBy(lit(3), lit(4)).agg(lit(6), $"b", sum("b")), Seq(Row(3, 4, 6, 1, 3), Row(3, 4, 6, 2, 6))) checkAnswer( @@ -667,17 +682,17 @@ class DataFrameAggregateSuite extends QueryTest with SharedSparkSession { .groupBy("a").agg(collect_list("f").as("g")) val aggPlan = objHashAggDF.queryExecution.executedPlan - val sortAggPlans = aggPlan.collect { + val sortAggPlans = collect(aggPlan) { case sortAgg: SortAggregateExec => sortAgg } assert(sortAggPlans.isEmpty) - val objHashAggPlans = aggPlan.collect { + val objHashAggPlans = collect(aggPlan) { case objHashAgg: ObjectHashAggregateExec => objHashAgg } assert(objHashAggPlans.nonEmpty) - val exchangePlans = aggPlan.collect { + val exchangePlans = collect(aggPlan) { case shuffle: ShuffleExchangeExec => shuffle } assert(exchangePlans.length == 1) @@ -707,14 +722,14 @@ class DataFrameAggregateSuite extends QueryTest with SharedSparkSession { assert(thrownException.message.contains("not allowed to use a window function")) } - checkWindowError(testData2.select(min(avg('b).over(Window.partitionBy('a))))) - checkWindowError(testData2.agg(sum('b), max(rank().over(Window.orderBy('a))))) - checkWindowError(testData2.groupBy('a).agg(sum('b), max(rank().over(Window.orderBy('b))))) - checkWindowError(testData2.groupBy('a).agg(max(sum(sum('b)).over(Window.orderBy('a))))) - checkWindowError( - testData2.groupBy('a).agg(sum('b).as("s"), max(count("*").over())).where('s === 3)) - checkAnswer( - testData2.groupBy('a).agg(max('b), sum('b).as("s"), count("*").over()).where('s === 3), + checkWindowError(testData2.select(min(avg($"b").over(Window.partitionBy($"a"))))) + checkWindowError(testData2.agg(sum($"b"), max(rank().over(Window.orderBy($"a"))))) + checkWindowError(testData2.groupBy($"a").agg(sum($"b"), max(rank().over(Window.orderBy($"b"))))) + checkWindowError(testData2.groupBy($"a").agg(max(sum(sum($"b")).over(Window.orderBy($"a"))))) + checkWindowError(testData2.groupBy($"a").agg( + sum($"b").as("s"), max(count("*").over())).where($"s" === 3)) + checkAnswer(testData2.groupBy($"a").agg( + max($"b"), sum($"b").as("s"), count("*").over()).where($"s" === 3), Row(1, 2, 3, 3) :: Row(2, 2, 3, 3) :: Row(3, 2, 3, 3) :: Nil) checkWindowError(sql("SELECT MIN(AVG(b) OVER(PARTITION BY a)) FROM testData2")) @@ -730,7 +745,7 @@ class DataFrameAggregateSuite extends QueryTest with SharedSparkSession { test("SPARK-24788: RelationalGroupedDataset.toString with unresolved exprs should not fail") { // Checks if these raise no exception - assert(testData.groupBy('key).toString.contains( + assert(testData.groupBy($"key").toString.contains( "[grouping expressions: [key], value: [key: int, value: string], type: GroupBy]")) assert(testData.groupBy(col("key")).toString.contains( "[grouping expressions: [key], value: [key: int, value: string], type: GroupBy]")) @@ -942,4 +957,17 @@ class DataFrameAggregateSuite extends QueryTest with SharedSparkSession { assert(error.message.contains("function count_if requires boolean type")) } } + + test("calendar interval agg support hash aggregate") { + val df1 = Seq((1, "1 day"), (2, "2 day"), (3, "3 day"), (3, null)).toDF("a", "b") + val df2 = df1.select(avg($"b" cast CalendarIntervalType)) + checkAnswer(df2, Row(new CalendarInterval(0, 2, 0)) :: Nil) + assert(find(df2.queryExecution.executedPlan)(_.isInstanceOf[HashAggregateExec]).isDefined) + val df3 = df1.groupBy($"a").agg(avg($"b" cast CalendarIntervalType)) + checkAnswer(df3, + Row(1, new CalendarInterval(0, 1, 0)) :: + Row(2, new CalendarInterval(0, 2, 0)) :: + Row(3, new CalendarInterval(0, 3, 0)) :: Nil) + assert(find(df3.queryExecution.executedPlan)(_.isInstanceOf[HashAggregateExec]).isDefined) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala index e9179a39d3b6d..4f25642906628 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameComplexTypeSuite.scala @@ -18,8 +18,12 @@ package org.apache.spark.sql import org.apache.spark.sql.catalyst.DefinedByConstructorParams +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.expressions.objects.MapObjects import org.apache.spark.sql.functions._ +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.ArrayType /** * A test suite to test DataFrame/SQL functionalities with complex types (i.e. array, struct, map). @@ -64,6 +68,24 @@ class DataFrameComplexTypeSuite extends QueryTest with SharedSparkSession { val ds100_5 = Seq(S100_5()).toDS() ds100_5.rdd.count } + + test("SPARK-29503 nest unsafe struct inside safe array") { + withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") { + val df = spark.sparkContext.parallelize(Seq(Seq(1, 2, 3))).toDF("items") + + // items: Seq[Int] => items.map { item => Seq(Struct(item)) } + val result = df.select( + new Column(MapObjects( + (item: Expression) => array(struct(new Column(item))).expr, + $"items".expr, + df.schema("items").dataType.asInstanceOf[ArrayType].elementType + )) as "items" + ).collect() + + assert(result.size === 1) + assert(result === Row(Seq(Seq(Row(1)), Seq(Row(2)), Seq(Row(3)))) :: Nil) + } + } } class S100( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 7d044638db571..f7531ea446015 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -278,15 +278,15 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { test("pmod") { val intData = Seq((7, 3), (-7, 3)).toDF("a", "b") checkAnswer( - intData.select(pmod('a, 'b)), + intData.select(pmod($"a", $"b")), Seq(Row(1), Row(2)) ) checkAnswer( - intData.select(pmod('a, lit(3))), + intData.select(pmod($"a", lit(3))), Seq(Row(1), Row(2)) ) checkAnswer( - intData.select(pmod(lit(-7), 'b)), + intData.select(pmod(lit(-7), $"b")), Seq(Row(2), Row(2)) ) checkAnswer( @@ -303,7 +303,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { ) val doubleData = Seq((7.2, 4.1)).toDF("a", "b") checkAnswer( - doubleData.select(pmod('a, 'b)), + doubleData.select(pmod($"a", $"b")), Seq(Row(3.1000000000000005)) // same as hive ) checkAnswer( @@ -312,6 +312,86 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { ) } + test("array_sort with lambda functions") { + + spark.udf.register("fAsc", (x: Int, y: Int) => { + if (x < y) -1 + else if (x == y) 0 + else 1 + }) + + spark.udf.register("fDesc", (x: Int, y: Int) => { + if (x < y) 1 + else if (x == y) 0 + else -1 + }) + + spark.udf.register("fString", (x: String, y: String) => { + if (x == null && y == null) 0 + else if (x == null) 1 + else if (y == null) -1 + else if (x < y) 1 + else if (x == y) 0 + else -1 + }) + + spark.udf.register("fStringLength", (x: String, y: String) => { + if (x == null && y == null) 0 + else if (x == null) 1 + else if (y == null) -1 + else if (x.length < y.length) -1 + else if (x.length == y.length) 0 + else 1 + }) + + val df1 = Seq(Array[Int](3, 2, 5, 1, 2)).toDF("a") + checkAnswer( + df1.selectExpr("array_sort(a, (x, y) -> fAsc(x, y))"), + Seq( + Row(Seq(1, 2, 2, 3, 5))) + ) + + checkAnswer( + df1.selectExpr("array_sort(a, (x, y) -> fDesc(x, y))"), + Seq( + Row(Seq(5, 3, 2, 2, 1))) + ) + + val df2 = Seq(Array[String]("bc", "ab", "dc")).toDF("a") + checkAnswer( + df2.selectExpr("array_sort(a, (x, y) -> fString(x, y))"), + Seq( + Row(Seq("dc", "bc", "ab"))) + ) + + val df3 = Seq(Array[String]("a", "abcd", "abc")).toDF("a") + checkAnswer( + df3.selectExpr("array_sort(a, (x, y) -> fStringLength(x, y))"), + Seq( + Row(Seq("a", "abc", "abcd"))) + ) + + val df4 = Seq((Array[Array[Int]](Array(2, 3, 1), Array(4, 2, 1, 4), + Array(1, 2)), "x")).toDF("a", "b") + checkAnswer( + df4.selectExpr("array_sort(a, (x, y) -> fAsc(cardinality(x), cardinality(y)))"), + Seq( + Row(Seq[Seq[Int]](Seq(1, 2), Seq(2, 3, 1), Seq(4, 2, 1, 4)))) + ) + + val df5 = Seq(Array[String]("bc", null, "ab", "dc")).toDF("a") + checkAnswer( + df5.selectExpr("array_sort(a, (x, y) -> fString(x, y))"), + Seq( + Row(Seq("dc", "bc", "ab", null))) + ) + + spark.sql("drop temporary function fAsc") + spark.sql("drop temporary function fDesc") + spark.sql("drop temporary function fString") + spark.sql("drop temporary function fStringLength") + } + test("sort_array/array_sort functions") { val df = Seq( (Array[Int](2, 1, 3), Array("b", "c", "a")), @@ -383,7 +463,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { assert(intercept[AnalysisException] { df3.selectExpr("array_sort(a)").collect() - }.getMessage().contains("only supports array input")) + }.getMessage().contains("argument 1 requires array type, however, '`a`' is of string type")) } def testSizeOfArray(sizeOfNull: Any): Unit = { @@ -520,7 +600,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { ) def testPrimitiveType(): Unit = { - checkAnswer(idf.select(map_entries('m)), iExpected) + checkAnswer(idf.select(map_entries($"m")), iExpected) checkAnswer(idf.selectExpr("map_entries(m)"), iExpected) checkAnswer(idf.selectExpr("map_entries(map(1, null, 2, null))"), Seq.fill(iExpected.length)(Row(Seq(Row(1, null), Row(2, null))))) @@ -547,7 +627,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { ) def testNonPrimitiveType(): Unit = { - checkAnswer(sdf.select(map_entries('m)), sExpected) + checkAnswer(sdf.select(map_entries($"m")), sExpected) checkAnswer(sdf.selectExpr("map_entries(m)"), sExpected) } @@ -572,7 +652,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { ) checkAnswer(df1.selectExpr("map_concat(map1, map2)"), expected1a) - checkAnswer(df1.select(map_concat('map1, 'map2)), expected1a) + checkAnswer(df1.select(map_concat($"map1", $"map2")), expected1a) val expected1b = Seq( Row(Map(1 -> 100, 2 -> 200)), @@ -581,7 +661,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { ) checkAnswer(df1.selectExpr("map_concat(map1)"), expected1b) - checkAnswer(df1.select(map_concat('map1)), expected1b) + checkAnswer(df1.select(map_concat($"map1")), expected1b) val df2 = Seq( ( @@ -613,7 +693,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { ) checkAnswer(df3.selectExpr("map_concat(map1, map2)"), expected3) - checkAnswer(df3.select(map_concat('map1, 'map2)), expected3) + checkAnswer(df3.select(map_concat($"map1", $"map2")), expected3) val expectedMessage1 = "input to function map_concat should all be the same type" @@ -622,7 +702,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { }.getMessage().contains(expectedMessage1)) assert(intercept[AnalysisException] { - df2.select(map_concat('map1, 'map2)).collect() + df2.select(map_concat($"map1", $"map2")).collect() }.getMessage().contains(expectedMessage1)) val expectedMessage2 = "input to function map_concat should all be of type map" @@ -632,7 +712,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { }.getMessage().contains(expectedMessage2)) assert(intercept[AnalysisException] { - df2.select(map_concat('map1, lit(12))).collect() + df2.select(map_concat($"map1", lit(12))).collect() }.getMessage().contains(expectedMessage2)) } @@ -651,7 +731,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(null)) def testPrimitiveType(): Unit = { - checkAnswer(idf.select(map_from_entries('a)), iExpected) + checkAnswer(idf.select(map_from_entries($"a")), iExpected) checkAnswer(idf.selectExpr("map_from_entries(a)"), iExpected) checkAnswer(idf.selectExpr("map_from_entries(array(struct(1, null), struct(2, null)))"), Seq.fill(iExpected.length)(Row(Map(1 -> null, 2 -> null)))) @@ -679,7 +759,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(null)) def testNonPrimitiveType(): Unit = { - checkAnswer(sdf.select(map_from_entries('a)), sExpected) + checkAnswer(sdf.select(map_from_entries($"a")), sExpected) checkAnswer(sdf.selectExpr("map_from_entries(a)"), sExpected) } @@ -770,7 +850,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { val errorMsg1 = s""" |Input to function array_contains should have been array followed by a - |value with same element type, but it's [array, decimal(29,29)]. + |value with same element type, but it's [array, decimal(38,29)]. """.stripMargin.replace("\n", " ").trim() assert(e1.message.contains(errorMsg1)) @@ -785,6 +865,23 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { assert(e2.message.contains(errorMsg2)) } + test("SPARK-29600: ArrayContains function may return incorrect result for DecimalType") { + checkAnswer( + sql("select array_contains(array(1.10), 1.1)"), + Seq(Row(true)) + ) + + checkAnswer( + sql("SELECT array_contains(array(1.1), 1.10)"), + Seq(Row(true)) + ) + + checkAnswer( + sql("SELECT array_contains(array(1.11), 1.1)"), + Seq(Row(false)) + ) + } + test("arrays_overlap function") { val df = Seq( (Seq[Option[Int]](Some(1), Some(2)), Seq[Option[Int]](Some(-1), Some(10))), @@ -899,8 +996,10 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { } test("sequence") { - checkAnswer(Seq((-2, 2)).toDF().select(sequence('_1, '_2)), Seq(Row(Array(-2, -1, 0, 1, 2)))) - checkAnswer(Seq((7, 2, -2)).toDF().select(sequence('_1, '_2, '_3)), Seq(Row(Array(7, 5, 3)))) + checkAnswer(Seq((-2, 2)).toDF().select(sequence($"_1", $"_2")), + Seq(Row(Array(-2, -1, 0, 1, 2)))) + checkAnswer(Seq((7, 2, -2)).toDF().select(sequence($"_1", $"_2", $"_3")), + Seq(Row(Array(7, 5, 3)))) checkAnswer( spark.sql("select sequence(" + @@ -926,7 +1025,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { // test type coercion checkAnswer( - Seq((1.toByte, 3L, 1)).toDF().select(sequence('_1, '_2, '_3)), + Seq((1.toByte, 3L, 1)).toDF().select(sequence($"_1", $"_2", $"_3")), Seq(Row(Array(1L, 2L, 3L)))) checkAnswer( @@ -954,9 +1053,9 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { test("reverse function - string") { val oneRowDF = Seq(("Spark", 3215)).toDF("s", "i") def testString(): Unit = { - checkAnswer(oneRowDF.select(reverse('s)), Seq(Row("krapS"))) + checkAnswer(oneRowDF.select(reverse($"s")), Seq(Row("krapS"))) checkAnswer(oneRowDF.selectExpr("reverse(s)"), Seq(Row("krapS"))) - checkAnswer(oneRowDF.select(reverse('i)), Seq(Row("5123"))) + checkAnswer(oneRowDF.select(reverse($"i")), Seq(Row("5123"))) checkAnswer(oneRowDF.selectExpr("reverse(i)"), Seq(Row("5123"))) checkAnswer(oneRowDF.selectExpr("reverse(null)"), Seq(Row(null))) } @@ -978,7 +1077,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { def testArrayOfPrimitiveTypeNotContainsNull(): Unit = { checkAnswer( - idfNotContainsNull.select(reverse('i)), + idfNotContainsNull.select(reverse($"i")), Seq(Row(Seq(7, 8, 9, 1)), Row(Seq(2, 7, 9, 8, 5)), Row(Seq.empty), Row(null)) ) checkAnswer( @@ -1004,7 +1103,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { def testArrayOfPrimitiveTypeContainsNull(): Unit = { checkAnswer( - idfContainsNull.select(reverse('i)), + idfContainsNull.select(reverse($"i")), Seq(Row(Seq(7, null, 8, 9, 1)), Row(Seq(2, 7, 9, 8, 5, null)), Row(Seq.empty), Row(null)) ) checkAnswer( @@ -1030,7 +1129,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { def testArrayOfNonPrimitiveType(): Unit = { checkAnswer( - sdf.select(reverse('s)), + sdf.select(reverse($"s")), Seq(Row(Seq("b", "a", "c")), Row(Seq(null, "c", null, "b")), Row(Seq.empty), Row(null)) ) checkAnswer( @@ -1735,7 +1834,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { ).toDF("i") def testArrayOfPrimitiveTypeNotContainsNull(): Unit = { - checkShuffleResult(idfNotContainsNull.select(shuffle('i))) + checkShuffleResult(idfNotContainsNull.select(shuffle($"i"))) checkShuffleResult(idfNotContainsNull.selectExpr("shuffle(i)")) } @@ -1755,7 +1854,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { ).toDF("i") def testArrayOfPrimitiveTypeContainsNull(): Unit = { - checkShuffleResult(idfContainsNull.select(shuffle('i))) + checkShuffleResult(idfContainsNull.select(shuffle($"i"))) checkShuffleResult(idfContainsNull.selectExpr("shuffle(i)")) } @@ -1775,7 +1874,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { ).toDF("s") def testNonPrimitiveType(): Unit = { - checkShuffleResult(sdf.select(shuffle('s))) + checkShuffleResult(sdf.select(shuffle($"s"))) checkShuffleResult(sdf.selectExpr("shuffle(s)")) } @@ -1930,6 +2029,18 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(Seq(5, 9, 11, 10, 6)), Row(Seq.empty), Row(null))) + checkAnswer(df.select(transform(col("i"), x => x + 1)), + Seq( + Row(Seq(2, 10, 9, 8)), + Row(Seq(6, 9, 10, 8, 3)), + Row(Seq.empty), + Row(null))) + checkAnswer(df.select(transform(col("i"), (x, i) => x + i)), + Seq( + Row(Seq(1, 10, 10, 10)), + Row(Seq(5, 9, 11, 10, 6)), + Row(Seq.empty), + Row(null))) } // Test with local relation, the Project will be evaluated without codegen @@ -1960,6 +2071,18 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(Seq(5, null, 10, 12, 11, 7)), Row(Seq.empty), Row(null))) + checkAnswer(df.select(transform(col("i"), x => x + 1)), + Seq( + Row(Seq(2, 10, 9, null, 8)), + Row(Seq(6, null, 9, 10, 8, 3)), + Row(Seq.empty), + Row(null))) + checkAnswer(df.select(transform(col("i"), (x, i) => x + i)), + Seq( + Row(Seq(1, 10, 10, null, 11)), + Row(Seq(5, null, 10, 12, 11, 7)), + Row(Seq.empty), + Row(null))) } // Test with local relation, the Project will be evaluated without codegen @@ -1990,6 +2113,18 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(Seq("b0", null, "c2", null)), Row(Seq.empty), Row(null))) + checkAnswer(df.select(transform(col("s"), x => concat(x, x))), + Seq( + Row(Seq("cc", "aa", "bb")), + Row(Seq("bb", null, "cc", null)), + Row(Seq.empty), + Row(null))) + checkAnswer(df.select(transform(col("s"), (x, i) => concat(x, i))), + Seq( + Row(Seq("c0", "a1", "b2")), + Row(Seq("b0", null, "c2", null)), + Row(Seq.empty), + Row(null))) } // Test with local relation, the Project will be evaluated without codegen @@ -2034,6 +2169,32 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Seq("b", null, "c", null, null))), Row(Seq.empty), Row(null))) + checkAnswer(df.select(transform(col("arg"), arg => arg)), + Seq( + Row(Seq("c", "a", "b")), + Row(Seq("b", null, "c", null)), + Row(Seq.empty), + Row(null))) + checkAnswer(df.select(transform(col("arg"), _ => col("arg"))), + Seq( + Row(Seq(Seq("c", "a", "b"), Seq("c", "a", "b"), Seq("c", "a", "b"))), + Row(Seq( + Seq("b", null, "c", null), + Seq("b", null, "c", null), + Seq("b", null, "c", null), + Seq("b", null, "c", null))), + Row(Seq.empty), + Row(null))) + checkAnswer(df.select(transform(col("arg"), x => concat(col("arg"), array(x)))), + Seq( + Row(Seq(Seq("c", "a", "b", "c"), Seq("c", "a", "b", "a"), Seq("c", "a", "b", "b"))), + Row(Seq( + Seq("b", null, "c", null, "b"), + Seq("b", null, "c", null, null), + Seq("b", null, "c", null, "c"), + Seq("b", null, "c", null, null))), + Row(Seq.empty), + Row(null))) } // Test with local relation, the Project will be evaluated without codegen @@ -2080,6 +2241,14 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(Map(), Map(1 -> -1, 2 -> -2, 3 -> -3)), Row(Map(1 -> 10), Map(3 -> -3)))) + checkAnswer(dfInts.select( + map_filter(col("m"), (k, v) => k * 10 === v), + map_filter(col("m"), (k, v) => k === (v * -1))), + Seq( + Row(Map(1 -> 10, 2 -> 20, 3 -> 30), Map()), + Row(Map(), Map(1 -> -1, 2 -> -2, 3 -> -3)), + Row(Map(1 -> 10), Map(3 -> -3)))) + val dfComplex = Seq( Map(1 -> Seq(Some(1)), 2 -> Seq(Some(1), Some(2)), 3 -> Seq(Some(1), Some(2), Some(3))), Map(1 -> null, 2 -> Seq(Some(-2), Some(-2)), 3 -> Seq[Option[Int]](None))).toDF("m") @@ -2090,6 +2259,13 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(Map(1 -> Seq(1)), Map(1 -> Seq(1), 2 -> Seq(1, 2), 3 -> Seq(1, 2, 3))), Row(Map(), Map(2 -> Seq(-2, -2))))) + checkAnswer(dfComplex.select( + map_filter(col("m"), (k, v) => k === element_at(v, 1)), + map_filter(col("m"), (k, v) => k === size(v))), + Seq( + Row(Map(1 -> Seq(1)), Map(1 -> Seq(1), 2 -> Seq(1, 2), 3 -> Seq(1, 2, 3))), + Row(Map(), Map(2 -> Seq(-2, -2))))) + // Invalid use cases val df = Seq( (Map(1 -> "a"), 1), @@ -2112,6 +2288,11 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { } assert(ex3.getMessage.contains("data type mismatch: argument 1 requires map type")) + val ex3a = intercept[AnalysisException] { + df.select(map_filter(col("i"), (k, v) => k > v)) + } + assert(ex3a.getMessage.contains("data type mismatch: argument 1 requires map type")) + val ex4 = intercept[AnalysisException] { df.selectExpr("map_filter(a, (k, v) -> k > v)") } @@ -2133,6 +2314,12 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(Seq(8, 2)), Row(Seq.empty), Row(null))) + checkAnswer(df.select(filter(col("i"), _ % 2 === 0)), + Seq( + Row(Seq(8)), + Row(Seq(8, 2)), + Row(Seq.empty), + Row(null))) } // Test with local relation, the Project will be evaluated without codegen @@ -2157,6 +2344,12 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(Seq(8, 2)), Row(Seq.empty), Row(null))) + checkAnswer(df.select(filter(col("i"), _ % 2 === 0)), + Seq( + Row(Seq(8)), + Row(Seq(8, 2)), + Row(Seq.empty), + Row(null))) } // Test with local relation, the Project will be evaluated without codegen @@ -2181,6 +2374,12 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(Seq("b", "c")), Row(Seq.empty), Row(null))) + checkAnswer(df.select(filter(col("s"), x => x.isNotNull)), + Seq( + Row(Seq("c", "a", "b")), + Row(Seq("b", "c")), + Row(Seq.empty), + Row(null))) } // Test with local relation, the Project will be evaluated without codegen @@ -2190,6 +2389,36 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { testNonPrimitiveType() } + test("filter function - index argument") { + val df = Seq( + Seq("c", "a", "b"), + Seq("b", null, "c", null), + Seq.empty, + null + ).toDF("s") + + def testIndexArgument(): Unit = { + checkAnswer(df.selectExpr("filter(s, (x, i) -> i % 2 == 0)"), + Seq( + Row(Seq("c", "b")), + Row(Seq("b", "c")), + Row(Seq.empty), + Row(null))) + checkAnswer(df.select(filter(col("s"), (x, i) => i % 2 === 0)), + Seq( + Row(Seq("c", "b")), + Row(Seq("b", "c")), + Row(Seq.empty), + Row(null))) + } + + // Test with local relation, the Project will be evaluated without codegen + testIndexArgument() + // Test with cached relation, the Project will be evaluated with codegen + df.cache() + testIndexArgument() + } + test("filter function - invalid") { val df = Seq( (Seq("c", "a", "b"), 1), @@ -2199,20 +2428,30 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { ).toDF("s", "i") val ex1 = intercept[AnalysisException] { - df.selectExpr("filter(s, (x, y) -> x + y)") + df.selectExpr("filter(s, (x, y, z) -> x + y)") } - assert(ex1.getMessage.contains("The number of lambda function arguments '2' does not match")) + assert(ex1.getMessage.contains("The number of lambda function arguments '3' does not match")) val ex2 = intercept[AnalysisException] { df.selectExpr("filter(i, x -> x)") } assert(ex2.getMessage.contains("data type mismatch: argument 1 requires array type")) + val ex2a = intercept[AnalysisException] { + df.select(filter(col("i"), x => x)) + } + assert(ex2a.getMessage.contains("data type mismatch: argument 1 requires array type")) + val ex3 = intercept[AnalysisException] { df.selectExpr("filter(s, x -> x)") } assert(ex3.getMessage.contains("data type mismatch: argument 2 requires boolean type")) + val ex3a = intercept[AnalysisException] { + df.select(filter(col("s"), x => x)) + } + assert(ex3a.getMessage.contains("data type mismatch: argument 2 requires boolean type")) + val ex4 = intercept[AnalysisException] { df.selectExpr("filter(a, x -> x)") } @@ -2234,6 +2473,12 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(false), Row(false), Row(null))) + checkAnswer(df.select(exists(col("i"), _ % 2 === 0)), + Seq( + Row(true), + Row(false), + Row(false), + Row(null))) } // Test with local relation, the Project will be evaluated without codegen @@ -2260,6 +2505,13 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(null), Row(false), Row(null))) + checkAnswer(df.select(exists(col("i"), _ % 2 === 0)), + Seq( + Row(true), + Row(false), + Row(null), + Row(false), + Row(null))) } // Test with local relation, the Project will be evaluated without codegen @@ -2284,6 +2536,12 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(true), Row(false), Row(null))) + checkAnswer(df.select(exists(col("s"), x => x.isNull)), + Seq( + Row(false), + Row(true), + Row(false), + Row(null))) } // Test with local relation, the Project will be evaluated without codegen @@ -2311,11 +2569,21 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { } assert(ex2.getMessage.contains("data type mismatch: argument 1 requires array type")) + val ex2a = intercept[AnalysisException] { + df.select(exists(col("i"), x => x)) + } + assert(ex2.getMessage.contains("data type mismatch: argument 1 requires array type")) + val ex3 = intercept[AnalysisException] { df.selectExpr("exists(s, x -> x)") } assert(ex3.getMessage.contains("data type mismatch: argument 2 requires boolean type")) + val ex3a = intercept[AnalysisException] { + df.select(exists(df("s"), x => x)) + } + assert(ex3a.getMessage.contains("data type mismatch: argument 2 requires boolean type")) + val ex4 = intercept[AnalysisException] { df.selectExpr("exists(a, x -> x)") } @@ -2337,6 +2605,12 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(true), Row(true), Row(null))) + checkAnswer(df.select(forall(col("i"), x => x % 2 === 0)), + Seq( + Row(false), + Row(true), + Row(true), + Row(null))) } // Test with local relation, the Project will be evaluated without codegen @@ -2363,6 +2637,13 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(true), Row(true), Row(null))) + checkAnswer(df.select(forall(col("i"), x => (x % 2 === 0) || x.isNull)), + Seq( + Row(false), + Row(true), + Row(true), + Row(true), + Row(null))) checkAnswer(df.selectExpr("forall(i, x -> x % 2 == 0)"), Seq( Row(false), @@ -2370,6 +2651,13 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(true), Row(true), Row(null))) + checkAnswer(df.select(forall(col("i"), x => x % 2 === 0)), + Seq( + Row(false), + Row(null), + Row(true), + Row(true), + Row(null))) } // Test with local relation, the Project will be evaluated without codegen @@ -2394,6 +2682,12 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(true), Row(true), Row(null))) + checkAnswer(df.select(forall(col("s"), _.isNull)), + Seq( + Row(false), + Row(true), + Row(true), + Row(null))) } // Test with local relation, the Project will be evaluated without codegen @@ -2421,15 +2715,30 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { } assert(ex2.getMessage.contains("data type mismatch: argument 1 requires array type")) + val ex2a = intercept[AnalysisException] { + df.select(forall(col("i"), x => x)) + } + assert(ex2a.getMessage.contains("data type mismatch: argument 1 requires array type")) + val ex3 = intercept[AnalysisException] { df.selectExpr("forall(s, x -> x)") } assert(ex3.getMessage.contains("data type mismatch: argument 2 requires boolean type")) + val ex3a = intercept[AnalysisException] { + df.select(forall(col("s"), x => x)) + } + assert(ex3a.getMessage.contains("data type mismatch: argument 2 requires boolean type")) + val ex4 = intercept[AnalysisException] { df.selectExpr("forall(a, x -> x)") } assert(ex4.getMessage.contains("cannot resolve '`a`'")) + + val ex4a = intercept[AnalysisException] { + df.select(forall(col("a"), x => x)) + } + assert(ex4a.getMessage.contains("cannot resolve '`a`'")) } test("aggregate function - array for primitive type not containing null") { @@ -2453,6 +2762,18 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(310), Row(0), Row(null))) + checkAnswer(df.select(aggregate(col("i"), lit(0), (acc, x) => acc + x)), + Seq( + Row(25), + Row(31), + Row(0), + Row(null))) + checkAnswer(df.select(aggregate(col("i"), lit(0), (acc, x) => acc + x, _ * 10)), + Seq( + Row(250), + Row(310), + Row(0), + Row(null))) } // Test with local relation, the Project will be evaluated without codegen @@ -2484,6 +2805,20 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(0), Row(0), Row(null))) + checkAnswer(df.select(aggregate(col("i"), lit(0), (acc, x) => acc + x)), + Seq( + Row(25), + Row(null), + Row(0), + Row(null))) + checkAnswer( + df.select( + aggregate(col("i"), lit(0), (acc, x) => acc + x, acc => coalesce(acc, lit(0)) * 10)), + Seq( + Row(250), + Row(0), + Row(0), + Row(null))) } // Test with local relation, the Project will be evaluated without codegen @@ -2515,6 +2850,21 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(""), Row("c"), Row(null))) + checkAnswer(df.select(aggregate(col("ss"), col("s"), (acc, x) => concat(acc, x))), + Seq( + Row("acab"), + Row(null), + Row("c"), + Row(null))) + checkAnswer( + df.select( + aggregate(col("ss"), col("s"), (acc, x) => concat(acc, x), + acc => coalesce(acc, lit("")))), + Seq( + Row("acab"), + Row(""), + Row("c"), + Row(null))) } // Test with local relation, the Project will be evaluated without codegen @@ -2547,11 +2897,21 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { } assert(ex3.getMessage.contains("data type mismatch: argument 1 requires array type")) + val ex3a = intercept[AnalysisException] { + df.select(aggregate(col("i"), lit(0), (acc, x) => x)) + } + assert(ex3a.getMessage.contains("data type mismatch: argument 1 requires array type")) + val ex4 = intercept[AnalysisException] { df.selectExpr("aggregate(s, 0, (acc, x) -> x)") } assert(ex4.getMessage.contains("data type mismatch: argument 3 requires int type")) + val ex4a = intercept[AnalysisException] { + df.select(aggregate(col("s"), lit(0), (acc, x) => x)) + } + assert(ex4a.getMessage.contains("data type mismatch: argument 3 requires int type")) + val ex5 = intercept[AnalysisException] { df.selectExpr("aggregate(a, 0, (acc, x) -> x)") } @@ -2572,6 +2932,13 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(Map(10 -> null, 8 -> false, 4 -> null)), Row(Map(5 -> null)), Row(null))) + + checkAnswer(df.select(map_zip_with(df("m1"), df("m2"), (k, v1, v2) => k === v1 + v2)), + Seq( + Row(Map(8 -> true, 3 -> false, 6 -> true)), + Row(Map(10 -> null, 8 -> false, 4 -> null)), + Row(Map(5 -> null)), + Row(null))) } test("map_zip_with function - map of non-primitive types") { @@ -2588,6 +2955,13 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(Map("b" -> Row("a", null), "c" -> Row("d", "a"), "d" -> Row(null, "k"))), Row(Map("a" -> Row("d", null))), Row(null))) + + checkAnswer(df.select(map_zip_with(col("m1"), col("m2"), (k, v1, v2) => struct(v1, v2))), + Seq( + Row(Map("z" -> Row("a", "c"), "y" -> Row("b", null), "x" -> Row("c", "a"))), + Row(Map("b" -> Row("a", null), "c" -> Row("d", "a"), "d" -> Row(null, "k"))), + Row(Map("a" -> Row("d", null))), + Row(null))) } test("map_zip_with function - invalid") { @@ -2606,16 +2980,32 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { assert(ex2.getMessage.contains("The input to function map_zip_with should have " + "been two maps with compatible key types")) + val ex2a = intercept[AnalysisException] { + df.select(map_zip_with(df("mis"), col("mmi"), (x, y, z) => concat(x, y, z))) + } + assert(ex2a.getMessage.contains("The input to function map_zip_with should have " + + "been two maps with compatible key types")) + val ex3 = intercept[AnalysisException] { df.selectExpr("map_zip_with(i, mis, (x, y, z) -> concat(x, y, z))") } assert(ex3.getMessage.contains("type mismatch: argument 1 requires map type")) + val ex3a = intercept[AnalysisException] { + df.select(map_zip_with(col("i"), col("mis"), (x, y, z) => concat(x, y, z))) + } + assert(ex3a.getMessage.contains("type mismatch: argument 1 requires map type")) + val ex4 = intercept[AnalysisException] { df.selectExpr("map_zip_with(mis, i, (x, y, z) -> concat(x, y, z))") } assert(ex4.getMessage.contains("type mismatch: argument 2 requires map type")) + val ex4a = intercept[AnalysisException] { + df.select(map_zip_with(col("mis"), col("i"), (x, y, z) => concat(x, y, z))) + } + assert(ex4a.getMessage.contains("type mismatch: argument 2 requires map type")) + val ex5 = intercept[AnalysisException] { df.selectExpr("map_zip_with(mmi, mmi, (x, y, z) -> x)") } @@ -2644,27 +3034,59 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { checkAnswer(dfExample1.selectExpr("transform_keys(i, (k, v) -> k + v)"), Seq(Row(Map(2 -> 1, 18 -> 9, 16 -> 8, 14 -> 7)))) + checkAnswer(dfExample1.select(transform_keys(col("i"), (k, v) => k + v)), + Seq(Row(Map(2 -> 1, 18 -> 9, 16 -> 8, 14 -> 7)))) + checkAnswer(dfExample2.selectExpr("transform_keys(j, " + "(k, v) -> map_from_arrays(ARRAY(1, 2, 3), ARRAY('one', 'two', 'three'))[k])"), Seq(Row(Map("one" -> 1.0, "two" -> 1.4, "three" -> 1.7)))) + checkAnswer(dfExample2.select( + transform_keys( + col("j"), + (k, v) => element_at( + map_from_arrays( + array(lit(1), lit(2), lit(3)), + array(lit("one"), lit("two"), lit("three")) + ), + k + ) + ) + ), + Seq(Row(Map("one" -> 1.0, "two" -> 1.4, "three" -> 1.7)))) + checkAnswer(dfExample2.selectExpr("transform_keys(j, (k, v) -> CAST(v * 2 AS BIGINT) + k)"), Seq(Row(Map(3 -> 1.0, 4 -> 1.4, 6 -> 1.7)))) + checkAnswer(dfExample2.select(transform_keys(col("j"), + (k, v) => (v * 2).cast("bigint") + k)), + Seq(Row(Map(3 -> 1.0, 4 -> 1.4, 6 -> 1.7)))) + checkAnswer(dfExample2.selectExpr("transform_keys(j, (k, v) -> k + v)"), Seq(Row(Map(2.0 -> 1.0, 3.4 -> 1.4, 4.7 -> 1.7)))) + checkAnswer(dfExample2.select(transform_keys(col("j"), (k, v) => k + v)), + Seq(Row(Map(2.0 -> 1.0, 3.4 -> 1.4, 4.7 -> 1.7)))) + checkAnswer(dfExample3.selectExpr("transform_keys(x, (k, v) -> k % 2 = 0 OR v)"), Seq(Row(Map(true -> true, true -> false)))) + checkAnswer(dfExample3.select(transform_keys(col("x"), (k, v) => k % 2 === 0 || v)), + Seq(Row(Map(true -> true, true -> false)))) + checkAnswer(dfExample3.selectExpr("transform_keys(x, (k, v) -> if(v, 2 * k, 3 * k))"), Seq(Row(Map(50 -> true, 78 -> false)))) - checkAnswer(dfExample3.selectExpr("transform_keys(x, (k, v) -> if(v, 2 * k, 3 * k))"), + checkAnswer(dfExample3.select(transform_keys(col("x"), + (k, v) => when(v, k * 2).otherwise(k * 3))), Seq(Row(Map(50 -> true, 78 -> false)))) checkAnswer(dfExample4.selectExpr("transform_keys(y, (k, v) -> array_contains(k, 3) AND v)"), Seq(Row(Map(false -> false)))) + + checkAnswer(dfExample4.select(transform_keys(col("y"), + (k, v) => array_contains(k, lit(3)) && v)), + Seq(Row(Map(false -> false)))) } // Test with local relation, the Project will be evaluated without codegen @@ -2702,6 +3124,11 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { } assert(ex3.getMessage.contains("Cannot use null as map key")) + val ex3a = intercept[Exception] { + dfExample1.select(transform_keys(col("i"), (k, v) => v)).show() + } + assert(ex3a.getMessage.contains("Cannot use null as map key")) + val ex4 = intercept[AnalysisException] { dfExample2.selectExpr("transform_keys(j, (k, v) -> k + 1)") } @@ -2766,6 +3193,46 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { checkAnswer( dfExample5.selectExpr("transform_values(c, (k, v) -> k + cardinality(v))"), Seq(Row(Map(1 -> 3)))) + + checkAnswer(dfExample1.select(transform_values(col("i"), (k, v) => k + v)), + Seq(Row(Map(1 -> 2, 9 -> 18, 8 -> 16, 7 -> 14)))) + + checkAnswer(dfExample2.select( + transform_values(col("x"), (k, v) => when(k, v).otherwise(k.cast("string")))), + Seq(Row(Map(false -> "false", true -> "def")))) + + checkAnswer(dfExample2.select(transform_values(col("x"), + (k, v) => (!k) && v === "abc")), + Seq(Row(Map(false -> true, true -> false)))) + + checkAnswer(dfExample3.select(transform_values(col("y"), (k, v) => v * v)), + Seq(Row(Map("a" -> 1, "b" -> 4, "c" -> 9)))) + + checkAnswer(dfExample3.select( + transform_values(col("y"), (k, v) => concat(k, lit(":"), v.cast("string")))), + Seq(Row(Map("a" -> "a:1", "b" -> "b:2", "c" -> "c:3")))) + + checkAnswer( + dfExample3.select(transform_values(col("y"), (k, v) => concat(k, v.cast("string")))), + Seq(Row(Map("a" -> "a1", "b" -> "b2", "c" -> "c3")))) + + val testMap = map_from_arrays( + array(lit(1), lit(2), lit(3)), + array(lit("one"), lit("two"), lit("three")) + ) + + checkAnswer( + dfExample4.select(transform_values(col("z"), + (k, v) => concat(element_at(testMap, k), lit("_"), v.cast("string")))), + Seq(Row(Map(1 -> "one_1.0", 2 -> "two_1.4", 3 ->"three_1.7")))) + + checkAnswer( + dfExample4.select(transform_values(col("z"), (k, v) => k - v)), + Seq(Row(Map(1 -> 0.0, 2 -> 0.6000000000000001, 3 -> 1.3)))) + + checkAnswer( + dfExample5.select(transform_values(col("c"), (k, v) => k + size(v))), + Seq(Row(Map(1 -> 3)))) } // Test with local relation, the Project will be evaluated without codegen @@ -2809,6 +3276,28 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { checkAnswer(dfExample2.selectExpr("transform_values(j, (k, v) -> k + cast(v as BIGINT))"), Seq(Row(Map.empty[BigInt, BigInt]))) + + checkAnswer(dfExample1.select(transform_values(col("i"), + (k, v) => lit(null).cast("int"))), + Seq(Row(Map.empty[Integer, Integer]))) + + checkAnswer(dfExample1.select(transform_values(col("i"), (k, v) => k)), + Seq(Row(Map.empty[Integer, Integer]))) + + checkAnswer(dfExample1.select(transform_values(col("i"), (k, v) => v)), + Seq(Row(Map.empty[Integer, Integer]))) + + checkAnswer(dfExample1.select(transform_values(col("i"), (k, v) => lit(0))), + Seq(Row(Map.empty[Integer, Integer]))) + + checkAnswer(dfExample1.select(transform_values(col("i"), (k, v) => lit("value"))), + Seq(Row(Map.empty[Integer, String]))) + + checkAnswer(dfExample1.select(transform_values(col("i"), (k, v) => lit(true))), + Seq(Row(Map.empty[Integer, Boolean]))) + + checkAnswer(dfExample1.select(transform_values(col("i"), (k, v) => v.cast("bigint"))), + Seq(Row(Map.empty[BigInt, BigInt]))) } testEmpty() @@ -2833,6 +3322,15 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { checkAnswer(dfExample2.selectExpr( "transform_values(b, (k, v) -> IF(v IS NULL, k + 1, k + 2))"), Seq(Row(Map(1 -> 3, 2 -> 4, 3 -> 4)))) + + checkAnswer(dfExample1.select(transform_values(col("a"), + (k, v) => lit(null).cast("int"))), + Seq(Row(Map[Int, Integer](1 -> null, 2 -> null, 3 -> null, 4 -> null)))) + + checkAnswer(dfExample2.select( + transform_values(col("b"), (k, v) => when(v.isNull, k + 1).otherwise(k + 2)) + ), + Seq(Row(Map(1 -> 3, 2 -> 4, 3 -> 4)))) } testNullValue() @@ -2871,6 +3369,12 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { } assert(ex3.getMessage.contains( "data type mismatch: argument 1 requires map type")) + + val ex3a = intercept[AnalysisException] { + dfExample3.select(transform_values(col("x"), (k, v) => k + 1)) + } + assert(ex3a.getMessage.contains( + "data type mismatch: argument 1 requires map type")) } testInvalidLambdaFunctions() @@ -2897,10 +3401,15 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(Seq.empty), Row(null)) checkAnswer(df1.selectExpr("zip_with(val1, val2, (x, y) -> x + y)"), expectedValue1) + checkAnswer(df1.select(zip_with(df1("val1"), df1("val2"), (x, y) => x + y)), expectedValue1) val expectedValue2 = Seq( Row(Seq(Row(1L, 1), Row(2L, null), Row(null, 3))), Row(Seq(Row(4L, 1), Row(11L, 2), Row(null, 3)))) checkAnswer(df2.selectExpr("zip_with(val1, val2, (x, y) -> (y, x))"), expectedValue2) + checkAnswer( + df2.select(zip_with(df2("val1"), df2("val2"), (x, y) => struct(y, x))), + expectedValue2 + ) } test("arrays zip_with function - for non-primitive types") { @@ -2915,7 +3424,14 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { Row(Seq(Row("x", "a"), Row("y", null))), Row(Seq.empty), Row(null)) - checkAnswer(df.selectExpr("zip_with(val1, val2, (x, y) -> (y, x))"), expectedValue1) + checkAnswer( + df.selectExpr("zip_with(val1, val2, (x, y) -> (y, x))"), + expectedValue1 + ) + checkAnswer( + df.select(zip_with(col("val1"), col("val2"), (x, y) => struct(y, x))), + expectedValue1 + ) } test("arrays zip_with function - invalid") { @@ -2937,6 +3453,10 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { df.selectExpr("zip_with(i, a2, (acc, x) -> x)") } assert(ex3.getMessage.contains("data type mismatch: argument 1 requires array type")) + val ex3a = intercept[AnalysisException] { + df.select(zip_with(df("i"), df("a2"), (acc, x) => x)) + } + assert(ex3a.getMessage.contains("data type mismatch: argument 1 requires array type")) val ex4 = intercept[AnalysisException] { df.selectExpr("zip_with(a1, a, (acc, x) -> x)") } @@ -2979,16 +3499,6 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { ).foreach(assertValuesDoNotChangeAfterCoalesceOrUnion(_)) } - test("SPARK-21281 use string types by default if array and map have no argument") { - val ds = spark.range(1) - var expectedSchema = new StructType() - .add("x", ArrayType(StringType, containsNull = false), nullable = false) - assert(ds.select(array().as("x")).schema == expectedSchema) - expectedSchema = new StructType() - .add("x", MapType(StringType, StringType, valueContainsNull = false), nullable = false) - assert(ds.select(map().as("x")).schema == expectedSchema) - } - test("SPARK-21281 fails if functions have no argument") { val df = Seq(1).toDF("a") @@ -3042,6 +3552,34 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { checkAnswer(df.select("x").filter("exists(i, x -> x % d == 0)"), Seq(Row(1))) } + + test("SPARK-29462: Empty array of NullType for array function with no arguments") { + Seq((true, StringType), (false, NullType)).foreach { + case (arrayDefaultToString, expectedType) => + withSQLConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE.key -> + arrayDefaultToString.toString) { + val schema = spark.range(1).select(array()).schema + assert(schema.nonEmpty && schema.head.dataType.isInstanceOf[ArrayType]) + val actualType = schema.head.dataType.asInstanceOf[ArrayType].elementType + assert(actualType === expectedType) + } + } + } + + test("SPARK-30790: Empty map with NullType as key/value type for map function with no argument") { + Seq((true, StringType), (false, NullType)).foreach { + case (mapDefaultToString, expectedType) => + withSQLConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE.key -> + mapDefaultToString.toString) { + val schema = spark.range(1).select(map()).schema + assert(schema.nonEmpty && schema.head.dataType.isInstanceOf[MapType]) + val actualKeyType = schema.head.dataType.asInstanceOf[MapType].keyType + val actualValueType = schema.head.dataType.asInstanceOf[MapType].valueType + assert(actualKeyType === expectedType) + assert(actualValueType === expectedType) + } + } + } } object DataFrameFunctionsSuite { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameHintSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameHintSuite.scala index b33c26a0b75a2..37dc8f1bcc7f5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameHintSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameHintSuite.scala @@ -68,5 +68,17 @@ class DataFrameHintSuite extends AnalysisTest with SharedSparkSession { check( df.hint("REPARTITION", 100), UnresolvedHint("REPARTITION", Seq(100), df.logicalPlan)) + + check( + df.hint("REPARTITION", 10, $"id".expr), + UnresolvedHint("REPARTITION", Seq(10, $"id".expr), df.logicalPlan)) + + check( + df.hint("REPARTITION_BY_RANGE", $"id".expr), + UnresolvedHint("REPARTITION_BY_RANGE", Seq($"id".expr), df.logicalPlan)) + + check( + df.hint("REPARTITION_BY_RANGE", 10, $"id".expr), + UnresolvedHint("REPARTITION_BY_RANGE", Seq(10, $"id".expr), df.logicalPlan)) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala index 3a217e6e28060..c7545bcad8962 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala @@ -17,14 +17,18 @@ package org.apache.spark.sql -import org.apache.spark.sql.catalyst.plans.{Inner, LeftOuter, RightOuter} -import org.apache.spark.sql.catalyst.plans.logical.Join +import org.apache.spark.sql.catalyst.plans.{Inner, InnerLike, LeftOuter, RightOuter} +import org.apache.spark.sql.catalyst.plans.logical.{Filter, Join, LogicalPlan, Project} +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession -class DataFrameJoinSuite extends QueryTest with SharedSparkSession { +class DataFrameJoinSuite extends QueryTest + with SharedSparkSession + with AdaptiveSparkPlanHelper { import testImplicits._ test("join - join using") { @@ -46,13 +50,13 @@ class DataFrameJoinSuite extends QueryTest with SharedSparkSession { } test("join - sorted columns not in join's outputSet") { - val df = Seq((1, 2, "1"), (3, 4, "3")).toDF("int", "int2", "str_sort").as('df1) - val df2 = Seq((1, 3, "1"), (5, 6, "5")).toDF("int", "int2", "str").as('df2) - val df3 = Seq((1, 3, "1"), (5, 6, "5")).toDF("int", "int2", "str").as('df3) + val df = Seq((1, 2, "1"), (3, 4, "3")).toDF("int", "int2", "str_sort").as("df1") + val df2 = Seq((1, 3, "1"), (5, 6, "5")).toDF("int", "int2", "str").as("df2") + val df3 = Seq((1, 3, "1"), (5, 6, "5")).toDF("int", "int2", "str").as("df3") checkAnswer( df.join(df2, $"df1.int" === $"df2.int", "outer").select($"df1.int", $"df2.int2") - .orderBy('str_sort.asc, 'str.asc), + .orderBy(Symbol("str_sort").asc, Symbol("str").asc), Row(null, 6) :: Row(1, 3) :: Row(3, null) :: Nil) checkAnswer( @@ -149,7 +153,7 @@ class DataFrameJoinSuite extends QueryTest with SharedSparkSession { spark.range(10e10.toLong) .join(spark.range(10e10.toLong).hint("broadcast"), "id") .queryExecution.executedPlan - assert(plan2.collect { case p: BroadcastHashJoinExec => p }.size == 1) + assert(collect(plan2) { case p: BroadcastHashJoinExec => p }.size == 1) } test("join - outer join conversion") { @@ -256,4 +260,66 @@ class DataFrameJoinSuite extends QueryTest with SharedSparkSession { df.join(df, df("id") <=> df("id")).queryExecution.optimizedPlan } } + + def extractLeftDeepInnerJoins(plan: LogicalPlan): Seq[LogicalPlan] = plan match { + case j @ Join(left, right, _: InnerLike, _, _) => right +: extractLeftDeepInnerJoins(left) + case Filter(_, child) => extractLeftDeepInnerJoins(child) + case Project(_, child) => extractLeftDeepInnerJoins(child) + case _ => Seq(plan) + } + + test("SPARK-24690 enables star schema detection even if CBO disabled") { + withTable("r0", "r1", "r2", "r3") { + withTempDir { dir => + + withSQLConf( + SQLConf.STARSCHEMA_DETECTION.key -> "true", + SQLConf.CBO_ENABLED.key -> "false", + SQLConf.PLAN_STATS_ENABLED.key -> "true") { + + val path = dir.getAbsolutePath + + // Collects column statistics first + spark.range(300).selectExpr("id AS a", "id AS b", "id AS c") + .write.mode("overwrite").parquet(s"$path/r0") + spark.read.parquet(s"$path/r0").write.saveAsTable("r0") + spark.sql("ANALYZE TABLE r0 COMPUTE STATISTICS FOR COLUMNS a, b, c") + + spark.range(10).selectExpr("id AS a", "id AS d") + .write.mode("overwrite").parquet(s"$path/r1") + spark.read.parquet(s"$path/r1").write.saveAsTable("r1") + spark.sql("ANALYZE TABLE r1 COMPUTE STATISTICS FOR COLUMNS a") + + spark.range(50).selectExpr("id AS b", "id AS e") + .write.mode("overwrite").parquet(s"$path/r2") + spark.read.parquet(s"$path/r2").write.saveAsTable("r2") + spark.sql("ANALYZE TABLE r2 COMPUTE STATISTICS FOR COLUMNS b") + + spark.range(1).selectExpr("id AS c", "id AS f") + .write.mode("overwrite").parquet(s"$path/r3") + spark.read.parquet(s"$path/r3").write.saveAsTable("r3") + spark.sql("ANALYZE TABLE r3 COMPUTE STATISTICS FOR COLUMNS c") + + val resultDf = sql( + s"""SELECT * FROM r0, r1, r2, r3 + | WHERE + | r0.a = r1.a AND + | r1.d >= 3 AND + | r0.b = r2.b AND + | r2.e >= 5 AND + | r0.c = r3.c AND + | r3.f <= 100 + """.stripMargin) + + val optimized = resultDf.queryExecution.optimizedPlan + val optJoins = extractLeftDeepInnerJoins(optimized) + val joinOrder = optJoins + .flatMap(_.collect { case p: LogicalRelation => p.catalogTable }.head) + .map(_.identifier.identifier) + + assert(joinOrder === Seq("r2", "r1", "r3", "r0")) + } + } + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala index aeee4577d3483..fb1ca69b6f73f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala @@ -21,6 +21,7 @@ import scala.collection.JavaConverters._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.{StringType, StructType} class DataFrameNaFunctionsSuite extends QueryTest with SharedSparkSession { import testImplicits._ @@ -36,6 +37,14 @@ class DataFrameNaFunctionsSuite extends QueryTest with SharedSparkSession { ).toDF("name", "age", "height") } + def createNaNDF(): DataFrame = { + Seq[(java.lang.Integer, java.lang.Long, java.lang.Short, + java.lang.Byte, java.lang.Float, java.lang.Double)]( + (1, 1L, 1.toShort, 1.toByte, 1.0f, 1.0), + (0, 0L, 0.toShort, 0.toByte, Float.NaN, Double.NaN) + ).toDF("int", "long", "short", "byte", "float", "double") + } + test("drop") { val input = createDF() val rows = input.collect() @@ -231,6 +240,70 @@ class DataFrameNaFunctionsSuite extends QueryTest with SharedSparkSession { } } + def createDFsWithSameFieldsName(): (DataFrame, DataFrame) = { + val df1 = Seq( + ("f1-1", "f2", null), + ("f1-2", null, null), + ("f1-3", "f2", "f3-1"), + ("f1-4", "f2", "f3-1") + ).toDF("f1", "f2", "f3") + val df2 = Seq( + ("f1-1", null, null), + ("f1-2", "f2", null), + ("f1-3", "f2", "f4-1") + ).toDF("f1", "f2", "f4") + (df1, df2) + } + + test("fill unambiguous field for join operation") { + val (df1, df2) = createDFsWithSameFieldsName() + val joined_df = df1.join(df2, Seq("f1"), joinType = "left_outer") + checkAnswer(joined_df.na.fill("", cols = Seq("f4")), + Row("f1-1", "f2", null, null, "") :: + Row("f1-2", null, null, "f2", "") :: + Row("f1-3", "f2", "f3-1", "f2", "f4-1") :: + Row("f1-4", "f2", "f3-1", null, "") :: Nil) + } + + test("fill ambiguous field for join operation") { + val (df1, df2) = createDFsWithSameFieldsName() + val joined_df = df1.join(df2, Seq("f1"), joinType = "left_outer") + + val message = intercept[AnalysisException] { + joined_df.na.fill("", cols = Seq("f2")) + }.getMessage + assert(message.contains("Reference 'f2' is ambiguous")) + } + + test("fill/drop with col(*)") { + val df = createDF() + // If columns are specified with "*", they are ignored. + checkAnswer(df.na.fill("new name", Seq("*")), df.collect()) + checkAnswer(df.na.drop("any", Seq("*")), df.collect()) + } + + test("fill/drop with nested columns") { + val schema = new StructType() + .add("c1", new StructType() + .add("c1-1", StringType) + .add("c1-2", StringType)) + + val data = Seq( + Row(Row(null, "a2")), + Row(Row("b1", "b2")), + Row(null)) + + val df = spark.createDataFrame( + spark.sparkContext.parallelize(data), schema) + + checkAnswer(df.select("c1.c1-1"), + Row(null) :: Row("b1") :: Row(null) :: Nil) + + // Nested columns are ignored for fill() and drop(). + checkAnswer(df.na.fill("a1", Seq("c1.c1-1")), data) + checkAnswer(df.na.drop("any", Seq("c1.c1-1")), data) + } + test("replace") { val input = createDF() @@ -305,4 +378,74 @@ class DataFrameNaFunctionsSuite extends QueryTest with SharedSparkSession { )).na.drop("name" :: Nil).select("name"), Row("Alice") :: Row("David") :: Nil) } + + test("SPARK-29890: duplicate names are allowed for fill() if column names are not specified.") { + val left = Seq(("1", null), ("3", "4")).toDF("col1", "col2") + val right = Seq(("1", "2"), ("3", null)).toDF("col1", "col2") + val df = left.join(right, Seq("col1")) + + // If column names are specified, the following fails due to ambiguity. + val exception = intercept[AnalysisException] { + df.na.fill("hello", Seq("col2")) + } + assert(exception.getMessage.contains("Reference 'col2' is ambiguous")) + + // If column names are not specified, fill() is applied to all the eligible columns. + checkAnswer( + df.na.fill("hello"), + Row("1", "hello", "2") :: Row("3", "4", "hello") :: Nil) + } + + test("SPARK-30065: duplicate names are allowed for drop() if column names are not specified.") { + val left = Seq(("1", null), ("3", "4"), ("5", "6")).toDF("col1", "col2") + val right = Seq(("1", "2"), ("3", null), ("5", "6")).toDF("col1", "col2") + val df = left.join(right, Seq("col1")) + + // If column names are specified, the following fails due to ambiguity. + val exception = intercept[AnalysisException] { + df.na.drop("any", Seq("col2")) + } + assert(exception.getMessage.contains("Reference 'col2' is ambiguous")) + + // If column names are not specified, drop() is applied to all the eligible rows. + checkAnswer( + df.na.drop("any"), + Row("5", "6", "6") :: Nil) + } + + test("replace nan with float") { + checkAnswer( + createNaNDF().na.replace("*", Map( + Float.NaN -> 10.0f + )), + Row(1, 1L, 1.toShort, 1.toByte, 1.0f, 1.0) :: + Row(0, 0L, 0.toShort, 0.toByte, 10.0f, 10.0) :: Nil) + } + + test("replace nan with double") { + checkAnswer( + createNaNDF().na.replace("*", Map( + Double.NaN -> 10.0 + )), + Row(1, 1L, 1.toShort, 1.toByte, 1.0f, 1.0) :: + Row(0, 0L, 0.toShort, 0.toByte, 10.0f, 10.0) :: Nil) + } + + test("replace float with nan") { + checkAnswer( + createNaNDF().na.replace("*", Map( + 1.0f -> Float.NaN + )), + Row(0, 0L, 0.toShort, 0.toByte, Float.NaN, Double.NaN) :: + Row(0, 0L, 0.toShort, 0.toByte, Float.NaN, Double.NaN) :: Nil) + } + + test("replace double with nan") { + checkAnswer( + createNaNDF().na.replace("*", Map( + 1.0 -> Double.NaN + )), + Row(0, 0L, 0.toShort, 0.toByte, Float.NaN, Double.NaN) :: + Row(0, 0L, 0.toShort, 0.toByte, Float.NaN, Double.NaN) :: Nil) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala index bcd0c3f0d64a7..51c6a835d58d6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFramePivotSuite.scala @@ -46,7 +46,7 @@ class DataFramePivotSuite extends QueryTest with SharedSparkSession { courseSales.groupBy("course").pivot("year", Seq(2012, 2013)).agg(sum($"earnings")), expected) checkAnswer( - courseSales.groupBy('course).pivot('year, Seq(2012, 2013)).agg(sum('earnings)), + courseSales.groupBy($"course").pivot($"year", Seq(2012, 2013)).agg(sum($"earnings")), expected) } @@ -206,7 +206,7 @@ class DataFramePivotSuite extends QueryTest with SharedSparkSession { complexData.groupBy().pivot("b", Seq(true, false)).agg(max("a")), expected) checkAnswer( - complexData.groupBy().pivot('b, Seq(true, false)).agg(max('a)), + complexData.groupBy().pivot($"b", Seq(true, false)).agg(max("a")), expected) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala index 92f1e4306c5b1..250ec7dc0ba5a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala @@ -34,8 +34,8 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { } test("join - self join") { - val df1 = testData.select(testData("key")).as('df1) - val df2 = testData.select(testData("key")).as('df2) + val df1 = testData.select(testData("key")).as("df1") + val df2 = testData.select(testData("key")).as("df2") checkAnswer( df1.join(df2, $"df1.key" === $"df2.key"), @@ -57,11 +57,11 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { test("join - using aliases after self join") { val df = Seq(1, 2, 3).map(i => (i, i.toString)).toDF("int", "str") checkAnswer( - df.as('x).join(df.as('y), $"x.str" === $"y.str").groupBy("x.str").count(), + df.as("x").join(df.as("y"), $"x.str" === $"y.str").groupBy("x.str").count(), Row("1", 1) :: Row("2", 1) :: Row("3", 1) :: Nil) checkAnswer( - df.as('x).join(df.as('y), $"x.str" === $"y.str").groupBy("y.str").count(), + df.as("x").join(df.as("y"), $"x.str" === $"y.str").groupBy("y.str").count(), Row("1", 1) :: Row("2", 1) :: Row("3", 1) :: Nil) } @@ -96,7 +96,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { val df2 = df1.filter($"id" > 0) withSQLConf( - SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "false", + SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "false", SQLConf.CROSS_JOINS_ENABLED.key -> "true") { // `df1("id") > df2("id")` is always false. checkAnswer(df1.join(df2, df1("id") > df2("id")), Nil) @@ -110,7 +110,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { } withSQLConf( - SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "true", + SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "true", SQLConf.CROSS_JOINS_ENABLED.key -> "true") { assertAmbiguousSelfJoin(df1.join(df2, df1("id") > df2("id"))) } @@ -121,7 +121,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { val df2 = df1.filter($"id" > 0) withSQLConf( - SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "true", + SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "true", SQLConf.CROSS_JOINS_ENABLED.key -> "true") { assertAmbiguousSelfJoin(df1.join(df2, df1.colRegex("id") > df2.colRegex("id"))) } @@ -132,7 +132,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { val df2 = df1.filter($"a.b" > 0) withSQLConf( - SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "true", + SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "true", SQLConf.CROSS_JOINS_ENABLED.key -> "true") { assertAmbiguousSelfJoin(df1.join(df2, df1("a.b") > df2("a.c"))) } @@ -143,7 +143,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { val df2 = df1.filter($"id" > 0) withSQLConf( - SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "false", + SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "false", SQLConf.CROSS_JOINS_ENABLED.key -> "true") { // `df2("id")` actually points to the column of `df1`. checkAnswer(df1.join(df2).select(df2("id")), Seq(0, 0, 1, 1, 2, 2).map(Row(_))) @@ -157,7 +157,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { } withSQLConf( - SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "true", + SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "true", SQLConf.CROSS_JOINS_ENABLED.key -> "true") { assertAmbiguousSelfJoin(df1.join(df2).select(df2("id"))) } @@ -170,7 +170,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { val df4 = spark.range(1) withSQLConf( - SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "false", + SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "false", SQLConf.CROSS_JOINS_ENABLED.key -> "true") { // `df2("id") < df3("id")` is always false checkAnswer(df1.join(df2).join(df3, df2("id") < df3("id")), Nil) @@ -196,7 +196,7 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { } withSQLConf( - SQLConf.FAIL_AMBIGUOUS_SELF_JOIN.key -> "true", + SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "true", SQLConf.CROSS_JOINS_ENABLED.key -> "true") { assertAmbiguousSelfJoin(df1.join(df2).join(df3, df2("id") < df3("id"))) assertAmbiguousSelfJoin(df1.join(df4).join(df2).select(df2("id"))) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala index fbb7e903c3450..bd3f48078374d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSetOperationsSuite.scala @@ -307,7 +307,7 @@ class DataFrameSetOperationsSuite extends QueryTest with SharedSparkSession { val union = df1.union(df2) checkAnswer( - union.filter('i < rand(7) * 10), + union.filter($"i" < rand(7) * 10), expected(union) ) checkAnswer( @@ -321,13 +321,13 @@ class DataFrameSetOperationsSuite extends QueryTest with SharedSparkSession { val intersect = df1.intersect(df2) checkAnswer( - intersect.filter('i < rand(7) * 10), + intersect.filter($"i" < rand(7) * 10), expected(intersect) ) val except = df1.except(df2) checkAnswer( - except.filter('i < rand(7) * 10), + except.filter($"i" < rand(7) * 10), expected(except) ) } @@ -375,7 +375,7 @@ class DataFrameSetOperationsSuite extends QueryTest with SharedSparkSession { case j: Union if j.children.size == 5 => j }.size === 1) checkAnswer( - unionDF.agg(avg('key), max('key), min('key), sum('key)), + unionDF.agg(avg("key"), max("key"), min("key"), sum("key")), Row(50.5, 100, 1, 25250) :: Nil ) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala index 3b57173bd246b..394bad751b5ce 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala @@ -443,9 +443,9 @@ class DataFrameStatSuite extends QueryTest with SharedSparkSession { assert(sketch4.confidence() === 0.99 +- 5e-3) intercept[IllegalArgumentException] { - df.select('id cast DoubleType as 'id) + df.select($"id" cast DoubleType as "id") .stat - .countMinSketch('id, depth = 10, width = 20, seed = 42) + .countMinSketch($"id", depth = 10, width = 20, seed = 42) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index b4ddfecaee469..694e576fcded4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -21,6 +21,7 @@ import java.io.{ByteArrayOutputStream, File} import java.nio.charset.StandardCharsets import java.sql.{Date, Timestamp} import java.util.UUID +import java.util.concurrent.atomic.AtomicLong import scala.util.Random @@ -29,10 +30,12 @@ import org.scalatest.Matchers._ import org.apache.spark.SparkException import org.apache.spark.scheduler.{SparkListener, SparkListenerJobEnd} import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.Uuid import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation -import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Union} +import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, OneRowRelation, Union} import org.apache.spark.sql.execution.{FilterExec, QueryExecution, WholeStageCodegenExec} +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.aggregate.HashAggregateExec import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec, ShuffleExchangeExec} import org.apache.spark.sql.functions._ @@ -43,13 +46,15 @@ import org.apache.spark.sql.types._ import org.apache.spark.util.Utils import org.apache.spark.util.random.XORShiftRandom -class DataFrameSuite extends QueryTest with SharedSparkSession { +class DataFrameSuite extends QueryTest + with SharedSparkSession + with AdaptiveSparkPlanHelper { import testImplicits._ test("analysis error should be eagerly reported") { - intercept[Exception] { testData.select('nonExistentName) } + intercept[Exception] { testData.select("nonExistentName") } intercept[Exception] { - testData.groupBy('key).agg(Map("nonExistentName" -> "sum")) + testData.groupBy("key").agg(Map("nonExistentName" -> "sum")) } intercept[Exception] { testData.groupBy("nonExistentName").agg(Map("key" -> "sum")) @@ -90,9 +95,10 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { assert(spark.emptyDataFrame.count() === 0) } - test("head and take") { + test("head, take and tail") { assert(testData.take(2) === testData.collect().take(2)) assert(testData.head(2) === testData.collect().take(2)) + assert(testData.tail(2) === testData.collect().takeRight(2)) assert(testData.head(2).head.schema === testData.schema) } @@ -106,8 +112,10 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { test("Star Expansion - CreateStruct and CreateArray") { val structDf = testData2.select("a", "b").as("record") // CreateStruct and CreateArray in aggregateExpressions - assert(structDf.groupBy($"a").agg(min(struct($"record.*"))).first() == Row(3, Row(3, 1))) - assert(structDf.groupBy($"a").agg(min(array($"record.*"))).first() == Row(3, Seq(3, 1))) + assert(structDf.groupBy($"a").agg(min(struct($"record.*"))). + sort("a").first() == Row(1, Row(1, 1))) + assert(structDf.groupBy($"a").agg(min(array($"record.*"))). + sort("a").first() == Row(1, Seq(1, 1))) // CreateStruct and CreateArray in project list (unresolved alias) assert(structDf.select(struct($"record.*")).first() == Row(Row(1, 1))) @@ -161,10 +169,10 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { DecimalData(BigDecimal("1"* 20 + ".123"), BigDecimal("1"* 20 + ".123")) :: DecimalData(BigDecimal("9"* 20 + ".123"), BigDecimal("9"* 20 + ".123")) :: Nil).toDF() - Seq(true, false).foreach { nullOnOverflow => - withSQLConf((SQLConf.DECIMAL_OPERATIONS_NULL_ON_OVERFLOW.key, nullOnOverflow.toString)) { + Seq(true, false).foreach { ansiEnabled => + withSQLConf((SQLConf.ANSI_ENABLED.key, ansiEnabled.toString)) { val structDf = largeDecimals.select("a").agg(sum("a")) - if (nullOnOverflow) { + if (!ansiEnabled) { checkAnswer(structDf, Row(null)) } else { val e = intercept[SparkException] { @@ -246,12 +254,12 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { test("repartition") { intercept[IllegalArgumentException] { - testData.select('key).repartition(0) + testData.select("key").repartition(0) } checkAnswer( - testData.select('key).repartition(10).select('key), - testData.select('key).collect().toSeq) + testData.select("key").repartition(10).select("key"), + testData.select("key").collect().toSeq) } test("repartition with SortOrder") { @@ -313,16 +321,16 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { test("coalesce") { intercept[IllegalArgumentException] { - testData.select('key).coalesce(0) + testData.select("key").coalesce(0) } - assert(testData.select('key).coalesce(1).rdd.partitions.size === 1) + assert(testData.select("key").coalesce(1).rdd.partitions.size === 1) checkAnswer( - testData.select('key).coalesce(1).select('key), - testData.select('key).collect().toSeq) + testData.select("key").coalesce(1).select("key"), + testData.select("key").collect().toSeq) - assert(spark.emptyDataFrame.coalesce(1).rdd.partitions.size === 1) + assert(spark.emptyDataFrame.coalesce(1).rdd.partitions.size === 0) } test("convert $\"attribute name\" into unresolved attribute") { @@ -333,7 +341,7 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { test("convert Scala Symbol 'attrname into unresolved attribute") { checkAnswer( - testData.where('key === lit(1)).select('value), + testData.where($"key" === lit(1)).select("value"), Row("1")) } @@ -345,17 +353,17 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { test("simple select") { checkAnswer( - testData.where('key === lit(1)).select('value), + testData.where($"key" === lit(1)).select("value"), Row("1")) } test("select with functions") { checkAnswer( - testData.select(sum('value), avg('value), count(lit(1))), + testData.select(sum("value"), avg("value"), count(lit(1))), Row(5050.0, 50.5, 100)) checkAnswer( - testData2.select('a + 'b, 'a < 'b), + testData2.select($"a" + $"b", $"a" < $"b"), Seq( Row(2, false), Row(3, true), @@ -365,31 +373,31 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { Row(5, false))) checkAnswer( - testData2.select(sumDistinct('a)), + testData2.select(sumDistinct($"a")), Row(6)) } test("sorting with null ordering") { val data = Seq[java.lang.Integer](2, 1, null).toDF("key") - checkAnswer(data.orderBy('key.asc), Row(null) :: Row(1) :: Row(2) :: Nil) + checkAnswer(data.orderBy($"key".asc), Row(null) :: Row(1) :: Row(2) :: Nil) checkAnswer(data.orderBy(asc("key")), Row(null) :: Row(1) :: Row(2) :: Nil) - checkAnswer(data.orderBy('key.asc_nulls_first), Row(null) :: Row(1) :: Row(2) :: Nil) + checkAnswer(data.orderBy($"key".asc_nulls_first), Row(null) :: Row(1) :: Row(2) :: Nil) checkAnswer(data.orderBy(asc_nulls_first("key")), Row(null) :: Row(1) :: Row(2) :: Nil) - checkAnswer(data.orderBy('key.asc_nulls_last), Row(1) :: Row(2) :: Row(null) :: Nil) + checkAnswer(data.orderBy($"key".asc_nulls_last), Row(1) :: Row(2) :: Row(null) :: Nil) checkAnswer(data.orderBy(asc_nulls_last("key")), Row(1) :: Row(2) :: Row(null) :: Nil) - checkAnswer(data.orderBy('key.desc), Row(2) :: Row(1) :: Row(null) :: Nil) + checkAnswer(data.orderBy($"key".desc), Row(2) :: Row(1) :: Row(null) :: Nil) checkAnswer(data.orderBy(desc("key")), Row(2) :: Row(1) :: Row(null) :: Nil) - checkAnswer(data.orderBy('key.desc_nulls_first), Row(null) :: Row(2) :: Row(1) :: Nil) + checkAnswer(data.orderBy($"key".desc_nulls_first), Row(null) :: Row(2) :: Row(1) :: Nil) checkAnswer(data.orderBy(desc_nulls_first("key")), Row(null) :: Row(2) :: Row(1) :: Nil) - checkAnswer(data.orderBy('key.desc_nulls_last), Row(2) :: Row(1) :: Row(null) :: Nil) + checkAnswer(data.orderBy($"key".desc_nulls_last), Row(2) :: Row(1) :: Row(null) :: Nil) checkAnswer(data.orderBy(desc_nulls_last("key")), Row(2) :: Row(1) :: Row(null) :: Nil) } test("global sorting") { checkAnswer( - testData2.orderBy('a.asc, 'b.asc), + testData2.orderBy($"a".asc, $"b".asc), Seq(Row(1, 1), Row(1, 2), Row(2, 1), Row(2, 2), Row(3, 1), Row(3, 2))) checkAnswer( @@ -397,31 +405,31 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { Seq(Row(1, 2), Row(1, 1), Row(2, 2), Row(2, 1), Row(3, 2), Row(3, 1))) checkAnswer( - testData2.orderBy('a.asc, 'b.desc), + testData2.orderBy($"a".asc, $"b".desc), Seq(Row(1, 2), Row(1, 1), Row(2, 2), Row(2, 1), Row(3, 2), Row(3, 1))) checkAnswer( - testData2.orderBy('a.desc, 'b.desc), + testData2.orderBy($"a".desc, $"b".desc), Seq(Row(3, 2), Row(3, 1), Row(2, 2), Row(2, 1), Row(1, 2), Row(1, 1))) checkAnswer( - testData2.orderBy('a.desc, 'b.asc), + testData2.orderBy($"a".desc, $"b".asc), Seq(Row(3, 1), Row(3, 2), Row(2, 1), Row(2, 2), Row(1, 1), Row(1, 2))) checkAnswer( - arrayData.toDF().orderBy('data.getItem(0).asc), + arrayData.toDF().orderBy($"data".getItem(0).asc), arrayData.toDF().collect().sortBy(_.getAs[Seq[Int]](0)(0)).toSeq) checkAnswer( - arrayData.toDF().orderBy('data.getItem(0).desc), + arrayData.toDF().orderBy($"data".getItem(0).desc), arrayData.toDF().collect().sortBy(_.getAs[Seq[Int]](0)(0)).reverse.toSeq) checkAnswer( - arrayData.toDF().orderBy('data.getItem(1).asc), + arrayData.toDF().orderBy($"data".getItem(1).asc), arrayData.toDF().collect().sortBy(_.getAs[Seq[Int]](0)(1)).toSeq) checkAnswer( - arrayData.toDF().orderBy('data.getItem(1).desc), + arrayData.toDF().orderBy($"data".getItem(1).desc), arrayData.toDF().collect().sortBy(_.getAs[Seq[Int]](0)(1)).reverse.toSeq) } @@ -450,7 +458,7 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { checkAnswer( // SELECT *, foo(key, value) FROM testData - testData.select($"*", foo('key, 'value)).limit(3), + testData.select($"*", foo($"key", $"value")).limit(3), Row(1, "1", "11") :: Row(2, "2", "22") :: Row(3, "3", "33") :: Nil ) } @@ -553,7 +561,7 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { } test("replace column using withColumns") { - val df2 = sparkContext.parallelize(Array((1, 2), (2, 3), (3, 4))).toDF("x", "y") + val df2 = sparkContext.parallelize(Seq((1, 2), (2, 3), (3, 4))).toDF("x", "y") val df3 = df2.withColumns(Seq("x", "newCol1", "newCol2"), Seq(df2("x") + 1, df2("y"), df2("y") + 1)) checkAnswer( @@ -794,7 +802,7 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { test("apply on query results (SPARK-5462)") { val df = testData.sparkSession.sql("select key from testData") - checkAnswer(df.select(df("key")), testData.select('key).collect().toSeq) + checkAnswer(df.select(df("key")), testData.select("key").collect().toSeq) } test("inputFiles") { @@ -1197,7 +1205,7 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { } test("SPARK-6899: type should match when using codegen") { - checkAnswer(decimalData.agg(avg('a)), Row(new java.math.BigDecimal(2))) + checkAnswer(decimalData.agg(avg("a")), Row(new java.math.BigDecimal(2))) } test("SPARK-7133: Implement struct, array, and map field accessor") { @@ -1399,7 +1407,7 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { test("Sorting columns are not in Filter and Project") { checkAnswer( - upperCaseData.filter('N > 1).select('N).filter('N < 6).orderBy('L.asc), + upperCaseData.filter($"N" > 1).select("N").filter($"N" < 6).orderBy($"L".asc), Row(2) :: Row(3) :: Row(4) :: Row(5) :: Nil) } @@ -1442,17 +1450,17 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { test("Alias uses internally generated names 'aggOrder' and 'havingCondition'") { val df = Seq(1 -> 2).toDF("i", "j") - val query1 = df.groupBy('i) - .agg(max('j).as("aggOrder")) - .orderBy(sum('j)) + val query1 = df.groupBy("i") + .agg(max("j").as("aggOrder")) + .orderBy(sum("j")) checkAnswer(query1, Row(1, 2)) // In the plan, there are two attributes having the same name 'havingCondition' // One is a user-provided alias name; another is an internally generated one. - val query2 = df.groupBy('i) - .agg(max('j).as("havingCondition")) - .where(sum('j) > 0) - .orderBy('havingCondition.asc) + val query2 = df.groupBy("i") + .agg(max("j").as("havingCondition")) + .where(sum("j") > 0) + .orderBy($"havingCondition".asc) checkAnswer(query2, Row(1, 2)) } @@ -1461,7 +1469,7 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { (1 to 10).toDF("id").write.mode(SaveMode.Overwrite).json(dir.getCanonicalPath) val input = spark.read.json(dir.getCanonicalPath) - val df = input.select($"id", rand(0).as('r)) + val df = input.select($"id", rand(0).as("r")) df.as("a").join(df.filter($"r" < 0.5).as("b"), $"a.id" === $"b.id").collect().foreach { row => assert(row.getDouble(1) - row.getDouble(3) === 0.0 +- 0.001) } @@ -1691,19 +1699,21 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { val plan = join.queryExecution.executedPlan checkAnswer(join, df) assert( - join.queryExecution.executedPlan.collect { case e: ShuffleExchangeExec => true }.size === 1) + collect(join.queryExecution.executedPlan) { + case e: ShuffleExchangeExec => true }.size === 1) assert( - join.queryExecution.executedPlan.collect { case e: ReusedExchangeExec => true }.size === 1) + collect(join.queryExecution.executedPlan) { case e: ReusedExchangeExec => true }.size === 1) val broadcasted = broadcast(join) val join2 = join.join(broadcasted, "id").join(broadcasted, "id") checkAnswer(join2, df) assert( - join2.queryExecution.executedPlan.collect { case e: ShuffleExchangeExec => true }.size == 1) + collect(join2.queryExecution.executedPlan) { + case e: ShuffleExchangeExec => true }.size == 1) assert( - join2.queryExecution.executedPlan - .collect { case e: BroadcastExchangeExec => true }.size === 1) + collect(join2.queryExecution.executedPlan) { + case e: BroadcastExchangeExec => true }.size === 1) assert( - join2.queryExecution.executedPlan.collect { case e: ReusedExchangeExec => true }.size == 4) + collect(join2.queryExecution.executedPlan) { case e: ReusedExchangeExec => true }.size == 4) } } @@ -1749,7 +1759,7 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { test("assertAnalyzed shouldn't replace original stack trace") { val e = intercept[AnalysisException] { - spark.range(1).select('id as 'a, 'id as 'b).groupBy('a).agg('b) + spark.range(1).select($"id" as "a", $"id" as "b").groupBy("a").agg($"b") } assert(e.getStackTrace.head.getClassName != classOf[QueryExecution].getName) @@ -1982,14 +1992,14 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { test("order-by ordinal.") { checkAnswer( - testData2.select(lit(7), 'a, 'b).orderBy(lit(1), lit(2), lit(3)), + testData2.select(lit(7), $"a", $"b").orderBy(lit(1), lit(2), lit(3)), Seq(Row(7, 1, 1), Row(7, 1, 2), Row(7, 2, 1), Row(7, 2, 2), Row(7, 3, 1), Row(7, 3, 2))) } test("SPARK-22271: mean overflows and returns null for some decimal variables") { val d = 0.034567890 val df = Seq(d, d, d, d, d, d, d, d, d, d).toDF("DecimalCol") - val result = df.select('DecimalCol cast DecimalType(38, 33)) + val result = df.select($"DecimalCol" cast DecimalType(38, 33)) .select(col("DecimalCol")).describe() val mean = result.select("DecimalCol").where($"summary" === "mean") assert(mean.collect().toSet === Set(Row("0.0345678900000000000000000000000000000"))) @@ -2025,24 +2035,25 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { val sourceDF = spark.createDataFrame(rows, schema) def structWhenDF: DataFrame = sourceDF - .select(when('cond, struct(lit("a").as("val1"), lit(10).as("val2"))).otherwise('s) as "res") - .select('res.getField("val1")) + .select(when($"cond", + struct(lit("a").as("val1"), lit(10).as("val2"))).otherwise($"s") as "res") + .select($"res".getField("val1")) def arrayWhenDF: DataFrame = sourceDF - .select(when('cond, array(lit("a"), lit("b"))).otherwise('a) as "res") - .select('res.getItem(0)) + .select(when($"cond", array(lit("a"), lit("b"))).otherwise($"a") as "res") + .select($"res".getItem(0)) def mapWhenDF: DataFrame = sourceDF - .select(when('cond, map(lit(0), lit("a"))).otherwise('m) as "res") - .select('res.getItem(0)) + .select(when($"cond", map(lit(0), lit("a"))).otherwise($"m") as "res") + .select($"res".getItem(0)) def structIfDF: DataFrame = sourceDF .select(expr("if(cond, struct('a' as val1, 10 as val2), s)") as "res") - .select('res.getField("val1")) + .select($"res".getField("val1")) def arrayIfDF: DataFrame = sourceDF .select(expr("if(cond, array('a', 'b'), a)") as "res") - .select('res.getItem(0)) + .select($"res".getItem(0)) def mapIfDF: DataFrame = sourceDF .select(expr("if(cond, map(0, 'a'), m)") as "res") - .select('res.getItem(0)) + .select($"res".getItem(0)) def checkResult(): Unit = { checkAnswer(structWhenDF, Seq(Row("a"), Row(null))) @@ -2105,17 +2116,17 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { // partitions. .write.partitionBy("p").option("compression", "gzip").json(path.getCanonicalPath) - var numJobs = 0 + val numJobs = new AtomicLong(0) sparkContext.addSparkListener(new SparkListener { override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = { - numJobs += 1 + numJobs.incrementAndGet() } }) val df = spark.read.json(path.getCanonicalPath) assert(df.columns === Array("i", "p")) - spark.sparkContext.listenerBus.waitUntilEmpty(10000) - assert(numJobs == 1) + spark.sparkContext.listenerBus.waitUntilEmpty() + assert(numJobs.get() == 1L) } } @@ -2202,4 +2213,91 @@ class DataFrameSuite extends QueryTest with SharedSparkSession { |*(1) Range (0, 10, step=1, splits=2)""".stripMargin)) } } + + test("SPARK-29442 Set `default` mode should override the existing mode") { + val df = Seq(Tuple1(1)).toDF() + val writer = df.write.mode("overwrite").mode("default") + val modeField = classOf[DataFrameWriter[Tuple1[Int]]].getDeclaredField("mode") + modeField.setAccessible(true) + assert(SaveMode.ErrorIfExists === modeField.get(writer).asInstanceOf[SaveMode]) + } + + test("sample should not duplicated the input data") { + val df1 = spark.range(10).select($"id" as "id1", $"id" % 5 as "key1") + val df2 = spark.range(10).select($"id" as "id2", $"id" % 5 as "key2") + val sampled = df1.join(df2, $"key1" === $"key2") + .sample(0.5, 42) + .select("id1", "id2") + val idTuples = sampled.collect().map(row => row.getLong(0) -> row.getLong(1)) + assert(idTuples.length == idTuples.toSet.size) + } + + test("groupBy.as") { + val df1 = Seq((1, 2, 3), (2, 3, 4)).toDF("a", "b", "c") + .repartition($"a", $"b").sortWithinPartitions("a", "b") + val df2 = Seq((1, 2, 4), (2, 3, 5)).toDF("a", "b", "c") + .repartition($"a", $"b").sortWithinPartitions("a", "b") + + implicit val valueEncoder = RowEncoder(df1.schema) + + val df3 = df1.groupBy("a", "b").as[GroupByKey, Row] + .cogroup(df2.groupBy("a", "b").as[GroupByKey, Row]) { case (_, data1, data2) => + data1.zip(data2).map { p => + p._1.getInt(2) + p._2.getInt(2) + } + }.toDF + + checkAnswer(df3.sort("value"), Row(7) :: Row(9) :: Nil) + + // Assert that no extra shuffle introduced by cogroup. + val exchanges = collect(df3.queryExecution.executedPlan) { + case h: ShuffleExchangeExec => h + } + assert(exchanges.size == 2) + } + + test("groupBy.as: custom grouping expressions") { + val df1 = Seq((1, 2, 3), (2, 3, 4)).toDF("a1", "b", "c") + .repartition($"a1", $"b").sortWithinPartitions("a1", "b") + val df2 = Seq((1, 2, 4), (2, 3, 5)).toDF("a1", "b", "c") + .repartition($"a1", $"b").sortWithinPartitions("a1", "b") + + implicit val valueEncoder = RowEncoder(df1.schema) + + val groupedDataset1 = df1.groupBy(($"a1" + 1).as("a"), $"b").as[GroupByKey, Row] + val groupedDataset2 = df2.groupBy(($"a1" + 1).as("a"), $"b").as[GroupByKey, Row] + + val df3 = groupedDataset1 + .cogroup(groupedDataset2) { case (_, data1, data2) => + data1.zip(data2).map { p => + p._1.getInt(2) + p._2.getInt(2) + } + }.toDF + + checkAnswer(df3.sort("value"), Row(7) :: Row(9) :: Nil) + } + + test("groupBy.as: throw AnalysisException for unresolved grouping expr") { + val df = Seq((1, 2, 3), (2, 3, 4)).toDF("a", "b", "c") + + implicit val valueEncoder = RowEncoder(df.schema) + + val err = intercept[AnalysisException] { + df.groupBy($"d", $"b").as[GroupByKey, Row] + } + assert(err.getMessage.contains("cannot resolve '`d`'")) + } + + test("emptyDataFrame should be foldable") { + val emptyDf = spark.emptyDataFrame.withColumn("id", lit(1L)) + val joined = spark.range(10).join(emptyDf, "id") + joined.queryExecution.optimizedPlan match { + case LocalRelation(Seq(id), Nil, _) => + assert(id.name == "id") + case _ => + fail("emptyDataFrame should be foldable") + } + } } + +case class GroupByKey(a: Int, b: Int) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala index fbd399917e390..8c998290b5044 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFramesSuite.scala @@ -111,7 +111,7 @@ class DataFrameWindowFramesSuite extends QueryTest with SharedSparkSession { checkAnswer( df.select( - 'key, + $"key", first("value").over( window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)), first("value").over( @@ -226,7 +226,7 @@ class DataFrameWindowFramesSuite extends QueryTest with SharedSparkSession { checkAnswer( df.select( - 'key, + $"key", sum("value").over(window. rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)), sum("value").over(window. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala index 835630bff7099..d398657ec0b6e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala @@ -21,8 +21,9 @@ import org.scalatest.Matchers.the import org.apache.spark.TestUtils.{assertNotSpilled, assertSpilled} import org.apache.spark.sql.catalyst.optimizer.TransposeWindow +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.exchange.Exchange -import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction, Window} +import org.apache.spark.sql.expressions.{Aggregator, MutableAggregationBuffer, UserDefinedAggregateFunction, Window} import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -31,7 +32,9 @@ import org.apache.spark.sql.types._ /** * Window function testing for DataFrame API. */ -class DataFrameWindowFunctionsSuite extends QueryTest with SharedSparkSession { +class DataFrameWindowFunctionsSuite extends QueryTest + with SharedSparkSession + with AdaptiveSparkPlanHelper{ import testImplicits._ @@ -412,6 +415,42 @@ class DataFrameWindowFunctionsSuite extends QueryTest with SharedSparkSession { Row("b", 2, 4, 8))) } + test("window function with aggregator") { + val agg = udaf(new Aggregator[(Long, Long), Long, Long] { + def zero: Long = 0L + def reduce(b: Long, a: (Long, Long)): Long = b + (a._1 * a._2) + def merge(b1: Long, b2: Long): Long = b1 + b2 + def finish(r: Long): Long = r + def bufferEncoder: Encoder[Long] = Encoders.scalaLong + def outputEncoder: Encoder[Long] = Encoders.scalaLong + }) + + val df = Seq( + ("a", 1, 1), + ("a", 1, 5), + ("a", 2, 10), + ("a", 2, -1), + ("b", 4, 7), + ("b", 3, 8), + ("b", 2, 4)) + .toDF("key", "a", "b") + val window = Window.partitionBy($"key").orderBy($"a").rangeBetween(Long.MinValue, 0L) + checkAnswer( + df.select( + $"key", + $"a", + $"b", + agg($"a", $"b").over(window)), + Seq( + Row("a", 1, 1, 6), + Row("a", 1, 5, 6), + Row("a", 2, 10, 24), + Row("a", 2, -1, 24), + Row("b", 4, 7, 60), + Row("b", 3, 8, 32), + Row("b", 2, 4, 8))) + } + test("null inputs") { val df = Seq(("a", 1), ("a", 1), ("a", 2), ("a", 2), ("b", 4), ("b", 3), ("b", 2)) .toDF("key", "value") @@ -633,20 +672,20 @@ class DataFrameWindowFunctionsSuite extends QueryTest with SharedSparkSession { assert(thrownException.message.contains("window functions inside WHERE and HAVING clauses")) } - checkAnalysisError(testData2.select('a).where(rank().over(Window.orderBy('b)) === 1)) - checkAnalysisError(testData2.where('b === 2 && rank().over(Window.orderBy('b)) === 1)) + checkAnalysisError(testData2.select("a").where(rank().over(Window.orderBy($"b")) === 1)) + checkAnalysisError(testData2.where($"b" === 2 && rank().over(Window.orderBy($"b")) === 1)) checkAnalysisError( - testData2.groupBy('a) - .agg(avg('b).as("avgb")) - .where('a > 'avgb && rank().over(Window.orderBy('a)) === 1)) + testData2.groupBy($"a") + .agg(avg($"b").as("avgb")) + .where($"a" > $"avgb" && rank().over(Window.orderBy($"a")) === 1)) checkAnalysisError( - testData2.groupBy('a) - .agg(max('b).as("maxb"), sum('b).as("sumb")) - .where(rank().over(Window.orderBy('a)) === 1)) + testData2.groupBy($"a") + .agg(max($"b").as("maxb"), sum($"b").as("sumb")) + .where(rank().over(Window.orderBy($"a")) === 1)) checkAnalysisError( - testData2.groupBy('a) - .agg(max('b).as("maxb"), sum('b).as("sumb")) - .where('sumb === 5 && rank().over(Window.orderBy('a)) === 1)) + testData2.groupBy($"a") + .agg(max($"b").as("maxb"), sum($"b").as("sumb")) + .where($"sumb" === 5 && rank().over(Window.orderBy($"a")) === 1)) checkAnalysisError(sql("SELECT a FROM testData2 WHERE RANK() OVER(ORDER BY b) = 1")) checkAnalysisError(sql("SELECT * FROM testData2 WHERE b = 2 AND RANK() OVER(ORDER BY b) = 1")) @@ -680,7 +719,7 @@ class DataFrameWindowFunctionsSuite extends QueryTest with SharedSparkSession { .select($"sno", $"pno", $"qty", col("sum_qty_2"), sum("qty").over(w1).alias("sum_qty_1")) val expectedNumExchanges = if (transposeWindowEnabled) 1 else 2 - val actualNumExchanges = select.queryExecution.executedPlan.collect { + val actualNumExchanges = stripAQEPlan(select.queryExecution.executedPlan).collect { case e: Exchange => e }.length assert(actualNumExchanges == expectedNumExchanges) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWriterV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWriterV2Suite.scala new file mode 100644 index 0000000000000..cd157086a8b8e --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWriterV2Suite.scala @@ -0,0 +1,637 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.sql.Timestamp + +import scala.collection.JavaConverters._ + +import org.scalatest.BeforeAndAfter + +import org.apache.spark.sql.catalyst.analysis.{CannotReplaceMissingTableException, NoSuchTableException, TableAlreadyExistsException} +import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan, OverwriteByExpression, OverwritePartitionsDynamic} +import org.apache.spark.sql.connector.{InMemoryTable, InMemoryTableCatalog} +import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog} +import org.apache.spark.sql.connector.expressions.{BucketTransform, DaysTransform, FieldReference, HoursTransform, IdentityTransform, LiteralValue, MonthsTransform, YearsTransform} +import org.apache.spark.sql.execution.QueryExecution +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType} +import org.apache.spark.sql.types.TimestampType +import org.apache.spark.sql.util.QueryExecutionListener +import org.apache.spark.unsafe.types.UTF8String +import org.apache.spark.util.Utils + +class DataFrameWriterV2Suite extends QueryTest with SharedSparkSession with BeforeAndAfter { + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + import org.apache.spark.sql.functions._ + import testImplicits._ + + private def catalog(name: String): TableCatalog = { + spark.sessionState.catalogManager.catalog(name).asTableCatalog + } + + private val defaultOwnership = Map(TableCatalog.PROP_OWNER -> Utils.getCurrentUserName()) + + before { + spark.conf.set("spark.sql.catalog.testcat", classOf[InMemoryTableCatalog].getName) + + val df = spark.createDataFrame(Seq((1L, "a"), (2L, "b"), (3L, "c"))).toDF("id", "data") + df.createOrReplaceTempView("source") + val df2 = spark.createDataFrame(Seq((4L, "d"), (5L, "e"), (6L, "f"))).toDF("id", "data") + df2.createOrReplaceTempView("source2") + } + + after { + spark.sessionState.catalogManager.reset() + spark.sessionState.conf.clear() + } + + test("DataFrameWriteV2 encode identifiers correctly") { + spark.sql("CREATE TABLE testcat.table_name (id bigint, data string) USING foo") + + var plan: LogicalPlan = null + val listener = new QueryExecutionListener { + override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = { + plan = qe.analyzed + + } + override def onFailure(funcName: String, qe: QueryExecution, error: Throwable): Unit = {} + } + spark.listenerManager.register(listener) + + spark.table("source").writeTo("testcat.table_name").append() + sparkContext.listenerBus.waitUntilEmpty() + assert(plan.isInstanceOf[AppendData]) + checkV2Identifiers(plan.asInstanceOf[AppendData].table) + + spark.table("source").writeTo("testcat.table_name").overwrite(lit(true)) + sparkContext.listenerBus.waitUntilEmpty() + assert(plan.isInstanceOf[OverwriteByExpression]) + checkV2Identifiers(plan.asInstanceOf[OverwriteByExpression].table) + + spark.table("source").writeTo("testcat.table_name").overwritePartitions() + sparkContext.listenerBus.waitUntilEmpty() + assert(plan.isInstanceOf[OverwritePartitionsDynamic]) + checkV2Identifiers(plan.asInstanceOf[OverwritePartitionsDynamic].table) + } + + private def checkV2Identifiers( + plan: LogicalPlan, + identifier: String = "table_name", + catalogPlugin: TableCatalog = catalog("testcat")): Unit = { + assert(plan.isInstanceOf[DataSourceV2Relation]) + val v2 = plan.asInstanceOf[DataSourceV2Relation] + assert(v2.identifier.exists(_.name() == identifier)) + assert(v2.catalog.exists(_ == catalogPlugin)) + } + + test("Append: basic append") { + spark.sql("CREATE TABLE testcat.table_name (id bigint, data string) USING foo") + + checkAnswer(spark.table("testcat.table_name"), Seq.empty) + + spark.table("source").writeTo("testcat.table_name").append() + + checkAnswer( + spark.table("testcat.table_name"), + Seq(Row(1L, "a"), Row(2L, "b"), Row(3L, "c"))) + + spark.table("source2").writeTo("testcat.table_name").append() + + checkAnswer( + spark.table("testcat.table_name"), + Seq(Row(1L, "a"), Row(2L, "b"), Row(3L, "c"), Row(4L, "d"), Row(5L, "e"), Row(6L, "f"))) + } + + test("Append: by name not position") { + spark.sql("CREATE TABLE testcat.table_name (id bigint, data string) USING foo") + + checkAnswer(spark.table("testcat.table_name"), Seq.empty) + + val exc = intercept[AnalysisException] { + spark.table("source").withColumnRenamed("data", "d").writeTo("testcat.table_name").append() + } + + assert(exc.getMessage.contains("Cannot find data for output column")) + assert(exc.getMessage.contains("'data'")) + + checkAnswer( + spark.table("testcat.table_name"), + Seq()) + } + + test("Append: fail if table does not exist") { + val exc = intercept[NoSuchTableException] { + spark.table("source").writeTo("testcat.table_name").append() + } + + assert(exc.getMessage.contains("table_name")) + } + + test("Overwrite: overwrite by expression: true") { + spark.sql( + "CREATE TABLE testcat.table_name (id bigint, data string) USING foo PARTITIONED BY (id)") + + checkAnswer(spark.table("testcat.table_name"), Seq.empty) + + spark.table("source").writeTo("testcat.table_name").append() + + checkAnswer( + spark.table("testcat.table_name"), + Seq(Row(1L, "a"), Row(2L, "b"), Row(3L, "c"))) + + spark.table("source2").writeTo("testcat.table_name").overwrite(lit(true)) + + checkAnswer( + spark.table("testcat.table_name"), + Seq(Row(4L, "d"), Row(5L, "e"), Row(6L, "f"))) + } + + test("Overwrite: overwrite by expression: id = 3") { + spark.sql( + "CREATE TABLE testcat.table_name (id bigint, data string) USING foo PARTITIONED BY (id)") + + checkAnswer(spark.table("testcat.table_name"), Seq.empty) + + spark.table("source").writeTo("testcat.table_name").append() + + checkAnswer( + spark.table("testcat.table_name"), + Seq(Row(1L, "a"), Row(2L, "b"), Row(3L, "c"))) + + spark.table("source2").writeTo("testcat.table_name").overwrite($"id" === 3) + + checkAnswer( + spark.table("testcat.table_name"), + Seq(Row(1L, "a"), Row(2L, "b"), Row(4L, "d"), Row(5L, "e"), Row(6L, "f"))) + } + + test("Overwrite: by name not position") { + spark.sql("CREATE TABLE testcat.table_name (id bigint, data string) USING foo") + + checkAnswer(spark.table("testcat.table_name"), Seq.empty) + + val exc = intercept[AnalysisException] { + spark.table("source").withColumnRenamed("data", "d") + .writeTo("testcat.table_name").overwrite(lit(true)) + } + + assert(exc.getMessage.contains("Cannot find data for output column")) + assert(exc.getMessage.contains("'data'")) + + checkAnswer( + spark.table("testcat.table_name"), + Seq()) + } + + test("Overwrite: fail if table does not exist") { + val exc = intercept[NoSuchTableException] { + spark.table("source").writeTo("testcat.table_name").overwrite(lit(true)) + } + + assert(exc.getMessage.contains("table_name")) + } + + test("OverwritePartitions: overwrite conflicting partitions") { + spark.sql( + "CREATE TABLE testcat.table_name (id bigint, data string) USING foo PARTITIONED BY (id)") + + checkAnswer(spark.table("testcat.table_name"), Seq.empty) + + spark.table("source").writeTo("testcat.table_name").append() + + checkAnswer( + spark.table("testcat.table_name"), + Seq(Row(1L, "a"), Row(2L, "b"), Row(3L, "c"))) + + spark.table("source2").withColumn("id", $"id" - 2) + .writeTo("testcat.table_name").overwritePartitions() + + checkAnswer( + spark.table("testcat.table_name"), + Seq(Row(1L, "a"), Row(2L, "d"), Row(3L, "e"), Row(4L, "f"))) + } + + test("OverwritePartitions: overwrite all rows if not partitioned") { + spark.sql("CREATE TABLE testcat.table_name (id bigint, data string) USING foo") + + checkAnswer(spark.table("testcat.table_name"), Seq.empty) + + spark.table("source").writeTo("testcat.table_name").append() + + checkAnswer( + spark.table("testcat.table_name"), + Seq(Row(1L, "a"), Row(2L, "b"), Row(3L, "c"))) + + spark.table("source2").writeTo("testcat.table_name").overwritePartitions() + + checkAnswer( + spark.table("testcat.table_name"), + Seq(Row(4L, "d"), Row(5L, "e"), Row(6L, "f"))) + } + + test("OverwritePartitions: by name not position") { + spark.sql("CREATE TABLE testcat.table_name (id bigint, data string) USING foo") + + checkAnswer(spark.table("testcat.table_name"), Seq.empty) + + val exc = intercept[AnalysisException] { + spark.table("source").withColumnRenamed("data", "d") + .writeTo("testcat.table_name").overwritePartitions() + } + + assert(exc.getMessage.contains("Cannot find data for output column")) + assert(exc.getMessage.contains("'data'")) + + checkAnswer( + spark.table("testcat.table_name"), + Seq()) + } + + test("OverwritePartitions: fail if table does not exist") { + val exc = intercept[NoSuchTableException] { + spark.table("source").writeTo("testcat.table_name").overwritePartitions() + } + + assert(exc.getMessage.contains("table_name")) + } + + test("Create: basic behavior") { + spark.table("source").writeTo("testcat.table_name").create() + + checkAnswer( + spark.table("testcat.table_name"), + Seq(Row(1L, "a"), Row(2L, "b"), Row(3L, "c"))) + + val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name === "testcat.table_name") + assert(table.schema === new StructType().add("id", LongType).add("data", StringType)) + assert(table.partitioning.isEmpty) + assert(table.properties == defaultOwnership.asJava) + } + + test("Create: with using") { + spark.table("source").writeTo("testcat.table_name").using("foo").create() + + checkAnswer( + spark.table("testcat.table_name"), + Seq(Row(1L, "a"), Row(2L, "b"), Row(3L, "c"))) + + val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name === "testcat.table_name") + assert(table.schema === new StructType().add("id", LongType).add("data", StringType)) + assert(table.partitioning.isEmpty) + assert(table.properties === (Map("provider" -> "foo") ++ defaultOwnership).asJava) + } + + test("Create: with property") { + spark.table("source").writeTo("testcat.table_name").tableProperty("prop", "value").create() + + checkAnswer( + spark.table("testcat.table_name"), + Seq(Row(1L, "a"), Row(2L, "b"), Row(3L, "c"))) + + val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name === "testcat.table_name") + assert(table.schema === new StructType().add("id", LongType).add("data", StringType)) + assert(table.partitioning.isEmpty) + assert(table.properties === (Map("prop" -> "value") ++ defaultOwnership).asJava) + } + + test("Create: identity partitioned table") { + spark.table("source").writeTo("testcat.table_name").partitionedBy($"id").create() + + checkAnswer( + spark.table("testcat.table_name"), + Seq(Row(1L, "a"), Row(2L, "b"), Row(3L, "c"))) + + val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name === "testcat.table_name") + assert(table.schema === new StructType().add("id", LongType).add("data", StringType)) + assert(table.partitioning === Seq(IdentityTransform(FieldReference("id")))) + assert(table.properties == defaultOwnership.asJava) + } + + test("Create: partitioned by years(ts)") { + spark.table("source") + .withColumn("ts", lit("2019-06-01 10:00:00.000000").cast("timestamp")) + .writeTo("testcat.table_name") + .tableProperty("allow-unsupported-transforms", "true") + .partitionedBy(years($"ts")) + .create() + + val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name === "testcat.table_name") + assert(table.partitioning === Seq(YearsTransform(FieldReference("ts")))) + } + + test("Create: partitioned by months(ts)") { + spark.table("source") + .withColumn("ts", lit("2019-06-01 10:00:00.000000").cast("timestamp")) + .writeTo("testcat.table_name") + .tableProperty("allow-unsupported-transforms", "true") + .partitionedBy(months($"ts")) + .create() + + val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name === "testcat.table_name") + assert(table.partitioning === Seq(MonthsTransform(FieldReference("ts")))) + } + + test("Create: partitioned by days(ts)") { + spark.table("source") + .withColumn("ts", lit("2019-06-01 10:00:00.000000").cast("timestamp")) + .writeTo("testcat.table_name") + .tableProperty("allow-unsupported-transforms", "true") + .partitionedBy(days($"ts")) + .create() + + val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name === "testcat.table_name") + assert(table.partitioning === Seq(DaysTransform(FieldReference("ts")))) + } + + test("Create: partitioned by hours(ts)") { + spark.table("source") + .withColumn("ts", lit("2019-06-01 10:00:00.000000").cast("timestamp")) + .writeTo("testcat.table_name") + .tableProperty("allow-unsupported-transforms", "true") + .partitionedBy(hours($"ts")) + .create() + + val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name === "testcat.table_name") + assert(table.partitioning === Seq(HoursTransform(FieldReference("ts")))) + } + + test("Create: partitioned by bucket(4, id)") { + spark.table("source") + .writeTo("testcat.table_name") + .tableProperty("allow-unsupported-transforms", "true") + .partitionedBy(bucket(4, $"id")) + .create() + + val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name === "testcat.table_name") + assert(table.partitioning === + Seq(BucketTransform(LiteralValue(4, IntegerType), Seq(FieldReference("id"))))) + } + + test("Create: fail if table already exists") { + spark.sql( + "CREATE TABLE testcat.table_name (id bigint, data string) USING foo PARTITIONED BY (id)") + + val exc = intercept[TableAlreadyExistsException] { + spark.table("source").writeTo("testcat.table_name").create() + } + + assert(exc.getMessage.contains("table_name")) + + val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + + // table should not have been changed + assert(table.name === "testcat.table_name") + assert(table.schema === new StructType().add("id", LongType).add("data", StringType)) + assert(table.partitioning === Seq(IdentityTransform(FieldReference("id")))) + assert(table.properties === (Map("provider" -> "foo") ++ defaultOwnership).asJava) + } + + test("Replace: basic behavior") { + spark.sql( + "CREATE TABLE testcat.table_name (id bigint, data string) USING foo PARTITIONED BY (id)") + spark.sql("INSERT INTO TABLE testcat.table_name SELECT * FROM source") + + checkAnswer( + spark.table("testcat.table_name"), + Seq(Row(1L, "a"), Row(2L, "b"), Row(3L, "c"))) + + val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + + // validate the initial table + assert(table.name === "testcat.table_name") + assert(table.schema === new StructType().add("id", LongType).add("data", StringType)) + assert(table.partitioning === Seq(IdentityTransform(FieldReference("id")))) + assert(table.properties === (Map("provider" -> "foo") ++ defaultOwnership).asJava) + + spark.table("source2") + .withColumn("even_or_odd", when(($"id" % 2) === 0, "even").otherwise("odd")) + .writeTo("testcat.table_name").replace() + + checkAnswer( + spark.table("testcat.table_name"), + Seq(Row(4L, "d", "even"), Row(5L, "e", "odd"), Row(6L, "f", "even"))) + + val replaced = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + + // validate the replacement table + assert(replaced.name === "testcat.table_name") + assert(replaced.schema === new StructType() + .add("id", LongType) + .add("data", StringType) + .add("even_or_odd", StringType)) + assert(replaced.partitioning.isEmpty) + assert(replaced.properties === defaultOwnership.asJava) + } + + test("Replace: partitioned table") { + spark.sql("CREATE TABLE testcat.table_name (id bigint, data string) USING foo") + spark.sql("INSERT INTO TABLE testcat.table_name SELECT * FROM source") + + checkAnswer( + spark.table("testcat.table_name"), + Seq(Row(1L, "a"), Row(2L, "b"), Row(3L, "c"))) + + val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + + // validate the initial table + assert(table.name === "testcat.table_name") + assert(table.schema === new StructType().add("id", LongType).add("data", StringType)) + assert(table.partitioning.isEmpty) + assert(table.properties === (Map("provider" -> "foo") ++ defaultOwnership).asJava) + + spark.table("source2") + .withColumn("even_or_odd", when(($"id" % 2) === 0, "even").otherwise("odd")) + .writeTo("testcat.table_name").partitionedBy($"id").replace() + + checkAnswer( + spark.table("testcat.table_name"), + Seq(Row(4L, "d", "even"), Row(5L, "e", "odd"), Row(6L, "f", "even"))) + + val replaced = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + + // validate the replacement table + assert(replaced.name === "testcat.table_name") + assert(replaced.schema === new StructType() + .add("id", LongType) + .add("data", StringType) + .add("even_or_odd", StringType)) + assert(replaced.partitioning === Seq(IdentityTransform(FieldReference("id")))) + assert(replaced.properties === defaultOwnership.asJava) + } + + test("Replace: fail if table does not exist") { + val exc = intercept[CannotReplaceMissingTableException] { + spark.table("source").writeTo("testcat.table_name").replace() + } + + assert(exc.getMessage.contains("table_name")) + } + + test("CreateOrReplace: table does not exist") { + spark.table("source2").writeTo("testcat.table_name").createOrReplace() + + checkAnswer( + spark.table("testcat.table_name"), + Seq(Row(4L, "d"), Row(5L, "e"), Row(6L, "f"))) + + val replaced = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + + // validate the replacement table + assert(replaced.name === "testcat.table_name") + assert(replaced.schema === new StructType().add("id", LongType).add("data", StringType)) + assert(replaced.partitioning.isEmpty) + assert(replaced.properties === defaultOwnership.asJava) + } + + test("CreateOrReplace: table exists") { + spark.sql( + "CREATE TABLE testcat.table_name (id bigint, data string) USING foo PARTITIONED BY (id)") + spark.sql("INSERT INTO TABLE testcat.table_name SELECT * FROM source") + + checkAnswer( + spark.table("testcat.table_name"), + Seq(Row(1L, "a"), Row(2L, "b"), Row(3L, "c"))) + + val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + + // validate the initial table + assert(table.name === "testcat.table_name") + assert(table.schema === new StructType().add("id", LongType).add("data", StringType)) + assert(table.partitioning === Seq(IdentityTransform(FieldReference("id")))) + assert(table.properties === (Map("provider" -> "foo") ++ defaultOwnership).asJava) + + spark.table("source2") + .withColumn("even_or_odd", when(($"id" % 2) === 0, "even").otherwise("odd")) + .writeTo("testcat.table_name").createOrReplace() + + checkAnswer( + spark.table("testcat.table_name"), + Seq(Row(4L, "d", "even"), Row(5L, "e", "odd"), Row(6L, "f", "even"))) + + val replaced = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + + // validate the replacement table + assert(replaced.name === "testcat.table_name") + assert(replaced.schema === new StructType() + .add("id", LongType) + .add("data", StringType) + .add("even_or_odd", StringType)) + assert(replaced.partitioning.isEmpty) + assert(replaced.properties === defaultOwnership.asJava) + } + + test("SPARK-30289 Create: partitioned by nested column") { + val schema = new StructType().add("ts", new StructType() + .add("created", TimestampType) + .add("modified", TimestampType) + .add("timezone", StringType)) + + val data = Seq( + Row(Row(Timestamp.valueOf("2019-06-01 10:00:00"), Timestamp.valueOf("2019-09-02 07:00:00"), + "America/Los_Angeles")), + Row(Row(Timestamp.valueOf("2019-08-26 18:00:00"), Timestamp.valueOf("2019-09-26 18:00:00"), + "America/Los_Angeles")), + Row(Row(Timestamp.valueOf("2018-11-23 18:00:00"), Timestamp.valueOf("2018-12-22 18:00:00"), + "America/New_York"))) + val df = spark.createDataFrame(spark.sparkContext.parallelize(data, 1), schema) + + df.writeTo("testcat.table_name") + .partitionedBy($"ts.timezone") + .create() + + val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + .asInstanceOf[InMemoryTable] + + assert(table.name === "testcat.table_name") + assert(table.partitioning === Seq(IdentityTransform(FieldReference(Array("ts", "timezone"))))) + checkAnswer(spark.table(table.name), data) + assert(table.dataMap.toArray.length == 2) + assert(table.dataMap(Seq(UTF8String.fromString("America/Los_Angeles"))).rows.size == 2) + assert(table.dataMap(Seq(UTF8String.fromString("America/New_York"))).rows.size == 1) + + // TODO: `DataSourceV2Strategy` can not translate nested fields into source filter yet + // so the following sql will fail. + // sql("DELETE FROM testcat.table_name WHERE ts.timezone = \"America/Los_Angeles\"") + } + + test("SPARK-30289 Create: partitioned by multiple transforms on nested columns") { + spark.table("source") + .withColumn("ts", struct( + lit("2019-06-01 10:00:00.000000").cast("timestamp") as "created", + lit("2019-09-02 07:00:00.000000").cast("timestamp") as "modified", + lit("America/Los_Angeles") as "timezone")) + .writeTo("testcat.table_name") + .tableProperty("allow-unsupported-transforms", "true") + .partitionedBy( + years($"ts.created"), months($"ts.created"), days($"ts.created"), hours($"ts.created"), + years($"ts.modified"), months($"ts.modified"), days($"ts.modified"), hours($"ts.modified") + ) + .create() + + val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name === "testcat.table_name") + assert(table.partitioning === Seq( + YearsTransform(FieldReference(Array("ts", "created"))), + MonthsTransform(FieldReference(Array("ts", "created"))), + DaysTransform(FieldReference(Array("ts", "created"))), + HoursTransform(FieldReference(Array("ts", "created"))), + YearsTransform(FieldReference(Array("ts", "modified"))), + MonthsTransform(FieldReference(Array("ts", "modified"))), + DaysTransform(FieldReference(Array("ts", "modified"))), + HoursTransform(FieldReference(Array("ts", "modified"))))) + } + + test("SPARK-30289 Create: partitioned by bucket(4, ts.timezone)") { + spark.table("source") + .withColumn("ts", struct( + lit("2019-06-01 10:00:00.000000").cast("timestamp") as "created", + lit("2019-09-02 07:00:00.000000").cast("timestamp") as "modified", + lit("America/Los_Angeles") as "timezone")) + .writeTo("testcat.table_name") + .tableProperty("allow-unsupported-transforms", "true") + .partitionedBy(bucket(4, $"ts.timezone")) + .create() + + val table = catalog("testcat").loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name === "testcat.table_name") + assert(table.partitioning === Seq(BucketTransform(LiteralValue(4, IntegerType), + Seq(FieldReference(Seq("ts", "timezone")))))) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala index 817387b2845f5..6ffe133ee652b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala @@ -19,12 +19,10 @@ package org.apache.spark.sql import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.expressions.Aggregator -import org.apache.spark.sql.expressions.scalalang.typed import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{BooleanType, IntegerType, StringType, StructType} - object ComplexResultAgg extends Aggregator[(String, Int), (Long, Long), (Long, Long)] { override def zero: (Long, Long) = (0, 0) override def reduce(countAndSum: (Long, Long), input: (String, Int)): (Long, Long) = { @@ -226,25 +224,6 @@ class DatasetAggregatorSuite extends QueryTest with SharedSparkSession { private implicit val ordering = Ordering.by((c: AggData) => c.a -> c.b) - test("typed aggregation: TypedAggregator") { - val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS() - - checkDataset( - ds.groupByKey(_._1).agg(typed.sum(_._2)), - ("a", 30.0), ("b", 3.0), ("c", 1.0)) - } - - test("typed aggregation: TypedAggregator, expr, expr") { - val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS() - - checkDataset( - ds.groupByKey(_._1).agg( - typed.sum(_._2), - expr("sum(_2)").as[Long], - count("*")), - ("a", 30.0, 30L, 2L), ("b", 3.0, 3L, 2L), ("c", 1.0, 1L, 1L)) - } - test("typed aggregation: complex result type") { val ds = Seq("a" -> 1, "a" -> 3, "b" -> 3).toDS() @@ -255,17 +234,6 @@ class DatasetAggregatorSuite extends QueryTest with SharedSparkSession { ("a", 2.0, (2L, 4L)), ("b", 3.0, (1L, 3L))) } - test("typed aggregation: in project list") { - val ds = Seq(1, 3, 2, 5).toDS() - - checkDataset( - ds.select(typed.sum((i: Int) => i)), - 11.0) - checkDataset( - ds.select(typed.sum((i: Int) => i), typed.sum((i: Int) => i * 2)), - 11.0 -> 22.0) - } - test("typed aggregation: class input") { val ds = Seq(AggData(1, "one"), AggData(2, "two")).toDS() @@ -315,14 +283,6 @@ class DatasetAggregatorSuite extends QueryTest with SharedSparkSession { ("one", 1), ("two", 1)) } - test("typed aggregate: avg, count, sum") { - val ds = Seq("a" -> 1, "a" -> 3, "b" -> 3).toDS() - checkDataset( - ds.groupByKey(_._1).agg( - typed.avg(_._2), typed.count(_._2), typed.sum(_._2), typed.sumLong(_._2)), - ("a", 2.0, 2L, 4.0, 4L), ("b", 3.0, 1L, 3.0, 3L)) - } - test("generic typed sum") { val ds = Seq("a" -> 1, "a" -> 3, "b" -> 3).toDS() checkDataset( @@ -366,18 +326,6 @@ class DatasetAggregatorSuite extends QueryTest with SharedSparkSession { checkAnswer(df2.agg(RowAgg.toColumn as "b").select("b"), Row(6) :: Nil) } - test("spark-15114 shorter system generated alias names") { - val ds = Seq(1, 3, 2, 5).toDS() - assert(ds.select(typed.sum((i: Int) => i)).columns.head === "TypedSumDouble(int)") - val ds2 = ds.select(typed.sum((i: Int) => i), typed.avg((i: Int) => i)) - assert(ds2.columns.head === "TypedSumDouble(int)") - assert(ds2.columns.last === "TypedAverage(int)") - val df = Seq(1 -> "a", 2 -> "b", 3 -> "b").toDF("i", "j") - assert(df.groupBy($"j").agg(RowAgg.toColumn).columns.last == - "RowAgg(org.apache.spark.sql.Row)") - assert(df.groupBy($"j").agg(RowAgg.toColumn as "agg1").columns.last == "agg1") - } - test("SPARK-15814 Aggregator can return null result") { val ds = Seq(AggData(1, "one"), AggData(2, "two")).toDS() checkDatasetUnorderly( @@ -390,15 +338,6 @@ class DatasetAggregatorSuite extends QueryTest with SharedSparkSession { checkDataset(ds.select(MapTypeBufferAgg.toColumn), 1) } - test("SPARK-15204 improve nullability inference for Aggregator") { - val ds1 = Seq(1, 3, 2, 5).toDS() - assert(ds1.select(typed.sum((i: Int) => i)).schema.head.nullable === false) - val ds2 = Seq(AggData(1, "a"), AggData(2, "a")).toDS() - assert(ds2.select(SeqAgg.toColumn).schema.head.nullable) - val ds3 = sql("SELECT 'Some String' AS b, 1279869254 AS a").as[AggData] - assert(ds3.select(NameAgg.toColumn).schema.head.nullable) - } - test("SPARK-18147: very complex aggregator result type") { val df = Seq(1 -> "a", 2 -> "b", 2 -> "c").toDF("i", "j") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala index dba906f63aed4..e47a6a68a0a9c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala @@ -55,7 +55,7 @@ object DatasetBenchmark extends SqlBasedBenchmark { res = res.map(func) i += 1 } - res.foreach(_ => Unit) + res.foreach(_ => ()) } benchmark.addCase("DataFrame") { iter => @@ -65,7 +65,7 @@ object DatasetBenchmark extends SqlBasedBenchmark { res = res.select($"l" + 1 as "l") i += 1 } - res.queryExecution.toRdd.foreach(_ => Unit) + res.queryExecution.toRdd.foreach(_ => ()) } benchmark.addCase("Dataset") { iter => @@ -75,7 +75,7 @@ object DatasetBenchmark extends SqlBasedBenchmark { res = res.map(func) i += 1 } - res.queryExecution.toRdd.foreach(_ => Unit) + res.queryExecution.toRdd.foreach(_ => ()) } benchmark @@ -96,7 +96,7 @@ object DatasetBenchmark extends SqlBasedBenchmark { res = res.map(func) i += 1 } - res.foreach(_ => Unit) + res.foreach(_ => ()) } benchmark.addCase("DataFrame") { iter => @@ -106,7 +106,7 @@ object DatasetBenchmark extends SqlBasedBenchmark { res = res.select($"l" + 1 as "l", $"s") i += 1 } - res.queryExecution.toRdd.foreach(_ => Unit) + res.queryExecution.toRdd.foreach(_ => ()) } benchmark.addCase("Dataset") { iter => @@ -116,7 +116,7 @@ object DatasetBenchmark extends SqlBasedBenchmark { res = res.map(func) i += 1 } - res.queryExecution.toRdd.foreach(_ => Unit) + res.queryExecution.toRdd.foreach(_ => ()) } benchmark @@ -139,7 +139,7 @@ object DatasetBenchmark extends SqlBasedBenchmark { res = res.filter(func) i += 1 } - res.foreach(_ => Unit) + res.foreach(_ => ()) } benchmark.addCase("DataFrame") { iter => @@ -149,7 +149,7 @@ object DatasetBenchmark extends SqlBasedBenchmark { res = res.filter($"l" % 2L === 0L) i += 1 } - res.queryExecution.toRdd.foreach(_ => Unit) + res.queryExecution.toRdd.foreach(_ => ()) } benchmark.addCase("Dataset") { iter => @@ -159,7 +159,7 @@ object DatasetBenchmark extends SqlBasedBenchmark { res = res.filter(func) i += 1 } - res.queryExecution.toRdd.foreach(_ => Unit) + res.queryExecution.toRdd.foreach(_ => ()) } benchmark @@ -183,7 +183,7 @@ object DatasetBenchmark extends SqlBasedBenchmark { res = res.filter(funcs(i)) i += 1 } - res.foreach(_ => Unit) + res.foreach(_ => ()) } benchmark.addCase("DataFrame") { iter => @@ -193,7 +193,7 @@ object DatasetBenchmark extends SqlBasedBenchmark { res = res.filter($"l" % (100L + i) === 0L) i += 1 } - res.queryExecution.toRdd.foreach(_ => Unit) + res.queryExecution.toRdd.foreach(_ => ()) } benchmark.addCase("Dataset") { iter => @@ -203,7 +203,7 @@ object DatasetBenchmark extends SqlBasedBenchmark { res = res.filter(funcs(i)) i += 1 } - res.queryExecution.toRdd.foreach(_ => Unit) + res.queryExecution.toRdd.foreach(_ => ()) } benchmark @@ -235,15 +235,15 @@ object DatasetBenchmark extends SqlBasedBenchmark { } benchmark.addCase("DataFrame sum") { iter => - df.select(sum($"l")).queryExecution.toRdd.foreach(_ => Unit) + df.select(sum($"l")).queryExecution.toRdd.foreach(_ => ()) } benchmark.addCase("Dataset sum using Aggregator") { iter => - df.as[Data].select(typed.sumLong((d: Data) => d.l)).queryExecution.toRdd.foreach(_ => Unit) + df.as[Data].select(typed.sumLong((d: Data) => d.l)).queryExecution.toRdd.foreach(_ => ()) } benchmark.addCase("Dataset complex Aggregator") { iter => - df.as[Data].select(ComplexAggregator.toColumn).queryExecution.toRdd.foreach(_ => Unit) + df.as[Data].select(ComplexAggregator.toColumn).queryExecution.toRdd.foreach(_ => ()) } benchmark diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala index 33d9def0b44e5..5c144dad23c30 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetCacheSuite.scala @@ -20,13 +20,17 @@ package org.apache.spark.sql import org.scalatest.concurrent.TimeLimits import org.scalatest.time.SpanSugar._ +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.columnar.{InMemoryRelation, InMemoryTableScanExec} import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.storage.StorageLevel -class DatasetCacheSuite extends QueryTest with SharedSparkSession with TimeLimits { +class DatasetCacheSuite extends QueryTest + with SharedSparkSession + with TimeLimits + with AdaptiveSparkPlanHelper { import testImplicits._ /** @@ -36,7 +40,8 @@ class DatasetCacheSuite extends QueryTest with SharedSparkSession with TimeLimit val plan = df.queryExecution.withCachedData assert(plan.isInstanceOf[InMemoryRelation]) val internalPlan = plan.asInstanceOf[InMemoryRelation].cacheBuilder.cachedPlan - assert(internalPlan.find(_.isInstanceOf[InMemoryTableScanExec]).size == numOfCachesDependedUpon) + assert(find(internalPlan)(_.isInstanceOf[InMemoryTableScanExec]).size + == numOfCachesDependedUpon) } test("get storage level") { @@ -97,7 +102,7 @@ class DatasetCacheSuite extends QueryTest with SharedSparkSession with TimeLimit test("persist and then groupBy columns asKey, map") { val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS() val grouped = ds.groupByKey(_._1) - val agged = grouped.mapGroups { case (g, iter) => (g, iter.map(_._2).sum) } + val agged = grouped.mapGroups { (g, iter) => (g, iter.map(_._2).sum) } agged.persist() checkDataset( @@ -158,8 +163,8 @@ class DatasetCacheSuite extends QueryTest with SharedSparkSession with TimeLimit test("SPARK-24596 Non-cascading Cache Invalidation") { val df = Seq(("a", 1), ("b", 2)).toDF("s", "i") - val df2 = df.filter('i > 1) - val df3 = df.filter('i < 2) + val df2 = df.filter($"i" > 1) + val df3 = df.filter($"i" < 2) df2.cache() df.cache() @@ -178,8 +183,8 @@ class DatasetCacheSuite extends QueryTest with SharedSparkSession with TimeLimit val expensiveUDF = udf({ x: Int => Thread.sleep(5000); x }) val df = spark.range(0, 5).toDF("a") val df1 = df.withColumn("b", expensiveUDF($"a")) - val df2 = df1.groupBy('a).agg(sum('b)) - val df3 = df.agg(sum('a)) + val df2 = df1.groupBy($"a").agg(sum($"b")) + val df3 = df.agg(sum($"a")) df1.cache() df2.cache() @@ -192,16 +197,16 @@ class DatasetCacheSuite extends QueryTest with SharedSparkSession with TimeLimit // df1 un-cached; df2's cache plan stays the same assert(df1.storageLevel == StorageLevel.NONE) - assertCacheDependency(df1.groupBy('a).agg(sum('b))) + assertCacheDependency(df1.groupBy($"a").agg(sum($"b"))) - val df4 = df1.groupBy('a).agg(sum('b)).agg(sum("sum(b)")) + val df4 = df1.groupBy($"a").agg(sum($"b")).agg(sum("sum(b)")) assertCached(df4) // reuse loaded cache failAfter(3.seconds) { checkDataset(df4, Row(10)) } - val df5 = df.agg(sum('a)).filter($"sum(a)" > 1) + val df5 = df.agg(sum($"a")).filter($"sum(a)" > 1) assertCached(df5) // first time use, load cache checkDataset(df5, Row(10)) @@ -209,8 +214,8 @@ class DatasetCacheSuite extends QueryTest with SharedSparkSession with TimeLimit test("SPARK-26708 Cache data and cached plan should stay consistent") { val df = spark.range(0, 5).toDF("a") - val df1 = df.withColumn("b", 'a + 1) - val df2 = df.filter('a > 1) + val df1 = df.withColumn("b", $"a" + 1) + val df2 = df.filter($"a" > 1) df.cache() // Add df1 to the CacheManager; the buffer is currently empty. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetOptimizationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetOptimizationSuite.scala index 892122b94b977..0ac99905f35f4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetOptimizationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetOptimizationSuite.scala @@ -181,15 +181,6 @@ class DatasetOptimizationSuite extends QueryTest with SharedSparkSession { // codegen cache should work for Datasets of same type. val count3 = getCodegenCount() assert(count3 == count2) - - withSQLConf(SQLConf.OPTIMIZER_REASSIGN_LAMBDA_VARIABLE_ID.key -> "false") { - // trigger codegen for another Dataset of same type - createDataset().collect() - // with the rule disabled, codegen happens again for encoder serializer and encoder - // deserializer - val count4 = getCodegenCount() - assert(count4 == (count3 + 2)) - } } withClue("array type") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala index 91a8f0a26b360..124b58483d24f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala @@ -171,7 +171,7 @@ class DatasetPrimitiveSuite extends QueryTest with SharedSparkSession { test("groupBy function, map") { val ds = Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11).toDS() val grouped = ds.groupByKey(_ % 2) - val agged = grouped.mapGroups { case (g, iter) => + val agged = grouped.mapGroups { (g, iter) => val name = if (g == 0) "even" else "odd" (name, iter.size) } @@ -184,7 +184,7 @@ class DatasetPrimitiveSuite extends QueryTest with SharedSparkSession { test("groupBy function, flatMap") { val ds = Seq("a", "b", "c", "xyz", "hello").toDS() val grouped = ds.groupByKey(_.length) - val agged = grouped.flatMapGroups { case (g, iter) => Iterator(g.toString, iter.mkString) } + val agged = grouped.flatMapGroups { (g, iter) => Iterator(g.toString, iter.mkString) } checkDatasetUnorderly( agged, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 42e5ee58954e8..b0bd612e88d98 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -20,7 +20,9 @@ package org.apache.spark.sql import java.io.{Externalizable, ObjectInput, ObjectOutput} import java.sql.{Date, Timestamp} +import org.scalatest.Assertions._ import org.scalatest.exceptions.TestFailedException +import org.scalatest.prop.TableDrivenPropertyChecks._ import org.apache.spark.{SparkException, TaskContext} import org.apache.spark.sql.catalyst.ScroogeLikeExample @@ -28,6 +30,7 @@ import org.apache.spark.sql.catalyst.encoders.{OuterScopes, RowEncoder} import org.apache.spark.sql.catalyst.plans.{LeftAnti, LeftSemi} import org.apache.spark.sql.catalyst.util.sideBySide import org.apache.spark.sql.execution.{LogicalRDD, RDDScanExec, SQLExecution} +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ShuffleExchangeExec} import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.expressions.UserDefinedFunction @@ -49,7 +52,9 @@ object TestForTypeAlias { def seqOfTupleTypeAlias: SeqOfTwoInt = Seq((1, 1), (2, 2)) } -class DatasetSuite extends QueryTest with SharedSparkSession { +class DatasetSuite extends QueryTest + with SharedSparkSession + with AdaptiveSparkPlanHelper { import testImplicits._ private implicit val ordering = Ordering.by((c: ClassData) => c.a -> c.b) @@ -194,6 +199,11 @@ class DatasetSuite extends QueryTest with SharedSparkSession { assert(ds.take(2) === Array(ClassData("a", 1), ClassData("b", 2))) } + test("as case class - tail") { + val ds = Seq((1, "a"), (2, "b"), (3, "c")).toDF("b", "a").as[ClassData] + assert(ds.tail(2) === Array(ClassData("b", 2), ClassData("c", 3))) + } + test("as seq of case class - reorder fields by name") { val df = spark.range(3).select(array(struct($"id".cast("int").as("b"), lit("a").as("a")))) val ds = df.as[Seq[ClassData]] @@ -518,7 +528,7 @@ class DatasetSuite extends QueryTest with SharedSparkSession { test("groupBy function, map") { val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS() val grouped = ds.groupByKey(v => (v._1, "word")) - val agged = grouped.mapGroups { case (g, iter) => (g._1, iter.map(_._2).sum) } + val agged = grouped.mapGroups { (g, iter) => (g._1, iter.map(_._2).sum) } checkDatasetUnorderly( agged, @@ -528,7 +538,7 @@ class DatasetSuite extends QueryTest with SharedSparkSession { test("groupBy function, flatMap") { val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS() val grouped = ds.groupByKey(v => (v._1, "word")) - val agged = grouped.flatMapGroups { case (g, iter) => + val agged = grouped.flatMapGroups { (g, iter) => Iterator(g._1, iter.map(_._2).sum.toString) } @@ -540,11 +550,11 @@ class DatasetSuite extends QueryTest with SharedSparkSession { test("groupBy function, mapValues, flatMap") { val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS() val keyValue = ds.groupByKey(_._1).mapValues(_._2) - val agged = keyValue.mapGroups { case (g, iter) => (g, iter.sum) } + val agged = keyValue.mapGroups { (g, iter) => (g, iter.sum) } checkDataset(agged, ("a", 30), ("b", 3), ("c", 1)) val keyValue1 = ds.groupByKey(t => (t._1, "key")).mapValues(t => (t._2, "value")) - val agged1 = keyValue1.mapGroups { case (g, iter) => (g._1, iter.map(_._1).sum) } + val agged1 = keyValue1.mapGroups { (g, iter) => (g._1, iter.map(_._1).sum) } checkDataset(agged1, ("a", 30), ("b", 3), ("c", 1)) } @@ -905,7 +915,7 @@ class DatasetSuite extends QueryTest with SharedSparkSession { test("grouping key and grouped value has field with same name") { val ds = Seq(ClassData("a", 1), ClassData("a", 2)).toDS() val agged = ds.groupByKey(d => ClassNullableData(d.a, null)).mapGroups { - case (key, values) => key.a + values.map(_.b).sum + (key, values) => key.a + values.map(_.b).sum } checkDataset(agged, "a3") @@ -978,7 +988,7 @@ class DatasetSuite extends QueryTest with SharedSparkSession { } test("SPARK-14554: Dataset.map may generate wrong java code for wide table") { - val wideDF = spark.range(10).select(Seq.tabulate(1000) {i => ('id + i).as(s"c$i")} : _*) + val wideDF = spark.range(10).select(Seq.tabulate(1000) {i => ($"id" + i).as(s"c$i")} : _*) // Make sure the generated code for this plan can compile and execute. checkDataset(wideDF.map(_.getLong(0)), 0L until 10 : _*) } @@ -1000,7 +1010,7 @@ class DatasetSuite extends QueryTest with SharedSparkSession { .select("user", "item") .as[(Int, Int)] .groupByKey(_._1) - .mapGroups { case (src, ids) => (src, ids.map(_._2).toArray) } + .mapGroups { (src, ids) => (src, ids.map(_._2).toArray) } .toDF("id", "actual") dataset.join(actual, dataset("user") === actual("id")).collect() @@ -1267,10 +1277,10 @@ class DatasetSuite extends QueryTest with SharedSparkSession { checkDataset( df.withColumn("b", lit(0)).as[ClassData] - .groupByKey(_.a).flatMapGroups { case (x, iter) => List[Int]() }) + .groupByKey(_.a).flatMapGroups { (_, _) => List[Int]() }) checkDataset( df.withColumn("b", expr("0")).as[ClassData] - .groupByKey(_.a).flatMapGroups { case (x, iter) => List[Int]() }) + .groupByKey(_.a).flatMapGroups { (_, _) => List[Int]() }) } test("SPARK-18125: Spark generated code causes CompileException") { @@ -1388,7 +1398,7 @@ class DatasetSuite extends QueryTest with SharedSparkSession { } testCheckpointing("basic") { - val ds = spark.range(10).repartition('id % 2).filter('id > 5).orderBy('id.desc) + val ds = spark.range(10).repartition($"id" % 2).filter($"id" > 5).orderBy($"id".desc) val cp = if (reliable) ds.checkpoint(eager) else ds.localCheckpoint(eager) val logicalRDD = cp.logicalPlan match { @@ -1423,10 +1433,10 @@ class DatasetSuite extends QueryTest with SharedSparkSession { } testCheckpointing("should preserve partitioning information") { - val ds = spark.range(10).repartition('id % 2) + val ds = spark.range(10).repartition($"id" % 2) val cp = if (reliable) ds.checkpoint(eager) else ds.localCheckpoint(eager) - val agg = cp.groupBy('id % 2).agg(count('id)) + val agg = cp.groupBy($"id" % 2).agg(count($"id")) agg.queryExecution.executedPlan.collectFirst { case ShuffleExchangeExec(_, _: RDDScanExec, _) => @@ -1438,7 +1448,7 @@ class DatasetSuite extends QueryTest with SharedSparkSession { ) } - checkAnswer(agg, ds.groupBy('id % 2).agg(count('id))) + checkAnswer(agg, ds.groupBy($"id" % 2).agg(count($"id"))) } } } @@ -1536,11 +1546,9 @@ class DatasetSuite extends QueryTest with SharedSparkSession { checkAnswer(df.sort("id"), expected) checkAnswer(df.sort(col("id")), expected) checkAnswer(df.sort($"id"), expected) - checkAnswer(df.sort('id), expected) checkAnswer(df.orderBy("id"), expected) checkAnswer(df.orderBy(col("id")), expected) checkAnswer(df.orderBy($"id"), expected) - checkAnswer(df.orderBy('id), expected) } test("SPARK-21567: Dataset should work with type alias") { @@ -1695,7 +1703,7 @@ class DatasetSuite extends QueryTest with SharedSparkSession { test("SPARK-24571: filtering of string values by char literal") { val df = Seq("Amsterdam", "San Francisco", "X").toDF("city") - checkAnswer(df.where('city === 'X'), Seq(Row("X"))) + checkAnswer(df.where($"city" === 'X'), Seq(Row("X"))) checkAnswer( df.where($"city".contains(java.lang.Character.valueOf('A'))), Seq(Row("Amsterdam"))) @@ -1841,6 +1849,66 @@ class DatasetSuite extends QueryTest with SharedSparkSession { val instant = java.time.Instant.parse("2019-03-30T09:54:00Z") assert(spark.range(1).map { _ => instant }.head === instant) } + + val dotColumnTestModes = Table( + ("caseSensitive", "colName"), + ("true", "field.1"), + ("false", "Field.1") + ) + + test("SPARK-25153: Improve error messages for columns with dots/periods") { + forAll(dotColumnTestModes) { (caseSensitive, colName) => + val ds = Seq(SpecialCharClass("1", "2")).toDS + withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive) { + val errorMsg = intercept[AnalysisException] { + ds(colName) + } + assert(errorMsg.getMessage.contains(s"did you mean to quote the `$colName` column?")) + } + } + } + + test("groupBy.as") { + val df1 = Seq(DoubleData(1, "one"), DoubleData(2, "two"), DoubleData(3, "three")).toDS() + .repartition($"id").sortWithinPartitions("id") + val df2 = Seq(DoubleData(5, "one"), DoubleData(1, "two"), DoubleData(3, "three")).toDS() + .repartition($"id").sortWithinPartitions("id") + + val df3 = df1.groupBy("id").as[Int, DoubleData] + .cogroup(df2.groupBy("id").as[Int, DoubleData]) { case (key, data1, data2) => + if (key == 1) { + Iterator(DoubleData(key, (data1 ++ data2).foldLeft("")((cur, next) => cur + next.val1))) + } else Iterator.empty + } + checkDataset(df3, DoubleData(1, "onetwo")) + + // Assert that no extra shuffle introduced by cogroup. + val exchanges = collect(df3.queryExecution.executedPlan) { + case h: ShuffleExchangeExec => h + } + assert(exchanges.size == 2) + } + + test("tail with different numbers") { + Seq(0, 2, 5, 10, 50, 100, 1000).foreach { n => + assert(spark.range(n).tail(6) === (math.max(n - 6, 0) until n)) + } + } + + test("tail should not accept minus value") { + val e = intercept[AnalysisException](spark.range(1).tail(-1)) + e.getMessage.contains("tail expression must be equal to or greater than 0") + } + + test("SparkSession.active should be the same instance after dataset operations") { + val active = SparkSession.getActiveSession.get + val clone = active.cloneSession() + val ds = new Dataset(clone, spark.range(10).queryExecution.logical, Encoders.INT) + + ds.queryExecution.analyzed + + assert(active eq SparkSession.getActiveSession.get) + } } object AssertExecutionId { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala index 2fef05f97e57c..ba45b9f9b62df 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala @@ -19,14 +19,15 @@ package org.apache.spark.sql import java.sql.{Date, Timestamp} import java.text.SimpleDateFormat -import java.time.Instant -import java.util.Locale +import java.time.{Instant, LocalDateTime} +import java.util.{Locale, TimeZone} import java.util.concurrent.TimeUnit -import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.catalyst.util.{DateTimeUtils, IntervalUtils} import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.DoubleType import org.apache.spark.unsafe.types.CalendarInterval class DateFunctionsSuite extends QueryTest with SharedSparkSession { @@ -95,15 +96,19 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { } test("date format") { - val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c") - - checkAnswer( - df.select(date_format($"a", "y"), date_format($"b", "y"), date_format($"c", "y")), - Row("2015", "2015", "2013")) - - checkAnswer( - df.selectExpr("date_format(a, 'y')", "date_format(b, 'y')", "date_format(c, 'y')"), - Row("2015", "2015", "2013")) + Seq(false, true).foreach { legacyParser => + withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) { + val df = Seq((d, sdf.format(d), ts)).toDF("a", "b", "c") + + checkAnswer( + df.select(date_format($"a", "y"), date_format($"b", "y"), date_format($"c", "y")), + Row("2015", "2015", "2013")) + + checkAnswer( + df.selectExpr("date_format(a, 'y')", "date_format(b, 'y')", "date_format(c, 'y')"), + Row("2015", "2015", "2013")) + } + } } test("year") { @@ -289,15 +294,15 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { val t2 = Timestamp.valueOf("2015-12-31 00:00:00") val d1 = Date.valueOf("2015-07-31") val d2 = Date.valueOf("2015-12-31") - val i = new CalendarInterval(2, 2000000L) + val i = new CalendarInterval(2, 2, 2000000L) val df = Seq((1, t1, d1), (3, t2, d2)).toDF("n", "t", "d") checkAnswer( - df.selectExpr(s"d + $i"), - Seq(Row(Date.valueOf("2015-09-30")), Row(Date.valueOf("2016-02-29")))) + df.selectExpr(s"d + INTERVAL'${i.toString}'"), + Seq(Row(Date.valueOf("2015-10-02")), Row(Date.valueOf("2016-03-02")))) checkAnswer( - df.selectExpr(s"t + $i"), - Seq(Row(Timestamp.valueOf("2015-10-01 00:00:01")), - Row(Timestamp.valueOf("2016-02-29 00:00:02")))) + df.selectExpr(s"t + INTERVAL'${i.toString}'"), + Seq(Row(Timestamp.valueOf("2015-10-03 00:00:01")), + Row(Timestamp.valueOf("2016-03-02 00:00:02")))) } test("time_sub") { @@ -305,15 +310,15 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { val t2 = Timestamp.valueOf("2016-02-29 00:00:02") val d1 = Date.valueOf("2015-09-30") val d2 = Date.valueOf("2016-02-29") - val i = new CalendarInterval(2, 2000000L) + val i = new CalendarInterval(2, 2, 2000000L) val df = Seq((1, t1, d1), (3, t2, d2)).toDF("n", "t", "d") checkAnswer( - df.selectExpr(s"d - $i"), - Seq(Row(Date.valueOf("2015-07-29")), Row(Date.valueOf("2015-12-28")))) + df.selectExpr(s"d - INTERVAL'${i.toString}'"), + Seq(Row(Date.valueOf("2015-07-27")), Row(Date.valueOf("2015-12-26")))) checkAnswer( - df.selectExpr(s"t - $i"), - Seq(Row(Timestamp.valueOf("2015-07-31 23:59:59")), - Row(Timestamp.valueOf("2015-12-29 00:00:00")))) + df.selectExpr(s"t - INTERVAL'${i.toString}'"), + Seq(Row(Timestamp.valueOf("2015-07-29 23:59:59")), + Row(Timestamp.valueOf("2015-12-27 00:00:00")))) } test("function add_months") { @@ -524,170 +529,194 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { } test("from_unixtime") { - val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US) - val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS" - val sdf2 = new SimpleDateFormat(fmt2, Locale.US) - val fmt3 = "yy-MM-dd HH-mm-ss" - val sdf3 = new SimpleDateFormat(fmt3, Locale.US) - val df = Seq((1000, "yyyy-MM-dd HH:mm:ss.SSS"), (-1000, "yy-MM-dd HH-mm-ss")).toDF("a", "b") - checkAnswer( - df.select(from_unixtime(col("a"))), - Seq(Row(sdf1.format(new Timestamp(1000000))), Row(sdf1.format(new Timestamp(-1000000))))) - checkAnswer( - df.select(from_unixtime(col("a"), fmt2)), - Seq(Row(sdf2.format(new Timestamp(1000000))), Row(sdf2.format(new Timestamp(-1000000))))) - checkAnswer( - df.select(from_unixtime(col("a"), fmt3)), - Seq(Row(sdf3.format(new Timestamp(1000000))), Row(sdf3.format(new Timestamp(-1000000))))) - checkAnswer( - df.selectExpr("from_unixtime(a)"), - Seq(Row(sdf1.format(new Timestamp(1000000))), Row(sdf1.format(new Timestamp(-1000000))))) - checkAnswer( - df.selectExpr(s"from_unixtime(a, '$fmt2')"), - Seq(Row(sdf2.format(new Timestamp(1000000))), Row(sdf2.format(new Timestamp(-1000000))))) - checkAnswer( - df.selectExpr(s"from_unixtime(a, '$fmt3')"), - Seq(Row(sdf3.format(new Timestamp(1000000))), Row(sdf3.format(new Timestamp(-1000000))))) + Seq(false, true).foreach { legacyParser => + withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) { + val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US) + val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS" + val sdf2 = new SimpleDateFormat(fmt2, Locale.US) + val fmt3 = "yy-MM-dd HH-mm-ss" + val sdf3 = new SimpleDateFormat(fmt3, Locale.US) + val df = Seq((1000, "yyyy-MM-dd HH:mm:ss.SSS"), (-1000, "yy-MM-dd HH-mm-ss")).toDF("a", "b") + checkAnswer( + df.select(from_unixtime(col("a"))), + Seq(Row(sdf1.format(new Timestamp(1000000))), Row(sdf1.format(new Timestamp(-1000000))))) + checkAnswer( + df.select(from_unixtime(col("a"), fmt2)), + Seq(Row(sdf2.format(new Timestamp(1000000))), Row(sdf2.format(new Timestamp(-1000000))))) + checkAnswer( + df.select(from_unixtime(col("a"), fmt3)), + Seq(Row(sdf3.format(new Timestamp(1000000))), Row(sdf3.format(new Timestamp(-1000000))))) + checkAnswer( + df.selectExpr("from_unixtime(a)"), + Seq(Row(sdf1.format(new Timestamp(1000000))), Row(sdf1.format(new Timestamp(-1000000))))) + checkAnswer( + df.selectExpr(s"from_unixtime(a, '$fmt2')"), + Seq(Row(sdf2.format(new Timestamp(1000000))), Row(sdf2.format(new Timestamp(-1000000))))) + checkAnswer( + df.selectExpr(s"from_unixtime(a, '$fmt3')"), + Seq(Row(sdf3.format(new Timestamp(1000000))), Row(sdf3.format(new Timestamp(-1000000))))) + } + } } private def secs(millis: Long): Long = TimeUnit.MILLISECONDS.toSeconds(millis) test("unix_timestamp") { - val date1 = Date.valueOf("2015-07-24") - val date2 = Date.valueOf("2015-07-25") - val ts1 = Timestamp.valueOf("2015-07-24 10:00:00.3") - val ts2 = Timestamp.valueOf("2015-07-25 02:02:02.2") - val s1 = "2015/07/24 10:00:00.5" - val s2 = "2015/07/25 02:02:02.6" - val ss1 = "2015-07-24 10:00:00" - val ss2 = "2015-07-25 02:02:02" - val fmt = "yyyy/MM/dd HH:mm:ss.S" - val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts", "s", "ss") - checkAnswer(df.select(unix_timestamp(col("ts"))), Seq( - Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) - checkAnswer(df.select(unix_timestamp(col("ss"))), Seq( - Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) - checkAnswer(df.select(unix_timestamp(col("d"), fmt)), Seq( - Row(secs(date1.getTime)), Row(secs(date2.getTime)))) - checkAnswer(df.select(unix_timestamp(col("s"), fmt)), Seq( - Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) - checkAnswer(df.selectExpr("unix_timestamp(ts)"), Seq( - Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) - checkAnswer(df.selectExpr("unix_timestamp(ss)"), Seq( - Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) - checkAnswer(df.selectExpr(s"unix_timestamp(d, '$fmt')"), Seq( - Row(secs(date1.getTime)), Row(secs(date2.getTime)))) - checkAnswer(df.selectExpr(s"unix_timestamp(s, '$fmt')"), Seq( - Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) - - val x1 = "2015-07-24 10:00:00" - val x2 = "2015-25-07 02:02:02" - val x3 = "2015-07-24 25:02:02" - val x4 = "2015-24-07 26:02:02" - val ts3 = Timestamp.valueOf("2015-07-24 02:25:02") - val ts4 = Timestamp.valueOf("2015-07-24 00:10:00") - - val df1 = Seq(x1, x2, x3, x4).toDF("x") - checkAnswer(df1.select(unix_timestamp(col("x"))), Seq( - Row(secs(ts1.getTime)), Row(null), Row(null), Row(null))) - checkAnswer(df1.selectExpr("unix_timestamp(x)"), Seq( - Row(secs(ts1.getTime)), Row(null), Row(null), Row(null))) - checkAnswer(df1.select(unix_timestamp(col("x"), "yyyy-dd-MM HH:mm:ss")), Seq( - Row(null), Row(secs(ts2.getTime)), Row(null), Row(null))) - checkAnswer(df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd mm:HH:ss')"), Seq( - Row(secs(ts4.getTime)), Row(null), Row(secs(ts3.getTime)), Row(null))) - - // invalid format - checkAnswer(df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd aa:HH:ss')"), Seq( - Row(null), Row(null), Row(null), Row(null))) - - // february - val y1 = "2016-02-29" - val y2 = "2017-02-29" - val ts5 = Timestamp.valueOf("2016-02-29 00:00:00") - val df2 = Seq(y1, y2).toDF("y") - checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq( - Row(secs(ts5.getTime)), Row(null))) - - val now = sql("select unix_timestamp()").collect().head.getLong(0) - checkAnswer( - sql(s"select cast ($now as timestamp)"), - Row(new java.util.Date(TimeUnit.SECONDS.toMillis(now)))) + Seq(false, true).foreach { legacyParser => + withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) { + val date1 = Date.valueOf("2015-07-24") + val date2 = Date.valueOf("2015-07-25") + val ts1 = Timestamp.valueOf("2015-07-24 10:00:00.3") + val ts2 = Timestamp.valueOf("2015-07-25 02:02:02.2") + val s1 = "2015/07/24 10:00:00.5" + val s2 = "2015/07/25 02:02:02.6" + val ss1 = "2015-07-24 10:00:00" + val ss2 = "2015-07-25 02:02:02" + val fmt = "yyyy/MM/dd HH:mm:ss.S" + val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts", "s", "ss") + checkAnswer(df.select(unix_timestamp(col("ts"))), Seq( + Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) + checkAnswer(df.select(unix_timestamp(col("ss"))), Seq( + Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) + checkAnswer(df.select(unix_timestamp(col("d"), fmt)), Seq( + Row(secs(date1.getTime)), Row(secs(date2.getTime)))) + checkAnswer(df.select(unix_timestamp(col("s"), fmt)), Seq( + Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) + checkAnswer(df.selectExpr("unix_timestamp(ts)"), Seq( + Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) + checkAnswer(df.selectExpr("unix_timestamp(ss)"), Seq( + Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) + checkAnswer(df.selectExpr(s"unix_timestamp(d, '$fmt')"), Seq( + Row(secs(date1.getTime)), Row(secs(date2.getTime)))) + checkAnswer(df.selectExpr(s"unix_timestamp(s, '$fmt')"), Seq( + Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) + + val x1 = "2015-07-24 10:00:00" + val x2 = "2015-25-07 02:02:02" + val x3 = "2015-07-24 25:02:02" + val x4 = "2015-24-07 26:02:02" + val ts3 = Timestamp.valueOf("2015-07-24 02:25:02") + val ts4 = Timestamp.valueOf("2015-07-24 00:10:00") + + val df1 = Seq(x1, x2, x3, x4).toDF("x") + checkAnswer(df1.select(unix_timestamp(col("x"))), Seq( + Row(secs(ts1.getTime)), Row(null), Row(null), Row(null))) + checkAnswer(df1.selectExpr("unix_timestamp(x)"), Seq( + Row(secs(ts1.getTime)), Row(null), Row(null), Row(null))) + checkAnswer(df1.select(unix_timestamp(col("x"), "yyyy-dd-MM HH:mm:ss")), Seq( + Row(null), Row(secs(ts2.getTime)), Row(null), Row(null))) + checkAnswer(df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd mm:HH:ss')"), Seq( + Row(secs(ts4.getTime)), Row(null), Row(secs(ts3.getTime)), Row(null))) + + // invalid format + checkAnswer(df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd aa:HH:ss')"), Seq( + Row(null), Row(null), Row(null), Row(null))) + + // february + val y1 = "2016-02-29" + val y2 = "2017-02-29" + val ts5 = Timestamp.valueOf("2016-02-29 00:00:00") + val df2 = Seq(y1, y2).toDF("y") + checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq( + Row(secs(ts5.getTime)), Row(null))) + + val now = sql("select unix_timestamp()").collect().head.getLong(0) + checkAnswer( + sql(s"select cast ($now as timestamp)"), + Row(new java.util.Date(TimeUnit.SECONDS.toMillis(now)))) + } + } } test("to_unix_timestamp") { - val date1 = Date.valueOf("2015-07-24") - val date2 = Date.valueOf("2015-07-25") - val ts1 = Timestamp.valueOf("2015-07-24 10:00:00.3") - val ts2 = Timestamp.valueOf("2015-07-25 02:02:02.2") - val s1 = "2015/07/24 10:00:00.5" - val s2 = "2015/07/25 02:02:02.6" - val ss1 = "2015-07-24 10:00:00" - val ss2 = "2015-07-25 02:02:02" - val fmt = "yyyy/MM/dd HH:mm:ss.S" - val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts", "s", "ss") - checkAnswer(df.selectExpr("to_unix_timestamp(ts)"), Seq( - Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) - checkAnswer(df.selectExpr("to_unix_timestamp(ss)"), Seq( - Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) - checkAnswer(df.selectExpr(s"to_unix_timestamp(d, '$fmt')"), Seq( - Row(secs(date1.getTime)), Row(secs(date2.getTime)))) - checkAnswer(df.selectExpr(s"to_unix_timestamp(s, '$fmt')"), Seq( - Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) - - val x1 = "2015-07-24 10:00:00" - val x2 = "2015-25-07 02:02:02" - val x3 = "2015-07-24 25:02:02" - val x4 = "2015-24-07 26:02:02" - val ts3 = Timestamp.valueOf("2015-07-24 02:25:02") - val ts4 = Timestamp.valueOf("2015-07-24 00:10:00") - - val df1 = Seq(x1, x2, x3, x4).toDF("x") - checkAnswer(df1.selectExpr("to_unix_timestamp(x)"), Seq( - Row(secs(ts1.getTime)), Row(null), Row(null), Row(null))) - checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd mm:HH:ss')"), Seq( - Row(secs(ts4.getTime)), Row(null), Row(secs(ts3.getTime)), Row(null))) - - // february - val y1 = "2016-02-29" - val y2 = "2017-02-29" - val ts5 = Timestamp.valueOf("2016-02-29 00:00:00") - val df2 = Seq(y1, y2).toDF("y") - checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq( - Row(secs(ts5.getTime)), Row(null))) - - // invalid format - checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd bb:HH:ss')"), Seq( - Row(null), Row(null), Row(null), Row(null))) + Seq(false, true).foreach { legacyParser => + withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) { + val date1 = Date.valueOf("2015-07-24") + val date2 = Date.valueOf("2015-07-25") + val ts1 = Timestamp.valueOf("2015-07-24 10:00:00.3") + val ts2 = Timestamp.valueOf("2015-07-25 02:02:02.2") + val s1 = "2015/07/24 10:00:00.5" + val s2 = "2015/07/25 02:02:02.6" + val ss1 = "2015-07-24 10:00:00" + val ss2 = "2015-07-25 02:02:02" + val fmt = "yyyy/MM/dd HH:mm:ss.S" + val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts", "s", "ss") + checkAnswer(df.selectExpr("to_unix_timestamp(ts)"), Seq( + Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) + checkAnswer(df.selectExpr("to_unix_timestamp(ss)"), Seq( + Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) + checkAnswer(df.selectExpr(s"to_unix_timestamp(d, '$fmt')"), Seq( + Row(secs(date1.getTime)), Row(secs(date2.getTime)))) + checkAnswer(df.selectExpr(s"to_unix_timestamp(s, '$fmt')"), Seq( + Row(secs(ts1.getTime)), Row(secs(ts2.getTime)))) + + val x1 = "2015-07-24 10:00:00" + val x2 = "2015-25-07 02:02:02" + val x3 = "2015-07-24 25:02:02" + val x4 = "2015-24-07 26:02:02" + val ts3 = Timestamp.valueOf("2015-07-24 02:25:02") + val ts4 = Timestamp.valueOf("2015-07-24 00:10:00") + + val df1 = Seq(x1, x2, x3, x4).toDF("x") + checkAnswer(df1.selectExpr("to_unix_timestamp(x)"), Seq( + Row(secs(ts1.getTime)), Row(null), Row(null), Row(null))) + checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd mm:HH:ss')"), Seq( + Row(secs(ts4.getTime)), Row(null), Row(secs(ts3.getTime)), Row(null))) + + // february + val y1 = "2016-02-29" + val y2 = "2017-02-29" + val ts5 = Timestamp.valueOf("2016-02-29 00:00:00") + val df2 = Seq(y1, y2).toDF("y") + checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq( + Row(secs(ts5.getTime)), Row(null))) + + // invalid format + checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd bb:HH:ss')"), Seq( + Row(null), Row(null), Row(null), Row(null))) + } + } } test("to_timestamp") { - val date1 = Date.valueOf("2015-07-24") - val date2 = Date.valueOf("2015-07-25") - val ts_date1 = Timestamp.valueOf("2015-07-24 00:00:00") - val ts_date2 = Timestamp.valueOf("2015-07-25 00:00:00") - val ts1 = Timestamp.valueOf("2015-07-24 10:00:00") - val ts2 = Timestamp.valueOf("2015-07-25 02:02:02") - val s1 = "2015/07/24 10:00:00.5" - val s2 = "2015/07/25 02:02:02.6" - val ts1m = Timestamp.valueOf("2015-07-24 10:00:00.5") - val ts2m = Timestamp.valueOf("2015-07-25 02:02:02.6") - val ss1 = "2015-07-24 10:00:00" - val ss2 = "2015-07-25 02:02:02" - val fmt = "yyyy/MM/dd HH:mm:ss.S" - val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts", "s", "ss") - - checkAnswer(df.select(to_timestamp(col("ss"))), - df.select(unix_timestamp(col("ss")).cast("timestamp"))) - checkAnswer(df.select(to_timestamp(col("ss"))), Seq( - Row(ts1), Row(ts2))) - checkAnswer(df.select(to_timestamp(col("s"), fmt)), Seq( - Row(ts1m), Row(ts2m))) - checkAnswer(df.select(to_timestamp(col("ts"), fmt)), Seq( - Row(ts1), Row(ts2))) - checkAnswer(df.select(to_timestamp(col("d"), "yyyy-MM-dd")), Seq( - Row(ts_date1), Row(ts_date2))) + Seq(false, true).foreach { legacyParser => + withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) { + val date1 = Date.valueOf("2015-07-24") + val date2 = Date.valueOf("2015-07-25") + val ts_date1 = Timestamp.valueOf("2015-07-24 00:00:00") + val ts_date2 = Timestamp.valueOf("2015-07-25 00:00:00") + val ts1 = Timestamp.valueOf("2015-07-24 10:00:00") + val ts2 = Timestamp.valueOf("2015-07-25 02:02:02") + val s1 = "2015/07/24 10:00:00.5" + val s2 = "2015/07/25 02:02:02.6" + val ts1m = Timestamp.valueOf("2015-07-24 10:00:00.5") + val ts2m = Timestamp.valueOf("2015-07-25 02:02:02.6") + val ss1 = "2015-07-24 10:00:00" + val ss2 = "2015-07-25 02:02:02" + val fmt = "yyyy/MM/dd HH:mm:ss.S" + val df = Seq((date1, ts1, s1, ss1), (date2, ts2, s2, ss2)).toDF("d", "ts", "s", "ss") + + checkAnswer(df.select(to_timestamp(col("ss"))), + df.select(unix_timestamp(col("ss")).cast("timestamp"))) + checkAnswer(df.select(to_timestamp(col("ss"))), Seq( + Row(ts1), Row(ts2))) + if (legacyParser) { + // In Spark 2.4 and earlier, to_timestamp() parses in seconds precision and cuts off + // the fractional part of seconds. The behavior was changed by SPARK-27438. + val legacyFmt = "yyyy/MM/dd HH:mm:ss" + checkAnswer(df.select(to_timestamp(col("s"), legacyFmt)), Seq( + Row(ts1), Row(ts2))) + } else { + checkAnswer(df.select(to_timestamp(col("s"), fmt)), Seq( + Row(ts1m), Row(ts2m))) + } + checkAnswer(df.select(to_timestamp(col("ts"), fmt)), Seq( + Row(ts1), Row(ts2))) + checkAnswer(df.select(to_timestamp(col("d"), "yyyy-MM-dd")), Seq( + Row(ts_date1), Row(ts_date2))) + } + } } test("datediff") { @@ -703,45 +732,55 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { checkAnswer(df.selectExpr("datediff(a, d)"), Seq(Row(1), Row(1))) } + test("to_timestamp with microseconds precision") { + withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") { + val timestamp = "1970-01-01T00:00:00.123456Z" + val df = Seq(timestamp).toDF("t") + checkAnswer(df.select(to_timestamp($"t", "yyyy-MM-dd'T'HH:mm:ss.SSSSSSX")), + Seq(Row(Instant.parse(timestamp)))) + } + } + test("from_utc_timestamp with literal zone") { val df = Seq( (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00"), (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00") ).toDF("a", "b") - withSQLConf(SQLConf.UTC_TIMESTAMP_FUNC_ENABLED.key -> "true") { - checkAnswer( - df.select(from_utc_timestamp(col("a"), "PST")), - Seq( - Row(Timestamp.valueOf("2015-07-23 17:00:00")), - Row(Timestamp.valueOf("2015-07-24 17:00:00")))) - checkAnswer( - df.select(from_utc_timestamp(col("b"), "PST")), - Seq( - Row(Timestamp.valueOf("2015-07-23 17:00:00")), - Row(Timestamp.valueOf("2015-07-24 17:00:00")))) - } - val msg = intercept[AnalysisException] { - df.select(from_utc_timestamp(col("a"), "PST")).collect() - }.getMessage - assert(msg.contains(SQLConf.UTC_TIMESTAMP_FUNC_ENABLED.key)) + checkAnswer( + df.select(from_utc_timestamp(col("a"), "PST")), + Seq( + Row(Timestamp.valueOf("2015-07-23 17:00:00")), + Row(Timestamp.valueOf("2015-07-24 17:00:00")))) + checkAnswer( + df.select(from_utc_timestamp(col("b"), "PST")), + Seq( + Row(Timestamp.valueOf("2015-07-23 17:00:00")), + Row(Timestamp.valueOf("2015-07-24 17:00:00")))) } test("from_utc_timestamp with column zone") { - withSQLConf(SQLConf.UTC_TIMESTAMP_FUNC_ENABLED.key -> "true") { - val df = Seq( - (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00", "CET"), - (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00", "PST") - ).toDF("a", "b", "c") - checkAnswer( - df.select(from_utc_timestamp(col("a"), col("c"))), - Seq( - Row(Timestamp.valueOf("2015-07-24 02:00:00")), - Row(Timestamp.valueOf("2015-07-24 17:00:00")))) - checkAnswer( - df.select(from_utc_timestamp(col("b"), col("c"))), - Seq( - Row(Timestamp.valueOf("2015-07-24 02:00:00")), - Row(Timestamp.valueOf("2015-07-24 17:00:00")))) + val df = Seq( + (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00", "CET"), + (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00", "PST") + ).toDF("a", "b", "c") + checkAnswer( + df.select(from_utc_timestamp(col("a"), col("c"))), + Seq( + Row(Timestamp.valueOf("2015-07-24 02:00:00")), + Row(Timestamp.valueOf("2015-07-24 17:00:00")))) + checkAnswer( + df.select(from_utc_timestamp(col("b"), col("c"))), + Seq( + Row(Timestamp.valueOf("2015-07-24 02:00:00")), + Row(Timestamp.valueOf("2015-07-24 17:00:00")))) + } + + test("handling null field by date_part") { + val input = Seq(Date.valueOf("2019-09-20")).toDF("d") + Seq("date_part(null, d)", "date_part(null, date'2019-09-20')").foreach { expr => + val df = input.selectExpr(expr) + assert(df.schema.headOption.get.dataType == DoubleType) + checkAnswer(df, Row(null)) } } @@ -750,50 +789,84 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession { (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00"), (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00") ).toDF("a", "b") - withSQLConf(SQLConf.UTC_TIMESTAMP_FUNC_ENABLED.key -> "true") { - checkAnswer( - df.select(to_utc_timestamp(col("a"), "PST")), - Seq( - Row(Timestamp.valueOf("2015-07-24 07:00:00")), - Row(Timestamp.valueOf("2015-07-25 07:00:00")))) - checkAnswer( - df.select(to_utc_timestamp(col("b"), "PST")), - Seq( - Row(Timestamp.valueOf("2015-07-24 07:00:00")), - Row(Timestamp.valueOf("2015-07-25 07:00:00")))) - } - val msg = intercept[AnalysisException] { - df.select(to_utc_timestamp(col("a"), "PST")).collect() - }.getMessage - assert(msg.contains(SQLConf.UTC_TIMESTAMP_FUNC_ENABLED.key)) + checkAnswer( + df.select(to_utc_timestamp(col("a"), "PST")), + Seq( + Row(Timestamp.valueOf("2015-07-24 07:00:00")), + Row(Timestamp.valueOf("2015-07-25 07:00:00")))) + checkAnswer( + df.select(to_utc_timestamp(col("b"), "PST")), + Seq( + Row(Timestamp.valueOf("2015-07-24 07:00:00")), + Row(Timestamp.valueOf("2015-07-25 07:00:00")))) } test("to_utc_timestamp with column zone") { - withSQLConf(SQLConf.UTC_TIMESTAMP_FUNC_ENABLED.key -> "true") { - val df = Seq( - (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00", "PST"), - (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00", "CET") - ).toDF("a", "b", "c") - checkAnswer( - df.select(to_utc_timestamp(col("a"), col("c"))), - Seq( - Row(Timestamp.valueOf("2015-07-24 07:00:00")), - Row(Timestamp.valueOf("2015-07-24 22:00:00")))) - checkAnswer( - df.select(to_utc_timestamp(col("b"), col("c"))), - Seq( - Row(Timestamp.valueOf("2015-07-24 07:00:00")), - Row(Timestamp.valueOf("2015-07-24 22:00:00")))) + val df = Seq( + (Timestamp.valueOf("2015-07-24 00:00:00"), "2015-07-24 00:00:00", "PST"), + (Timestamp.valueOf("2015-07-25 00:00:00"), "2015-07-25 00:00:00", "CET") + ).toDF("a", "b", "c") + checkAnswer( + df.select(to_utc_timestamp(col("a"), col("c"))), + Seq( + Row(Timestamp.valueOf("2015-07-24 07:00:00")), + Row(Timestamp.valueOf("2015-07-24 22:00:00")))) + checkAnswer( + df.select(to_utc_timestamp(col("b"), col("c"))), + Seq( + Row(Timestamp.valueOf("2015-07-24 07:00:00")), + Row(Timestamp.valueOf("2015-07-24 22:00:00")))) + } + + test("SPARK-30668: use legacy timestamp parser in to_timestamp") { + def checkTimeZoneParsing(expected: Any): Unit = { + val df = Seq("2020-01-27T20:06:11.847-0800").toDF("ts") + checkAnswer(df.select(to_timestamp(col("ts"), "yyyy-MM-dd'T'HH:mm:ss.SSSz")), + Row(expected)) + } + withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> "true") { + checkTimeZoneParsing(Timestamp.valueOf("2020-01-27 20:06:11.847")) + } + withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> "false") { + checkTimeZoneParsing(null) } } + test("SPARK-30752: convert time zones on a daylight saving day") { + val systemTz = "PST" + val sessionTz = "UTC" + val fromTz = "Asia/Hong_Kong" + val fromTs = "2019-11-03T12:00:00" // daylight saving date in PST + val utsTs = "2019-11-03T04:00:00" + val defaultTz = TimeZone.getDefault + try { + TimeZone.setDefault(DateTimeUtils.getTimeZone(systemTz)) + withSQLConf( + SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true", + SQLConf.SESSION_LOCAL_TIMEZONE.key -> sessionTz) { + val expected = LocalDateTime.parse(utsTs) + .atZone(DateTimeUtils.getZoneId(sessionTz)) + .toInstant + val df = Seq(fromTs).toDF("localTs") + checkAnswer( + df.select(to_utc_timestamp(col("localTs"), fromTz)), + Row(expected)) + } + } finally { + TimeZone.setDefault(defaultTz) + } + } - test("to_timestamp with microseconds precision") { - withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") { - val timestamp = "1970-01-01T00:00:00.123456Z" - val df = Seq(timestamp).toDF("t") - checkAnswer(df.select(to_timestamp($"t", "yyyy-MM-dd'T'HH:mm:ss.SSSSSSX")), - Seq(Row(Instant.parse(timestamp)))) + test("SPARK-30766: date_trunc of old timestamps to hours and days") { + def checkTrunc(level: String, expected: String): Unit = { + val df = Seq("0010-01-01 01:02:03.123456") + .toDF() + .select($"value".cast("timestamp").as("ts")) + .select(date_trunc(level, $"ts").cast("string")) + checkAnswer(df, Row(expected)) } + + checkTrunc("HOUR", "0010-01-01 01:00:00") + checkTrunc("DAY", "0010-01-01 00:00:00") } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DeprecatedDatasetAggregatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DeprecatedDatasetAggregatorSuite.scala new file mode 100644 index 0000000000000..b1d5e80f8563f --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/DeprecatedDatasetAggregatorSuite.scala @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import org.apache.spark.sql.expressions.scalalang.typed +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.test.SharedSparkSession + +@deprecated("This test suite will be removed.", "3.0.0") +class DeprecatedDatasetAggregatorSuite extends QueryTest with SharedSparkSession { + import testImplicits._ + + test("typed aggregation: TypedAggregator") { + val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS() + + checkDataset( + ds.groupByKey(_._1).agg(typed.sum(_._2)), + ("a", 30.0), ("b", 3.0), ("c", 1.0)) + } + + test("typed aggregation: TypedAggregator, expr, expr") { + val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS() + + checkDataset( + ds.groupByKey(_._1).agg( + typed.sum(_._2), + expr("sum(_2)").as[Long], + count("*")), + ("a", 30.0, 30L, 2L), ("b", 3.0, 3L, 2L), ("c", 1.0, 1L, 1L)) + } + + test("typed aggregation: in project list") { + val ds = Seq(1, 3, 2, 5).toDS() + + checkDataset( + ds.select(typed.sum((i: Int) => i)), + 11.0) + checkDataset( + ds.select(typed.sum((i: Int) => i), typed.sum((i: Int) => i * 2)), + 11.0 -> 22.0) + } + + test("typed aggregate: avg, count, sum") { + val ds = Seq("a" -> 1, "a" -> 3, "b" -> 3).toDS() + checkDataset( + ds.groupByKey(_._1).agg( + typed.avg(_._2), typed.count(_._2), typed.sum(_._2), typed.sumLong(_._2)), + ("a", 2.0, 2L, 4.0, 4L), ("b", 3.0, 1L, 3.0, 3L)) + } + + test("spark-15114 shorter system generated alias names") { + val ds = Seq(1, 3, 2, 5).toDS() + assert(ds.select(typed.sum((i: Int) => i)).columns.head === "TypedSumDouble(int)") + val ds2 = ds.select(typed.sum((i: Int) => i), typed.avg((i: Int) => i)) + assert(ds2.columns.head === "TypedSumDouble(int)") + assert(ds2.columns.last === "TypedAverage(int)") + val df = Seq(1 -> "a", 2 -> "b", 3 -> "b").toDF("i", "j") + assert(df.groupBy($"j").agg(RowAgg.toColumn).columns.last == + "RowAgg(org.apache.spark.sql.Row)") + assert(df.groupBy($"j").agg(RowAgg.toColumn as "agg1").columns.last == "agg1") + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala index 09221efe28e15..baa9f5ecafc68 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala @@ -22,6 +22,8 @@ import org.scalatest.GivenWhenThen import org.apache.spark.sql.catalyst.expressions.{DynamicPruningExpression, Expression} import org.apache.spark.sql.catalyst.plans.ExistenceJoin import org.apache.spark.sql.execution._ +import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, AdaptiveSparkPlanHelper} +import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec} import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec import org.apache.spark.sql.execution.streaming.{MemoryStream, StreamingQueryWrapper} import org.apache.spark.sql.functions._ @@ -31,18 +33,24 @@ import org.apache.spark.sql.test.SharedSparkSession /** * Test suite for the filtering ratio policy used to trigger dynamic partition pruning (DPP). */ -class DynamicPartitionPruningSuite +abstract class DynamicPartitionPruningSuiteBase extends QueryTest with SharedSparkSession - with GivenWhenThen { + with GivenWhenThen + with AdaptiveSparkPlanHelper { val tableFormat: String = "parquet" import testImplicits._ + val adaptiveExecutionOn: Boolean + override def beforeAll(): Unit = { super.beforeAll() + spark.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, adaptiveExecutionOn) + spark.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_FORCE_APPLY, true) + val factData = Seq[(Int, Int, Int, Int)]( (1000, 1, 1, 10), (1010, 2, 1, 10), @@ -96,7 +104,8 @@ class DynamicPartitionPruningSuite (6, 60) ) - spark.range(1000).select('id as 'product_id, ('id % 10) as 'store_id, ('id + 1) as 'code) + spark.range(1000) + .select($"id" as "product_id", ($"id" % 10) as "store_id", ($"id" + 1) as "code") .write .format(tableFormat) .mode("overwrite") @@ -149,6 +158,8 @@ class DynamicPartitionPruningSuite sql("DROP TABLE IF EXISTS fact_stats") sql("DROP TABLE IF EXISTS dim_stats") } finally { + spark.sessionState.conf.unsetConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED) + spark.sessionState.conf.unsetConf(SQLConf.ADAPTIVE_EXECUTION_FORCE_APPLY) super.afterAll() } } @@ -161,22 +172,41 @@ class DynamicPartitionPruningSuite df: DataFrame, withSubquery: Boolean, withBroadcast: Boolean): Unit = { - val dpExprs = collectDynamicPruningExpressions(df.queryExecution.executedPlan) + val plan = df.queryExecution.executedPlan + val dpExprs = collectDynamicPruningExpressions(plan) val hasSubquery = dpExprs.exists { case InSubqueryExec(_, _: SubqueryExec, _, _) => true case _ => false } - val hasSubqueryBroadcast = dpExprs.exists { - case InSubqueryExec(_, _: SubqueryBroadcastExec, _, _) => true - case _ => false + val subqueryBroadcast = dpExprs.collect { + case InSubqueryExec(_, b: SubqueryBroadcastExec, _, _) => b } val hasFilter = if (withSubquery) "Should" else "Shouldn't" assert(hasSubquery == withSubquery, s"$hasFilter trigger DPP with a subquery duplicate:\n${df.queryExecution}") val hasBroadcast = if (withBroadcast) "Should" else "Shouldn't" - assert(hasSubqueryBroadcast == withBroadcast, + assert(subqueryBroadcast.nonEmpty == withBroadcast, s"$hasBroadcast trigger DPP with a reused broadcast exchange:\n${df.queryExecution}") + + subqueryBroadcast.foreach { s => + s.child match { + case _: ReusedExchangeExec => // reuse check ok. + case b: BroadcastExchangeExec => + val hasReuse = plan.find { + case ReusedExchangeExec(_, e) => e eq b + case _ => false + }.isDefined + assert(hasReuse, s"$s\nshould have been reused in\n$plan") + case _ => + fail(s"Invalid child node found in\n$s") + } + } + + val isMainQueryAdaptive = plan.isInstanceOf[AdaptiveSparkPlanExec] + subqueriesAll(plan).filterNot(subqueryBroadcast.contains).foreach { s => + assert(s.find(_.isInstanceOf[AdaptiveSparkPlanExec]).isDefined == isMainQueryAdaptive) + } } /** @@ -221,7 +251,8 @@ class DynamicPartitionPruningSuite */ test("simple inner join triggers DPP with mock-up tables") { withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") { + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { withTable("df1", "df2") { spark.range(1000) .select(col("id"), col("id").as("k")) @@ -253,7 +284,8 @@ class DynamicPartitionPruningSuite */ test("self-join on a partitioned table should not trigger DPP") { withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") { + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { withTable("fact") { sql( s""" @@ -284,7 +316,8 @@ class DynamicPartitionPruningSuite */ test("static scan metrics") { withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") { + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { withTable("fact", "dim") { spark.range(10) .map { x => Tuple3(x, x + 1, 0) } @@ -304,7 +337,7 @@ class DynamicPartitionPruningSuite def getFactScan(plan: SparkPlan): SparkPlan = { val scanOption = - plan.find { + find(plan) { case s: FileSourceScanExec => s.output.exists(_.find(_.argString(maxFields = 100).contains("fid")).isDefined) case _ => false @@ -352,7 +385,8 @@ class DynamicPartitionPruningSuite test("DPP should not be rewritten as an existential join") { withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true", SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "1.5", - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") { + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { val df = sql( s""" |SELECT * FROM product p WHERE p.store_id NOT IN @@ -377,7 +411,7 @@ class DynamicPartitionPruningSuite */ test("DPP triggers only for certain types of query") { withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") { + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false") { Given("dynamic partition pruning disabled") withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "false") { val df = sql( @@ -408,14 +442,15 @@ class DynamicPartitionPruningSuite """ |SELECT * FROM fact_sk f |JOIN dim_store s - |ON f.date_id = s.store_id + |ON f.store_id = s.store_id """.stripMargin) checkPartitionPruningPredicate(df, false, false) } Given("left-semi join with partition column on the left side") - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { val df = sql( """ |SELECT * FROM fact_sk f @@ -439,7 +474,8 @@ class DynamicPartitionPruningSuite } Given("right outer join with partition column on the left side") - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { val df = sql( """ |SELECT * FROM fact_sk f RIGHT OUTER JOIN dim_store s @@ -456,7 +492,8 @@ class DynamicPartitionPruningSuite */ test("filtering ratio policy fallback") { withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") { + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { Given("no stats and selective predicate") withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true", SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "true") { @@ -525,7 +562,8 @@ class DynamicPartitionPruningSuite */ test("filtering ratio policy with stats when the broadcast pruning is disabled") { withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false") { + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { Given("disabling the use of stats in the DPP heuristic") withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true", SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false") { @@ -595,10 +633,7 @@ class DynamicPartitionPruningSuite test("partition pruning in broadcast hash joins with non-deterministic probe part") { Given("alias with simple join condition, and non-deterministic query") - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { val df = sql( """ |SELECT f.date_id, f.pid, f.sid FROM @@ -612,10 +647,7 @@ class DynamicPartitionPruningSuite } Given("alias over multiple sub-queries with simple join condition") - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { val df = sql( """ |SELECT f.date_id, f.pid, f.sid FROM @@ -633,10 +665,7 @@ class DynamicPartitionPruningSuite test("partition pruning in broadcast hash joins with aliases") { Given("alias with simple join condition, using attribute names only") - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { val df = sql( """ |SELECT f.date_id, f.pid, f.sid FROM @@ -656,10 +685,7 @@ class DynamicPartitionPruningSuite } Given("alias with expr as join condition") - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { val df = sql( """ |SELECT f.date_id, f.pid, f.sid FROM @@ -679,10 +705,7 @@ class DynamicPartitionPruningSuite } Given("alias over multiple sub-queries with simple join condition") - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { val df = sql( """ |SELECT f.date_id, f.pid, f.sid FROM @@ -704,10 +727,7 @@ class DynamicPartitionPruningSuite } Given("alias over multiple sub-queries with simple join condition") - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { val df = sql( """ |SELECT f.date_id, f.pid_d as pid, f.sid_d as sid FROM @@ -736,10 +756,8 @@ class DynamicPartitionPruningSuite test("partition pruning in broadcast hash joins") { Given("disable broadcast pruning and disable subquery duplication") withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false", - SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { val df = sql( """ |SELECT f.date_id, f.product_id, f.units_sold, f.store_id FROM fact_stats f @@ -759,9 +777,10 @@ class DynamicPartitionPruningSuite Given("disable reuse broadcast results and enable subquery duplication") withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "false", + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false", SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0.5") { + SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0.5", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { val df = sql( """ |SELECT f.date_id, f.product_id, f.units_sold, f.store_id FROM fact_stats f @@ -780,52 +799,47 @@ class DynamicPartitionPruningSuite } Given("enable reuse broadcast results and disable query duplication") - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { - val df = sql( - """ - |SELECT f.date_id, f.product_id, f.units_sold, f.store_id FROM fact_stats f - |JOIN dim_stats s - |ON f.store_id = s.store_id WHERE s.country = 'DE' - """.stripMargin) + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { + val df = sql( + """ + |SELECT f.date_id, f.product_id, f.units_sold, f.store_id FROM fact_stats f + |JOIN dim_stats s + |ON f.store_id = s.store_id WHERE s.country = 'DE' + """.stripMargin) - checkPartitionPruningPredicate(df, false, true) + checkPartitionPruningPredicate(df, false, true) - checkAnswer(df, - Row(1030, 2, 10, 3) :: - Row(1040, 2, 50, 3) :: - Row(1050, 2, 50, 3) :: - Row(1060, 2, 50, 3) :: Nil - ) + checkAnswer(df, + Row(1030, 2, 10, 3) :: + Row(1040, 2, 50, 3) :: + Row(1050, 2, 50, 3) :: + Row(1060, 2, 50, 3) :: Nil + ) } Given("disable broadcast hash join and disable query duplication") - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { - val df = sql( - """ - |SELECT f.date_id, f.product_id, f.units_sold, f.store_id FROM fact_stats f - |JOIN dim_stats s - |ON f.store_id = s.store_id WHERE s.country = 'DE' - """.stripMargin) + withSQLConf( + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true", + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + val df = sql( + """ + |SELECT f.date_id, f.product_id, f.units_sold, f.store_id FROM fact_stats f + |JOIN dim_stats s + |ON f.store_id = s.store_id WHERE s.country = 'DE' + """.stripMargin) - checkPartitionPruningPredicate(df, false, false) + checkPartitionPruningPredicate(df, false, false) - checkAnswer(df, - Row(1030, 2, 10, 3) :: - Row(1040, 2, 50, 3) :: - Row(1050, 2, 50, 3) :: - Row(1060, 2, 50, 3) :: Nil - ) + checkAnswer(df, + Row(1030, 2, 10, 3) :: + Row(1040, 2, 50, 3) :: + Row(1050, 2, 50, 3) :: + Row(1060, 2, 50, 3) :: Nil + ) } Given("disable broadcast hash join and enable query duplication") - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false", SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "true") { val df = sql( @@ -847,9 +861,7 @@ class DynamicPartitionPruningSuite } test("broadcast a single key in a HashedRelation") { - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { withTable("fact", "dim") { spark.range(100).select( $"id", @@ -907,9 +919,7 @@ class DynamicPartitionPruningSuite } test("broadcast multiple keys in a LongHashedRelation") { - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { withTable("fact", "dim") { spark.range(100).select( $"id", @@ -944,9 +954,7 @@ class DynamicPartitionPruningSuite } test("broadcast multiple keys in an UnsafeHashedRelation") { - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { withTable("fact", "dim") { spark.range(100).select( $"id", @@ -981,9 +989,7 @@ class DynamicPartitionPruningSuite } test("different broadcast subqueries with identical children") { - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { withTable("fact", "dim") { spark.range(100).select( $"id", @@ -1022,7 +1028,7 @@ class DynamicPartitionPruningSuite test("no partition pruning when the build side is a stream") { withTable("fact") { val input = MemoryStream[Int] - val stream = input.toDF.select('value as "one", ('value * 3) as "code") + val stream = input.toDF.select($"value" as "one", ($"value" * 3) as "code") spark.range(100).select( $"id", ($"id" + 1).as("one"), @@ -1055,7 +1061,7 @@ class DynamicPartitionPruningSuite } test("avoid reordering broadcast join keys to match input hash partitioning") { - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false", SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { withTable("large", "dimTwo", "dimThree") { spark.range(100).select( @@ -1105,9 +1111,7 @@ class DynamicPartitionPruningSuite * duplicated partitioning keys, also used to uniquely identify the dynamic pruning filters. */ test("dynamic partition pruning ambiguity issue across nested joins") { - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { withTable("store", "date", "item") { spark.range(500) .select((($"id" + 30) % 50).as("ss_item_sk"), @@ -1145,9 +1149,7 @@ class DynamicPartitionPruningSuite } test("cleanup any DPP filter that isn't pushed down due to expression id clashes") { - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { withTable("fact", "dim") { spark.range(1000).select($"id".as("A"), $"id".as("AA")) .write.partitionBy("A").format(tableFormat).mode("overwrite").saveAsTable("fact") @@ -1168,10 +1170,7 @@ class DynamicPartitionPruningSuite } test("cleanup any DPP filter that isn't pushed down due to non-determinism") { - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { val df = sql( """ |SELECT f.date_id, f.pid, f.sid FROM @@ -1186,9 +1185,8 @@ class DynamicPartitionPruningSuite } test("join key with multiple references on the filtering plan") { - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { + // when enable AQE, the reusedExchange is inserted when executed. withTable("fact", "dim") { spark.range(100).select( $"id", @@ -1220,9 +1218,7 @@ class DynamicPartitionPruningSuite } test("Make sure dynamic pruning works on uncorrelated queries") { - withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { val df = sql( """ |SELECT d.store_id, @@ -1246,10 +1242,7 @@ class DynamicPartitionPruningSuite test("Plan broadcast pruning only when the broadcast can be reused") { Given("dynamic pruning filter on the build side") - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { val df = sql( """ |SELECT f.date_id, f.store_id, f.product_id, f.units_sold FROM fact_np f @@ -1268,10 +1261,7 @@ class DynamicPartitionPruningSuite } Given("dynamic pruning filter on the probe side") - withSQLConf( - SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST.key -> "true", - SQLConf.DYNAMIC_PARTITION_PRUNING_USE_STATS.key -> "false", - SQLConf.DYNAMIC_PARTITION_PRUNING_FALLBACK_FILTER_RATIO.key -> "0") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "true") { val df = sql( """ |SELECT /*+ BROADCAST(f)*/ @@ -1291,3 +1281,11 @@ class DynamicPartitionPruningSuite } } } + +class DynamicPartitionPruningSuiteAEOff extends DynamicPartitionPruningSuiteBase { + override val adaptiveExecutionOn: Boolean = false +} + +class DynamicPartitionPruningSuiteAEOn extends DynamicPartitionPruningSuiteBase { + override val adaptiveExecutionOn: Boolean = true +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala index 125cff0e6628a..b591705274110 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql +import org.apache.spark.sql.execution._ +import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.StructType @@ -24,13 +26,41 @@ import org.apache.spark.sql.types.StructType class ExplainSuite extends QueryTest with SharedSparkSession { import testImplicits._ + var originalValue: String = _ + protected override def beforeAll(): Unit = { + super.beforeAll() + originalValue = spark.conf.get(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key) + spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false") + } + + protected override def afterAll(): Unit = { + spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, originalValue) + super.afterAll() + } + + private def getNormalizedExplain(df: DataFrame, mode: ExplainMode): String = { + val output = new java.io.ByteArrayOutputStream() + Console.withOut(output) { + df.explain(mode.name) + } + output.toString.replaceAll("#\\d+", "#x") + } + /** * Get the explain from a DataFrame and run the specified action on it. */ - private def withNormalizedExplain(df: DataFrame, extended: Boolean)(f: String => Unit) = { + private def withNormalizedExplain(df: DataFrame, mode: ExplainMode)(f: String => Unit) = { + f(getNormalizedExplain(df, mode)) + } + + /** + * Get the explain by running the sql. The explain mode should be part of the + * sql text itself. + */ + private def withNormalizedExplain(queryText: String)(f: String => Unit) = { val output = new java.io.ByteArrayOutputStream() Console.withOut(output) { - df.explain(extended = extended) + sql(queryText).show(false) } val normalizedOutput = output.toString.replaceAll("#\\d+", "#x") f(normalizedOutput) @@ -39,14 +69,19 @@ class ExplainSuite extends QueryTest with SharedSparkSession { /** * Runs the plan and makes sure the plans contains all of the keywords. */ - private def checkKeywordsExistsInExplain(df: DataFrame, keywords: String*): Unit = { - withNormalizedExplain(df, extended = true) { normalizedOutput => + private def checkKeywordsExistsInExplain( + df: DataFrame, mode: ExplainMode, keywords: String*): Unit = { + withNormalizedExplain(df, mode) { normalizedOutput => for (key <- keywords) { assert(normalizedOutput.contains(key)) } } } + private def checkKeywordsExistsInExplain(df: DataFrame, keywords: String*): Unit = { + checkKeywordsExistsInExplain(df, ExtendedMode, keywords: _*) + } + test("SPARK-23034 show rdd names in RDD scan nodes (Dataset)") { val rddWithName = spark.sparkContext.parallelize(Row(1, "abc") :: Nil).setName("testRdd") val df = spark.createDataFrame(rddWithName, StructType.fromDDL("c0 int, c1 string")) @@ -195,11 +230,117 @@ class ExplainSuite extends QueryTest with SharedSparkSession { test("SPARK-26659: explain of DataWritingCommandExec should not contain duplicate cmd.nodeName") { withTable("temptable") { val df = sql("create table temptable using parquet as select * from range(2)") - withNormalizedExplain(df, extended = false) { normalizedOutput => + withNormalizedExplain(df, SimpleMode) { normalizedOutput => assert("Create\\w*?TableAsSelectCommand".r.findAllMatchIn(normalizedOutput).length == 1) } } } + + test("explain formatted - check presence of subquery in case of DPP") { + withTable("df1", "df2") { + withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true", + SQLConf.DYNAMIC_PARTITION_PRUNING_REUSE_BROADCAST_ONLY.key -> "false", + SQLConf.EXCHANGE_REUSE_ENABLED.key -> "false") { + withTable("df1", "df2") { + spark.range(1000).select(col("id"), col("id").as("k")) + .write + .partitionBy("k") + .format("parquet") + .mode("overwrite") + .saveAsTable("df1") + + spark.range(100) + .select(col("id"), col("id").as("k")) + .write + .partitionBy("k") + .format("parquet") + .mode("overwrite") + .saveAsTable("df2") + + val sqlText = + """ + |EXPLAIN FORMATTED SELECT df1.id, df2.k + |FROM df1 JOIN df2 ON df1.k = df2.k AND df2.id < 2 + |""".stripMargin + + val expected_pattern1 = + "Subquery:1 Hosting operator id = 1 Hosting Expression = k#xL IN subquery#x" + val expected_pattern2 = + "PartitionFilters: \\[isnotnull\\(k#xL\\), dynamicpruningexpression\\(k#xL " + + "IN subquery#x\\)\\]" + val expected_pattern3 = + "Location: InMemoryFileIndex \\[.*org.apache.spark.sql.ExplainSuite" + + "/df2/.*, ... 99 entries\\]" + val expected_pattern4 = + "Location: InMemoryFileIndex \\[.*org.apache.spark.sql.ExplainSuite" + + "/df1/.*, ... 999 entries\\]" + withNormalizedExplain(sqlText) { normalizedOutput => + assert(expected_pattern1.r.findAllMatchIn(normalizedOutput).length == 1) + assert(expected_pattern2.r.findAllMatchIn(normalizedOutput).length == 1) + assert(expected_pattern3.r.findAllMatchIn(normalizedOutput).length == 2) + assert(expected_pattern4.r.findAllMatchIn(normalizedOutput).length == 1) + } + } + } + } + } + + test("Support ExplainMode in Dataset.explain") { + val df1 = Seq((1, 2), (2, 3)).toDF("k", "v1") + val df2 = Seq((2, 3), (1, 1)).toDF("k", "v2") + val testDf = df1.join(df2, "k").groupBy("k").agg(count("v1"), sum("v1"), avg("v2")) + + val simpleExplainOutput = getNormalizedExplain(testDf, SimpleMode) + assert(simpleExplainOutput.startsWith("== Physical Plan ==")) + Seq("== Parsed Logical Plan ==", + "== Analyzed Logical Plan ==", + "== Optimized Logical Plan ==").foreach { planType => + assert(!simpleExplainOutput.contains(planType)) + } + checkKeywordsExistsInExplain( + testDf, + ExtendedMode, + "== Parsed Logical Plan ==" :: + "== Analyzed Logical Plan ==" :: + "== Optimized Logical Plan ==" :: + "== Physical Plan ==" :: + Nil: _*) + checkKeywordsExistsInExplain( + testDf, + CostMode, + "Statistics(sizeInBytes=" :: + Nil: _*) + checkKeywordsExistsInExplain( + testDf, + CodegenMode, + "WholeStageCodegen subtrees" :: + "Generated code:" :: + Nil: _*) + checkKeywordsExistsInExplain( + testDf, + FormattedMode, + "* LocalTableScan (1)" :: + "(1) LocalTableScan [codegen id :" :: + Nil: _*) + } + + test("Dataset.toExplainString has mode as string") { + val df = spark.range(10).toDF + def assertExplainOutput(mode: ExplainMode): Unit = { + assert(df.queryExecution.explainString(mode).replaceAll("#\\d+", "#x").trim === + getNormalizedExplain(df, mode).trim) + } + assertExplainOutput(SimpleMode) + assertExplainOutput(ExtendedMode) + assertExplainOutput(CodegenMode) + assertExplainOutput(CostMode) + assertExplainOutput(FormattedMode) + + val errMsg = intercept[IllegalArgumentException] { + ExplainMode.fromString("unknown") + }.getMessage + assert(errMsg.contains("Unknown explain mode: unknown")) + } } case class ExplainSingleData(id: Int) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala index 23848d90dc53d..c870958128483 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql -import java.io.{File, FilenameFilter, FileNotFoundException} +import java.io.{File, FileNotFoundException} import java.nio.file.{Files, StandardOpenOption} import java.util.Locale @@ -27,9 +27,13 @@ import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd} -import org.apache.spark.sql.TestingUDT.{IntervalData, IntervalUDT, NullData, NullUDT} +import org.apache.spark.sql.TestingUDT.{IntervalUDT, NullData, NullUDT} +import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.planning.PhysicalOperation -import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation +import org.apache.spark.sql.catalyst.plans.logical.Filter +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.execution.datasources.FilePartition +import org.apache.spark.sql.execution.datasources.v2.{BatchScanExec, DataSourceV2ScanRelation, FileScan} import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetTable import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, SortMergeJoinExec} import org.apache.spark.sql.functions._ @@ -38,7 +42,9 @@ import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ -class FileBasedDataSourceSuite extends QueryTest with SharedSparkSession { +class FileBasedDataSourceSuite extends QueryTest + with SharedSparkSession + with AdaptiveSparkPlanHelper { import testImplicits._ override def beforeAll(): Unit = { @@ -175,18 +181,23 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSparkSession { withTempDir { dir => val basePath = dir.getCanonicalPath - Seq("0").toDF("a").write.format(format).save(new Path(basePath, "first").toString) - Seq("1").toDF("a").write.format(format).save(new Path(basePath, "second").toString) + Seq("0").toDF("a").write.format(format).save(new Path(basePath, "second").toString) + Seq("1").toDF("a").write.format(format).save(new Path(basePath, "fourth").toString) + val firstPath = new Path(basePath, "first") val thirdPath = new Path(basePath, "third") val fs = thirdPath.getFileSystem(spark.sessionState.newHadoopConf()) - Seq("2").toDF("a").write.format(format).save(thirdPath.toString) - val files = fs.listStatus(thirdPath).filter(_.isFile).map(_.getPath) + Seq("2").toDF("a").write.format(format).save(firstPath.toString) + Seq("3").toDF("a").write.format(format).save(thirdPath.toString) + val files = Seq(firstPath, thirdPath).flatMap { p => + fs.listStatus(p).filter(_.isFile).map(_.getPath) + } val df = spark.read.format(format).load( new Path(basePath, "first").toString, new Path(basePath, "second").toString, - new Path(basePath, "third").toString) + new Path(basePath, "third").toString, + new Path(basePath, "fourth").toString) // Make sure all data files are deleted and can't be opened. files.foreach(f => fs.delete(f, false)) @@ -199,15 +210,21 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSparkSession { } } - withSQLConf(SQLConf.IGNORE_MISSING_FILES.key -> "true") { - testIgnoreMissingFiles() - } - - withSQLConf(SQLConf.IGNORE_MISSING_FILES.key -> "false") { - val exception = intercept[SparkException] { - testIgnoreMissingFiles() + for { + ignore <- Seq("true", "false") + sources <- Seq("", format) + } { + withSQLConf(SQLConf.IGNORE_MISSING_FILES.key -> ignore, + SQLConf.USE_V1_SOURCE_LIST.key -> sources) { + if (ignore.toBoolean) { + testIgnoreMissingFiles() + } else { + val exception = intercept[SparkException] { + testIgnoreMissingFiles() + } + assert(exception.getMessage().contains("does not exist")) + } } - assert(exception.getMessage().contains("does not exist")) } } } @@ -481,14 +498,14 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSparkSession { spark.range(1000).repartition(1).write.csv(path) val bytesReads = new mutable.ArrayBuffer[Long]() val bytesReadListener = new SparkListener() { - override def onTaskEnd(taskEnd: SparkListenerTaskEnd) { + override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = { bytesReads += taskEnd.taskMetrics.inputMetrics.bytesRead } } sparkContext.addSparkListener(bytesReadListener) try { spark.read.csv(path).limit(1).collect() - sparkContext.listenerBus.waitUntilEmpty(1000L) + sparkContext.listenerBus.waitUntilEmpty() assert(bytesReads.sum === 7860) } finally { sparkContext.removeSparkListener(bytesReadListener) @@ -657,6 +674,23 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSparkSession { } } + test("Return correct results when data columns overlap with partition columns (nested data)") { + Seq("parquet", "orc", "json").foreach { format => + withSQLConf(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key -> "true") { + withTempPath { path => + val tablePath = new File(s"${path.getCanonicalPath}/c3=c/c1=a/c5=e") + + val inputDF = sql("SELECT 1 c1, 2 c2, 3 c3, named_struct('c4_1', 2, 'c4_2', 3) c4, 5 c5") + inputDF.write.format(format).save(tablePath.getCanonicalPath) + + val resultDF = spark.read.format(format).load(path.getCanonicalPath) + .select("c1", "c4.c4_1", "c5", "c3") + checkAnswer(resultDF, Row("a", 2, "e", "c")) + } + } + } + } + test("sizeInBytes should be the total size of all files") { Seq("orc", "").foreach { useV1SourceReaderList => withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> useV1SourceReaderList) { @@ -664,7 +698,7 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSparkSession { dir.delete() spark.range(1000).write.orc(dir.toString) val df = spark.read.orc(dir.toString) - assert(df.queryExecution.logical.stats.sizeInBytes === BigInt(getLocalDirSize(dir))) + assert(df.queryExecution.optimizedPlan.stats.sizeInBytes === BigInt(getLocalDirSize(dir))) } } } @@ -685,21 +719,21 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSparkSession { val df2FromFile = spark.read.orc(workDirPath + "/data2") val joinedDF = df1FromFile.join(df2FromFile, Seq("count")) if (compressionFactor == 0.5) { - val bJoinExec = joinedDF.queryExecution.executedPlan.collect { + val bJoinExec = collect(joinedDF.queryExecution.executedPlan) { case bJoin: BroadcastHashJoinExec => bJoin } assert(bJoinExec.nonEmpty) - val smJoinExec = joinedDF.queryExecution.executedPlan.collect { + val smJoinExec = collect(joinedDF.queryExecution.executedPlan) { case smJoin: SortMergeJoinExec => smJoin } assert(smJoinExec.isEmpty) } else { // compressionFactor is 1.0 - val bJoinExec = joinedDF.queryExecution.executedPlan.collect { + val bJoinExec = collect(joinedDF.queryExecution.executedPlan) { case bJoin: BroadcastHashJoinExec => bJoin } assert(bJoinExec.isEmpty) - val smJoinExec = joinedDF.queryExecution.executedPlan.collect { + val smJoinExec = collect(joinedDF.queryExecution.executedPlan) { case smJoin: SortMergeJoinExec => smJoin } assert(smJoinExec.nonEmpty) @@ -709,6 +743,85 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSparkSession { } } + test("File source v2: support partition pruning") { + withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> "") { + allFileBasedDataSources.foreach { format => + withTempPath { dir => + Seq(("a", 1, 2), ("b", 1, 2), ("c", 2, 1)) + .toDF("value", "p1", "p2") + .write + .format(format) + .partitionBy("p1", "p2") + .option("header", true) + .save(dir.getCanonicalPath) + val df = spark + .read + .format(format) + .option("header", true) + .load(dir.getCanonicalPath) + .where("p1 = 1 and p2 = 2 and value != \"a\"") + + val filterCondition = df.queryExecution.optimizedPlan.collectFirst { + case f: Filter => f.condition + } + assert(filterCondition.isDefined) + // The partitions filters should be pushed down and no need to be reevaluated. + assert(filterCondition.get.collectFirst { + case a: AttributeReference if a.name == "p1" || a.name == "p2" => a + }.isEmpty) + + val fileScan = df.queryExecution.executedPlan collectFirst { + case BatchScanExec(_, f: FileScan) => f + } + assert(fileScan.nonEmpty) + assert(fileScan.get.partitionFilters.nonEmpty) + assert(fileScan.get.dataFilters.nonEmpty) + assert(fileScan.get.planInputPartitions().forall { partition => + partition.asInstanceOf[FilePartition].files.forall { file => + file.filePath.contains("p1=1") && file.filePath.contains("p2=2") + } + }) + checkAnswer(df, Row("b", 1, 2)) + } + } + } + } + + test("File source v2: support passing data filters to FileScan without partitionFilters") { + withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> "") { + allFileBasedDataSources.foreach { format => + withTempPath { dir => + Seq(("a", 1, 2), ("b", 1, 2), ("c", 2, 1)) + .toDF("value", "p1", "p2") + .write + .format(format) + .partitionBy("p1", "p2") + .option("header", true) + .save(dir.getCanonicalPath) + val df = spark + .read + .format(format) + .option("header", true) + .load(dir.getCanonicalPath) + .where("value = 'a'") + + val filterCondition = df.queryExecution.optimizedPlan.collectFirst { + case f: Filter => f.condition + } + assert(filterCondition.isDefined) + + val fileScan = df.queryExecution.executedPlan collectFirst { + case BatchScanExec(_, f: FileScan) => f + } + assert(fileScan.nonEmpty) + assert(fileScan.get.partitionFilters.isEmpty) + assert(fileScan.get.dataFilters.nonEmpty) + checkAnswer(df, Row("a", 1, 2)) + } + } + } + } + test("File table location should include both values of option `path` and `paths`") { withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> "") { withTempPaths(3) { paths => @@ -720,7 +833,7 @@ class FileBasedDataSourceSuite extends QueryTest with SharedSparkSession { .option("path", paths.head.getCanonicalPath) .parquet(paths(1).getCanonicalPath, paths(2).getCanonicalPath) df.queryExecution.optimizedPlan match { - case PhysicalOperation(_, _, DataSourceV2Relation(table: ParquetTable, _, _)) => + case PhysicalOperation(_, _, DataSourceV2ScanRelation(table: ParquetTable, _, _)) => assert(table.paths.toSet == paths.map(_.getCanonicalPath).toSet) case _ => throw new AnalysisException("Can not match ParquetTable in the query.") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala index 4edce3b0811e0..96a0eb3e32e9b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala @@ -88,28 +88,28 @@ class GeneratorFunctionSuite extends QueryTest with SharedSparkSession { test("single explode") { val df = Seq((1, Seq(1, 2, 3))).toDF("a", "intList") checkAnswer( - df.select(explode('intList)), + df.select(explode($"intList")), Row(1) :: Row(2) :: Row(3) :: Nil) } test("single explode_outer") { val df = Seq((1, Seq(1, 2, 3)), (2, Seq())).toDF("a", "intList") checkAnswer( - df.select(explode_outer('intList)), + df.select(explode_outer($"intList")), Row(1) :: Row(2) :: Row(3) :: Row(null) :: Nil) } test("single posexplode") { val df = Seq((1, Seq(1, 2, 3))).toDF("a", "intList") checkAnswer( - df.select(posexplode('intList)), + df.select(posexplode($"intList")), Row(0, 1) :: Row(1, 2) :: Row(2, 3) :: Nil) } test("single posexplode_outer") { val df = Seq((1, Seq(1, 2, 3)), (2, Seq())).toDF("a", "intList") checkAnswer( - df.select(posexplode_outer('intList)), + df.select(posexplode_outer($"intList")), Row(0, 1) :: Row(1, 2) :: Row(2, 3) :: Row(null, null) :: Nil) } @@ -117,13 +117,13 @@ class GeneratorFunctionSuite extends QueryTest with SharedSparkSession { val df = Seq((1, Seq(1, 2, 3))).toDF("a", "intList") checkAnswer( - df.select($"a", explode('intList)), + df.select($"a", explode($"intList")), Row(1, 1) :: Row(1, 2) :: Row(1, 3) :: Nil) checkAnswer( - df.select($"*", explode('intList)), + df.select($"*", explode($"intList")), Row(1, Seq(1, 2, 3), 1) :: Row(1, Seq(1, 2, 3), 2) :: Row(1, Seq(1, 2, 3), 3) :: Nil) @@ -133,7 +133,7 @@ class GeneratorFunctionSuite extends QueryTest with SharedSparkSession { val df = Seq((1, Seq(1, 2, 3)), (2, Seq())).toDF("a", "intList") checkAnswer( - df.select($"a", explode_outer('intList)), + df.select($"a", explode_outer($"intList")), Row(1, 1) :: Row(1, 2) :: Row(1, 3) :: @@ -141,7 +141,7 @@ class GeneratorFunctionSuite extends QueryTest with SharedSparkSession { Nil) checkAnswer( - df.select($"*", explode_outer('intList)), + df.select($"*", explode_outer($"intList")), Row(1, Seq(1, 2, 3), 1) :: Row(1, Seq(1, 2, 3), 2) :: Row(1, Seq(1, 2, 3), 3) :: @@ -153,11 +153,11 @@ class GeneratorFunctionSuite extends QueryTest with SharedSparkSession { val df = Seq((1, Seq(1, 2, 3))).toDF("a", "intList") checkAnswer( - df.select(explode('intList).as('int)).select('int), + df.select(explode($"intList").as("int")).select($"int"), Row(1) :: Row(2) :: Row(3) :: Nil) checkAnswer( - df.select(explode('intList).as('int)).select(sum('int)), + df.select(explode($"intList").as("int")).select(sum($"int")), Row(6) :: Nil) } @@ -165,11 +165,11 @@ class GeneratorFunctionSuite extends QueryTest with SharedSparkSession { val df = Seq((1, Seq(1, 2, 3)), (2, Seq())).toDF("a", "intList") checkAnswer( - df.select(explode_outer('intList).as('int)).select('int), + df.select(explode_outer($"intList").as("int")).select($"int"), Row(1) :: Row(2) :: Row(3) :: Row(null) :: Nil) checkAnswer( - df.select(explode('intList).as('int)).select(sum('int)), + df.select(explode($"intList").as("int")).select(sum($"int")), Row(6) :: Nil) } @@ -177,7 +177,7 @@ class GeneratorFunctionSuite extends QueryTest with SharedSparkSession { val df = Seq((1, Map("a" -> "b"))).toDF("a", "map") checkAnswer( - df.select(explode('map)), + df.select(explode($"map")), Row("a", "b")) } @@ -186,7 +186,7 @@ class GeneratorFunctionSuite extends QueryTest with SharedSparkSession { (3, Map("c" -> "d"))).toDF("a", "map") checkAnswer( - df.select(explode_outer('map)), + df.select(explode_outer($"map")), Row("a", "b") :: Row(null, null) :: Row("c", "d") :: Nil) } @@ -194,7 +194,7 @@ class GeneratorFunctionSuite extends QueryTest with SharedSparkSession { val df = Seq((1, Map("a" -> "b"))).toDF("a", "map") checkAnswer( - df.select(explode('map).as("key1" :: "value1" :: Nil)).select("key1", "value1"), + df.select(explode($"map").as("key1" :: "value1" :: Nil)).select("key1", "value1"), Row("a", "b")) } @@ -202,13 +202,13 @@ class GeneratorFunctionSuite extends QueryTest with SharedSparkSession { val df = Seq((3, None), (1, Some(Map("a" -> "b")))).toDF("a", "map") checkAnswer( - df.select(explode_outer('map).as("key1" :: "value1" :: Nil)).select("key1", "value1"), + df.select(explode_outer($"map").as("key1" :: "value1" :: Nil)).select("key1", "value1"), Row("a", "b") :: Row(null, null) :: Nil) } test("self join explode") { val df = Seq((1, Seq(1, 2, 3))).toDF("a", "intList") - val exploded = df.select(explode('intList).as('i)) + val exploded = df.select(explode($"intList").as("i")) checkAnswer( exploded.join(exploded, exploded("i") === exploded("i")).agg(count("*")), @@ -277,7 +277,8 @@ class GeneratorFunctionSuite extends QueryTest with SharedSparkSession { test("inline_outer") { val df = Seq((1, "2"), (3, "4"), (5, "6")).toDF("col1", "col2") - val df2 = df.select(when('col1 === 1, null).otherwise(array(struct('col1, 'col2))).as("col1")) + val df2 = df.select( + when($"col1" === 1, null).otherwise(array(struct($"col1", $"col2"))).as("col1")) checkAnswer( df2.selectExpr("inline(col1)"), Row(3, "4") :: Row(5, "6") :: Nil diff --git a/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala index d62fe961117a9..51150a1b38b49 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/IntegratedUDFTestUtils.scala @@ -22,6 +22,8 @@ import java.nio.file.{Files, Paths} import scala.collection.JavaConverters._ import scala.util.Try +import org.scalatest.Assertions._ + import org.apache.spark.TestUtils import org.apache.spark.api.python.{PythonBroadcast, PythonEvalType, PythonFunction, PythonUtils} import org.apache.spark.broadcast.Broadcast @@ -103,7 +105,7 @@ object IntegratedUDFTestUtils extends SQLHelper { Seq( pythonExec, "-c", - "from pyspark.sql.utils import require_minimum_pandas_version;" + + "from pyspark.sql.pandas.utils import require_minimum_pandas_version;" + "require_minimum_pandas_version()"), None, "PYTHONPATH" -> s"$pysparkPythonPath:$pythonPath").!! @@ -115,14 +117,14 @@ object IntegratedUDFTestUtils extends SQLHelper { Seq( pythonExec, "-c", - "from pyspark.sql.utils import require_minimum_pyarrow_version;" + + "from pyspark.sql.pandas.utils import require_minimum_pyarrow_version;" + "require_minimum_pyarrow_version()"), None, "PYTHONPATH" -> s"$pysparkPythonPath:$pythonPath").!! true }.getOrElse(false) - private lazy val pythonVer = if (isPythonAvailable) { + lazy val pythonVer: String = if (isPythonAvailable) { Process( Seq(pythonExec, "-c", "import sys; print('%d.%d' % sys.version_info[:2])"), None, @@ -131,6 +133,24 @@ object IntegratedUDFTestUtils extends SQLHelper { throw new RuntimeException(s"Python executable [$pythonExec] is unavailable.") } + lazy val pandasVer: String = if (isPandasAvailable) { + Process( + Seq(pythonExec, "-c", "import pandas; print(pandas.__version__)"), + None, + "PYTHONPATH" -> s"$pysparkPythonPath:$pythonPath").!!.trim() + } else { + throw new RuntimeException("Pandas is unavailable.") + } + + lazy val pyarrowVer: String = if (isPyArrowAvailable) { + Process( + Seq(pythonExec, "-c", "import pyarrow; print(pyarrow.__version__)"), + None, + "PYTHONPATH" -> s"$pysparkPythonPath:$pythonPath").!!.trim() + } else { + throw new RuntimeException("PyArrow is unavailable.") + } + // Dynamically pickles and reads the Python instance into JVM side in order to mimic // Python native function within Python UDF. private lazy val pythonFunc: Array[Byte] = if (shouldTestPythonUDFs) { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinHintSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinHintSuite.scala index 6b154253e6e6c..f68c416941266 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JoinHintSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinHintSuite.scala @@ -17,20 +17,18 @@ package org.apache.spark.sql -import scala.collection.mutable.ArrayBuffer - -import org.apache.log4j.{AppenderSkeleton, Level} -import org.apache.log4j.spi.LoggingEvent +import org.apache.log4j.Level import org.apache.spark.sql.catalyst.optimizer.EliminateResolvedHint import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.RuleExecutor +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.joins._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession -class JoinHintSuite extends PlanTest with SharedSparkSession { +class JoinHintSuite extends PlanTest with SharedSparkSession with AdaptiveSparkPlanHelper { import testImplicits._ lazy val df = spark.range(10) @@ -38,14 +36,6 @@ class JoinHintSuite extends PlanTest with SharedSparkSession { lazy val df2 = df.selectExpr("id as b1", "id as b2") lazy val df3 = df.selectExpr("id as c1", "id as c2") - class MockAppender extends AppenderSkeleton { - val loggingEvents = new ArrayBuffer[LoggingEvent]() - - override def append(loggingEvent: LoggingEvent): Unit = loggingEvents.append(loggingEvent) - override def close(): Unit = {} - override def requiresLayout(): Boolean = false - } - def msgNoHintRelationFound(relation: String, hint: String): String = s"Count not find relation '$relation' specified in hint '$hint'." @@ -59,7 +49,7 @@ class JoinHintSuite extends PlanTest with SharedSparkSession { df: => DataFrame, expectedHints: Seq[JoinHint], warnings: Seq[String]): Unit = { - val logAppender = new MockAppender() + val logAppender = new LogAppender("join hints") withLogAppender(logAppender) { verifyJoinHint(df, expectedHints) } @@ -99,7 +89,7 @@ class JoinHintSuite extends PlanTest with SharedSparkSession { test("multiple joins") { verifyJoinHint( - df1.join(df2.hint("broadcast").join(df3, 'b1 === 'c1).hint("broadcast"), 'a1 === 'c1), + df1.join(df2.hint("broadcast").join(df3, $"b1" === $"c1").hint("broadcast"), $"a1" === $"c1"), JoinHint( None, Some(HintInfo(strategy = Some(BROADCAST)))) :: @@ -108,7 +98,7 @@ class JoinHintSuite extends PlanTest with SharedSparkSession { None) :: Nil ) verifyJoinHint( - df1.hint("broadcast").join(df2, 'a1 === 'b1).hint("broadcast").join(df3, 'a1 === 'c1), + df1.hint("broadcast").join(df2, $"a1" === $"b1").hint("broadcast").join(df3, $"a1" === $"c1"), JoinHint( Some(HintInfo(strategy = Some(BROADCAST))), None) :: @@ -180,8 +170,8 @@ class JoinHintSuite extends PlanTest with SharedSparkSession { ) verifyJoinHint( - df1.join(df2, 'a1 === 'b1 && 'a1 > 5).hint("broadcast") - .join(df3, 'b1 === 'c1 && 'a1 < 10), + df1.join(df2, $"a1" === $"b1" && $"a1" > 5).hint("broadcast") + .join(df3, $"b1" === $"c1" && $"a1" < 10), JoinHint( Some(HintInfo(strategy = Some(BROADCAST))), None) :: @@ -189,9 +179,9 @@ class JoinHintSuite extends PlanTest with SharedSparkSession { ) verifyJoinHint( - df1.join(df2, 'a1 === 'b1 && 'a1 > 5).hint("broadcast") - .join(df3, 'b1 === 'c1 && 'a1 < 10) - .join(df, 'b1 === 'id), + df1.join(df2, $"a1" === $"b1" && $"a1" > 5).hint("broadcast") + .join(df3, $"b1" === $"c1" && $"a1" < 10) + .join(df, $"b1" === $"id"), JoinHint.NONE :: JoinHint( Some(HintInfo(strategy = Some(BROADCAST))), @@ -222,7 +212,7 @@ class JoinHintSuite extends PlanTest with SharedSparkSession { test("hint merge") { verifyJoinHintWithWarnings( - df.hint("broadcast").filter('id > 2).hint("broadcast").join(df, "id"), + df.hint("broadcast").filter($"id" > 2).hint("broadcast").join(df, "id"), JoinHint( Some(HintInfo(strategy = Some(BROADCAST))), None) :: Nil, @@ -236,7 +226,7 @@ class JoinHintSuite extends PlanTest with SharedSparkSession { Nil ) verifyJoinHintWithWarnings( - df.hint("merge").filter('id > 2).hint("shuffle_hash").join(df, "id").hint("broadcast"), + df.hint("merge").filter($"id" > 2).hint("shuffle_hash").join(df, "id").hint("broadcast"), JoinHint( Some(HintInfo(strategy = Some(SHUFFLE_HASH))), None) :: Nil, @@ -312,13 +302,13 @@ class JoinHintSuite extends PlanTest with SharedSparkSession { test("nested hint") { verifyJoinHint( - df.hint("broadcast").hint("broadcast").filter('id > 2).join(df, "id"), + df.hint("broadcast").hint("broadcast").filter($"id" > 2).join(df, "id"), JoinHint( Some(HintInfo(strategy = Some(BROADCAST))), None) :: Nil ) verifyJoinHint( - df.hint("shuffle_hash").hint("broadcast").hint("merge").filter('id > 2).join(df, "id"), + df.hint("shuffle_hash").hint("broadcast").hint("merge").filter($"id" > 2).join(df, "id"), JoinHint( Some(HintInfo(strategy = Some(SHUFFLE_MERGE))), None) :: Nil @@ -352,7 +342,7 @@ class JoinHintSuite extends PlanTest with SharedSparkSession { private def assertBroadcastHashJoin(df: DataFrame, buildSide: BuildSide): Unit = { val executedPlan = df.queryExecution.executedPlan - val broadcastHashJoins = executedPlan.collect { + val broadcastHashJoins = collect(executedPlan) { case b: BroadcastHashJoinExec => b } assert(broadcastHashJoins.size == 1) @@ -361,7 +351,7 @@ class JoinHintSuite extends PlanTest with SharedSparkSession { private def assertBroadcastNLJoin(df: DataFrame, buildSide: BuildSide): Unit = { val executedPlan = df.queryExecution.executedPlan - val broadcastNLJoins = executedPlan.collect { + val broadcastNLJoins = collect(executedPlan) { case b: BroadcastNestedLoopJoinExec => b } assert(broadcastNLJoins.size == 1) @@ -370,7 +360,7 @@ class JoinHintSuite extends PlanTest with SharedSparkSession { private def assertShuffleHashJoin(df: DataFrame, buildSide: BuildSide): Unit = { val executedPlan = df.queryExecution.executedPlan - val shuffleHashJoins = executedPlan.collect { + val shuffleHashJoins = collect(executedPlan) { case s: ShuffledHashJoinExec => s } assert(shuffleHashJoins.size == 1) @@ -379,7 +369,7 @@ class JoinHintSuite extends PlanTest with SharedSparkSession { private def assertShuffleMergeJoin(df: DataFrame): Unit = { val executedPlan = df.queryExecution.executedPlan - val shuffleMergeJoins = executedPlan.collect { + val shuffleMergeJoins = collect(executedPlan) { case s: SortMergeJoinExec => s } assert(shuffleMergeJoins.size == 1) @@ -387,7 +377,7 @@ class JoinHintSuite extends PlanTest with SharedSparkSession { private def assertShuffleReplicateNLJoin(df: DataFrame): Unit = { val executedPlan = df.queryExecution.executedPlan - val shuffleReplicateNLJoins = executedPlan.collect { + val shuffleReplicateNLJoins = collect(executedPlan) { case c: CartesianProductExec => c } assert(shuffleReplicateNLJoins.size == 1) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala index 72742644ff34e..f45bd950040ce 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala @@ -22,21 +22,41 @@ import java.util.Locale import scala.collection.JavaConverters._ import scala.collection.mutable.ListBuffer +import org.mockito.Mockito._ + import org.apache.spark.TestUtils.{assertNotSpilled, assertSpilled} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation -import org.apache.spark.sql.catalyst.expressions.{Ascending, SortOrder} +import org.apache.spark.sql.catalyst.expressions.{Ascending, GenericRow, SortOrder} import org.apache.spark.sql.catalyst.plans.logical.Filter -import org.apache.spark.sql.execution.{BinaryExecNode, FilterExec, SortExec} +import org.apache.spark.sql.execution.{BinaryExecNode, FilterExec, SortExec, SparkPlan} +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.joins._ import org.apache.spark.sql.execution.python.BatchEvalPythonExec import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.StructType -class JoinSuite extends QueryTest with SharedSparkSession { +class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlanHelper { import testImplicits._ + private def attachCleanupResourceChecker(plan: SparkPlan): Unit = { + // SPARK-21492: Check cleanupResources are finally triggered in SortExec node for every + // test case + plan.foreachUp { + case s: SortExec => + val sortExec = spy(s) + verify(sortExec, atLeastOnce).cleanupResources() + verify(sortExec.rowSorter, atLeastOnce).cleanupResources() + case _ => + } + } + + override protected def checkAnswer(df: => DataFrame, rows: Seq[Row]): Unit = { + attachCleanupResourceChecker(df.queryExecution.sparkPlan) + super.checkAnswer(df, rows) + } + setupTestData() def statisticSizeInByte(df: DataFrame): BigInt = { @@ -219,7 +239,9 @@ class JoinSuite extends QueryTest with SharedSparkSession { checkAnswer( bigDataX.join(bigDataY).where($"x.key" === $"y.key"), - testData.rdd.flatMap(row => Seq.fill(16)(Row.merge(row, row))).collect().toSeq) + testData.rdd.flatMap { row => + Seq.fill(16)(new GenericRow(Seq(row, row).flatMap(_.toSeq).toArray)) + }.collect().toSeq) } test("cartesian product join") { @@ -503,10 +525,10 @@ class JoinSuite extends QueryTest with SharedSparkSession { SQLConf.CROSS_JOINS_ENABLED.key -> "true") { assert(statisticSizeInByte(spark.table("testData2")) > - spark.conf.get(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD)) + spark.conf.get[Long](SQLConf.AUTO_BROADCASTJOIN_THRESHOLD)) assert(statisticSizeInByte(spark.table("testData")) < - spark.conf.get(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD)) + spark.conf.get[Long](SQLConf.AUTO_BROADCASTJOIN_THRESHOLD)) Seq( ("SELECT * FROM testData LEFT SEMI JOIN testData2 ON key = a", @@ -821,7 +843,7 @@ class JoinSuite extends QueryTest with SharedSparkSession { case j: SortMergeJoinExec => j } val executed = df.queryExecution.executedPlan - val executedJoins = executed.collect { + val executedJoins = collect(executed) { case j: SortMergeJoinExec => j } // This only applies to the above tested queries, in which a child SortMergeJoin always @@ -1005,12 +1027,12 @@ class JoinSuite extends QueryTest with SharedSparkSession { val right = Seq((1, 2), (3, 4)).toDF("c", "d") val df = left.join(right, pythonTestUDF(left("a")) === pythonTestUDF(right.col("c"))) - val joinNode = df.queryExecution.executedPlan.find(_.isInstanceOf[BroadcastHashJoinExec]) + val joinNode = find(df.queryExecution.executedPlan)(_.isInstanceOf[BroadcastHashJoinExec]) assert(joinNode.isDefined) // There are two PythonUDFs which use attribute from left and right of join, individually. // So two PythonUDFs should be evaluated before the join operator, at left and right side. - val pythonEvals = joinNode.get.collect { + val pythonEvals = collect(joinNode.get) { case p: BatchEvalPythonExec => p } assert(pythonEvals.size == 2) @@ -1034,9 +1056,30 @@ class JoinSuite extends QueryTest with SharedSparkSession { assert(filterInAnalysis.isDefined) // Filter predicate was pushdown as join condition. So there is no Filter exec operator. - val filterExec = df.queryExecution.executedPlan.find(_.isInstanceOf[FilterExec]) + val filterExec = find(df.queryExecution.executedPlan)(_.isInstanceOf[FilterExec]) assert(filterExec.isEmpty) checkAnswer(df, Row(1, 2, 1, 2) :: Nil) } + + test("SPARK-21492: cleanupResource without code generation") { + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false", + SQLConf.SHUFFLE_PARTITIONS.key -> "1", + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + val df1 = spark.range(0, 10, 1, 2) + val df2 = spark.range(10).select($"id".as("b1"), (- $"id").as("b2")) + val res = df1.join(df2, $"id" === $"b1" && $"id" === $"b2").select($"b1", $"b2", $"id") + checkAnswer(res, Row(0, 0, 0)) + } + } + + test("SPARK-29850: sort-merge-join an empty table should not memory leak") { + val df1 = spark.range(10).select($"id", $"id" % 3 as 'p) + .repartition($"id").groupBy($"id").agg(Map("p" -> "max")) + val df2 = spark.range(0) + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + assert(df2.join(df1, "id").collect().isEmpty) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala index 92a4acc130be5..fd1e9e309558e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql +import java.sql.{Date, Timestamp} import java.text.SimpleDateFormat import java.util.Locale @@ -38,6 +39,13 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { Row("alice", "5")) } + test("function get_json_object - support single quotes") { + val df: DataFrame = Seq(("""{'name': 'fang', 'age': 5}""")).toDF("a") + checkAnswer( + df.selectExpr("get_json_object(a, '$.name')", "get_json_object(a, '$.age')"), + Row("fang", "5")) + } + val tuples: Seq[(String, String)] = ("1", """{"f1": "value1", "f2": "value2", "f3": 3, "f5": 5.23}""") :: ("2", """{"f1": "value12", "f3": "value3", "f2": 2, "f4": 4.01}""") :: @@ -214,33 +222,24 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { Row("""{"_1":"26/08/2015 18:00"}""") :: Nil) } - test("to_json - key types of map don't matter") { - // interval type is invalid for converting to JSON. However, the keys of a map are treated - // as strings, so its type doesn't matter. - val df = Seq(Tuple1(Tuple1("interval -3 month 7 hours"))).toDF("a") - .select(struct(map($"a._1".cast(CalendarIntervalType), lit("a")).as("col1")).as("c")) + test("to_json - interval support") { + val baseDf = Seq(Tuple1(Tuple1("-3 month 7 hours"))).toDF("a") + val df = baseDf.select(struct($"a._1".cast(CalendarIntervalType).as("a")).as("c")) checkAnswer( df.select(to_json($"c")), - Row("""{"col1":{"interval -3 months 7 hours":"a"}}""") :: Nil) - } + Row("""{"a":"-3 months 7 hours"}""") :: Nil) - test("to_json unsupported type") { - val baseDf = Seq(Tuple1(Tuple1("interval -3 month 7 hours"))).toDF("a") - val df = baseDf.select(struct($"a._1".cast(CalendarIntervalType).as("a")).as("c")) - val e = intercept[AnalysisException]{ - // Unsupported type throws an exception - df.select(to_json($"c")).collect() - } - assert(e.getMessage.contains( - "Unable to convert column a of type interval to JSON.")) + val df1 = baseDf + .select(struct(map($"a._1".cast(CalendarIntervalType), lit("a")).as("col1")).as("c")) + checkAnswer( + df1.select(to_json($"c")), + Row("""{"col1":{"-3 months 7 hours":"a"}}""") :: Nil) - // interval type is invalid for converting to JSON. We can't use it as value type of a map. val df2 = baseDf .select(struct(map(lit("a"), $"a._1".cast(CalendarIntervalType)).as("col1")).as("c")) - val e2 = intercept[AnalysisException] { - df2.select(to_json($"c")).collect() - } - assert(e2.getMessage.contains("Unable to convert column col1 of type interval to JSON")) + checkAnswer( + df2.select(to_json($"c")), + Row("""{"col1":{"a":"-3 months 7 hours"}}""") :: Nil) } test("roundtrip in to_json and from_json - struct") { @@ -608,4 +607,50 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { checkAnswer(df, Row(Row(java.sql.Timestamp.valueOf("2018-11-06 18:00:00.0")))) } } + + test("special timestamp values") { + Seq("now", "today", "epoch", "tomorrow", "yesterday").foreach { specialValue => + val input = Seq(s"""{"t": "$specialValue"}""").toDS() + val readback = input.select(from_json($"value", lit("t timestamp"), + Map.empty[String, String].asJava)).collect() + assert(readback(0).getAs[Row](0).getAs[Timestamp](0).getTime >= 0) + } + } + + test("special date values") { + Seq("now", "today", "epoch", "tomorrow", "yesterday").foreach { specialValue => + val input = Seq(s"""{"d": "$specialValue"}""").toDS() + val readback = input.select(from_json($"value", lit("d date"), + Map.empty[String, String].asJava)).collect() + assert(readback(0).getAs[Row](0).getAs[Date](0).getTime >= 0) + } + } + + test("from_json - timestamp in micros") { + val df = Seq("""{"time": "1970-01-01T00:00:00.123456"}""").toDS() + val schema = new StructType().add("time", TimestampType) + val options = Map("timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss.SSSSSS") + + checkAnswer( + df.select(from_json($"value", schema, options)), + Row(Row(java.sql.Timestamp.valueOf("1970-01-01 00:00:00.123456")))) + } + + test("to_json - timestamp in micros") { + val s = "2019-11-18 11:56:00.123456" + val df = Seq(java.sql.Timestamp.valueOf(s)).toDF("t").select( + to_json(struct($"t"), Map("timestampFormat" -> "yyyy-MM-dd HH:mm:ss.SSSSSS"))) + checkAnswer(df, Row(s"""{"t":"$s"}""")) + } + + test("json_tuple - do not truncate results") { + Seq(2000, 2800, 8000 - 1, 8000, 8000 + 1, 65535).foreach { len => + val str = Array.tabulate(len)(_ => "a").mkString + val json_tuple_result = Seq(s"""{"test":"$str"}""").toDF("json") + .withColumn("result", json_tuple('json, "test")) + .select('result) + .as[String].head.length + assert(json_tuple_result === len) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/LocalSparkSession.scala b/sql/core/src/test/scala/org/apache/spark/sql/LocalSparkSession.scala index 6b90f20a94fa4..36db95ff8a31b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/LocalSparkSession.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/LocalSparkSession.scala @@ -27,14 +27,14 @@ trait LocalSparkSession extends BeforeAndAfterEach with BeforeAndAfterAll { self @transient var spark: SparkSession = _ - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() InternalLoggerFactory.setDefaultFactory(Slf4JLoggerFactory.INSTANCE) SparkSession.clearActiveSession() SparkSession.clearDefaultSession() } - override def afterEach() { + override def afterEach(): Unit = { try { LocalSparkSession.stop(spark) SparkSession.clearActiveSession() @@ -47,7 +47,7 @@ trait LocalSparkSession extends BeforeAndAfterEach with BeforeAndAfterAll { self } object LocalSparkSession { - def stop(spark: SparkSession) { + def stop(spark: SparkSession): Unit = { if (spark != null) { spark.stop() } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MathFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MathFunctionsSuite.scala index 567bcdd1878a8..bd86c2ec075b0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/MathFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/MathFunctionsSuite.scala @@ -21,6 +21,7 @@ import java.nio.charset.StandardCharsets import org.apache.spark.sql.functions._ import org.apache.spark.sql.functions.{log => logarithm} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession private object MathFunctionsTestData { @@ -218,19 +219,21 @@ class MathFunctionsSuite extends QueryTest with SharedSparkSession { Seq(Row(5, 0, 0), Row(55, 60, 100), Row(555, 560, 600)) ) - val pi = "3.1415" - checkAnswer( - sql(s"SELECT round($pi, -3), round($pi, -2), round($pi, -1), " + - s"round($pi, 0), round($pi, 1), round($pi, 2), round($pi, 3)"), - Seq(Row(BigDecimal("0E3"), BigDecimal("0E2"), BigDecimal("0E1"), BigDecimal(3), - BigDecimal("3.1"), BigDecimal("3.14"), BigDecimal("3.142"))) - ) - checkAnswer( - sql(s"SELECT bround($pi, -3), bround($pi, -2), bround($pi, -1), " + - s"bround($pi, 0), bround($pi, 1), bround($pi, 2), bround($pi, 3)"), - Seq(Row(BigDecimal("0E3"), BigDecimal("0E2"), BigDecimal("0E1"), BigDecimal(3), - BigDecimal("3.1"), BigDecimal("3.14"), BigDecimal("3.142"))) - ) + withSQLConf(SQLConf.LEGACY_ALLOW_NEGATIVE_SCALE_OF_DECIMAL_ENABLED.key -> "true") { + val pi = "3.1415" + checkAnswer( + sql(s"SELECT round($pi, -3), round($pi, -2), round($pi, -1), " + + s"round($pi, 0), round($pi, 1), round($pi, 2), round($pi, 3)"), + Seq(Row(BigDecimal("0E3"), BigDecimal("0E2"), BigDecimal("0E1"), BigDecimal(3), + BigDecimal("3.1"), BigDecimal("3.14"), BigDecimal("3.142"))) + ) + checkAnswer( + sql(s"SELECT bround($pi, -3), bround($pi, -2), bround($pi, -1), " + + s"bround($pi, 0), bround($pi, 1), bround($pi, 2), bround($pi, 3)"), + Seq(Row(BigDecimal("0E3"), BigDecimal("0E2"), BigDecimal("0E1"), BigDecimal(3), + BigDecimal("3.1"), BigDecimal("3.14"), BigDecimal("3.142"))) + ) + } val bdPi: BigDecimal = BigDecimal(31415925L, 7) checkAnswer( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MiscFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MiscFunctionsSuite.scala index cad0821dbf5aa..5ab06b1ebebf6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/MiscFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/MiscFunctionsSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql +import org.apache.spark.{SPARK_REVISION, SPARK_VERSION_SHORT} import org.apache.spark.sql.test.SharedSparkSession class MiscFunctionsSuite extends QueryTest with SharedSparkSession { @@ -31,6 +32,12 @@ class MiscFunctionsSuite extends QueryTest with SharedSparkSession { s"java_method('$className', 'method1', a, b)"), Row("m1one", "m1one")) } + + test("version") { + checkAnswer( + Seq("").toDF("a").selectExpr("version()"), + Row(SPARK_VERSION_SHORT + " " + SPARK_REVISION)) + } } object ReflectClass { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala index 3039a4ccb677c..4a21ae9242039 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala @@ -21,6 +21,9 @@ import java.util.{Locale, TimeZone} import scala.collection.JavaConverters._ +import org.junit.Assert +import org.scalatest.Assertions + import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.execution.SQLExecution @@ -150,10 +153,7 @@ abstract class QueryTest extends PlanTest { assertEmptyMissingInput(analyzedDF) - QueryTest.checkAnswer(analyzedDF, expectedAnswer) match { - case Some(errorMessage) => fail(errorMessage) - case None => - } + QueryTest.checkAnswer(analyzedDF, expectedAnswer) } protected def checkAnswer(df: => DataFrame, expectedAnswer: Row): Unit = { @@ -235,18 +235,32 @@ abstract class QueryTest extends PlanTest { } } -object QueryTest { +object QueryTest extends Assertions { + /** + * Runs the plan and makes sure the answer matches the expected result. + * + * @param df the DataFrame to be executed + * @param expectedAnswer the expected result in a Seq of Rows. + * @param checkToRDD whether to verify deserialization to an RDD. This runs the query twice. + */ + def checkAnswer(df: DataFrame, expectedAnswer: Seq[Row], checkToRDD: Boolean = true): Unit = { + getErrorMessageInCheckAnswer(df, expectedAnswer, checkToRDD) match { + case Some(errorMessage) => fail(errorMessage) + case None => + } + } + /** * Runs the plan and makes sure the answer matches the expected result. * If there was exception during the execution or the contents of the DataFrame does not - * match the expected result, an error message will be returned. Otherwise, a [[None]] will + * match the expected result, an error message will be returned. Otherwise, a None will * be returned. * - * @param df the [[DataFrame]] to be executed - * @param expectedAnswer the expected result in a [[Seq]] of [[Row]]s. + * @param df the DataFrame to be executed + * @param expectedAnswer the expected result in a Seq of Rows. * @param checkToRDD whether to verify deserialization to an RDD. This runs the query twice. */ - def checkAnswer( + def getErrorMessageInCheckAnswer( df: DataFrame, expectedAnswer: Seq[Row], checkToRDD: Boolean = true): Option[String] = { @@ -408,10 +422,10 @@ object QueryTest { } } - def checkAnswer(df: DataFrame, expectedAnswer: java.util.List[Row]): String = { - checkAnswer(df, expectedAnswer.asScala) match { - case Some(errorMessage) => errorMessage - case None => null + def checkAnswer(df: DataFrame, expectedAnswer: java.util.List[Row]): Unit = { + getErrorMessageInCheckAnswer(df, expectedAnswer.asScala) match { + case Some(errorMessage) => Assert.fail(errorMessage) + case None => } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index b8b157e275b61..11f9724e587f2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -22,11 +22,19 @@ import java.net.{MalformedURLException, URL} import java.sql.{Date, Timestamp} import java.util.concurrent.atomic.AtomicBoolean +import scala.collection.parallel.immutable.ParVector + import org.apache.spark.{AccumulatorSuite, SparkException} import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart} +import org.apache.spark.sql.catalyst.expressions.GenericRow +import org.apache.spark.sql.catalyst.expressions.aggregate.{Complete, Partial} +import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation import org.apache.spark.sql.catalyst.util.StringUtils -import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, SortAggregateExec} +import org.apache.spark.sql.execution.HiveResult.hiveResultString +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec} import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec +import org.apache.spark.sql.execution.command.FunctionsCommand import org.apache.spark.sql.execution.datasources.v2.BatchScanExec import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan @@ -36,8 +44,9 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.{SharedSparkSession, TestSQLContext} import org.apache.spark.sql.test.SQLTestData._ import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.CalendarInterval -class SQLQuerySuite extends QueryTest with SharedSparkSession { +class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlanHelper { import testImplicits._ setupTestData() @@ -55,7 +64,8 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { test("show functions") { def getFunctions(pattern: String): Seq[Row] = { StringUtils.filterPattern( - spark.sessionState.catalog.listFunctions("default").map(_._1.funcName), pattern) + spark.sessionState.catalog.listFunctions("default").map(_._1.funcName) + ++ FunctionsCommand.virtualOperators, pattern) .map(Row(_)) } @@ -115,6 +125,81 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { } } + test("using _FUNC_ instead of function names in examples") { + val exampleRe = "(>.*;)".r + val setStmtRe = "(?i)^(>\\s+set\\s+).+".r + val ignoreSet = Set( + // Examples for CaseWhen show simpler syntax: + // `CASE WHEN ... THEN ... WHEN ... THEN ... END` + "org.apache.spark.sql.catalyst.expressions.CaseWhen", + // _FUNC_ is replaced by `locate` but `locate(... IN ...)` is not supported + "org.apache.spark.sql.catalyst.expressions.StringLocate", + // _FUNC_ is replaced by `%` which causes a parsing error on `SELECT %(2, 1.8)` + "org.apache.spark.sql.catalyst.expressions.Remainder", + // Examples demonstrate alternative names, see SPARK-20749 + "org.apache.spark.sql.catalyst.expressions.Length") + spark.sessionState.functionRegistry.listFunction().foreach { funcId => + val info = spark.sessionState.catalog.lookupFunctionInfo(funcId) + val className = info.getClassName + withClue(s"Expression class '$className'") { + val exprExamples = info.getOriginalExamples + if (!exprExamples.isEmpty && !ignoreSet.contains(className)) { + assert(exampleRe.findAllIn(exprExamples).toIterable + .filter(setStmtRe.findFirstIn(_).isEmpty) // Ignore SET commands + .forall(_.contains("_FUNC_"))) + } + } + } + } + + test("check outputs of expression examples") { + def unindentAndTrim(s: String): String = { + s.replaceAll("\n\\s+", "\n").trim + } + val beginSqlStmtRe = " > ".r + val endSqlStmtRe = ";\n".r + def checkExampleSyntax(example: String): Unit = { + val beginStmtNum = beginSqlStmtRe.findAllIn(example).length + val endStmtNum = endSqlStmtRe.findAllIn(example).length + assert(beginStmtNum === endStmtNum, + "The number of ` > ` does not match to the number of `;`") + } + val exampleRe = """^(.+);\n(?s)(.+)$""".r + val ignoreSet = Set( + // One of examples shows getting the current timestamp + "org.apache.spark.sql.catalyst.expressions.UnixTimestamp", + // Random output without a seed + "org.apache.spark.sql.catalyst.expressions.Rand", + "org.apache.spark.sql.catalyst.expressions.Randn", + "org.apache.spark.sql.catalyst.expressions.Shuffle", + "org.apache.spark.sql.catalyst.expressions.Uuid", + // The example calls methods that return unstable results. + "org.apache.spark.sql.catalyst.expressions.CallMethodViaReflection") + + val parFuncs = new ParVector(spark.sessionState.functionRegistry.listFunction().toVector) + parFuncs.foreach { funcId => + // Examples can change settings. We clone the session to prevent tests clashing. + val clonedSpark = spark.cloneSession() + val info = clonedSpark.sessionState.catalog.lookupFunctionInfo(funcId) + val className = info.getClassName + if (!ignoreSet.contains(className)) { + withClue(s"Function '${info.getName}', Expression class '$className'") { + val example = info.getExamples + checkExampleSyntax(example) + example.split(" > ").toList.foreach(_ match { + case exampleRe(sql, output) => + val df = clonedSpark.sql(sql) + val actual = unindentAndTrim( + hiveResultString(df.queryExecution.executedPlan).mkString("\n")) + val expected = unindentAndTrim(output) + assert(actual === expected) + case _ => + }) + } + } + } + } + test("SPARK-6743: no columns from cache") { Seq( (83, 0, 38), @@ -699,8 +784,9 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { | SELECT * FROM testData UNION ALL | SELECT * FROM testData) y |WHERE x.key = y.key""".stripMargin), - testData.rdd.flatMap( - row => Seq.fill(16)(Row.merge(row, row))).collect().toSeq) + testData.rdd.flatMap { row => + Seq.fill(16)(new GenericRow(Seq(row, row).flatMap(_.toSeq).toArray)) + }.collect().toSeq) } test("cartesian product join") { @@ -1473,7 +1559,7 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { import org.apache.spark.unsafe.types.CalendarInterval val df = sql("select interval 3 years -3 month 7 week 123 microseconds") - checkAnswer(df, Row(new CalendarInterval(12 * 3 - 3, 7L * 1000 * 1000 * 3600 * 24 * 7 + 123 ))) + checkAnswer(df, Row(new CalendarInterval(12 * 3 - 3, 7 * 7, 123 ))) withTempPath(f => { // Currently we don't yet support saving out values of interval data type. val e = intercept[AnalysisException] { @@ -1481,35 +1567,21 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { } e.message.contains("Cannot save interval data type into external storage") }) - - val e1 = intercept[AnalysisException] { - sql("select interval") - } - assert(e1.message.contains("at least one time unit should be given for interval literal")) - - // Currently we don't yet support nanosecond - val e2 = intercept[AnalysisException] { - sql("select interval 23 nanosecond") - } - assert(e2.message.contains("no viable alternative at input 'interval 23 nanosecond'")) } test("SPARK-8945: add and subtract expressions for interval type") { - import org.apache.spark.unsafe.types.CalendarInterval - import org.apache.spark.unsafe.types.CalendarInterval.MICROS_PER_WEEK - val df = sql("select interval 3 years -3 month 7 week 123 microseconds as i") - checkAnswer(df, Row(new CalendarInterval(12 * 3 - 3, 7L * MICROS_PER_WEEK + 123))) + checkAnswer(df, Row(new CalendarInterval(12 * 3 - 3, 7 * 7, 123))) - checkAnswer(df.select(df("i") + new CalendarInterval(2, 123)), - Row(new CalendarInterval(12 * 3 - 3 + 2, 7L * MICROS_PER_WEEK + 123 + 123))) + checkAnswer(df.select(df("i") + new CalendarInterval(2, 1, 123)), + Row(new CalendarInterval(12 * 3 - 3 + 2, 7 * 7 + 1, 123 + 123))) - checkAnswer(df.select(df("i") - new CalendarInterval(2, 123)), - Row(new CalendarInterval(12 * 3 - 3 - 2, 7L * MICROS_PER_WEEK + 123 - 123))) + checkAnswer(df.select(df("i") - new CalendarInterval(2, 1, 123)), + Row(new CalendarInterval(12 * 3 - 3 - 2, 7 * 7 - 1, 123 - 123))) // unary minus checkAnswer(df.select(-df("i")), - Row(new CalendarInterval(-(12 * 3 - 3), -(7L * MICROS_PER_WEEK + 123)))) + Row(new CalendarInterval(-(12 * 3 - 3), -7 * 7, -123))) } test("aggregation with codegen updates peak execution memory") { @@ -2609,14 +2681,14 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { } // Make sure no spurious job starts are pending in the listener bus. - sparkContext.listenerBus.waitUntilEmpty(500) + sparkContext.listenerBus.waitUntilEmpty() sparkContext.addSparkListener(listener) try { // Execute the command. sql("show databases").head() // Make sure we have seen all events triggered by DataFrame.show() - sparkContext.listenerBus.waitUntilEmpty(500) + sparkContext.listenerBus.waitUntilEmpty() } finally { sparkContext.removeSparkListener(listener) } @@ -2765,6 +2837,44 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { checkAnswer(df, Row(1, 3, 4) :: Row(2, 3, 4) :: Row(3, 3, 4) :: Nil) } + test("Support filter clause for aggregate function with hash aggregate") { + Seq(("COUNT(a)", 3), ("COLLECT_LIST(a)", Seq(1, 2, 3))).foreach { funcToResult => + val query = s"SELECT ${funcToResult._1} FILTER (WHERE b > 1) FROM testData2" + val df = sql(query) + val physical = df.queryExecution.sparkPlan + val aggregateExpressions = physical.collect { + case agg: HashAggregateExec => agg.aggregateExpressions + case agg: ObjectHashAggregateExec => agg.aggregateExpressions + }.flatten + aggregateExpressions.foreach { expr => + if (expr.mode == Complete || expr.mode == Partial) { + assert(expr.filter.isDefined) + } else { + assert(expr.filter.isEmpty) + } + } + checkAnswer(df, Row(funcToResult._2)) + } + } + + test("Support filter clause for aggregate function uses SortAggregateExec") { + withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "false") { + val df = sql("SELECT PERCENTILE(a, 1) FILTER (WHERE b > 1) FROM testData2") + val physical = df.queryExecution.sparkPlan + val aggregateExpressions = physical.collect { + case agg: SortAggregateExec => agg.aggregateExpressions + }.flatten + aggregateExpressions.foreach { expr => + if (expr.mode == Complete || expr.mode == Partial) { + assert(expr.filter.isDefined) + } else { + assert(expr.filter.isEmpty) + } + } + checkAnswer(df, Row(3)) + } + } + test("Non-deterministic aggregate functions should not be deduplicated") { val query = "SELECT a, first_value(b), first_value(b) + 1 FROM testData2 GROUP BY a" val df = sql(query) @@ -3149,6 +3259,7 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { checkAnswer(sql("select * from t1 where d > '1999-13'"), Row(result)) checkAnswer(sql("select to_timestamp('2000-01-01 01:10:00') > '1'"), Row(true)) } + sql("DROP VIEW t1") } test("SPARK-28156: self-join should not miss cached view") { @@ -3171,7 +3282,7 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { |on leftside.a = rightside.a """.stripMargin) - val inMemoryTableScan = queryDf.queryExecution.executedPlan.collect { + val inMemoryTableScan = collect(queryDf.queryExecution.executedPlan) { case i: InMemoryTableScanExec => i } assert(inMemoryTableScan.size == 2) @@ -3180,6 +3291,109 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession { } } + + test("SPARK-29000: arithmetic computation overflow when don't allow decimal precision loss ") { + withSQLConf(SQLConf.DECIMAL_OPERATIONS_ALLOW_PREC_LOSS.key -> "false") { + val df1 = sql("select case when 1=2 then 1 else 100.000000000000000000000000 end * 1") + checkAnswer(df1, Array(Row(100))) + val df2 = sql("select case when 1=2 then 1 else 100.000000000000000000000000 end * " + + "case when 1=2 then 2 else 1 end") + checkAnswer(df2, Array(Row(100))) + val df3 = sql("select case when 1=2 then 1 else 1.000000000000000000000001 end / 10") + checkAnswer(df3, Array(Row(new java.math.BigDecimal("0.100000000000000000000000100")))) + } + } + + test("SPARK-29239: Subquery should not cause NPE when eliminating subexpression") { + withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false", + SQLConf.SUBQUERY_REUSE_ENABLED.key -> "false", + SQLConf.CODEGEN_FACTORY_MODE.key -> "CODEGEN_ONLY", + SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> ConvertToLocalRelation.ruleName) { + withTempView("t1", "t2") { + sql("create temporary view t1 as select * from values ('val1a', 10L) as t1(t1a, t1b)") + sql("create temporary view t2 as select * from values ('val3a', 110L) as t2(t2a, t2b)") + val df = sql("SELECT min, min from (SELECT (SELECT min(t2b) FROM t2) min " + + "FROM t1 WHERE t1a = 'val1c')") + assert(df.collect().size == 0) + } + } + } + + test("SPARK-29213: FilterExec should not throw NPE") { + withTempView("t1", "t2", "t3") { + sql("SELECT ''").as[String].map(identity).toDF("x").createOrReplaceTempView("t1") + sql("SELECT * FROM VALUES 0, CAST(NULL AS BIGINT)") + .as[java.lang.Long] + .map(identity) + .toDF("x") + .createOrReplaceTempView("t2") + sql("SELECT ''").as[String].map(identity).toDF("x").createOrReplaceTempView("t3") + sql( + """ + |SELECT t1.x + |FROM t1 + |LEFT JOIN ( + | SELECT x FROM ( + | SELECT x FROM t2 + | UNION ALL + | SELECT SUBSTR(x,5) x FROM t3 + | ) a + | WHERE LENGTH(x)>0 + |) t3 + |ON t1.x=t3.x + """.stripMargin).collect() + } + } + + test("SPARK-29682: Conflicting attributes in Expand are resolved") { + val numsDF = Seq(1, 2, 3).toDF("nums") + val cubeDF = numsDF.cube("nums").agg(max(lit(0)).as("agcol")) + + checkAnswer( + cubeDF.join(cubeDF, "nums"), + Row(1, 0, 0) :: Row(2, 0, 0) :: Row(3, 0, 0) :: Nil) + } + + test("SPARK-29860: Fix dataType mismatch issue for InSubquery") { + withTempView("ta", "tb", "tc", "td", "te", "tf") { + sql("CREATE TEMPORARY VIEW ta AS SELECT * FROM VALUES(CAST(1 AS DECIMAL(8, 0))) AS ta(id)") + sql("CREATE TEMPORARY VIEW tb AS SELECT * FROM VALUES(CAST(1 AS DECIMAL(7, 2))) AS tb(id)") + sql("CREATE TEMPORARY VIEW tc AS SELECT * FROM VALUES(CAST(1 AS DOUBLE)) AS tc(id)") + sql("CREATE TEMPORARY VIEW td AS SELECT * FROM VALUES(CAST(1 AS FLOAT)) AS td(id)") + sql("CREATE TEMPORARY VIEW te AS SELECT * FROM VALUES(CAST(1 AS BIGINT)) AS te(id)") + sql("CREATE TEMPORARY VIEW tf AS SELECT * FROM VALUES(CAST(1 AS DECIMAL(38, 38))) AS tf(id)") + val df1 = sql("SELECT id FROM ta WHERE id IN (SELECT id FROM tb)") + checkAnswer(df1, Row(new java.math.BigDecimal(1))) + val df2 = sql("SELECT id FROM ta WHERE id IN (SELECT id FROM tc)") + checkAnswer(df2, Row(new java.math.BigDecimal(1))) + val df3 = sql("SELECT id FROM ta WHERE id IN (SELECT id FROM td)") + checkAnswer(df3, Row(new java.math.BigDecimal(1))) + val df4 = sql("SELECT id FROM ta WHERE id IN (SELECT id FROM te)") + checkAnswer(df4, Row(new java.math.BigDecimal(1))) + val df5 = sql("SELECT id FROM ta WHERE id IN (SELECT id FROM tf)") + checkAnswer(df5, Array.empty[Row]) + } + } + + test("SPARK-30447: fix constant propagation inside NOT") { + withTempView("t") { + Seq[Integer](1, null).toDF("c").createOrReplaceTempView("t") + val df = sql("SELECT * FROM t WHERE NOT(c = 1 AND c + 1 = 1)") + + checkAnswer(df, Row(1)) + } + } + + test("SPARK-26218: Fix the corner case when casting float to Integer") { + withSQLConf(SQLConf.ANSI_ENABLED.key -> "true") { + intercept[ArithmeticException]( + sql("SELECT CAST(CAST(2147483648 as FLOAT) as Integer)").collect() + ) + intercept[ArithmeticException]( + sql("SELECT CAST(CAST(2147483648 as DOUBLE) as Integer)").collect() + ) + } + } } case class Foo(bar: Option[String]) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala index 28ca0edaef871..83285911b3948 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala @@ -19,19 +19,23 @@ package org.apache.spark.sql import java.io.File import java.util.{Locale, TimeZone} +import java.util.regex.Pattern +import scala.collection.mutable.{ArrayBuffer, HashMap} import scala.util.control.NonFatal +import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical._ -import org.apache.spark.sql.catalyst.plans.logical.sql.{DescribeColumnStatement, DescribeTableStatement} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.catalyst.util.{fileToString, stringToFile} import org.apache.spark.sql.execution.HiveResult.hiveResultString +import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.command.{DescribeColumnCommand, DescribeCommandBase} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.StructType +import org.apache.spark.tags.ExtendedSQLTest /** * End-to-end test cases for SQL queries. @@ -60,9 +64,24 @@ import org.apache.spark.sql.types.StructType * }}} * * The format for input files is simple: - * 1. A list of SQL queries separated by semicolon. + * 1. A list of SQL queries separated by semicolons by default. If the semicolon cannot effectively + * separate the SQL queries in the test file(e.g. bracketed comments), please use + * --QUERY-DELIMITER-START and --QUERY-DELIMITER-END. Lines starting with + * --QUERY-DELIMITER-START and --QUERY-DELIMITER-END represent the beginning and end of a query, + * respectively. Code that is not surrounded by lines that begin with --QUERY-DELIMITER-START + * and --QUERY-DELIMITER-END is still separated by semicolons. * 2. Lines starting with -- are treated as comments and ignored. - * 3. Lines starting with --SET are used to run the file with the following set of configs. + * 3. Lines starting with --SET are used to specify the configs when running this testing file. You + * can set multiple configs in one --SET, using comma to separate them. Or you can use multiple + * --SET statements. + * 4. Lines starting with --IMPORT are used to load queries from another test file. + * 5. Lines starting with --CONFIG_DIM are used to specify config dimensions of this testing file. + * The dimension name is decided by the string after --CONFIG_DIM. For example, --CONFIG_DIM1 + * belongs to dimension 1. One dimension can have multiple lines, each line representing one + * config set (one or more configs, separated by comma). Spark will run this testing file many + * times, each time picks one config set from each dimension, until all the combinations are + * tried. For example, if dimension 1 has 2 lines, dimension 2 has 3 lines, this testing file + * will be run 6 times (cartesian product). * * For example: * {{{ @@ -75,16 +94,16 @@ import org.apache.spark.sql.types.StructType * {{{ * -- some header information * - * -- !query 0 + * -- !query * select 1, -1 - * -- !query 0 schema + * -- !query schema * struct<...schema...> - * -- !query 0 output + * -- !query output * ... data row 1 ... * ... data row 2 ... * ... * - * -- !query 1 + * -- !query * ... * }}} * @@ -102,12 +121,12 @@ import org.apache.spark.sql.types.StructType * Therefore, UDF test cases should have single input and output files but executed by three * different types of UDFs. See 'udf/udf-inner-join.sql' as an example. */ +@ExtendedSQLTest class SQLQueryTestSuite extends QueryTest with SharedSparkSession { import IntegratedUDFTestUtils._ private val regenerateGoldenFiles: Boolean = System.getenv("SPARK_GENERATE_GOLDEN_FILES") == "1" - protected val isTestWithConfigSets: Boolean = true protected val baseResourcePath = { // We use a path based on Spark home for 2 reasons: @@ -131,23 +150,29 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { private val notIncludedMsg = "[not included in comparison]" private val clsName = this.getClass.getCanonicalName + protected val emptySchema = StructType(Seq.empty).catalogString + + protected override def sparkConf: SparkConf = super.sparkConf + // Fewer shuffle partitions to speed up testing. + .set(SQLConf.SHUFFLE_PARTITIONS, 4) + /** List of test cases to ignore, in lower cases. */ protected def blackList: Set[String] = Set( "blacklist.sql" // Do NOT remove this one. It is here to test the blacklist functionality. ) // Create all the test cases. - listTestCases().foreach(createScalaTestCase) + listTestCases.foreach(createScalaTestCase) /** A single SQL query's output. */ protected case class QueryOutput(sql: String, schema: String, output: String) { - def toString(queryIndex: Int): String = { + override def toString: String = { // We are explicitly not using multi-line string due to stripMargin removing "|" in output. - s"-- !query $queryIndex\n" + + s"-- !query\n" + sql + "\n" + - s"-- !query $queryIndex schema\n" + + s"-- !query schema\n" + schema + "\n" + - s"-- !query $queryIndex output\n" + + s"-- !query output\n" + output } } @@ -165,6 +190,11 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { */ protected trait PgSQLTest + /** + * traits that indicate ANSI-related tests with the ANSI mode enabled. + */ + protected trait AnsiTest + protected trait UDFTest { val udf: TestUDF } @@ -191,6 +221,10 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { resultFile: String, udf: TestUDF) extends TestCase with UDFTest with PgSQLTest + /** An ANSI-related test case. */ + protected case class AnsiTestCase( + name: String, inputFile: String, resultFile: String) extends TestCase with AnsiTest + protected def createScalaTestCase(testCase: TestCase): Unit = { if (blackList.exists(t => testCase.name.toLowerCase(Locale.ROOT).contains(t.toLowerCase(Locale.ROOT)))) { @@ -217,55 +251,94 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { } } - // For better test coverage, runs the tests on mixed config sets: WHOLESTAGE_CODEGEN_ENABLED - // and CODEGEN_FACTORY_MODE. - private lazy val codegenConfigSets = Array( - ("true", "CODEGEN_ONLY"), - ("false", "CODEGEN_ONLY"), - ("false", "NO_CODEGEN") - ).map { case (wholeStageCodegenEnabled, codegenFactoryMode) => - Array(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> wholeStageCodegenEnabled, - SQLConf.CODEGEN_FACTORY_MODE.key -> codegenFactoryMode) - } - /** Run a test case. */ protected def runTest(testCase: TestCase): Unit = { + def splitWithSemicolon(seq: Seq[String]) = { + seq.mkString("\n").split("(?<=[^\\\\]);") + } val input = fileToString(new File(testCase.inputFile)) - val (comments, code) = input.split("\n").partition(_.trim.startsWith("--")) + val (comments, code) = input.split("\n").partition { line => + val newLine = line.trim + newLine.startsWith("--") && !newLine.startsWith("--QUERY-DELIMITER") + } + + // If `--IMPORT` found, load code from another test case file, then insert them + // into the head in this test. + val importedTestCaseName = comments.filter(_.startsWith("--IMPORT ")).map(_.substring(9)) + val importedCode = importedTestCaseName.flatMap { testCaseName => + listTestCases.find(_.name == testCaseName).map { testCase => + val input = fileToString(new File(testCase.inputFile)) + val (_, code) = input.split("\n").partition(_.trim.startsWith("--")) + code + } + }.flatten + + val allCode = importedCode ++ code + val tempQueries = if (allCode.exists(_.trim.startsWith("--QUERY-DELIMITER"))) { + // Although the loop is heavy, only used for bracketed comments test. + val querys = new ArrayBuffer[String] + val otherCodes = new ArrayBuffer[String] + var tempStr = "" + var start = false + for (c <- allCode) { + if (c.trim.startsWith("--QUERY-DELIMITER-START")) { + start = true + querys ++= splitWithSemicolon(otherCodes.toSeq) + otherCodes.clear() + } else if (c.trim.startsWith("--QUERY-DELIMITER-END")) { + start = false + querys += s"\n${tempStr.stripSuffix(";")}" + tempStr = "" + } else if (start) { + tempStr += s"\n$c" + } else { + otherCodes += c + } + } + if (otherCodes.nonEmpty) { + querys ++= splitWithSemicolon(otherCodes.toSeq) + } + querys.toSeq + } else { + splitWithSemicolon(allCode).toSeq + } // List of SQL queries to run - // note: this is not a robust way to split queries using semicolon, but works for now. - val queries = code.mkString("\n").split("(?<=[^\\\\]);").map(_.trim).filter(_ != "").toSeq + val queries = tempQueries.map(_.trim).filter(_ != "").toSeq // Fix misplacement when comment is at the end of the query. .map(_.split("\n").filterNot(_.startsWith("--")).mkString("\n")).map(_.trim).filter(_ != "") - // When we are regenerating the golden files, we don't need to set any config as they - // all need to return the same result - if (regenerateGoldenFiles || !isTestWithConfigSets) { - runQueries(queries, testCase, None) + val settingLines = comments.filter(_.startsWith("--SET ")).map(_.substring(6)) + val settings = settingLines.flatMap(_.split(",").map { kv => + val (conf, value) = kv.span(_ != '=') + conf.trim -> value.substring(1).trim + }) + + if (regenerateGoldenFiles) { + runQueries(queries, testCase, settings) } else { - val configSets = { - val configLines = comments.filter(_.startsWith("--SET")).map(_.substring(5)) - val configs = configLines.map(_.split(",").map { confAndValue => - val (conf, value) = confAndValue.span(_ != '=') + // A config dimension has multiple config sets, and a config set has multiple configs. + // - config dim: Seq[Seq[(String, String)]] + // - config set: Seq[(String, String)] + // - config: (String, String)) + // We need to do cartesian product for all the config dimensions, to get a list of + // config sets, and run the query once for each config set. + val configDimLines = comments.filter(_.startsWith("--CONFIG_DIM")).map(_.substring(12)) + val configDims = configDimLines.groupBy(_.takeWhile(_ != ' ')).mapValues { lines => + lines.map(_.dropWhile(_ != ' ').substring(1)).map(_.split(",").map { kv => + val (conf, value) = kv.span(_ != '=') conf.trim -> value.substring(1).trim - }) + }.toSeq).toSeq + } - if (configs.nonEmpty) { - codegenConfigSets.flatMap { codegenConfig => - configs.map { config => - config ++ codegenConfig - } - } - } else { - codegenConfigSets - } + val configSets = configDims.values.foldLeft(Seq(Seq[(String, String)]())) { (res, dim) => + dim.flatMap { configSet => res.map(_ ++ configSet) } } configSets.foreach { configSet => try { - runQueries(queries, testCase, Some(configSet)) + runQueries(queries, testCase, settings ++ configSet) } catch { case e: Throwable => val configs = configSet.map { @@ -281,7 +354,7 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { protected def runQueries( queries: Seq[String], testCase: TestCase, - configSet: Option[Seq[(String, String)]]): Unit = { + configSet: Seq[(String, String)]): Unit = { // Create a local SparkSession to have stronger isolation between different test cases. // This does not isolate catalog changes. val localSparkSession = spark.newSession() @@ -289,10 +362,6 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { testCase match { case udfTestCase: UDFTest => - // In Python UDF tests, the number of shuffle partitions matters considerably in - // the testing time because it requires to fork and communicate between external - // processes. - localSparkSession.conf.set(SQLConf.SHUFFLE_PARTITIONS.key, 4) registerTestUDF(udfTestCase.udf, localSparkSession) case _ => } @@ -304,26 +373,27 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { localSparkSession.udf.register("boolne", (b1: Boolean, b2: Boolean) => b1 != b2) // vol used by boolean.sql and case.sql. localSparkSession.udf.register("vol", (s: String) => s) - // PostgreSQL enabled cartesian product by default. - localSparkSession.conf.set(SQLConf.CROSS_JOINS_ENABLED.key, true) - localSparkSession.conf.set(SQLConf.ANSI_SQL_PARSER.key, true) - localSparkSession.conf.set(SQLConf.PREFER_INTEGRAL_DIVISION.key, true) + localSparkSession.conf.set(SQLConf.ANSI_ENABLED.key, true) + case _: AnsiTest => + localSparkSession.conf.set(SQLConf.ANSI_ENABLED.key, true) case _ => } + localSparkSession.conf.set(SQLConf.DATETIME_JAVA8API_ENABLED.key, true) - if (configSet.isDefined) { + if (configSet.nonEmpty) { // Execute the list of set operation in order to add the desired configs - val setOperations = configSet.get.map { case (key, value) => s"set $key=$value" } + val setOperations = configSet.map { case (key, value) => s"set $key=$value" } logInfo(s"Setting configs: ${setOperations.mkString(", ")}") setOperations.foreach(localSparkSession.sql) } + // Run the SQL queries preparing them for comparison. val outputs: Seq[QueryOutput] = queries.map { sql => - val (schema, output) = getNormalizedResult(localSparkSession, sql) + val (schema, output) = handleExceptions(getNormalizedResult(localSparkSession, sql)) // We might need to do some query canonicalization in the future. QueryOutput( sql = sql, - schema = schema.catalogString, + schema = schema, output = output.mkString("\n").replaceAll("\\s+$", "")) } @@ -332,7 +402,7 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { val goldenOutput = { s"-- Automatically generated by ${getClass.getSimpleName}\n" + s"-- Number of queries: ${outputs.size}\n\n\n" + - outputs.zipWithIndex.map{case (qr, i) => qr.toString(i)}.mkString("\n\n\n") + "\n" + outputs.zipWithIndex.map{case (qr, i) => qr.toString}.mkString("\n\n\n") + "\n" } val resultFile = new File(testCase.resultFile) val parent = resultFile.getParentFile @@ -345,11 +415,25 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { // This is a temporary workaround for SPARK-28894. The test names are truncated after // the last dot due to a bug in SBT. This makes easier to debug via Jenkins test result // report. See SPARK-28894. - withClue(s"${testCase.name}${System.lineSeparator()}") { + // See also SPARK-29127. It is difficult to see the version information in the failed test + // cases so the version information related to Python was also added. + val clue = testCase match { + case udfTestCase: UDFTest + if udfTestCase.udf.isInstanceOf[TestPythonUDF] && shouldTestPythonUDFs => + s"${testCase.name}${System.lineSeparator()}Python: $pythonVer${System.lineSeparator()}" + case udfTestCase: UDFTest + if udfTestCase.udf.isInstanceOf[TestScalarPandasUDF] && shouldTestScalarPandasUDFs => + s"${testCase.name}${System.lineSeparator()}" + + s"Python: $pythonVer Pandas: $pandasVer PyArrow: $pyarrowVer${System.lineSeparator()}" + case _ => + s"${testCase.name}${System.lineSeparator()}" + } + + withClue(clue) { // Read back the golden file. val expectedOutputs: Seq[QueryOutput] = { val goldenOutput = fileToString(new File(testCase.resultFile)) - val segments = goldenOutput.split("-- !query.+\n") + val segments = goldenOutput.split("-- !query.*\n") // each query has 3 segments, plus the header assert(segments.size == outputs.size * 3 + 1, @@ -377,53 +461,69 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { s"Schema did not match for query #$i\n${expected.sql}: $output") { output.schema } - assertResult(expected.output, s"Result did not match for query #$i\n${expected.sql}") { - output.output - } + assertResult(expected.output, s"Result did not match" + + s" for query #$i\n${expected.sql}") { output.output } } } } + /** + * This method handles exceptions occurred during query execution as they may need special care + * to become comparable to the expected output. + * + * @param result a function that returns a pair of schema and output + */ + protected def handleExceptions(result: => (String, Seq[String])): (String, Seq[String]) = { + try { + result + } catch { + case a: AnalysisException => + // Do not output the logical plan tree which contains expression IDs. + // Also implement a crude way of masking expression IDs in the error message + // with a generic pattern "###". + val msg = if (a.plan.nonEmpty) a.getSimpleMessage else a.getMessage + (emptySchema, Seq(a.getClass.getName, msg.replaceAll("#\\d+", "#x"))) + case s: SparkException if s.getCause != null => + // For a runtime exception, it is hard to match because its message contains + // information of stage, task ID, etc. + // To make result matching simpler, here we match the cause of the exception if it exists. + val cause = s.getCause + (emptySchema, Seq(cause.getClass.getName, cause.getMessage)) + case NonFatal(e) => + // If there is an exception, put the exception class followed by the message. + (emptySchema, Seq(e.getClass.getName, e.getMessage)) + } + } + /** Executes a query and returns the result as (schema of the output, normalized output). */ - private def getNormalizedResult(session: SparkSession, sql: String): (StructType, Seq[String]) = { + private def getNormalizedResult(session: SparkSession, sql: String): (String, Seq[String]) = { // Returns true if the plan is supposed to be sorted. def isSorted(plan: LogicalPlan): Boolean = plan match { case _: Join | _: Aggregate | _: Generate | _: Sample | _: Distinct => false case _: DescribeCommandBase | _: DescribeColumnCommand - | _: DescribeTableStatement + | _: DescribeRelation | _: DescribeColumnStatement => true case PhysicalOperation(_, _, Sort(_, true, _)) => true case _ => plan.children.iterator.exists(isSorted) } - try { - val df = session.sql(sql) - val schema = df.schema - // Get answer, but also get rid of the #1234 expression ids that show up in explain plans - val answer = hiveResultString(df.queryExecution.executedPlan).map(replaceNotIncludedMsg) - - // If the output is not pre-sorted, sort it. - if (isSorted(df.queryExecution.analyzed)) (schema, answer) else (schema, answer.sorted) - - } catch { - case a: AnalysisException => - // Do not output the logical plan tree which contains expression IDs. - // Also implement a crude way of masking expression IDs in the error message - // with a generic pattern "###". - val msg = if (a.plan.nonEmpty) a.getSimpleMessage else a.getMessage - (StructType(Seq.empty), Seq(a.getClass.getName, msg.replaceAll("#\\d+", "#x"))) - case NonFatal(e) => - // If there is an exception, put the exception class followed by the message. - (StructType(Seq.empty), Seq(e.getClass.getName, e.getMessage)) + val df = session.sql(sql) + val schema = df.schema.catalogString + // Get answer, but also get rid of the #1234 expression ids that show up in explain plans + val answer = SQLExecution.withNewExecutionId(df.queryExecution, Some(sql)) { + hiveResultString(df.queryExecution.executedPlan).map(replaceNotIncludedMsg) } + + // If the output is not pre-sorted, sort it. + if (isSorted(df.queryExecution.analyzed)) (schema, answer) else (schema, answer.sorted) } protected def replaceNotIncludedMsg(line: String): String = { line.replaceAll("#\\d+", "#x") .replaceAll( - s"Location.*/sql/core/spark-warehouse/$clsName/", - s"Location ${notIncludedMsg}sql/core/spark-warehouse/") + s"Location.*$clsName/", + s"Location $notIncludedMsg/{warehouse_dir}/") .replaceAll("Created By.*", s"Created By $notIncludedMsg") .replaceAll("Created Time.*", s"Created Time $notIncludedMsg") .replaceAll("Last Access.*", s"Last Access $notIncludedMsg") @@ -431,14 +531,14 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { .replaceAll("\\*\\(\\d+\\) ", "*") // remove the WholeStageCodegen codegenStageIds } - protected def listTestCases(): Seq[TestCase] = { + protected lazy val listTestCases: Seq[TestCase] = { listFilesRecursively(new File(inputFilePath)).flatMap { file => val resultFile = file.getAbsolutePath.replace(inputFilePath, goldenFilePath) + ".out" val absPath = file.getAbsolutePath val testCaseName = absPath.stripPrefix(inputFilePath).stripPrefix(File.separator) if (file.getAbsolutePath.startsWith( - s"$inputFilePath${File.separator}udf${File.separator}pgSQL")) { + s"$inputFilePath${File.separator}udf${File.separator}postgreSQL")) { Seq(TestScalaUDF("udf"), TestPythonUDF("udf"), TestScalarPandasUDF("udf")).map { udf => UDFPgSQLTestCase( s"$testCaseName - ${udf.prettyName}", absPath, resultFile, udf) @@ -448,8 +548,10 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession { UDFTestCase( s"$testCaseName - ${udf.prettyName}", absPath, resultFile, udf) } - } else if (file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}pgSQL")) { + } else if (file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}postgreSQL")) { PgSQLTestCase(testCaseName, absPath, resultFile) :: Nil + } else if (file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}ansi")) { + AnsiTestCase(testCaseName, absPath, resultFile) :: Nil } else { RegularTestCase(testCaseName, absPath, resultFile) :: Nil } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SSBQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SSBQuerySuite.scala index 9a0c61b3304c5..099b559105fe8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SSBQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SSBQuerySuite.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.util.resourceToString */ class SSBQuerySuite extends BenchmarkQueryTest { - override def beforeAll { + override def beforeAll: Unit = { super.beforeAll sql( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala index 1d461a03fd1f6..31957a99e15af 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SessionStateSuite.scala @@ -158,7 +158,7 @@ class SessionStateSuite extends SparkFunSuite { assert(forkedSession ne activeSession) assert(forkedSession.listenerManager ne activeSession.listenerManager) runCollectQueryOn(forkedSession) - activeSession.sparkContext.listenerBus.waitUntilEmpty(1000) + activeSession.sparkContext.listenerBus.waitUntilEmpty() assert(collectorA.commands.length == 1) // forked should callback to A assert(collectorA.commands(0) == "collect") @@ -166,14 +166,14 @@ class SessionStateSuite extends SparkFunSuite { // => changes to forked do not affect original forkedSession.listenerManager.register(collectorB) runCollectQueryOn(activeSession) - activeSession.sparkContext.listenerBus.waitUntilEmpty(1000) + activeSession.sparkContext.listenerBus.waitUntilEmpty() assert(collectorB.commands.isEmpty) // original should not callback to B assert(collectorA.commands.length == 2) // original should still callback to A assert(collectorA.commands(1) == "collect") // <= changes to original do not affect forked activeSession.listenerManager.register(collectorC) runCollectQueryOn(forkedSession) - activeSession.sparkContext.listenerBus.waitUntilEmpty(1000) + activeSession.sparkContext.listenerBus.waitUntilEmpty() assert(collectorC.commands.isEmpty) // forked should not callback to C assert(collectorA.commands.length == 3) // forked should still callback to A assert(collectorB.commands.length == 1) // forked should still callback to B diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ShowCreateTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ShowCreateTableSuite.scala index 42307b1b9734e..b3b94f8be0d17 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ShowCreateTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ShowCreateTableSuite.scala @@ -148,20 +148,6 @@ abstract class ShowCreateTableSuite extends QueryTest with SQLTestUtils { } } - test("view") { - withView("v1") { - sql("CREATE VIEW v1 AS SELECT 1 AS a") - checkCreateView("v1") - } - } - - test("view with output columns") { - withView("v1") { - sql("CREATE VIEW v1 (b) AS SELECT 1 AS a") - checkCreateView("v1") - } - } - test("temp view") { val viewName = "spark_28383" withTempView(viewName) { @@ -186,17 +172,22 @@ abstract class ShowCreateTableSuite extends QueryTest with SQLTestUtils { withTable("t1") { val createTable = "CREATE TABLE `t1` (`a` STRUCT<`b`: STRING>)" sql(s"$createTable USING json") - val shownDDL = sql(s"SHOW CREATE TABLE t1") - .head() - .getString(0) - .split("\n") - .head + val shownDDL = getShowDDL("SHOW CREATE TABLE t1") assert(shownDDL == createTable) checkCreateTable("t1") } } + protected def getShowDDL(showCreateTableSql: String): String = { + val result = sql(showCreateTableSql) + .head() + .getString(0) + .split("\n") + .map(_.trim) + if (result.length > 1) result(0) + result(1) else result.head + } + protected def checkCreateTable(table: String): Unit = { checkCreateTableOrView(TableIdentifier(table, Some("default")), "TABLE") } @@ -220,7 +211,7 @@ abstract class ShowCreateTableSuite extends QueryTest with SQLTestUtils { } } - private def checkCatalogTables(expected: CatalogTable, actual: CatalogTable): Unit = { + protected def checkCatalogTables(expected: CatalogTable, actual: CatalogTable): Unit = { def normalize(table: CatalogTable): CatalogTable = { val nondeterministicProps = Set( "CreateTime", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala index 74341f93dd5ba..99ea95089d71c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala @@ -16,16 +16,20 @@ */ package org.apache.spark.sql +import java.util.Locale + import org.apache.spark.{SparkFunSuite, TaskContext} import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParserInterface} -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, UnresolvedHint} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.COLUMN_BATCH_SIZE import org.apache.spark.sql.internal.StaticSQLConf.SPARK_SESSION_EXTENSIONS import org.apache.spark.sql.types.{DataType, Decimal, IntegerType, LongType, Metadata, StructType} import org.apache.spark.sql.vectorized.{ColumnarArray, ColumnarBatch, ColumnarMap, ColumnVector} @@ -122,12 +126,33 @@ class SparkSessionExtensionSuite extends SparkFunSuite { } } + case class MyHintRule(spark: SparkSession) extends Rule[LogicalPlan] { + val MY_HINT_NAME = Set("CONVERT_TO_EMPTY") + + override def apply(plan: LogicalPlan): LogicalPlan = + plan.resolveOperators { + case h: UnresolvedHint if MY_HINT_NAME.contains(h.name.toUpperCase(Locale.ROOT)) => + LocalRelation(h.output, data = Seq.empty, isStreaming = h.isStreaming) + } + } + + test("inject custom hint rule") { + withSession(Seq(_.injectPostHocResolutionRule(MyHintRule))) { session => + assert( + session.range(1).hint("CONVERT_TO_EMPTY").logicalPlan.isInstanceOf[LocalRelation], + "plan is expected to be a local relation" + ) + } + } + test("inject columnar") { val extensions = create { extensions => extensions.injectColumnar(session => MyColumarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule())) } withSession(extensions) { session => + // The ApplyColumnarRulesAndInsertTransitions rule is not applied when enable AQE + session.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, false) assert(session.sessionState.columnarRules.contains( MyColumarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule()))) import session.sqlContext.implicits._ @@ -150,6 +175,30 @@ class SparkSessionExtensionSuite extends SparkFunSuite { } } + test("reset column vectors") { + val session = SparkSession.builder() + .master("local[1]") + .config(COLUMN_BATCH_SIZE.key, 2) + .withExtensions { extensions => + extensions.injectColumnar(session => + MyColumarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule())) } + .getOrCreate() + + try { + assert(session.sessionState.columnarRules.contains( + MyColumarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule()))) + import session.sqlContext.implicits._ + + val input = Seq((100L), (200L), (300L)) + val data = input.toDF("vals").repartition(1) + val df = data.selectExpr("vals + 1") + val result = df.collect() + assert(result sameElements input.map(x => Row(x + 2))) + } finally { + stop(session) + } + } + test("use custom class for extensions") { val session = SparkSession.builder() .master("local[1]") @@ -283,7 +332,20 @@ case class MyParser(spark: SparkSession, delegate: ParserInterface) extends Pars object MyExtensions { val myFunction = (FunctionIdentifier("myFunction"), - new ExpressionInfo("noClass", "myDb", "myFunction", "usage", "extended usage"), + new ExpressionInfo( + "noClass", + "myDb", + "myFunction", + "usage", + "extended usage", + " Examples:", + """ + note + """, + "3.0.0", + """ + deprecated + """), (_: Seq[Expression]) => Literal(5, IntegerType)) } @@ -680,7 +742,20 @@ case class MySparkStrategy2(spark: SparkSession) extends SparkStrategy { object MyExtensions2 { val myFunction = (FunctionIdentifier("myFunction2"), - new ExpressionInfo("noClass", "myDb", "myFunction2", "usage", "extended usage"), + new ExpressionInfo( + "noClass", + "myDb", + "myFunction2", + "usage", + "extended usage", + " Examples:", + """ + note + """, + "3.0.0", + """ + deprecated + """), (_: Seq[Expression]) => Literal(5, IntegerType)) } @@ -699,7 +774,20 @@ class MyExtensions2 extends (SparkSessionExtensions => Unit) { object MyExtensions2Duplicate { val myFunction = (FunctionIdentifier("myFunction2"), - new ExpressionInfo("noClass", "myDb", "myFunction2", "usage", "extended usage"), + new ExpressionInfo( + "noClass", + "myDb", + "myFunction2", + "usage", + "extended usage", + " Examples:", + """ + note + """, + "3.0.0", + """ + deprecated + """), (_: Seq[Expression]) => Literal(5, IntegerType)) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala index 915f66526c3e6..fde8ddf491bd1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala @@ -27,6 +27,7 @@ import scala.util.Random import org.apache.spark.sql.catalyst.{QualifiedTableName, TableIdentifier} import org.apache.spark.sql.catalyst.catalog.{CatalogColumnStat, CatalogStatistics, CatalogTable, HiveTableRelation} +import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Histogram, HistogramBin, HistogramSerializer, LogicalPlan} import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._ import org.apache.spark.sql.execution.datasources.LogicalRelation @@ -238,10 +239,14 @@ abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils getTableFromCatalogCache(tableName) != null } - def getCatalogStatistics(tableName: String): CatalogStatistics = { + def getTableStats(tableName: String): CatalogStatistics = { getCatalogTable(tableName).stats.get } + def getPartitionStats(tableName: String, partSpec: TablePartitionSpec): CatalogStatistics = { + spark.sessionState.catalog.getPartition(TableIdentifier(tableName), partSpec).stats.get + } + def checkTableStats( tableName: String, hasSizeInBytes: Boolean, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 88b3e5ec61f8a..ec698818a0d85 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -129,18 +129,37 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { Row("AQIDBA==", bytes)) } - test("overlay function") { + test("string overlay function") { // scalastyle:off // non ascii characters are not allowed in the code, so we disable the scalastyle here. - val df = Seq(("Spark SQL", "Spark的SQL")).toDF("a", "b") - checkAnswer(df.select(overlay($"a", "_", 6)), Row("Spark_SQL")) - checkAnswer(df.select(overlay($"a", "CORE", 7)), Row("Spark CORE")) - checkAnswer(df.select(overlay($"a", "ANSI ", 7, 0)), Row("Spark ANSI SQL")) - checkAnswer(df.select(overlay($"a", "tructured", 2, 4)), Row("Structured SQL")) - checkAnswer(df.select(overlay($"b", "_", 6)), Row("Spark_SQL")) + val df = Seq(("Spark SQL", "Spark的SQL", "_", "CORE", "ANSI ", "tructured", 6, 7, 0, 2, 4)). + toDF("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k") + checkAnswer(df.select(overlay($"a", $"c", $"g")), Row("Spark_SQL")) + checkAnswer(df.select(overlay($"a", $"d", $"h")), Row("Spark CORE")) + checkAnswer(df.select(overlay($"a", $"e", $"h", $"i")), Row("Spark ANSI SQL")) + checkAnswer(df.select(overlay($"a", $"f", $"j", $"k")), Row("Structured SQL")) + checkAnswer(df.select(overlay($"b", $"c", $"g")), Row("Spark_SQL")) // scalastyle:on } + test("binary overlay function") { + // non ascii characters are not allowed in the code, so we disable the scalastyle here. + val df = Seq(( + Array[Byte](1, 2, 3, 4, 5, 6, 7, 8, 9), + Array[Byte](-1), + Array[Byte](-1, -1, -1, -1), + Array[Byte](-1, -1), + Array[Byte](-1, -1, -1, -1, -1), + 6, 7, 0, 2, 4)).toDF("a", "b", "c", "d", "e", "f", "g", "h", "i", "j") + checkAnswer(df.select(overlay($"a", $"b", $"f")), Row(Array[Byte](1, 2, 3, 4, 5, -1, 7, 8, 9))) + checkAnswer(df.select(overlay($"a", $"c", $"g")), + Row(Array[Byte](1, 2, 3, 4, 5, 6, -1, -1, -1, -1))) + checkAnswer(df.select(overlay($"a", $"d", $"g", $"h")), + Row(Array[Byte](1, 2, 3, 4, 5, 6, -1, -1, 7, 8, 9))) + checkAnswer(df.select(overlay($"a", $"e", $"i", $"j")), + Row(Array[Byte](1, -1, -1, -1, -1, -1, 6, 7, 8, 9))) + } + test("string / binary substring function") { // scalastyle:off // non ascii characters are not allowed in the code, so we disable the scalastyle here. @@ -266,7 +285,7 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { test("string parse_url function") { - def testUrl(url: String, expected: Row) { + def testUrl(url: String, expected: Row): Unit = { checkAnswer(Seq[String]((url)).toDF("url").selectExpr( "parse_url(url, 'HOST')", "parse_url(url, 'PATH')", "parse_url(url, 'QUERY')", "parse_url(url, 'REF')", diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala index a1d7792941ed9..ff8f94c68c5ee 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala @@ -22,11 +22,12 @@ import scala.collection.mutable.ArrayBuffer import org.apache.spark.sql.catalyst.expressions.SubqueryExpression import org.apache.spark.sql.catalyst.plans.logical.{Join, LogicalPlan, Sort} import org.apache.spark.sql.execution.{ColumnarToRowExec, ExecSubqueryExpression, FileSourceScanExec, InputAdapter, ReusedSubqueryExec, ScalarSubquery, SubqueryExec, WholeStageCodegenExec} +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.datasources.FileScanRDD import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession -class SubquerySuite extends QueryTest with SharedSparkSession { +class SubquerySuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlanHelper { import testImplicits._ setupTestData() @@ -891,9 +892,9 @@ class SubquerySuite extends QueryTest with SharedSparkSession { val sqlText = """ - |SELECT * FROM t1 + |SELECT * FROM t1 a |WHERE - |NOT EXISTS (SELECT * FROM t1) + |NOT EXISTS (SELECT * FROM t1 b WHERE a.i = b.i) """.stripMargin val optimizedPlan = sql(sqlText).queryExecution.optimizedPlan val join = optimizedPlan.collectFirst { case j: Join => j }.get @@ -1080,9 +1081,8 @@ class SubquerySuite extends QueryTest with SharedSparkSession { | HAVING max(c2) > 0 | ORDER BY c1) """.stripMargin - // The rule to remove redundant sorts is not able to remove the inner sort under - // an Aggregate operator. We only remove the top level sort. - assert(getNumSortsInQuery(query6) == 1) + + assert(getNumSortsInQuery(query6) == 0) // Cases when sort is not removed from the plan // Limit on top of sort @@ -1272,12 +1272,29 @@ class SubquerySuite extends QueryTest with SharedSparkSession { } } + test("Cannot remove sort for floating-point order-sensitive aggregates from subquery") { + Seq("float", "double").foreach { typeName => + Seq("SUM", "AVG", "KURTOSIS", "SKEWNESS", "STDDEV_POP", "STDDEV_SAMP", + "VAR_POP", "VAR_SAMP").foreach { aggName => + val query = + s""" + |SELECT k, $aggName(v) FROM ( + | SELECT k, v + | FROM VALUES (1, $typeName(2.0)), (2, $typeName(1.0)) t(k, v) + | ORDER BY v) + |GROUP BY k + """.stripMargin + assert(getNumSortsInQuery(query) == 1) + } + } + } + test("SPARK-25482: Forbid pushdown to datasources of filters containing subqueries") { withTempView("t1", "t2") { sql("create temporary view t1(a int) using parquet") sql("create temporary view t2(b int) using parquet") val plan = sql("select * from t2 where b > (select max(a) from t1)") - val subqueries = plan.queryExecution.executedPlan.collect { + val subqueries = stripAQEPlan(plan.queryExecution.executedPlan).collect { case p => p.subqueries }.flatten assert(subqueries.length == 1) @@ -1292,7 +1309,7 @@ class SubquerySuite extends QueryTest with SharedSparkSession { val df = sql("SELECT * FROM a WHERE p <= (SELECT MIN(id) FROM b)") checkAnswer(df, Seq(Row(0, 0), Row(2, 0))) // need to execute the query before we can examine fs.inputRDDs() - assert(df.queryExecution.executedPlan match { + assert(stripAQEPlan(df.queryExecution.executedPlan) match { case WholeStageCodegenExec(ColumnarToRowExec(InputAdapter( fs @ FileSourceScanExec(_, _, _, partitionFilters, _, _, _)))) => partitionFilters.exists(ExecSubqueryExpression.hasSubquery) && @@ -1342,7 +1359,9 @@ class SubquerySuite extends QueryTest with SharedSparkSession { test("SPARK-27279: Reuse Subquery") { Seq(true, false).foreach { reuse => - withSQLConf(SQLConf.SUBQUERY_REUSE_ENABLED.key -> reuse.toString) { + withSQLConf(SQLConf.SUBQUERY_REUSE_ENABLED.key -> reuse.toString, + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { + // when enable AQE, the reusedExchange is inserted when executed. val df = sql( """ |SELECT (SELECT avg(key) FROM testData) + (SELECT avg(key) FROM testData) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala index a668434a68aff..aacb625d7921f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.internal.SQLConf */ class TPCDSQuerySuite extends BenchmarkQueryTest with TPCDSSchema { - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() for (tableName <- tableNames) { createTable(spark, tableName) @@ -82,13 +82,19 @@ class TPCDSQuerySuite extends BenchmarkQueryTest with TPCDSSchema { "q3", "q7", "q10", "q19", "q27", "q34", "q42", "q43", "q46", "q52", "q53", "q55", "q59", "q63", "q65", "q68", "q73", "q79", "q89", "q98", "ss_max") + // List up the known queries having too large code in a generated function. + // A JIRA file for `modified-q3` is as follows; + // [SPARK-29128] Split predicate code in OR expressions + val blackListForMethodCodeSizeCheck = Set("modified-q3") + modifiedTPCDSQueries.foreach { name => val queryString = resourceToString(s"tpcds-modifiedQueries/$name.sql", classLoader = Thread.currentThread().getContextClassLoader) - test(s"modified-$name") { + val testName = s"modified-$name" + test(testName) { // check the plans can be properly generated val plan = sql(queryString).queryExecution.executedPlan - checkGeneratedCode(plan) + checkGeneratedCode(plan, !blackListForMethodCodeSizeCheck.contains(testName)) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TPCHQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TPCHQuerySuite.scala index b32d95d0b286c..ba99e18714b1d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/TPCHQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCHQuerySuite.scala @@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.util.resourceToString */ class TPCHQuerySuite extends BenchmarkQueryTest { - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() sql( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala index 2a034bcdc3f00..cc3995516dcc2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala @@ -22,7 +22,7 @@ import java.math.BigDecimal import org.apache.spark.sql.api.java._ import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.plans.logical.Project -import org.apache.spark.sql.execution.QueryExecution +import org.apache.spark.sql.execution.{QueryExecution, SimpleMode} import org.apache.spark.sql.execution.columnar.InMemoryRelation import org.apache.spark.sql.execution.command.{CreateDataSourceTableAsSelectCommand, ExplainCommand} import org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand @@ -309,7 +309,7 @@ class UDFSuite extends QueryTest with SharedSparkSession { test("SPARK-19338 Provide identical names for UDFs in the EXPLAIN output") { def explainStr(df: DataFrame): String = { - val explain = ExplainCommand(df.queryExecution.logical, extended = false) + val explain = ExplainCommand(df.queryExecution.logical, SimpleMode) val sparkPlan = spark.sessionState.executePlan(explain).executedPlan sparkPlan.executeCollect().map(_.getString(0).trim).headOption.getOrElse("") } @@ -360,13 +360,13 @@ class UDFSuite extends QueryTest with SharedSparkSession { .withColumn("b", udf1($"a", lit(10))) df.cache() df.write.saveAsTable("t") - sparkContext.listenerBus.waitUntilEmpty(1000) + sparkContext.listenerBus.waitUntilEmpty() assert(numTotalCachedHit == 1, "expected to be cached in saveAsTable") df.write.insertInto("t") - sparkContext.listenerBus.waitUntilEmpty(1000) + sparkContext.listenerBus.waitUntilEmpty() assert(numTotalCachedHit == 2, "expected to be cached in insertInto") df.write.save(path.getCanonicalPath) - sparkContext.listenerBus.waitUntilEmpty(1000) + sparkContext.listenerBus.waitUntilEmpty() assert(numTotalCachedHit == 3, "expected to be cached in save for native") } } @@ -443,12 +443,12 @@ class UDFSuite extends QueryTest with SharedSparkSession { test("SPARK-25044 Verify null input handling for primitive types - with udf(Any, DataType)") { val f = udf((x: Int) => x, IntegerType) checkAnswer( - Seq(new Integer(1), null).toDF("x").select(f($"x")), + Seq(Integer.valueOf(1), null).toDF("x").select(f($"x")), Row(1) :: Row(0) :: Nil) val f2 = udf((x: Double) => x, DoubleType) checkAnswer( - Seq(new java.lang.Double(1.1), null).toDF("x").select(f2($"x")), + Seq(java.lang.Double.valueOf(1.1), null).toDF("x").select(f2($"x")), Row(1.1) :: Row(0.0) :: Nil) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala index 2b2fedd3ca218..ffc2018d2132d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql +import java.util.Arrays + import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.{Cast, ExpressionEvalHelper, GenericInternalRow, Literal} @@ -277,4 +279,12 @@ class UserDefinedTypeSuite extends QueryTest with SharedSparkSession with Parque val udt = new TestUDT.MyDenseVectorUDT() assert(!Cast.canUpCast(udt, StringType)) } + + test("typeof user defined type") { + val schema = new StructType().add("a", new TestUDT.MyDenseVectorUDT()) + val data = Arrays.asList( + RowFactory.create(new TestUDT.MyDenseVector(Array(1.0, 3.0, 5.0, 7.0, 9.0)))) + checkAnswer(spark.createDataFrame(data, schema).selectExpr("typeof(a)"), + Seq(Row("array"))) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/AlterTableTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala similarity index 74% rename from sql/core/src/test/scala/org/apache/spark/sql/sources/v2/AlterTableTests.scala rename to sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala index 4b7ee384b4c10..96fe301b512ea 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/AlterTableTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/AlterTableTests.scala @@ -15,12 +15,14 @@ * limitations under the License. */ -package org.apache.spark.sql.sources.v2 +package org.apache.spark.sql.connector import scala.collection.JavaConverters._ import org.apache.spark.SparkException import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.connector.catalog.CatalogV2Util.withDefaultOwnership +import org.apache.spark.sql.connector.catalog.Table import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ @@ -49,7 +51,7 @@ trait AlterTableTests extends SharedSparkSession { } assert(exc.getMessage.contains(s"${catalogAndNamespace}table_name")) - assert(exc.getMessage.contains("Table or view not found")) + assert(exc.getMessage.contains("Table not found")) } } @@ -85,6 +87,21 @@ trait AlterTableTests extends SharedSparkSession { } } + test("AlterTable: add column with NOT NULL") { + val t = s"${catalogAndNamespace}table_name" + withTable(t) { + sql(s"CREATE TABLE $t (id int) USING $v2Format") + sql(s"ALTER TABLE $t ADD COLUMN data string NOT NULL") + + val table = getTableMetadata(t) + + assert(table.name === fullTableName(t)) + assert(table.schema === StructType(Seq( + StructField("id", IntegerType), + StructField("data", StringType, nullable = false)))) + } + } + test("AlterTable: add column with comment") { val t = s"${catalogAndNamespace}table_name" withTable(t) { @@ -100,6 +117,62 @@ trait AlterTableTests extends SharedSparkSession { } } + test("AlterTable: add column with interval type") { + val t = s"${catalogAndNamespace}table_name" + withTable(t) { + sql(s"CREATE TABLE $t (id int, point struct) USING $v2Format") + val e1 = + intercept[AnalysisException](sql(s"ALTER TABLE $t ADD COLUMN data interval")) + assert(e1.getMessage.contains("Cannot use interval type in the table schema.")) + val e2 = + intercept[AnalysisException](sql(s"ALTER TABLE $t ADD COLUMN point.z interval")) + assert(e2.getMessage.contains("Cannot use interval type in the table schema.")) + } + } + + test("AlterTable: add column with position") { + val t = s"${catalogAndNamespace}table_name" + withTable(t) { + sql(s"CREATE TABLE $t (point struct) USING $v2Format") + + sql(s"ALTER TABLE $t ADD COLUMN a string FIRST") + assert(getTableMetadata(t).schema == new StructType() + .add("a", StringType) + .add("point", new StructType().add("x", IntegerType))) + + sql(s"ALTER TABLE $t ADD COLUMN b string AFTER point") + assert(getTableMetadata(t).schema == new StructType() + .add("a", StringType) + .add("point", new StructType().add("x", IntegerType)) + .add("b", StringType)) + + val e1 = intercept[AnalysisException]( + sql(s"ALTER TABLE $t ADD COLUMN c string AFTER non_exist")) + assert(e1.getMessage().contains("Couldn't find the reference column")) + + sql(s"ALTER TABLE $t ADD COLUMN point.y int FIRST") + assert(getTableMetadata(t).schema == new StructType() + .add("a", StringType) + .add("point", new StructType() + .add("y", IntegerType) + .add("x", IntegerType)) + .add("b", StringType)) + + sql(s"ALTER TABLE $t ADD COLUMN point.z int AFTER x") + assert(getTableMetadata(t).schema == new StructType() + .add("a", StringType) + .add("point", new StructType() + .add("y", IntegerType) + .add("x", IntegerType) + .add("z", IntegerType)) + .add("b", StringType)) + + val e2 = intercept[AnalysisException]( + sql(s"ALTER TABLE $t ADD COLUMN point.x2 int AFTER non_exist")) + assert(e2.getMessage().contains("Couldn't find the reference column")) + } + } + test("AlterTable: add multiple columns") { val t = s"${catalogAndNamespace}table_name" withTable(t) { @@ -239,6 +312,30 @@ trait AlterTableTests extends SharedSparkSession { } } + test("AlterTable: add column - new column should not exist") { + val t = s"${catalogAndNamespace}table_name" + withTable(t) { + sql( + s"""CREATE TABLE $t ( + |id int, + |point struct, + |arr array>, + |mk map, string>, + |mv map> + |) + |USING $v2Format""".stripMargin) + + Seq("id", "point.x", "arr.element.x", "mk.key.x", "mv.value.x").foreach { field => + + val e = intercept[AnalysisException] { + sql(s"ALTER TABLE $t ADD COLUMNS $field double") + } + assert(e.getMessage.contains("add")) + assert(e.getMessage.contains(s"$field already exists")) + } + } + } + test("AlterTable: update column type int -> long") { val t = s"${catalogAndNamespace}table_name" withTable(t) { @@ -246,12 +343,42 @@ trait AlterTableTests extends SharedSparkSession { sql(s"ALTER TABLE $t ALTER COLUMN id TYPE bigint") val table = getTableMetadata(t) - assert(table.name === fullTableName(t)) assert(table.schema === new StructType().add("id", LongType)) } } + test("AlterTable: update column type to interval") { + val t = s"${catalogAndNamespace}table_name" + withTable(t) { + sql(s"CREATE TABLE $t (id int) USING $v2Format") + val e = intercept[AnalysisException](sql(s"ALTER TABLE $t ALTER COLUMN id TYPE interval")) + assert(e.getMessage.contains("id to interval type")) + } + } + + test("AlterTable: SET/DROP NOT NULL") { + val t = s"${catalogAndNamespace}table_name" + withTable(t) { + sql(s"CREATE TABLE $t (id bigint NOT NULL) USING $v2Format") + sql(s"ALTER TABLE $t ALTER COLUMN id SET NOT NULL") + + val table = getTableMetadata(t) + assert(table.name === fullTableName(t)) + assert(table.schema === new StructType().add("id", LongType, nullable = false)) + + sql(s"ALTER TABLE $t ALTER COLUMN id DROP NOT NULL") + val table2 = getTableMetadata(t) + assert(table2.name === fullTableName(t)) + assert(table2.schema === new StructType().add("id", LongType)) + + val e = intercept[AnalysisException] { + sql(s"ALTER TABLE $t ALTER COLUMN id SET NOT NULL") + } + assert(e.message.contains("Cannot change nullable column to non-nullable")) + } + } + test("AlterTable: update nested type float -> double") { val t = s"${catalogAndNamespace}table_name" withTable(t) { @@ -259,7 +386,6 @@ trait AlterTableTests extends SharedSparkSession { sql(s"ALTER TABLE $t ALTER COLUMN point.x TYPE double") val table = getTableMetadata(t) - assert(table.name === fullTableName(t)) assert(table.schema === new StructType() .add("id", IntegerType) @@ -279,7 +405,7 @@ trait AlterTableTests extends SharedSparkSession { } assert(exc.getMessage.contains("point")) - assert(exc.getMessage.contains("update a struct by adding, deleting, or updating its fields")) + assert(exc.getMessage.contains("update a struct by updating its fields")) val table = getTableMetadata(t) @@ -470,16 +596,58 @@ trait AlterTableTests extends SharedSparkSession { } } - test("AlterTable: update column type and comment") { + test("AlterTable: update column position") { val t = s"${catalogAndNamespace}table_name" withTable(t) { - sql(s"CREATE TABLE $t (id int) USING $v2Format") - sql(s"ALTER TABLE $t ALTER COLUMN id TYPE bigint COMMENT 'doc'") - - val table = getTableMetadata(t) - - assert(table.name === fullTableName(t)) - assert(table.schema === StructType(Seq(StructField("id", LongType).withComment("doc")))) + sql(s"CREATE TABLE $t (a int, b int, point struct) USING $v2Format") + + sql(s"ALTER TABLE $t ALTER COLUMN b FIRST") + assert(getTableMetadata(t).schema == new StructType() + .add("b", IntegerType) + .add("a", IntegerType) + .add("point", new StructType() + .add("x", IntegerType) + .add("y", IntegerType) + .add("z", IntegerType))) + + sql(s"ALTER TABLE $t ALTER COLUMN b AFTER point") + assert(getTableMetadata(t).schema == new StructType() + .add("a", IntegerType) + .add("point", new StructType() + .add("x", IntegerType) + .add("y", IntegerType) + .add("z", IntegerType)) + .add("b", IntegerType)) + + val e1 = intercept[AnalysisException]( + sql(s"ALTER TABLE $t ALTER COLUMN b AFTER non_exist")) + assert(e1.getMessage.contains("Couldn't resolve positional argument")) + + sql(s"ALTER TABLE $t ALTER COLUMN point.y FIRST") + assert(getTableMetadata(t).schema == new StructType() + .add("a", IntegerType) + .add("point", new StructType() + .add("y", IntegerType) + .add("x", IntegerType) + .add("z", IntegerType)) + .add("b", IntegerType)) + + sql(s"ALTER TABLE $t ALTER COLUMN point.y AFTER z") + assert(getTableMetadata(t).schema == new StructType() + .add("a", IntegerType) + .add("point", new StructType() + .add("x", IntegerType) + .add("z", IntegerType) + .add("y", IntegerType)) + .add("b", IntegerType)) + + val e2 = intercept[AnalysisException]( + sql(s"ALTER TABLE $t ALTER COLUMN point.y AFTER non_exist")) + assert(e2.getMessage.contains("Couldn't resolve positional argument")) + + // `AlterTable.resolved` checks column existence. + intercept[AnalysisException]( + sql(s"ALTER TABLE $t ALTER COLUMN a.y AFTER x")) } } @@ -692,6 +860,37 @@ trait AlterTableTests extends SharedSparkSession { } } + test("AlterTable: rename column - new name should not exist") { + val t = s"${catalogAndNamespace}table_name" + withTable(t) { + sql( + s"""CREATE TABLE $t ( + |id int, + |user_id int, + |point struct, + |arr array>, + |mk map, string>, + |mv map> + |) + |USING $v2Format""".stripMargin) + + Seq( + "id" -> "user_id", + "point.x" -> "y", + "arr.element.x" -> "y", + "mk.key.x" -> "y", + "mv.value.x" -> "y").foreach { case (field, newName) => + + val e = intercept[AnalysisException] { + sql(s"ALTER TABLE $t RENAME COLUMN $field TO $newName") + } + assert(e.getMessage.contains("rename")) + assert(e.getMessage.contains((field.split("\\.").init :+ newName).mkString("."))) + assert(e.getMessage.contains("already exists")) + } + } + } + test("AlterTable: drop column") { val t = s"${catalogAndNamespace}table_name" withTable(t) { @@ -811,7 +1010,20 @@ trait AlterTableTests extends SharedSparkSession { assert(table.name === fullTableName(t)) assert(table.properties === - Map("provider" -> v2Format, "location" -> "s3://bucket/path").asJava) + withDefaultOwnership(Map("provider" -> v2Format, "location" -> "s3://bucket/path")).asJava) + } + } + + test("AlterTable: set partition location") { + val t = s"${catalogAndNamespace}table_name" + withTable(t) { + sql(s"CREATE TABLE $t (id int) USING $v2Format") + + val exc = intercept[AnalysisException] { + sql(s"ALTER TABLE $t PARTITION(ds='2017-06-10') SET LOCATION 's3://bucket/path'") + } + assert(exc.getMessage.contains( + "ALTER TABLE SET LOCATION does not support partition for v2 tables")) } } @@ -824,7 +1036,8 @@ trait AlterTableTests extends SharedSparkSession { val table = getTableMetadata(t) assert(table.name === fullTableName(t)) - assert(table.properties === Map("provider" -> v2Format, "test" -> "34").asJava) + assert(table.properties === + withDefaultOwnership(Map("provider" -> v2Format, "test" -> "34")).asJava) } } @@ -836,15 +1049,30 @@ trait AlterTableTests extends SharedSparkSession { val table = getTableMetadata(t) assert(table.name === fullTableName(t)) - assert(table.properties === Map("provider" -> v2Format, "test" -> "34").asJava) + assert(table.properties === + withDefaultOwnership(Map("provider" -> v2Format, "test" -> "34")).asJava) sql(s"ALTER TABLE $t UNSET TBLPROPERTIES ('test')") val updated = getTableMetadata(t) assert(updated.name === fullTableName(t)) - assert(updated.properties === Map("provider" -> v2Format).asJava) + assert(updated.properties === withDefaultOwnership(Map("provider" -> v2Format)).asJava) } } + test("AlterTable: replace columns") { + val t = s"${catalogAndNamespace}table_name" + withTable(t) { + sql(s"CREATE TABLE $t (col1 int, col2 int COMMENT 'c2') USING $v2Format") + sql(s"ALTER TABLE $t REPLACE COLUMNS (col2 string, col3 int COMMENT 'c3')") + + val table = getTableMetadata(t) + + assert(table.name === fullTableName(t)) + assert(table.schema === StructType(Seq( + StructField("col2", StringType), + StructField("col3", IntegerType).withComment("c3")))) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2DataFrameSessionCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSessionCatalogSuite.scala similarity index 85% rename from sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2DataFrameSessionCatalogSuite.scala rename to sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSessionCatalogSuite.scala index fee6962501637..01caf8e2eb115 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2DataFrameSessionCatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSessionCatalogSuite.scala @@ -15,22 +15,19 @@ * limitations under the License. */ -package org.apache.spark.sql.sources.v2 +package org.apache.spark.sql.connector import java.util import org.scalatest.BeforeAndAfter -import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, SaveMode} -import org.apache.spark.sql.catalog.v2.{CatalogPlugin, Identifier, TableCatalog, TableChange} -import org.apache.spark.sql.catalog.v2.expressions.Transform -import org.apache.spark.sql.catalog.v2.utils.CatalogV2Util +import org.apache.spark.sql.{DataFrame, QueryTest, SaveMode} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{NoSuchTableException, TableAlreadyExistsException} -import org.apache.spark.sql.connector.InMemoryTable -import org.apache.spark.sql.execution.datasources.v2.V2SessionCatalog -import org.apache.spark.sql.internal.SQLConf.{PARTITION_OVERWRITE_MODE, PartitionOverwriteMode, V2_SESSION_CATALOG} -import org.apache.spark.sql.sources.v2.utils.TestV2SessionCatalogBase +import org.apache.spark.sql.connector.catalog._ +import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME +import org.apache.spark.sql.connector.expressions.Transform +import org.apache.spark.sql.internal.SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -39,8 +36,6 @@ class DataSourceV2DataFrameSessionCatalogSuite extends InsertIntoTests(supportsDynamicOverwrite = true, includeSQLOnlyTests = false) with SessionCatalogTest[InMemoryTable, InMemoryTableSessionCatalog] { - import testImplicits._ - override protected def doInsert(tableName: String, insert: DataFrame, mode: SaveMode): Unit = { val dfw = insert.write.format(v2Format) if (mode != null) { @@ -89,7 +84,7 @@ class DataSourceV2DataFrameSessionCatalogSuite val t1 = "prop_table" withTable(t1) { spark.range(20).write.format(v2Format).option("path", "abc").saveAsTable(t1) - val cat = spark.sessionState.catalogManager.v2SessionCatalog.get.asInstanceOf[TableCatalog] + val cat = spark.sessionState.catalogManager.currentCatalog.asInstanceOf[TableCatalog] val tableInfo = cat.loadTable(Identifier.of(Array.empty, t1)) assert(tableInfo.properties().get("location") === "abc") assert(tableInfo.properties().get("provider") === v2Format) @@ -97,12 +92,6 @@ class DataSourceV2DataFrameSessionCatalogSuite } } -class InMemoryTableProvider extends TableProvider { - override def getTable(options: CaseInsensitiveStringMap): Table = { - throw new UnsupportedOperationException("D'oh!") - } -} - class InMemoryTableSessionCatalog extends TestV2SessionCatalogBase[InMemoryTable] { override def newTable( name: String, @@ -112,6 +101,13 @@ class InMemoryTableSessionCatalog extends TestV2SessionCatalogBase[InMemoryTable new InMemoryTable(name, schema, partitions, properties) } + override def loadTable(ident: Identifier): Table = { + val identToUse = Option(InMemoryTableSessionCatalog.customIdentifierResolution) + .map(_(ident)) + .getOrElse(ident) + super.loadTable(identToUse) + } + override def alterTable(ident: Identifier, changes: TableChange*): Table = { val fullIdent = fullIdentifier(ident) Option(tables.get(fullIdent)) match { @@ -136,7 +132,22 @@ class InMemoryTableSessionCatalog extends TestV2SessionCatalogBase[InMemoryTable } } -private[v2] trait SessionCatalogTest[T <: Table, Catalog <: TestV2SessionCatalogBase[T]] +object InMemoryTableSessionCatalog { + private var customIdentifierResolution: Identifier => Identifier = _ + + def withCustomIdentifierResolver( + resolver: Identifier => Identifier)( + f: => Unit): Unit = { + try { + customIdentifierResolution = resolver + f + } finally { + customIdentifierResolution = null + } + } +} + +private [connector] trait SessionCatalogTest[T <: Table, Catalog <: TestV2SessionCatalogBase[T]] extends QueryTest with SharedSparkSession with BeforeAndAfter { @@ -145,18 +156,18 @@ private[v2] trait SessionCatalogTest[T <: Table, Catalog <: TestV2SessionCatalog spark.sessionState.catalogManager.catalog(name) } - protected val v2Format: String = classOf[InMemoryTableProvider].getName + protected val v2Format: String = classOf[FakeV2Provider].getName protected val catalogClassName: String = classOf[InMemoryTableSessionCatalog].getName before { - spark.conf.set(V2_SESSION_CATALOG.key, catalogClassName) + spark.conf.set(V2_SESSION_CATALOG_IMPLEMENTATION.key, catalogClassName) } override def afterEach(): Unit = { super.afterEach() - catalog("session").asInstanceOf[Catalog].clearTables() - spark.conf.set(V2_SESSION_CATALOG.key, classOf[V2SessionCatalog].getName) + catalog(SESSION_CATALOG_NAME).asInstanceOf[Catalog].clearTables() + spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) } protected def verifyTable(tableName: String, expected: DataFrame): Unit diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala similarity index 58% rename from sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2DataFrameSuite.scala rename to sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala index abccb5cec6752..0a6897b829994 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2DataFrameSuite.scala @@ -15,13 +15,21 @@ * limitations under the License. */ -package org.apache.spark.sql.sources.v2 +package org.apache.spark.sql.connector -import org.apache.spark.sql.{DataFrame, Row, SaveMode} -import org.apache.spark.sql.connector.InMemoryTableCatalog +import java.util.Collections + +import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SaveMode} +import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException +import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan} +import org.apache.spark.sql.connector.catalog.Identifier +import org.apache.spark.sql.execution.QueryExecution +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.QueryExecutionListener class DataSourceV2DataFrameSuite extends InsertIntoTests(supportsDynamicOverwrite = true, includeSQLOnlyTests = false) { + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ import testImplicits._ before { @@ -76,13 +84,15 @@ class DataSourceV2DataFrameSuite withTable(t1) { sql(s"CREATE TABLE $t1 (id bigint, data string) USING foo") val df = Seq((1L, "a"), (2L, "b"), (3L, "c")).toDF("id", "data") - // Default saveMode is append, therefore this doesn't throw a table already exists exception - df.write.saveAsTable(t1) + // Default saveMode is ErrorIfExists + intercept[TableAlreadyExistsException] { + df.write.saveAsTable(t1) + } + assert(spark.table(t1).count() === 0) + + // appends are by name not by position + df.select('data, 'id).write.mode("append").saveAsTable(t1) checkAnswer(spark.table(t1), df) - - // also appends are by name not by position - df.select('data, 'id).write.saveAsTable(t1) - checkAnswer(spark.table(t1), df.union(df)) } } @@ -123,4 +133,57 @@ class DataSourceV2DataFrameSuite checkAnswer(spark.table(t1), Seq(Row("c", "d"))) } } + + testQuietly("SPARK-29778: saveAsTable: append mode takes write options") { + + var plan: LogicalPlan = null + val listener = new QueryExecutionListener { + override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = { + plan = qe.analyzed + } + override def onFailure(funcName: String, qe: QueryExecution, error: Throwable): Unit = {} + } + + try { + spark.listenerManager.register(listener) + + val t1 = "testcat.ns1.ns2.tbl" + + sql(s"CREATE TABLE $t1 (id bigint, data string) USING foo") + + val df = Seq((1L, "a"), (2L, "b"), (3L, "c")).toDF("id", "data") + df.write.option("other", "20").mode("append").saveAsTable(t1) + + sparkContext.listenerBus.waitUntilEmpty() + plan match { + case p: AppendData => + assert(p.writeOptions == Map("other" -> "20")) + case other => + fail(s"Expected to parse ${classOf[AppendData].getName} from query," + + s"got ${other.getClass.getName}: $plan") + } + + checkAnswer(spark.table(t1), df) + } finally { + spark.listenerManager.unregister(listener) + } + } + + test("Cannot write data with intervals to v2") { + withTable("testcat.table_name") { + val testCatalog = spark.sessionState.catalogManager.catalog("testcat").asTableCatalog + testCatalog.createTable( + Identifier.of(Array(), "table_name"), + new StructType().add("i", "interval"), + Array.empty, Collections.emptyMap[String, String]) + val df = sql("select interval 1 day as i") + val v2Writer = df.writeTo("testcat.table_name") + val e1 = intercept[AnalysisException](v2Writer.append()) + assert(e1.getMessage.contains(s"Cannot use interval type in the table schema.")) + val e2 = intercept[AnalysisException](v2Writer.overwrite(df("i"))) + assert(e2.getMessage.contains(s"Cannot use interval type in the table schema.")) + val e3 = intercept[AnalysisException](v2Writer.overwritePartitions()) + assert(e3.getMessage.contains(s"Cannot use interval type in the table schema.")) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2SQLSessionCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala similarity index 75% rename from sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2SQLSessionCatalogSuite.scala rename to sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala index cfbafdb65c7c3..b6997445013e5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2SQLSessionCatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSessionCatalogSuite.scala @@ -15,12 +15,10 @@ * limitations under the License. */ -package org.apache.spark.sql.sources.v2 +package org.apache.spark.sql.connector import org.apache.spark.sql.{DataFrame, SaveMode} -import org.apache.spark.sql.catalog.v2.{Identifier, TableCatalog} -import org.apache.spark.sql.connector.InMemoryTable -import org.apache.spark.sql.internal.SQLConf.{PARTITION_OVERWRITE_MODE, PartitionOverwriteMode} +import org.apache.spark.sql.connector.catalog.{Identifier, Table, TableCatalog} class DataSourceV2SQLSessionCatalogSuite extends InsertIntoTests(supportsDynamicOverwrite = true, includeSQLOnlyTests = true) @@ -46,9 +44,23 @@ class DataSourceV2SQLSessionCatalogSuite } override def getTableMetadata(tableName: String): Table = { - val v2Catalog = spark.sessionState.catalogManager.v2SessionCatalog.get + val v2Catalog = spark.sessionState.catalogManager.currentCatalog val nameParts = spark.sessionState.sqlParser.parseMultipartIdentifier(tableName) v2Catalog.asInstanceOf[TableCatalog] .loadTable(Identifier.of(Array.empty, nameParts.last)) } + + test("SPARK-30697: catalog.isView doesn't throw an error for specialized identifiers") { + val t1 = "tbl" + withTable(t1) { + sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format") + + def idResolver(id: Identifier): Identifier = Identifier.of(Array.empty, id.name()) + + InMemoryTableSessionCatalog.withCustomIdentifierResolver(idResolver) { + // The following should not throw AnalysisException. + sql(s"DESCRIBE TABLE ignored.$t1") + } + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala new file mode 100644 index 0000000000000..eabcb81c50646 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala @@ -0,0 +1,2276 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connector + +import scala.collection.JavaConverters._ + +import org.apache.spark.SparkException +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.analysis.{CannotReplaceMissingTableException, NamespaceAlreadyExistsException, NoSuchDatabaseException, NoSuchNamespaceException, NoSuchTableException, TableAlreadyExistsException} +import org.apache.spark.sql.catalyst.parser.ParseException +import org.apache.spark.sql.connector.catalog._ +import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME +import org.apache.spark.sql.connector.catalog.CatalogV2Util.withDefaultOwnership +import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} +import org.apache.spark.sql.internal.SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION +import org.apache.spark.sql.internal.connector.SimpleTableProvider +import org.apache.spark.sql.sources.SimpleScanSource +import org.apache.spark.sql.types.{BooleanType, LongType, StringType, StructField, StructType} +import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.util.Utils + +class DataSourceV2SQLSuite + extends InsertIntoTests(supportsDynamicOverwrite = true, includeSQLOnlyTests = true) + with AlterTableTests { + + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + + private val v2Source = classOf[FakeV2Provider].getName + override protected val v2Format = v2Source + override protected val catalogAndNamespace = "testcat.ns1.ns2." + private val defaultUser: String = Utils.getCurrentUserName() + + private def catalog(name: String): CatalogPlugin = { + spark.sessionState.catalogManager.catalog(name) + } + + protected def doInsert(tableName: String, insert: DataFrame, mode: SaveMode): Unit = { + val tmpView = "tmp_view" + withTempView(tmpView) { + insert.createOrReplaceTempView(tmpView) + val overwrite = if (mode == SaveMode.Overwrite) "OVERWRITE" else "INTO" + sql(s"INSERT $overwrite TABLE $tableName SELECT * FROM $tmpView") + } + } + + override def verifyTable(tableName: String, expected: DataFrame): Unit = { + checkAnswer(spark.table(tableName), expected) + } + + override def getTableMetadata(tableName: String): Table = { + val nameParts = spark.sessionState.sqlParser.parseMultipartIdentifier(tableName) + val v2Catalog = catalog(nameParts.head).asTableCatalog + val namespace = nameParts.drop(1).init.toArray + v2Catalog.loadTable(Identifier.of(namespace, nameParts.last)) + } + + before { + spark.conf.set("spark.sql.catalog.testcat", classOf[InMemoryTableCatalog].getName) + spark.conf.set( + "spark.sql.catalog.testcat_atomic", classOf[StagingInMemoryTableCatalog].getName) + spark.conf.set("spark.sql.catalog.testcat2", classOf[InMemoryTableCatalog].getName) + spark.conf.set( + V2_SESSION_CATALOG_IMPLEMENTATION.key, classOf[InMemoryTableSessionCatalog].getName) + + val df = spark.createDataFrame(Seq((1L, "a"), (2L, "b"), (3L, "c"))).toDF("id", "data") + df.createOrReplaceTempView("source") + val df2 = spark.createDataFrame(Seq((4L, "d"), (5L, "e"), (6L, "f"))).toDF("id", "data") + df2.createOrReplaceTempView("source2") + } + + after { + spark.sessionState.catalog.reset() + spark.sessionState.catalogManager.reset() + spark.sessionState.conf.clear() + } + + test("CreateTable: use v2 plan because catalog is set") { + spark.sql("CREATE TABLE testcat.table_name (id bigint NOT NULL, data string) USING foo") + + val testCatalog = catalog("testcat").asTableCatalog + val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name == "testcat.table_name") + assert(table.partitioning.isEmpty) + assert(table.properties == withDefaultOwnership(Map("provider" -> "foo")).asJava) + assert(table.schema == new StructType() + .add("id", LongType, nullable = false) + .add("data", StringType)) + + val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) + checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), Seq.empty) + } + + test("DescribeTable using v2 catalog") { + spark.sql("CREATE TABLE testcat.table_name (id bigint, data string)" + + " USING foo" + + " PARTITIONED BY (id)") + val descriptionDf = spark.sql("DESCRIBE TABLE testcat.table_name") + assert(descriptionDf.schema.map(field => (field.name, field.dataType)) === + Seq( + ("col_name", StringType), + ("data_type", StringType), + ("comment", StringType))) + val description = descriptionDf.collect() + assert(description === Seq( + Row("id", "bigint", ""), + Row("data", "string", ""), + Row("", "", ""), + Row("# Partitioning", "", ""), + Row("Part 0", "id", ""))) + + val e = intercept[AnalysisException] { + sql("DESCRIBE TABLE testcat.table_name PARTITION (id = 1)") + } + assert(e.message.contains("DESCRIBE does not support partition for v2 tables")) + } + + test("DescribeTable with v2 catalog when table does not exist.") { + intercept[AnalysisException] { + spark.sql("DESCRIBE TABLE testcat.table_name") + } + } + + test("DescribeTable extended using v2 catalog") { + spark.sql("CREATE TABLE testcat.table_name (id bigint, data string)" + + " USING foo" + + " PARTITIONED BY (id)" + + " TBLPROPERTIES ('bar'='baz')" + + " COMMENT 'this is a test table'" + + " LOCATION '/tmp/testcat/table_name'") + val descriptionDf = spark.sql("DESCRIBE TABLE EXTENDED testcat.table_name") + assert(descriptionDf.schema.map(field => (field.name, field.dataType)) + === Seq( + ("col_name", StringType), + ("data_type", StringType), + ("comment", StringType))) + assert(descriptionDf.collect() + .map(_.toSeq) + .map(_.toArray.map(_.toString.trim)) === Array( + Array("id", "bigint", ""), + Array("data", "string", ""), + Array("", "", ""), + Array("# Partitioning", "", ""), + Array("Part 0", "id", ""), + Array("", "", ""), + Array("# Detailed Table Information", "", ""), + Array("Name", "testcat.table_name", ""), + Array("Comment", "this is a test table", ""), + Array("Location", "/tmp/testcat/table_name", ""), + Array("Provider", "foo", ""), + Array(TableCatalog.PROP_OWNER.capitalize, defaultUser, ""), + Array("Table Properties", "[bar=baz]", ""))) + + } + + test("CreateTable: use v2 plan and session catalog when provider is v2") { + spark.sql(s"CREATE TABLE table_name (id bigint, data string) USING $v2Source") + + val testCatalog = catalog(SESSION_CATALOG_NAME).asTableCatalog + val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name == "default.table_name") + assert(table.partitioning.isEmpty) + assert(table.properties == withDefaultOwnership(Map("provider" -> v2Source)).asJava) + assert(table.schema == new StructType().add("id", LongType).add("data", StringType)) + + val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) + checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), Seq.empty) + } + + test("CreateTable: fail if table exists") { + spark.sql("CREATE TABLE testcat.table_name (id bigint, data string) USING foo") + + val testCatalog = catalog("testcat").asTableCatalog + + val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) + assert(table.name == "testcat.table_name") + assert(table.partitioning.isEmpty) + assert(table.properties == withDefaultOwnership(Map("provider" -> "foo")).asJava) + assert(table.schema == new StructType().add("id", LongType).add("data", StringType)) + + // run a second create query that should fail + val exc = intercept[TableAlreadyExistsException] { + spark.sql("CREATE TABLE testcat.table_name (id bigint, data string, id2 bigint) USING bar") + } + + assert(exc.getMessage.contains("table_name")) + + // table should not have changed + val table2 = testCatalog.loadTable(Identifier.of(Array(), "table_name")) + assert(table2.name == "testcat.table_name") + assert(table2.partitioning.isEmpty) + assert(table2.properties == withDefaultOwnership(Map("provider" -> "foo")).asJava) + assert(table2.schema == new StructType().add("id", LongType).add("data", StringType)) + + // check that the table is still empty + val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) + checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), Seq.empty) + } + + test("CreateTable: if not exists") { + spark.sql( + "CREATE TABLE IF NOT EXISTS testcat.table_name (id bigint, data string) USING foo") + + val testCatalog = catalog("testcat").asTableCatalog + val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name == "testcat.table_name") + assert(table.partitioning.isEmpty) + assert(table.properties == withDefaultOwnership(Map("provider" -> "foo")).asJava) + assert(table.schema == new StructType().add("id", LongType).add("data", StringType)) + + spark.sql("CREATE TABLE IF NOT EXISTS testcat.table_name (id bigint, data string) USING bar") + + // table should not have changed + val table2 = testCatalog.loadTable(Identifier.of(Array(), "table_name")) + assert(table2.name == "testcat.table_name") + assert(table2.partitioning.isEmpty) + assert(table2.properties == withDefaultOwnership(Map("provider" -> "foo")).asJava) + assert(table2.schema == new StructType().add("id", LongType).add("data", StringType)) + + // check that the table is still empty + val rdd2 = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) + checkAnswer(spark.internalCreateDataFrame(rdd2, table.schema), Seq.empty) + } + + test("CreateTable: use default catalog for v2 sources when default catalog is set") { + spark.conf.set(SQLConf.DEFAULT_CATALOG.key, "testcat") + spark.sql(s"CREATE TABLE table_name (id bigint, data string) USING foo") + + val testCatalog = catalog("testcat").asTableCatalog + val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name == "testcat.table_name") + assert(table.partitioning.isEmpty) + assert(table.properties == withDefaultOwnership(Map("provider" -> "foo")).asJava) + assert(table.schema == new StructType().add("id", LongType).add("data", StringType)) + + // check that the table is empty + val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) + checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), Seq.empty) + } + + test("CreateTable/RepalceTable: invalid schema if has interval type") { + Seq("CREATE", "REPLACE").foreach { action => + val e1 = intercept[AnalysisException]( + sql(s"$action TABLE table_name (id int, value interval) USING $v2Format")) + assert(e1.getMessage.contains(s"Cannot use interval type in the table schema.")) + val e2 = intercept[AnalysisException]( + sql(s"$action TABLE table_name (id array) USING $v2Format")) + assert(e2.getMessage.contains(s"Cannot use interval type in the table schema.")) + } + } + + test("CTAS/RTAS: invalid schema if has interval type") { + Seq("CREATE", "REPLACE").foreach { action => + val e1 = intercept[AnalysisException]( + sql(s"$action TABLE table_name USING $v2Format as select interval 1 day")) + assert(e1.getMessage.contains(s"Cannot use interval type in the table schema.")) + val e2 = intercept[AnalysisException]( + sql(s"$action TABLE table_name USING $v2Format as select array(interval 1 day)")) + assert(e2.getMessage.contains(s"Cannot use interval type in the table schema.")) + } + } + + test("CreateTableAsSelect: use v2 plan because catalog is set") { + val basicCatalog = catalog("testcat").asTableCatalog + val atomicCatalog = catalog("testcat_atomic").asTableCatalog + val basicIdentifier = "testcat.table_name" + val atomicIdentifier = "testcat_atomic.table_name" + + Seq((basicCatalog, basicIdentifier), (atomicCatalog, atomicIdentifier)).foreach { + case (catalog, identifier) => + spark.sql(s"CREATE TABLE $identifier USING foo AS SELECT id, data FROM source") + + val table = catalog.loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name == identifier) + assert(table.partitioning.isEmpty) + assert(table.properties == withDefaultOwnership(Map("provider" -> "foo")).asJava) + assert(table.schema == new StructType() + .add("id", LongType) + .add("data", StringType)) + + val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) + checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), spark.table("source")) + } + } + + test("ReplaceTableAsSelect: basic v2 implementation.") { + val basicCatalog = catalog("testcat").asTableCatalog + val atomicCatalog = catalog("testcat_atomic").asTableCatalog + val basicIdentifier = "testcat.table_name" + val atomicIdentifier = "testcat_atomic.table_name" + + Seq((basicCatalog, basicIdentifier), (atomicCatalog, atomicIdentifier)).foreach { + case (catalog, identifier) => + spark.sql(s"CREATE TABLE $identifier USING foo AS SELECT id, data FROM source") + val originalTable = catalog.loadTable(Identifier.of(Array(), "table_name")) + + spark.sql(s"REPLACE TABLE $identifier USING foo AS SELECT id FROM source") + val replacedTable = catalog.loadTable(Identifier.of(Array(), "table_name")) + + assert(replacedTable != originalTable, "Table should have been replaced.") + assert(replacedTable.name == identifier) + assert(replacedTable.partitioning.isEmpty) + assert(replacedTable.properties == withDefaultOwnership(Map("provider" -> "foo")).asJava) + assert(replacedTable.schema == new StructType().add("id", LongType)) + + val rdd = spark.sparkContext.parallelize(replacedTable.asInstanceOf[InMemoryTable].rows) + checkAnswer( + spark.internalCreateDataFrame(rdd, replacedTable.schema), + spark.table("source").select("id")) + } + } + + test("ReplaceTableAsSelect: Non-atomic catalog drops the table if the write fails.") { + spark.sql("CREATE TABLE testcat.table_name USING foo AS SELECT id, data FROM source") + val testCatalog = catalog("testcat").asTableCatalog + val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) + assert(table.asInstanceOf[InMemoryTable].rows.nonEmpty) + + intercept[Exception] { + spark.sql("REPLACE TABLE testcat.table_name" + + s" USING foo OPTIONS (`${InMemoryTable.SIMULATE_FAILED_WRITE_OPTION}`=true)" + + s" AS SELECT id FROM source") + } + + assert(!testCatalog.tableExists(Identifier.of(Array(), "table_name")), + "Table should have been dropped as a result of the replace.") + } + + test("ReplaceTableAsSelect: Non-atomic catalog drops the table permanently if the" + + " subsequent table creation fails.") { + spark.sql("CREATE TABLE testcat.table_name USING foo AS SELECT id, data FROM source") + val testCatalog = catalog("testcat").asTableCatalog + val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) + assert(table.asInstanceOf[InMemoryTable].rows.nonEmpty) + + intercept[Exception] { + spark.sql("REPLACE TABLE testcat.table_name" + + s" USING foo" + + s" TBLPROPERTIES (`${InMemoryTableCatalog.SIMULATE_FAILED_CREATE_PROPERTY}`=true)" + + s" AS SELECT id FROM source") + } + + assert(!testCatalog.tableExists(Identifier.of(Array(), "table_name")), + "Table should have been dropped and failed to be created.") + } + + test("ReplaceTableAsSelect: Atomic catalog does not drop the table when replace fails.") { + spark.sql("CREATE TABLE testcat_atomic.table_name USING foo AS SELECT id, data FROM source") + val testCatalog = catalog("testcat_atomic").asTableCatalog + val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) + + intercept[Exception] { + spark.sql("REPLACE TABLE testcat_atomic.table_name" + + s" USING foo OPTIONS (`${InMemoryTable.SIMULATE_FAILED_WRITE_OPTION}=true)" + + s" AS SELECT id FROM source") + } + + var maybeReplacedTable = testCatalog.loadTable(Identifier.of(Array(), "table_name")) + assert(maybeReplacedTable === table, "Table should not have changed.") + + intercept[Exception] { + spark.sql("REPLACE TABLE testcat_atomic.table_name" + + s" USING foo" + + s" TBLPROPERTIES (`${InMemoryTableCatalog.SIMULATE_FAILED_CREATE_PROPERTY}`=true)" + + s" AS SELECT id FROM source") + } + + maybeReplacedTable = testCatalog.loadTable(Identifier.of(Array(), "table_name")) + assert(maybeReplacedTable === table, "Table should not have changed.") + } + + test("ReplaceTable: Erases the table contents and changes the metadata.") { + spark.sql(s"CREATE TABLE testcat.table_name USING $v2Source AS SELECT id, data FROM source") + + val testCatalog = catalog("testcat").asTableCatalog + val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) + assert(table.asInstanceOf[InMemoryTable].rows.nonEmpty) + + spark.sql("REPLACE TABLE testcat.table_name (id bigint NOT NULL) USING foo") + val replaced = testCatalog.loadTable(Identifier.of(Array(), "table_name")) + + assert(replaced.asInstanceOf[InMemoryTable].rows.isEmpty, + "Replaced table should have no rows after committing.") + assert(replaced.schema().fields.length === 1, + "Replaced table should have new schema.") + assert(replaced.schema().fields(0) === StructField("id", LongType, nullable = false), + "Replaced table should have new schema.") + } + + test("ReplaceTableAsSelect: CREATE OR REPLACE new table has same behavior as CTAS.") { + Seq("testcat", "testcat_atomic").foreach { catalogName => + spark.sql( + s""" + |CREATE TABLE $catalogName.created USING $v2Source + |AS SELECT id, data FROM source + """.stripMargin) + spark.sql( + s""" + |CREATE OR REPLACE TABLE $catalogName.replaced USING $v2Source + |AS SELECT id, data FROM source + """.stripMargin) + + val testCatalog = catalog(catalogName).asTableCatalog + val createdTable = testCatalog.loadTable(Identifier.of(Array(), "created")) + val replacedTable = testCatalog.loadTable(Identifier.of(Array(), "replaced")) + + assert(createdTable.asInstanceOf[InMemoryTable].rows === + replacedTable.asInstanceOf[InMemoryTable].rows) + assert(createdTable.schema === replacedTable.schema) + } + } + + test("ReplaceTableAsSelect: REPLACE TABLE throws exception if table does not exist.") { + Seq("testcat", "testcat_atomic").foreach { catalog => + spark.sql(s"CREATE TABLE $catalog.created USING $v2Source AS SELECT id, data FROM source") + intercept[CannotReplaceMissingTableException] { + spark.sql(s"REPLACE TABLE $catalog.replaced USING $v2Source AS SELECT id, data FROM source") + } + } + } + + test("ReplaceTableAsSelect: REPLACE TABLE throws exception if table is dropped before commit.") { + import InMemoryTableCatalog._ + spark.sql(s"CREATE TABLE testcat_atomic.created USING $v2Source AS SELECT id, data FROM source") + intercept[CannotReplaceMissingTableException] { + spark.sql(s"REPLACE TABLE testcat_atomic.replaced" + + s" USING $v2Source" + + s" TBLPROPERTIES (`$SIMULATE_DROP_BEFORE_REPLACE_PROPERTY`=true)" + + s" AS SELECT id, data FROM source") + } + } + + test("CreateTableAsSelect: use v2 plan and session catalog when provider is v2") { + spark.sql(s"CREATE TABLE table_name USING $v2Source AS SELECT id, data FROM source") + + val testCatalog = catalog(SESSION_CATALOG_NAME).asTableCatalog + val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name == "default.table_name") + assert(table.partitioning.isEmpty) + assert(table.properties == withDefaultOwnership(Map("provider" -> v2Source)).asJava) + assert(table.schema == new StructType() + .add("id", LongType) + .add("data", StringType)) + + val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) + checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), spark.table("source")) + } + + test("CreateTableAsSelect: fail if table exists") { + spark.sql("CREATE TABLE testcat.table_name USING foo AS SELECT id, data FROM source") + + val testCatalog = catalog("testcat").asTableCatalog + + val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) + assert(table.name == "testcat.table_name") + assert(table.partitioning.isEmpty) + assert(table.properties == withDefaultOwnership(Map("provider" -> "foo")).asJava) + assert(table.schema == new StructType() + .add("id", LongType) + .add("data", StringType)) + + val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) + checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), spark.table("source")) + + // run a second CTAS query that should fail + val exc = intercept[TableAlreadyExistsException] { + spark.sql( + "CREATE TABLE testcat.table_name USING bar AS SELECT id, data, id as id2 FROM source2") + } + + assert(exc.getMessage.contains("table_name")) + + // table should not have changed + val table2 = testCatalog.loadTable(Identifier.of(Array(), "table_name")) + assert(table2.name == "testcat.table_name") + assert(table2.partitioning.isEmpty) + assert(table2.properties == withDefaultOwnership(Map("provider" -> "foo")).asJava) + assert(table2.schema == new StructType() + .add("id", LongType) + .add("data", StringType)) + + val rdd2 = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) + checkAnswer(spark.internalCreateDataFrame(rdd2, table.schema), spark.table("source")) + } + + test("CreateTableAsSelect: if not exists") { + spark.sql( + "CREATE TABLE IF NOT EXISTS testcat.table_name USING foo AS SELECT id, data FROM source") + + val testCatalog = catalog("testcat").asTableCatalog + val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name == "testcat.table_name") + assert(table.partitioning.isEmpty) + assert(table.properties == withDefaultOwnership(Map("provider" -> "foo")).asJava) + assert(table.schema == new StructType() + .add("id", LongType) + .add("data", StringType)) + + val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) + checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), spark.table("source")) + + spark.sql( + "CREATE TABLE IF NOT EXISTS testcat.table_name USING foo AS SELECT id, data FROM source2") + + // check that the table contains data from just the first CTAS + val rdd2 = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) + checkAnswer(spark.internalCreateDataFrame(rdd2, table.schema), spark.table("source")) + } + + test("CreateTableAsSelect: use default catalog for v2 sources when default catalog is set") { + spark.conf.set(SQLConf.DEFAULT_CATALOG.key, "testcat") + + val df = spark.createDataFrame(Seq((1L, "a"), (2L, "b"), (3L, "c"))).toDF("id", "data") + df.createOrReplaceTempView("source") + + // setting the default catalog breaks the reference to source because the default catalog is + // used and AsTableIdentifier no longer matches + spark.sql(s"CREATE TABLE table_name USING foo AS SELECT id, data FROM source") + + val testCatalog = catalog("testcat").asTableCatalog + val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name == "testcat.table_name") + assert(table.partitioning.isEmpty) + assert(table.properties == withDefaultOwnership(Map("provider" -> "foo")).asJava) + assert(table.schema == new StructType() + .add("id", LongType) + .add("data", StringType)) + + val rdd = sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) + checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), spark.table("source")) + } + + test("CreateTableAsSelect: v2 session catalog can load v1 source table") { + // unset this config to use the default v2 session catalog. + spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + + val df = spark.createDataFrame(Seq((1L, "a"), (2L, "b"), (3L, "c"))).toDF("id", "data") + df.createOrReplaceTempView("source") + + sql(s"CREATE TABLE table_name USING parquet AS SELECT id, data FROM source") + + checkAnswer(sql(s"TABLE default.table_name"), spark.table("source")) + // The fact that the following line doesn't throw an exception means, the session catalog + // can load the table. + val t = catalog(SESSION_CATALOG_NAME).asTableCatalog + .loadTable(Identifier.of(Array.empty, "table_name")) + assert(t.isInstanceOf[V1Table], "V1 table wasn't returned as an unresolved table") + } + + test("CreateTableAsSelect: nullable schema") { + val basicCatalog = catalog("testcat").asTableCatalog + val atomicCatalog = catalog("testcat_atomic").asTableCatalog + val basicIdentifier = "testcat.table_name" + val atomicIdentifier = "testcat_atomic.table_name" + + Seq((basicCatalog, basicIdentifier), (atomicCatalog, atomicIdentifier)).foreach { + case (catalog, identifier) => + spark.sql(s"CREATE TABLE $identifier USING foo AS SELECT 1 i") + + val table = catalog.loadTable(Identifier.of(Array(), "table_name")) + + assert(table.name == identifier) + assert(table.partitioning.isEmpty) + assert(table.properties == withDefaultOwnership(Map("provider" -> "foo")).asJava) + assert(table.schema == new StructType().add("i", "int")) + + val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) + checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), Row(1)) + + sql(s"INSERT INTO $identifier SELECT CAST(null AS INT)") + val rdd2 = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) + checkAnswer(spark.internalCreateDataFrame(rdd2, table.schema), Seq(Row(1), Row(null))) + } + } + + test("DropTable: basic") { + val tableName = "testcat.ns1.ns2.tbl" + val ident = Identifier.of(Array("ns1", "ns2"), "tbl") + sql(s"CREATE TABLE $tableName USING foo AS SELECT id, data FROM source") + assert(catalog("testcat").asTableCatalog.tableExists(ident) === true) + sql(s"DROP TABLE $tableName") + assert(catalog("testcat").asTableCatalog.tableExists(ident) === false) + } + + test("DropTable: table qualified with the session catalog name") { + val ident = Identifier.of(Array(), "tbl") + sql("CREATE TABLE tbl USING json AS SELECT 1 AS i") + assert(catalog("spark_catalog").asTableCatalog.tableExists(ident) === true) + sql("DROP TABLE spark_catalog.tbl") + assert(catalog("spark_catalog").asTableCatalog.tableExists(ident) === false) + } + + test("DropTable: if exists") { + intercept[NoSuchTableException] { + sql(s"DROP TABLE testcat.db.notbl") + } + sql(s"DROP TABLE IF EXISTS testcat.db.notbl") + } + + test("Relation: basic") { + val t1 = "testcat.ns1.ns2.tbl" + withTable(t1) { + sql(s"CREATE TABLE $t1 USING foo AS SELECT id, data FROM source") + checkAnswer(sql(s"TABLE $t1"), spark.table("source")) + checkAnswer(sql(s"SELECT * FROM $t1"), spark.table("source")) + } + } + + test("Relation: SparkSession.table()") { + val t1 = "testcat.ns1.ns2.tbl" + withTable(t1) { + sql(s"CREATE TABLE $t1 USING foo AS SELECT id, data FROM source") + checkAnswer(spark.table(s"$t1"), spark.table("source")) + } + } + + test("Relation: CTE") { + val t1 = "testcat.ns1.ns2.tbl" + withTable(t1) { + sql(s"CREATE TABLE $t1 USING foo AS SELECT id, data FROM source") + checkAnswer( + sql(s""" + |WITH cte AS (SELECT * FROM $t1) + |SELECT * FROM cte + """.stripMargin), + spark.table("source")) + } + } + + test("Relation: view text") { + val t1 = "testcat.ns1.ns2.tbl" + withTable(t1) { + withView("view1") { v1: String => + sql(s"CREATE TABLE $t1 USING foo AS SELECT id, data FROM source") + sql(s"CREATE VIEW $v1 AS SELECT * from $t1") + checkAnswer(sql(s"TABLE $v1"), spark.table("source")) + } + } + } + + test("Relation: join tables in 2 catalogs") { + val t1 = "testcat.ns1.ns2.tbl" + val t2 = "testcat2.v2tbl" + withTable(t1, t2) { + sql(s"CREATE TABLE $t1 USING foo AS SELECT id, data FROM source") + sql(s"CREATE TABLE $t2 USING foo AS SELECT id, data FROM source2") + val df1 = spark.table("source") + val df2 = spark.table("source2") + val df_joined = df1.join(df2).where(df1("id") + 1 === df2("id")) + checkAnswer( + sql(s""" + |SELECT * + |FROM $t1 t1, $t2 t2 + |WHERE t1.id + 1 = t2.id + """.stripMargin), + df_joined) + } + } + + test("qualified column names for v2 tables") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + sql(s"CREATE TABLE $t (id bigint, point struct) USING foo") + sql(s"INSERT INTO $t VALUES (1, (10, 20))") + + checkAnswer( + sql(s"SELECT testcat.ns1.ns2.tbl.id, testcat.ns1.ns2.tbl.point.x FROM $t"), + Row(1, 10)) + checkAnswer(sql(s"SELECT ns1.ns2.tbl.id, ns1.ns2.tbl.point.x FROM $t"), Row(1, 10)) + checkAnswer(sql(s"SELECT ns2.tbl.id, ns2.tbl.point.x FROM $t"), Row(1, 10)) + checkAnswer(sql(s"SELECT tbl.id, tbl.point.x FROM $t"), Row(1, 10)) + + val ex = intercept[AnalysisException] { + sql(s"SELECT ns1.ns2.ns3.tbl.id from $t") + } + assert(ex.getMessage.contains("cannot resolve '`ns1.ns2.ns3.tbl.id`")) + } + } + + test("qualified column names for v1 tables") { + // unset this config to use the default v2 session catalog. + spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + + withTable("t") { + sql("CREATE TABLE t USING json AS SELECT 1 AS i") + checkAnswer(sql("select default.t.i from spark_catalog.t"), Row(1)) + checkAnswer(sql("select t.i from spark_catalog.default.t"), Row(1)) + checkAnswer(sql("select default.t.i from spark_catalog.default.t"), Row(1)) + + // catalog name cannot be used for v1 tables. + val ex = intercept[AnalysisException] { + sql(s"select spark_catalog.default.t.i from spark_catalog.default.t") + } + assert(ex.getMessage.contains("cannot resolve '`spark_catalog.default.t.i`")) + } + } + + test("InsertInto: append - across catalog") { + val t1 = "testcat.ns1.ns2.tbl" + val t2 = "testcat2.db.tbl" + withTable(t1, t2) { + sql(s"CREATE TABLE $t1 USING foo AS SELECT * FROM source") + sql(s"CREATE TABLE $t2 (id bigint, data string) USING foo") + sql(s"INSERT INTO $t2 SELECT * FROM $t1") + checkAnswer(spark.table(t2), spark.table("source")) + } + } + + test("ShowTables: using v2 catalog") { + spark.sql("CREATE TABLE testcat.db.table_name (id bigint, data string) USING foo") + spark.sql("CREATE TABLE testcat.n1.n2.db.table_name (id bigint, data string) USING foo") + + runShowTablesSql("SHOW TABLES FROM testcat.db", Seq(Row("db", "table_name"))) + + runShowTablesSql( + "SHOW TABLES FROM testcat.n1.n2.db", + Seq(Row("n1.n2.db", "table_name"))) + } + + test("ShowTables: using v2 catalog with a pattern") { + spark.sql("CREATE TABLE testcat.db.table (id bigint, data string) USING foo") + spark.sql("CREATE TABLE testcat.db.table_name_1 (id bigint, data string) USING foo") + spark.sql("CREATE TABLE testcat.db.table_name_2 (id bigint, data string) USING foo") + spark.sql("CREATE TABLE testcat.db2.table_name_2 (id bigint, data string) USING foo") + + runShowTablesSql( + "SHOW TABLES FROM testcat.db", + Seq( + Row("db", "table"), + Row("db", "table_name_1"), + Row("db", "table_name_2"))) + + runShowTablesSql( + "SHOW TABLES FROM testcat.db LIKE '*name*'", + Seq(Row("db", "table_name_1"), Row("db", "table_name_2"))) + + runShowTablesSql( + "SHOW TABLES FROM testcat.db LIKE '*2'", + Seq(Row("db", "table_name_2"))) + } + + test("ShowTables: using v2 catalog, namespace doesn't exist") { + runShowTablesSql("SHOW TABLES FROM testcat.unknown", Seq()) + } + + test("ShowTables: using v1 catalog") { + runShowTablesSql( + "SHOW TABLES FROM default", + Seq(Row("", "source", true), Row("", "source2", true)), + expectV2Catalog = false) + } + + test("ShowTables: using v1 catalog, db doesn't exist ") { + // 'db' below resolves to a database name for v1 catalog because there is no catalog named + // 'db' and there is no default catalog set. + val exception = intercept[NoSuchDatabaseException] { + runShowTablesSql("SHOW TABLES FROM db", Seq(), expectV2Catalog = false) + } + + assert(exception.getMessage.contains("Database 'db' not found")) + } + + test("ShowTables: using v1 catalog, db name with multipartIdentifier ('a.b') is not allowed.") { + val exception = intercept[AnalysisException] { + runShowTablesSql("SHOW TABLES FROM a.b", Seq(), expectV2Catalog = false) + } + + assert(exception.getMessage.contains("The database name is not valid: a.b")) + } + + test("ShowTables: using v2 catalog with empty namespace") { + spark.sql("CREATE TABLE testcat.table (id bigint, data string) USING foo") + runShowTablesSql("SHOW TABLES FROM testcat", Seq(Row("", "table"))) + } + + test("ShowTables: namespace is not specified and default v2 catalog is set") { + spark.conf.set(SQLConf.DEFAULT_CATALOG.key, "testcat") + spark.sql("CREATE TABLE testcat.table (id bigint, data string) USING foo") + + // v2 catalog is used where default namespace is empty for TestInMemoryTableCatalog. + runShowTablesSql("SHOW TABLES", Seq(Row("", "table"))) + } + + test("ShowTables: namespace not specified and default v2 catalog not set - fallback to v1") { + runShowTablesSql( + "SHOW TABLES", + Seq(Row("", "source", true), Row("", "source2", true)), + expectV2Catalog = false) + + runShowTablesSql( + "SHOW TABLES LIKE '*2'", + Seq(Row("", "source2", true)), + expectV2Catalog = false) + } + + test("ShowTables: change current catalog and namespace with USE statements") { + sql("CREATE TABLE testcat.ns1.ns2.table (id bigint) USING foo") + + // Initially, the v2 session catalog (current catalog) is used. + runShowTablesSql( + "SHOW TABLES", Seq(Row("", "source", true), Row("", "source2", true)), + expectV2Catalog = false) + + // Update the current catalog, and no table is matched since the current namespace is Array(). + sql("USE testcat") + runShowTablesSql("SHOW TABLES", Seq()) + + // Update the current namespace to match ns1.ns2.table. + sql("USE testcat.ns1.ns2") + runShowTablesSql("SHOW TABLES", Seq(Row("ns1.ns2", "table"))) + } + + private def runShowTablesSql( + sqlText: String, + expected: Seq[Row], + expectV2Catalog: Boolean = true): Unit = { + val schema = if (expectV2Catalog) { + new StructType() + .add("namespace", StringType, nullable = false) + .add("tableName", StringType, nullable = false) + } else { + new StructType() + .add("database", StringType, nullable = false) + .add("tableName", StringType, nullable = false) + .add("isTemporary", BooleanType, nullable = false) + } + + val df = spark.sql(sqlText) + assert(df.schema === schema) + assert(expected === df.collect()) + } + + test("SHOW TABLE EXTENDED not valid v1 database") { + def testV1CommandNamespace(sqlCommand: String, namespace: String): Unit = { + val e = intercept[AnalysisException] { + sql(sqlCommand) + } + assert(e.message.contains(s"The database name is not valid: ${namespace}")) + } + + val namespace = "testcat.ns1.ns2" + val table = "tbl" + withTable(s"$namespace.$table") { + sql(s"CREATE TABLE $namespace.$table (id bigint, data string) " + + s"USING foo PARTITIONED BY (id)") + + testV1CommandNamespace(s"SHOW TABLE EXTENDED FROM $namespace LIKE 'tb*'", + namespace) + testV1CommandNamespace(s"SHOW TABLE EXTENDED IN $namespace LIKE 'tb*'", + namespace) + testV1CommandNamespace("SHOW TABLE EXTENDED " + + s"FROM $namespace LIKE 'tb*' PARTITION(id=1)", + namespace) + testV1CommandNamespace("SHOW TABLE EXTENDED " + + s"IN $namespace LIKE 'tb*' PARTITION(id=1)", + namespace) + } + } + + test("SHOW TABLE EXTENDED valid v1") { + val expected = Seq(Row("", "source", true), Row("", "source2", true)) + val schema = new StructType() + .add("database", StringType, nullable = false) + .add("tableName", StringType, nullable = false) + .add("isTemporary", BooleanType, nullable = false) + .add("information", StringType, nullable = false) + + val df = sql("SHOW TABLE EXTENDED FROM default LIKE '*source*'") + val result = df.collect() + val resultWithoutInfo = result.map{ case Row(db, table, temp, _) => Row(db, table, temp)} + + assert(df.schema === schema) + assert(resultWithoutInfo === expected) + result.foreach{ case Row(_, _, _, info: String) => assert(info.nonEmpty)} + } + + test("CreateNameSpace: basic tests") { + // Session catalog is used. + withNamespace("ns") { + sql("CREATE NAMESPACE ns") + testShowNamespaces("SHOW NAMESPACES", Seq("default", "ns")) + } + + // V2 non-session catalog is used. + withNamespace("testcat.ns1.ns2") { + sql("CREATE NAMESPACE testcat.ns1.ns2") + testShowNamespaces("SHOW NAMESPACES IN testcat", Seq("ns1")) + testShowNamespaces("SHOW NAMESPACES IN testcat.ns1", Seq("ns1.ns2")) + } + + withNamespace("testcat.test") { + withTempDir { tmpDir => + val path = tmpDir.getCanonicalPath + sql(s"CREATE NAMESPACE testcat.test LOCATION '$path'") + val metadata = + catalog("testcat").asNamespaceCatalog.loadNamespaceMetadata(Array("test")).asScala + val catalogPath = metadata(SupportsNamespaces.PROP_LOCATION) + assert(catalogPath.equals(catalogPath)) + } + } + } + + test("CreateNameSpace: test handling of 'IF NOT EXIST'") { + withNamespace("testcat.ns1") { + sql("CREATE NAMESPACE IF NOT EXISTS testcat.ns1") + + // The 'ns1' namespace already exists, so this should fail. + val exception = intercept[NamespaceAlreadyExistsException] { + sql("CREATE NAMESPACE testcat.ns1") + } + assert(exception.getMessage.contains("Namespace 'ns1' already exists")) + + // The following will be no-op since the namespace already exists. + sql("CREATE NAMESPACE IF NOT EXISTS testcat.ns1") + } + } + + test("CreateNameSpace: reserved properties") { + import SupportsNamespaces._ + withSQLConf((SQLConf.LEGACY_PROPERTY_NON_RESERVED.key, "false")) { + CatalogV2Util.NAMESPACE_RESERVED_PROPERTIES.filterNot(_ == PROP_COMMENT).foreach { key => + val exception = intercept[ParseException] { + sql(s"CREATE NAMESPACE testcat.reservedTest WITH DBPROPERTIES('$key'='dummyVal')") + } + assert(exception.getMessage.contains(s"$key is a reserved namespace property")) + } + } + withSQLConf((SQLConf.LEGACY_PROPERTY_NON_RESERVED.key, "true")) { + CatalogV2Util.NAMESPACE_RESERVED_PROPERTIES.filterNot(_ == PROP_COMMENT).foreach { key => + withNamespace("testcat.reservedTest") { + sql(s"CREATE NAMESPACE testcat.reservedTest WITH DBPROPERTIES('$key'='foo')") + assert(sql("DESC NAMESPACE EXTENDED testcat.reservedTest") + .toDF("k", "v") + .where("k='Properties'") + .isEmpty, s"$key is a reserved namespace property and ignored") + val meta = + catalog("testcat").asNamespaceCatalog.loadNamespaceMetadata(Array("reservedTest")) + assert(meta.get(key) == null || !meta.get(key).contains("foo"), + "reserved properties should not have side effects") + } + } + } + } + + test("create/replace/alter table - reserved properties") { + import TableCatalog._ + withSQLConf((SQLConf.LEGACY_PROPERTY_NON_RESERVED.key, "false")) { + CatalogV2Util.TABLE_RESERVED_PROPERTIES.filterNot(_ == PROP_COMMENT).foreach { key => + Seq("OPTIONS", "TBLPROPERTIES").foreach { clause => + Seq("CREATE", "REPLACE").foreach { action => + val e = intercept[ParseException] { + sql(s"$action TABLE testcat.reservedTest (key int) USING foo $clause ('$key'='bar')") + } + assert(e.getMessage.contains(s"$key is a reserved table property")) + } + } + + val e1 = intercept[ParseException] { + sql(s"ALTER TABLE testcat.reservedTest SET TBLPROPERTIES ('$key'='bar')") + } + assert(e1.getMessage.contains(s"$key is a reserved table property")) + + val e2 = intercept[ParseException] { + sql(s"ALTER TABLE testcat.reservedTest UNSET TBLPROPERTIES ('$key')") + } + assert(e2.getMessage.contains(s"$key is a reserved table property")) + } + } + withSQLConf((SQLConf.LEGACY_PROPERTY_NON_RESERVED.key, "true")) { + CatalogV2Util.TABLE_RESERVED_PROPERTIES.filterNot(_ == PROP_COMMENT).foreach { key => + Seq("OPTIONS", "TBLPROPERTIES").foreach { clause => + withTable("testcat.reservedTest") { + Seq("CREATE", "REPLACE").foreach { action => + sql(s"$action TABLE testcat.reservedTest (key int) USING foo $clause ('$key'='bar')") + val tableCatalog = catalog("testcat").asTableCatalog + val identifier = Identifier.of(Array(), "reservedTest") + val originValue = tableCatalog.loadTable(identifier).properties().get(key) + assert(originValue != "bar", "reserved properties should not have side effects") + sql(s"ALTER TABLE testcat.reservedTest SET TBLPROPERTIES ('$key'='newValue')") + assert(tableCatalog.loadTable(identifier).properties().get(key) == originValue, + "reserved properties should not have side effects") + sql(s"ALTER TABLE testcat.reservedTest UNSET TBLPROPERTIES ('$key')") + assert(tableCatalog.loadTable(identifier).properties().get(key) == originValue, + "reserved properties should not have side effects") + } + } + } + } + } + } + + test("create/replace - path property") { + Seq("true", "false").foreach { conf => + withSQLConf((SQLConf.LEGACY_PROPERTY_NON_RESERVED.key, conf)) { + withTable("testcat.reservedTest") { + Seq("CREATE", "REPLACE").foreach { action => + val e1 = intercept[ParseException] { + sql(s"$action TABLE testcat.reservedTest USING foo LOCATION 'foo' OPTIONS" + + s" ('path'='bar')") + } + assert(e1.getMessage.contains(s"Duplicated table paths found: 'foo' and 'bar'")) + + val e2 = intercept[ParseException] { + sql(s"$action TABLE testcat.reservedTest USING foo OPTIONS" + + s" ('path'='foo', 'PaTh'='bar')") + } + assert(e2.getMessage.contains(s"Duplicated table paths found: 'foo' and 'bar'")) + + sql(s"$action TABLE testcat.reservedTest USING foo LOCATION 'foo' TBLPROPERTIES" + + s" ('path'='bar', 'Path'='noop')") + val tableCatalog = catalog("testcat").asTableCatalog + val identifier = Identifier.of(Array(), "reservedTest") + assert(tableCatalog.loadTable(identifier).properties() + .get(TableCatalog.PROP_LOCATION) == "foo", + "path as a table property should not have side effects") + assert(tableCatalog.loadTable(identifier).properties().get("path") == "bar", + "path as a table property should not have side effects") + assert(tableCatalog.loadTable(identifier).properties().get("Path") == "noop", + "path as a table property should not have side effects") + } + } + } + } + } + + test("DropNamespace: basic tests") { + // Session catalog is used. + sql("CREATE NAMESPACE ns") + testShowNamespaces("SHOW NAMESPACES", Seq("default", "ns")) + sql("DROP NAMESPACE ns") + testShowNamespaces("SHOW NAMESPACES", Seq("default")) + + // V2 non-session catalog is used. + sql("CREATE NAMESPACE testcat.ns1") + testShowNamespaces("SHOW NAMESPACES IN testcat", Seq("ns1")) + sql("DROP NAMESPACE testcat.ns1") + testShowNamespaces("SHOW NAMESPACES IN testcat", Seq()) + } + + test("DropNamespace: drop non-empty namespace with a non-cascading mode") { + sql("CREATE TABLE testcat.ns1.table (id bigint) USING foo") + sql("CREATE TABLE testcat.ns1.ns2.table (id bigint) USING foo") + testShowNamespaces("SHOW NAMESPACES IN testcat", Seq("ns1")) + testShowNamespaces("SHOW NAMESPACES IN testcat.ns1", Seq("ns1.ns2")) + + def assertDropFails(): Unit = { + val e = intercept[SparkException] { + sql("DROP NAMESPACE testcat.ns1") + } + assert(e.getMessage.contains("Cannot drop a non-empty namespace: ns1")) + } + + // testcat.ns1.table is present, thus testcat.ns1 cannot be dropped. + assertDropFails() + sql("DROP TABLE testcat.ns1.table") + + // testcat.ns1.ns2.table is present, thus testcat.ns1 cannot be dropped. + assertDropFails() + sql("DROP TABLE testcat.ns1.ns2.table") + + // testcat.ns1.ns2 namespace is present, thus testcat.ns1 cannot be dropped. + assertDropFails() + sql("DROP NAMESPACE testcat.ns1.ns2") + + // Now that testcat.ns1 is empty, it can be dropped. + sql("DROP NAMESPACE testcat.ns1") + testShowNamespaces("SHOW NAMESPACES IN testcat", Seq()) + } + + test("DropNamespace: drop non-empty namespace with a cascade mode") { + sql("CREATE TABLE testcat.ns1.table (id bigint) USING foo") + sql("CREATE TABLE testcat.ns1.ns2.table (id bigint) USING foo") + testShowNamespaces("SHOW NAMESPACES IN testcat", Seq("ns1")) + testShowNamespaces("SHOW NAMESPACES IN testcat.ns1", Seq("ns1.ns2")) + + sql("DROP NAMESPACE testcat.ns1 CASCADE") + testShowNamespaces("SHOW NAMESPACES IN testcat", Seq()) + } + + test("DropNamespace: test handling of 'IF EXISTS'") { + sql("DROP NAMESPACE IF EXISTS testcat.unknown") + + val exception = intercept[NoSuchNamespaceException] { + sql("DROP NAMESPACE testcat.ns1") + } + assert(exception.getMessage.contains("Namespace 'ns1' not found")) + } + + test("DescribeNamespace using v2 catalog") { + withNamespace("testcat.ns1.ns2") { + sql("CREATE NAMESPACE IF NOT EXISTS testcat.ns1.ns2 COMMENT " + + "'test namespace' LOCATION '/tmp/ns_test'") + val descriptionDf = sql("DESCRIBE NAMESPACE testcat.ns1.ns2") + assert(descriptionDf.schema.map(field => (field.name, field.dataType)) === + Seq( + ("name", StringType), + ("value", StringType) + )) + val description = descriptionDf.collect() + assert(description === Seq( + Row("Namespace Name", "ns2"), + Row(SupportsNamespaces.PROP_COMMENT.capitalize, "test namespace"), + Row(SupportsNamespaces.PROP_LOCATION.capitalize, "/tmp/ns_test"), + Row(SupportsNamespaces.PROP_OWNER.capitalize, defaultUser)) + ) + } + } + + test("AlterNamespaceSetProperties using v2 catalog") { + withNamespace("testcat.ns1.ns2") { + sql("CREATE NAMESPACE IF NOT EXISTS testcat.ns1.ns2 COMMENT " + + "'test namespace' LOCATION '/tmp/ns_test' WITH PROPERTIES ('a'='a','b'='b','c'='c')") + sql("ALTER NAMESPACE testcat.ns1.ns2 SET PROPERTIES ('a'='b','b'='a')") + val descriptionDf = sql("DESCRIBE NAMESPACE EXTENDED testcat.ns1.ns2") + assert(descriptionDf.collect() === Seq( + Row("Namespace Name", "ns2"), + Row(SupportsNamespaces.PROP_COMMENT.capitalize, "test namespace"), + Row(SupportsNamespaces.PROP_LOCATION.capitalize, "/tmp/ns_test"), + Row(SupportsNamespaces.PROP_OWNER.capitalize, defaultUser), + Row("Properties", "((a,b),(b,a),(c,c))")) + ) + } + } + + test("AlterNamespaceSetProperties: reserved properties") { + import SupportsNamespaces._ + withSQLConf((SQLConf.LEGACY_PROPERTY_NON_RESERVED.key, "false")) { + CatalogV2Util.NAMESPACE_RESERVED_PROPERTIES.filterNot(_ == PROP_COMMENT).foreach { key => + withNamespace("testcat.reservedTest") { + sql("CREATE NAMESPACE testcat.reservedTest") + val exception = intercept[ParseException] { + sql(s"ALTER NAMESPACE testcat.reservedTest SET PROPERTIES ('$key'='dummyVal')") + } + assert(exception.getMessage.contains(s"$key is a reserved namespace property")) + } + } + } + withSQLConf((SQLConf.LEGACY_PROPERTY_NON_RESERVED.key, "true")) { + CatalogV2Util.NAMESPACE_RESERVED_PROPERTIES.filterNot(_ == PROP_COMMENT).foreach { key => + withNamespace("testcat.reservedTest") { + sql(s"CREATE NAMESPACE testcat.reservedTest") + sql(s"ALTER NAMESPACE testcat.reservedTest SET PROPERTIES ('$key'='foo')") + assert(sql("DESC NAMESPACE EXTENDED testcat.reservedTest") + .toDF("k", "v") + .where("k='Properties'") + .isEmpty, s"$key is a reserved namespace property and ignored") + val meta = + catalog("testcat").asNamespaceCatalog.loadNamespaceMetadata(Array("reservedTest")) + assert(meta.get(key) == null || !meta.get(key).contains("foo"), + "reserved properties should not have side effects") + } + } + } + } + + test("AlterNamespaceSetLocation using v2 catalog") { + withNamespace("testcat.ns1.ns2") { + sql("CREATE NAMESPACE IF NOT EXISTS testcat.ns1.ns2 COMMENT " + + "'test namespace' LOCATION '/tmp/ns_test_1'") + sql("ALTER NAMESPACE testcat.ns1.ns2 SET LOCATION '/tmp/ns_test_2'") + val descriptionDf = sql("DESCRIBE NAMESPACE EXTENDED testcat.ns1.ns2") + assert(descriptionDf.collect() === Seq( + Row("Namespace Name", "ns2"), + Row(SupportsNamespaces.PROP_COMMENT.capitalize, "test namespace"), + Row(SupportsNamespaces.PROP_LOCATION.capitalize, "/tmp/ns_test_2"), + Row(SupportsNamespaces.PROP_OWNER.capitalize, defaultUser)) + ) + } + } + + test("ShowNamespaces: show root namespaces with default v2 catalog") { + spark.conf.set(SQLConf.DEFAULT_CATALOG.key, "testcat") + + testShowNamespaces("SHOW NAMESPACES", Seq()) + + spark.sql("CREATE TABLE testcat.ns1.table (id bigint) USING foo") + spark.sql("CREATE TABLE testcat.ns1.ns1_1.table (id bigint) USING foo") + spark.sql("CREATE TABLE testcat.ns2.table (id bigint) USING foo") + + testShowNamespaces("SHOW NAMESPACES", Seq("ns1", "ns2")) + testShowNamespaces("SHOW NAMESPACES LIKE '*1*'", Seq("ns1")) + } + + test("ShowNamespaces: show namespaces with v2 catalog") { + spark.sql("CREATE TABLE testcat.ns1.table (id bigint) USING foo") + spark.sql("CREATE TABLE testcat.ns1.ns1_1.table (id bigint) USING foo") + spark.sql("CREATE TABLE testcat.ns1.ns1_2.table (id bigint) USING foo") + spark.sql("CREATE TABLE testcat.ns2.table (id bigint) USING foo") + spark.sql("CREATE TABLE testcat.ns2.ns2_1.table (id bigint) USING foo") + + // Look up only with catalog name, which should list root namespaces. + testShowNamespaces("SHOW NAMESPACES IN testcat", Seq("ns1", "ns2")) + + // Look up sub-namespaces. + testShowNamespaces("SHOW NAMESPACES IN testcat.ns1", Seq("ns1.ns1_1", "ns1.ns1_2")) + testShowNamespaces("SHOW NAMESPACES IN testcat.ns1 LIKE '*2*'", Seq("ns1.ns1_2")) + testShowNamespaces("SHOW NAMESPACES IN testcat.ns2", Seq("ns2.ns2_1")) + + // Try to look up namespaces that do not exist. + testShowNamespaces("SHOW NAMESPACES IN testcat.ns3", Seq()) + testShowNamespaces("SHOW NAMESPACES IN testcat.ns1.ns3", Seq()) + } + + test("ShowNamespaces: default v2 catalog is not set") { + spark.sql("CREATE TABLE testcat.ns.table (id bigint) USING foo") + + // The current catalog is resolved to a v2 session catalog. + testShowNamespaces("SHOW NAMESPACES", Seq("default")) + } + + test("ShowNamespaces: default v2 catalog doesn't support namespace") { + spark.conf.set( + "spark.sql.catalog.testcat_no_namspace", + classOf[BasicInMemoryTableCatalog].getName) + spark.conf.set(SQLConf.DEFAULT_CATALOG.key, "testcat_no_namspace") + + val exception = intercept[AnalysisException] { + sql("SHOW NAMESPACES") + } + + assert(exception.getMessage.contains("does not support namespaces")) + } + + test("ShowNamespaces: v2 catalog doesn't support namespace") { + spark.conf.set( + "spark.sql.catalog.testcat_no_namspace", + classOf[BasicInMemoryTableCatalog].getName) + + val exception = intercept[AnalysisException] { + sql("SHOW NAMESPACES in testcat_no_namspace") + } + + assert(exception.getMessage.contains("does not support namespaces")) + } + + test("ShowNamespaces: session catalog is used and namespace doesn't exist") { + val exception = intercept[AnalysisException] { + sql("SHOW NAMESPACES in dummy") + } + + assert(exception.getMessage.contains("Namespace 'dummy' not found")) + } + + test("ShowNamespaces: change catalog and namespace with USE statements") { + sql("CREATE TABLE testcat.ns1.ns2.table (id bigint) USING foo") + + // Initially, the current catalog is a v2 session catalog. + testShowNamespaces("SHOW NAMESPACES", Seq("default")) + + // Update the current catalog to 'testcat'. + sql("USE testcat") + testShowNamespaces("SHOW NAMESPACES", Seq("ns1")) + + // Update the current namespace to 'ns1'. + sql("USE ns1") + // 'SHOW NAMESPACES' is not affected by the current namespace and lists root namespaces. + testShowNamespaces("SHOW NAMESPACES", Seq("ns1")) + } + + private def testShowNamespaces( + sqlText: String, + expected: Seq[String]): Unit = { + val schema = new StructType().add("namespace", StringType, nullable = false) + + val df = spark.sql(sqlText) + assert(df.schema === schema) + assert(df.collect().map(_.getAs[String](0)).sorted === expected.sorted) + } + + test("Use: basic tests with USE statements") { + val catalogManager = spark.sessionState.catalogManager + + // Validate the initial current catalog and namespace. + assert(catalogManager.currentCatalog.name() == SESSION_CATALOG_NAME) + assert(catalogManager.currentNamespace === Array("default")) + + // The following implicitly creates namespaces. + sql("CREATE TABLE testcat.ns1.ns1_1.table (id bigint) USING foo") + sql("CREATE TABLE testcat2.ns2.ns2_2.table (id bigint) USING foo") + sql("CREATE TABLE testcat2.ns3.ns3_3.table (id bigint) USING foo") + sql("CREATE TABLE testcat2.testcat.table (id bigint) USING foo") + + // Catalog is resolved to 'testcat'. + sql("USE testcat.ns1.ns1_1") + assert(catalogManager.currentCatalog.name() == "testcat") + assert(catalogManager.currentNamespace === Array("ns1", "ns1_1")) + + // Catalog is resolved to 'testcat2'. + sql("USE testcat2.ns2.ns2_2") + assert(catalogManager.currentCatalog.name() == "testcat2") + assert(catalogManager.currentNamespace === Array("ns2", "ns2_2")) + + // Only the namespace is changed. + sql("USE ns3.ns3_3") + assert(catalogManager.currentCatalog.name() == "testcat2") + assert(catalogManager.currentNamespace === Array("ns3", "ns3_3")) + + // Only the namespace is changed (explicit). + sql("USE NAMESPACE testcat") + assert(catalogManager.currentCatalog.name() == "testcat2") + assert(catalogManager.currentNamespace === Array("testcat")) + + // Catalog is resolved to `testcat`. + sql("USE testcat") + assert(catalogManager.currentCatalog.name() == "testcat") + assert(catalogManager.currentNamespace === Array()) + } + + test("Use: set v2 catalog as a current catalog") { + val catalogManager = spark.sessionState.catalogManager + assert(catalogManager.currentCatalog.name() == SESSION_CATALOG_NAME) + + sql("USE testcat") + assert(catalogManager.currentCatalog.name() == "testcat") + } + + test("Use: v2 session catalog is used and namespace does not exist") { + val exception = intercept[NoSuchDatabaseException] { + sql("USE ns1") + } + assert(exception.getMessage.contains("Database 'ns1' not found")) + } + + test("Use: v2 catalog is used and namespace does not exist") { + // Namespaces are not required to exist for v2 catalogs. + sql("USE testcat.ns1.ns2") + val catalogManager = spark.sessionState.catalogManager + assert(catalogManager.currentNamespace === Array("ns1", "ns2")) + } + + test("ShowCurrentNamespace: basic tests") { + def testShowCurrentNamespace(expectedCatalogName: String, expectedNamespace: String): Unit = { + val schema = new StructType() + .add("catalog", StringType, nullable = false) + .add("namespace", StringType, nullable = false) + val df = sql("SHOW CURRENT NAMESPACE") + val rows = df.collect + + assert(df.schema === schema) + assert(rows.length == 1) + assert(rows(0).getAs[String](0) === expectedCatalogName) + assert(rows(0).getAs[String](1) === expectedNamespace) + } + + // Initially, the v2 session catalog is set as a current catalog. + testShowCurrentNamespace("spark_catalog", "default") + + sql("USE testcat") + testShowCurrentNamespace("testcat", "") + sql("USE testcat.ns1.ns2") + testShowCurrentNamespace("testcat", "ns1.ns2") + } + + test("tableCreation: partition column case insensitive resolution") { + val testCatalog = catalog("testcat").asTableCatalog + val sessionCatalog = catalog(SESSION_CATALOG_NAME).asTableCatalog + + def checkPartitioning(cat: TableCatalog, partition: String): Unit = { + val table = cat.loadTable(Identifier.of(Array.empty, "tbl")) + val partitions = table.partitioning().map(_.references()) + assert(partitions.length === 1) + val fieldNames = partitions.flatMap(_.map(_.fieldNames())) + assert(fieldNames === Array(Array(partition))) + } + + sql(s"CREATE TABLE tbl (a int, b string) USING $v2Source PARTITIONED BY (A)") + checkPartitioning(sessionCatalog, "a") + sql(s"CREATE TABLE testcat.tbl (a int, b string) USING $v2Source PARTITIONED BY (A)") + checkPartitioning(testCatalog, "a") + sql(s"CREATE OR REPLACE TABLE tbl (a int, b string) USING $v2Source PARTITIONED BY (B)") + checkPartitioning(sessionCatalog, "b") + sql(s"CREATE OR REPLACE TABLE testcat.tbl (a int, b string) USING $v2Source PARTITIONED BY (B)") + checkPartitioning(testCatalog, "b") + } + + test("tableCreation: partition column case sensitive resolution") { + def checkFailure(statement: String): Unit = { + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { + val e = intercept[AnalysisException] { + sql(statement) + } + assert(e.getMessage.contains("Couldn't find column")) + } + } + + checkFailure(s"CREATE TABLE tbl (a int, b string) USING $v2Source PARTITIONED BY (A)") + checkFailure(s"CREATE TABLE testcat.tbl (a int, b string) USING $v2Source PARTITIONED BY (A)") + checkFailure( + s"CREATE OR REPLACE TABLE tbl (a int, b string) USING $v2Source PARTITIONED BY (B)") + checkFailure( + s"CREATE OR REPLACE TABLE testcat.tbl (a int, b string) USING $v2Source PARTITIONED BY (B)") + } + + test("tableCreation: duplicate column names in the table definition") { + val errorMsg = "Found duplicate column(s) in the table definition of t" + Seq((true, ("a", "a")), (false, ("aA", "Aa"))).foreach { case (caseSensitive, (c0, c1)) => + withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { + assertAnalysisError( + s"CREATE TABLE t ($c0 INT, $c1 INT) USING $v2Source", + errorMsg + ) + assertAnalysisError( + s"CREATE TABLE testcat.t ($c0 INT, $c1 INT) USING $v2Source", + errorMsg + ) + assertAnalysisError( + s"CREATE OR REPLACE TABLE t ($c0 INT, $c1 INT) USING $v2Source", + errorMsg + ) + assertAnalysisError( + s"CREATE OR REPLACE TABLE testcat.t ($c0 INT, $c1 INT) USING $v2Source", + errorMsg + ) + } + } + } + + test("tableCreation: duplicate nested column names in the table definition") { + val errorMsg = "Found duplicate column(s) in the table definition of t" + Seq((true, ("a", "a")), (false, ("aA", "Aa"))).foreach { case (caseSensitive, (c0, c1)) => + withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { + assertAnalysisError( + s"CREATE TABLE t (d struct<$c0: INT, $c1: INT>) USING $v2Source", + errorMsg + ) + assertAnalysisError( + s"CREATE TABLE testcat.t (d struct<$c0: INT, $c1: INT>) USING $v2Source", + errorMsg + ) + assertAnalysisError( + s"CREATE OR REPLACE TABLE t (d struct<$c0: INT, $c1: INT>) USING $v2Source", + errorMsg + ) + assertAnalysisError( + s"CREATE OR REPLACE TABLE testcat.t (d struct<$c0: INT, $c1: INT>) USING $v2Source", + errorMsg + ) + } + } + } + + test("tableCreation: bucket column names not in table definition") { + val errorMsg = "Couldn't find column c in" + assertAnalysisError( + s"CREATE TABLE tbl (a int, b string) USING $v2Source CLUSTERED BY (c) INTO 4 BUCKETS", + errorMsg + ) + assertAnalysisError( + s"CREATE TABLE testcat.tbl (a int, b string) USING $v2Source CLUSTERED BY (c) INTO 4 BUCKETS", + errorMsg + ) + assertAnalysisError( + s"CREATE OR REPLACE TABLE tbl (a int, b string) USING $v2Source " + + "CLUSTERED BY (c) INTO 4 BUCKETS", + errorMsg + ) + assertAnalysisError( + s"CREATE OR REPLACE TABLE testcat.tbl (a int, b string) USING $v2Source " + + "CLUSTERED BY (c) INTO 4 BUCKETS", + errorMsg + ) + } + + test("tableCreation: bucket column name containing dot") { + withTable("t") { + sql( + """ + |CREATE TABLE testcat.t (id int, `a.b` string) USING foo + |CLUSTERED BY (`a.b`) INTO 4 BUCKETS + |OPTIONS ('allow-unsupported-transforms'=true) + """.stripMargin) + + val testCatalog = catalog("testcat").asTableCatalog.asInstanceOf[InMemoryTableCatalog] + val table = testCatalog.loadTable(Identifier.of(Array.empty, "t")) + val partitioning = table.partitioning() + assert(partitioning.length == 1 && partitioning.head.name() == "bucket") + val references = partitioning.head.references() + assert(references.length == 1) + assert(references.head.fieldNames().toSeq == Seq("a.b")) + } + } + + test("tableCreation: column repeated in partition columns") { + val errorMsg = "Found duplicate column(s) in the partitioning" + Seq((true, ("a", "a")), (false, ("aA", "Aa"))).foreach { case (caseSensitive, (c0, c1)) => + withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { + assertAnalysisError( + s"CREATE TABLE t ($c0 INT) USING $v2Source PARTITIONED BY ($c0, $c1)", + errorMsg + ) + assertAnalysisError( + s"CREATE TABLE testcat.t ($c0 INT) USING $v2Source PARTITIONED BY ($c0, $c1)", + errorMsg + ) + assertAnalysisError( + s"CREATE OR REPLACE TABLE t ($c0 INT) USING $v2Source PARTITIONED BY ($c0, $c1)", + errorMsg + ) + assertAnalysisError( + s"CREATE OR REPLACE TABLE testcat.t ($c0 INT) USING $v2Source PARTITIONED BY ($c0, $c1)", + errorMsg + ) + } + } + } + + test("tableCreation: column repeated in bucket columns") { + val errorMsg = "Found duplicate column(s) in the bucket definition" + Seq((true, ("a", "a")), (false, ("aA", "Aa"))).foreach { case (caseSensitive, (c0, c1)) => + withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { + assertAnalysisError( + s"CREATE TABLE t ($c0 INT) USING $v2Source " + + s"CLUSTERED BY ($c0, $c1) INTO 2 BUCKETS", + errorMsg + ) + assertAnalysisError( + s"CREATE TABLE testcat.t ($c0 INT) USING $v2Source " + + s"CLUSTERED BY ($c0, $c1) INTO 2 BUCKETS", + errorMsg + ) + assertAnalysisError( + s"CREATE OR REPLACE TABLE t ($c0 INT) USING $v2Source " + + s"CLUSTERED BY ($c0, $c1) INTO 2 BUCKETS", + errorMsg + ) + assertAnalysisError( + s"CREATE OR REPLACE TABLE testcat.t ($c0 INT) USING $v2Source " + + s"CLUSTERED BY ($c0, $c1) INTO 2 BUCKETS", + errorMsg + ) + } + } + } + + test("REFRESH TABLE: v2 table") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + sql(s"CREATE TABLE $t (id bigint, data string) USING foo") + + val testCatalog = catalog("testcat").asTableCatalog.asInstanceOf[InMemoryTableCatalog] + val identifier = Identifier.of(Array("ns1", "ns2"), "tbl") + + assert(!testCatalog.isTableInvalidated(identifier)) + sql(s"REFRESH TABLE $t") + assert(testCatalog.isTableInvalidated(identifier)) + } + } + + test("REPLACE TABLE: v1 table") { + val e = intercept[AnalysisException] { + sql(s"CREATE OR REPLACE TABLE tbl (a int) USING ${classOf[SimpleScanSource].getName}") + } + assert(e.message.contains("REPLACE TABLE is only supported with v2 tables")) + } + + test("DeleteFrom: basic - delete all") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + sql(s"CREATE TABLE $t (id bigint, data string, p int) USING foo PARTITIONED BY (id, p)") + sql(s"INSERT INTO $t VALUES (2L, 'a', 2), (2L, 'b', 3), (3L, 'c', 3)") + sql(s"DELETE FROM $t") + checkAnswer(spark.table(t), Seq()) + } + } + + test("DeleteFrom: basic - delete with where clause") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + sql(s"CREATE TABLE $t (id bigint, data string, p int) USING foo PARTITIONED BY (id, p)") + sql(s"INSERT INTO $t VALUES (2L, 'a', 2), (2L, 'b', 3), (3L, 'c', 3)") + sql(s"DELETE FROM $t WHERE id = 2") + checkAnswer(spark.table(t), Seq( + Row(3, "c", 3))) + } + } + + test("DeleteFrom: delete from aliased target table") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + sql(s"CREATE TABLE $t (id bigint, data string, p int) USING foo PARTITIONED BY (id, p)") + sql(s"INSERT INTO $t VALUES (2L, 'a', 2), (2L, 'b', 3), (3L, 'c', 3)") + sql(s"DELETE FROM $t AS tbl WHERE tbl.id = 2") + checkAnswer(spark.table(t), Seq( + Row(3, "c", 3))) + } + } + + test("DeleteFrom: normalize attribute names") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + sql(s"CREATE TABLE $t (id bigint, data string, p int) USING foo PARTITIONED BY (id, p)") + sql(s"INSERT INTO $t VALUES (2L, 'a', 2), (2L, 'b', 3), (3L, 'c', 3)") + sql(s"DELETE FROM $t AS tbl WHERE tbl.ID = 2") + checkAnswer(spark.table(t), Seq( + Row(3, "c", 3))) + } + } + + test("DeleteFrom: fail if has subquery") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + sql(s"CREATE TABLE $t (id bigint, data string, p int) USING foo PARTITIONED BY (id, p)") + sql(s"INSERT INTO $t VALUES (2L, 'a', 2), (2L, 'b', 3), (3L, 'c', 3)") + val exc = intercept[AnalysisException] { + sql(s"DELETE FROM $t WHERE id IN (SELECT id FROM $t)") + } + + assert(spark.table(t).count === 3) + assert(exc.getMessage.contains("Delete by condition with subquery is not supported")) + } + } + + test("DeleteFrom: DELETE is only supported with v2 tables") { + // unset this config to use the default v2 session catalog. + spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + val v1Table = "tbl" + withTable(v1Table) { + sql(s"CREATE TABLE $v1Table" + + s" USING ${classOf[SimpleScanSource].getName} OPTIONS (from=0,to=1)") + val exc = intercept[AnalysisException] { + sql(s"DELETE FROM $v1Table WHERE i = 2") + } + + assert(exc.getMessage.contains("DELETE is only supported with v2 tables")) + } + } + + test("UPDATE TABLE") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + sql( + s""" + |CREATE TABLE $t (id bigint, name string, age int, p int) + |USING foo + |PARTITIONED BY (id, p) + """.stripMargin) + + // UPDATE non-existing table + assertAnalysisError( + "UPDATE dummy SET name='abc'", + "Table or view not found") + + // UPDATE non-existing column + assertAnalysisError( + s"UPDATE $t SET dummy='abc'", + "cannot resolve") + assertAnalysisError( + s"UPDATE $t SET name='abc' WHERE dummy=1", + "cannot resolve") + + // UPDATE is not implemented yet. + val e = intercept[UnsupportedOperationException] { + sql(s"UPDATE $t SET name='Robert', age=32 WHERE p=1") + } + assert(e.getMessage.contains("UPDATE TABLE is not supported temporarily")) + } + } + + test("MERGE INTO TABLE") { + val target = "testcat.ns1.ns2.target" + val source = "testcat.ns1.ns2.source" + withTable(target, source) { + sql( + s""" + |CREATE TABLE $target (id bigint, name string, age int, p int) + |USING foo + |PARTITIONED BY (id, p) + """.stripMargin) + sql( + s""" + |CREATE TABLE $source (id bigint, name string, age int, p int) + |USING foo + |PARTITIONED BY (id, p) + """.stripMargin) + + // MERGE INTO non-existing table + assertAnalysisError( + s""" + |MERGE INTO testcat.ns1.ns2.dummy AS target + |USING testcat.ns1.ns2.source AS source + |ON target.id = source.id + |WHEN MATCHED AND (target.age < 10) THEN DELETE + |WHEN MATCHED AND (target.age > 10) THEN UPDATE SET * + |WHEN NOT MATCHED AND (target.col2='insert') + |THEN INSERT * + """.stripMargin, + "Table or view not found") + + // USING non-existing table + assertAnalysisError( + s""" + |MERGE INTO testcat.ns1.ns2.target AS target + |USING testcat.ns1.ns2.dummy AS source + |ON target.id = source.id + |WHEN MATCHED AND (target.age < 10) THEN DELETE + |WHEN MATCHED AND (target.age > 10) THEN UPDATE SET * + |WHEN NOT MATCHED AND (target.col2='insert') + |THEN INSERT * + """.stripMargin, + "Table or view not found") + + // UPDATE non-existing column + assertAnalysisError( + s""" + |MERGE INTO testcat.ns1.ns2.target AS target + |USING testcat.ns1.ns2.source AS source + |ON target.id = source.id + |WHEN MATCHED AND (target.age < 10) THEN DELETE + |WHEN MATCHED AND (target.age > 10) THEN UPDATE SET target.dummy = source.age + |WHEN NOT MATCHED AND (target.col2='insert') + |THEN INSERT * + """.stripMargin, + "cannot resolve") + + // UPDATE using non-existing column + assertAnalysisError( + s""" + |MERGE INTO testcat.ns1.ns2.target AS target + |USING testcat.ns1.ns2.source AS source + |ON target.id = source.id + |WHEN MATCHED AND (target.age < 10) THEN DELETE + |WHEN MATCHED AND (target.age > 10) THEN UPDATE SET target.age = source.dummy + |WHEN NOT MATCHED AND (target.col2='insert') + |THEN INSERT * + """.stripMargin, + "cannot resolve") + + // MERGE INTO is not implemented yet. + val e = intercept[UnsupportedOperationException] { + sql( + s""" + |MERGE INTO testcat.ns1.ns2.target AS target + |USING testcat.ns1.ns2.source AS source + |ON target.id = source.id + |WHEN MATCHED AND (target.p < 0) THEN DELETE + |WHEN MATCHED AND (target.p > 0) THEN UPDATE SET * + |WHEN NOT MATCHED THEN INSERT * + """.stripMargin) + } + assert(e.getMessage.contains("MERGE INTO TABLE is not supported temporarily")) + } + } + + test("AlterTable: rename table basic test") { + withTable("testcat.ns1.new") { + sql(s"CREATE TABLE testcat.ns1.ns2.old USING foo AS SELECT id, data FROM source") + checkAnswer(sql("SHOW TABLES FROM testcat.ns1.ns2"), Seq(Row("ns1.ns2", "old"))) + + sql(s"ALTER TABLE testcat.ns1.ns2.old RENAME TO ns1.new") + checkAnswer(sql("SHOW TABLES FROM testcat.ns1.ns2"), Seq.empty) + checkAnswer(sql("SHOW TABLES FROM testcat.ns1"), Seq(Row("ns1", "new"))) + } + } + + test("AlterTable: renaming views are not supported") { + val e = intercept[AnalysisException] { + sql(s"ALTER VIEW testcat.ns.tbl RENAME TO ns.view") + } + assert(e.getMessage.contains("Renaming view is not supported in v2 catalogs")) + } + + test("ANALYZE TABLE") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo") + testV1Command("ANALYZE TABLE", s"$t COMPUTE STATISTICS") + testV1Command("ANALYZE TABLE", s"$t COMPUTE STATISTICS FOR ALL COLUMNS") + } + } + + test("MSCK REPAIR TABLE") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo") + testV1Command("MSCK REPAIR TABLE", t) + } + } + + test("TRUNCATE TABLE") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + sql( + s""" + |CREATE TABLE $t (id bigint, data string) + |USING foo + |PARTITIONED BY (id) + """.stripMargin) + + testV1Command("TRUNCATE TABLE", t) + testV1Command("TRUNCATE TABLE", s"$t PARTITION(id='1')") + } + } + + test("SHOW PARTITIONS") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + sql( + s""" + |CREATE TABLE $t (id bigint, data string) + |USING foo + |PARTITIONED BY (id) + """.stripMargin) + + testV1Command("SHOW PARTITIONS", t) + testV1Command("SHOW PARTITIONS", s"$t PARTITION(id='1')") + } + } + + test("LOAD DATA INTO TABLE") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + sql( + s""" + |CREATE TABLE $t (id bigint, data string) + |USING foo + |PARTITIONED BY (id) + """.stripMargin) + + testV1Command("LOAD DATA", s"INPATH 'filepath' INTO TABLE $t") + testV1Command("LOAD DATA", s"LOCAL INPATH 'filepath' INTO TABLE $t") + testV1Command("LOAD DATA", s"LOCAL INPATH 'filepath' OVERWRITE INTO TABLE $t") + testV1Command("LOAD DATA", + s"LOCAL INPATH 'filepath' OVERWRITE INTO TABLE $t PARTITION(id=1)") + } + } + + test("SHOW CREATE TABLE") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo") + testV1Command("SHOW CREATE TABLE", t) + } + } + + test("CACHE TABLE") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo") + + testV1Command("CACHE TABLE", t) + + val e = intercept[AnalysisException] { + sql(s"CACHE LAZY TABLE $t") + } + assert(e.message.contains("CACHE TABLE is only supported with v1 tables")) + } + } + + test("UNCACHE TABLE") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + sql(s"CREATE TABLE $t (id bigint, data string) USING foo") + + testV1Command("UNCACHE TABLE", t) + testV1Command("UNCACHE TABLE", s"IF EXISTS $t") + } + } + + test("SHOW COLUMNS") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo") + + testV1Command("SHOW COLUMNS", s"FROM $t") + testV1Command("SHOW COLUMNS", s"IN $t") + + val e3 = intercept[AnalysisException] { + sql(s"SHOW COLUMNS FROM tbl IN testcat.ns1.ns2") + } + assert(e3.message.contains("Namespace name should have " + + "only one part if specified: testcat.ns1.ns2")) + } + } + + test("ALTER TABLE RECOVER PARTITIONS") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo") + val e = intercept[AnalysisException] { + sql(s"ALTER TABLE $t RECOVER PARTITIONS") + } + assert(e.message.contains("ALTER TABLE RECOVER PARTITIONS is only supported with v1 tables")) + } + } + + test("ALTER TABLE ADD PARTITION") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo PARTITIONED BY (id)") + val e = intercept[AnalysisException] { + sql(s"ALTER TABLE $t ADD PARTITION (id=1) LOCATION 'loc'") + } + assert(e.message.contains("ALTER TABLE ADD PARTITION is only supported with v1 tables")) + } + } + + test("ALTER TABLE RENAME PARTITION") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo PARTITIONED BY (id)") + val e = intercept[AnalysisException] { + sql(s"ALTER TABLE $t PARTITION (id=1) RENAME TO PARTITION (id=2)") + } + assert(e.message.contains("ALTER TABLE RENAME PARTITION is only supported with v1 tables")) + } + } + + test("ALTER TABLE DROP PARTITIONS") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo PARTITIONED BY (id)") + val e = intercept[AnalysisException] { + sql(s"ALTER TABLE $t DROP PARTITION (id=1)") + } + assert(e.message.contains("ALTER TABLE DROP PARTITION is only supported with v1 tables")) + } + } + + test("ALTER TABLE SerDe properties") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo PARTITIONED BY (id)") + val e = intercept[AnalysisException] { + sql(s"ALTER TABLE $t SET SERDEPROPERTIES ('columns'='foo,bar', 'field.delim' = ',')") + } + assert(e.message.contains("ALTER TABLE SerDe Properties is only supported with v1 tables")) + } + } + + test("ALTER VIEW AS QUERY") { + val v = "testcat.ns1.ns2.v" + val e = intercept[AnalysisException] { + sql(s"ALTER VIEW $v AS SELECT 1") + } + assert(e.message.contains("ALTER VIEW QUERY is only supported with v1 tables")) + } + + test("CREATE VIEW") { + val v = "testcat.ns1.ns2.v" + val e = intercept[AnalysisException] { + sql(s"CREATE VIEW $v AS SELECT * FROM tab1") + } + assert(e.message.contains("CREATE VIEW is only supported with v1 tables")) + } + + test("SHOW TBLPROPERTIES: v2 table") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + val user = "andrew" + val status = "new" + val provider = "foo" + spark.sql(s"CREATE TABLE $t (id bigint, data string) USING $provider " + + s"TBLPROPERTIES ('user'='$user', 'status'='$status')") + + val properties = sql(s"SHOW TBLPROPERTIES $t").orderBy("key") + + val schema = new StructType() + .add("key", StringType, nullable = false) + .add("value", StringType, nullable = false) + + val expected = Seq( + Row(TableCatalog.PROP_OWNER, defaultUser), + Row("provider", provider), + Row("status", status), + Row("user", user)) + + assert(properties.schema === schema) + assert(expected === properties.collect()) + } + } + + test("SHOW TBLPROPERTIES(key): v2 table") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + val user = "andrew" + val status = "new" + val provider = "foo" + spark.sql(s"CREATE TABLE $t (id bigint, data string) USING $provider " + + s"TBLPROPERTIES ('user'='$user', 'status'='$status')") + + val properties = sql(s"SHOW TBLPROPERTIES $t ('status')") + + val expected = Seq(Row("status", status)) + + assert(expected === properties.collect()) + } + } + + test("SHOW TBLPROPERTIES(key): v2 table, key not found") { + val t = "testcat.ns1.ns2.tbl" + withTable(t) { + val nonExistingKey = "nonExistingKey" + spark.sql(s"CREATE TABLE $t (id bigint, data string) USING foo " + + s"TBLPROPERTIES ('user'='andrew', 'status'='new')") + + val properties = sql(s"SHOW TBLPROPERTIES $t ('$nonExistingKey')") + + val expected = Seq(Row(nonExistingKey, s"Table $t does not have property: $nonExistingKey")) + + assert(expected === properties.collect()) + } + } + + test("DESCRIBE FUNCTION: only support session catalog") { + val e = intercept[AnalysisException] { + sql("DESCRIBE FUNCTION testcat.ns1.ns2.fun") + } + assert(e.message.contains("DESCRIBE FUNCTION is only supported in v1 catalog")) + + val e1 = intercept[AnalysisException] { + sql("DESCRIBE FUNCTION default.ns1.ns2.fun") + } + assert(e1.message.contains("Unsupported function name 'default.ns1.ns2.fun'")) + } + + test("SHOW FUNCTIONS not valid v1 namespace") { + val function = "testcat.ns1.ns2.fun" + + val e = intercept[AnalysisException] { + sql(s"SHOW FUNCTIONS LIKE $function") + } + assert(e.message.contains("SHOW FUNCTIONS is only supported in v1 catalog")) + } + + test("DROP FUNCTION: only support session catalog") { + val e = intercept[AnalysisException] { + sql("DROP FUNCTION testcat.ns1.ns2.fun") + } + assert(e.message.contains("DROP FUNCTION is only supported in v1 catalog")) + + val e1 = intercept[AnalysisException] { + sql("DESCRIBE FUNCTION default.ns1.ns2.fun") + } + assert(e1.message.contains("Unsupported function name 'default.ns1.ns2.fun'")) + } + + test("CREATE FUNCTION: only support session catalog") { + val e = intercept[AnalysisException] { + sql("CREATE FUNCTION testcat.ns1.ns2.fun as 'f'") + } + assert(e.message.contains("CREATE FUNCTION is only supported in v1 catalog")) + + val e1 = intercept[AnalysisException] { + sql("CREATE FUNCTION default.ns1.ns2.fun as 'f'") + } + assert(e1.message.contains("Unsupported function name 'default.ns1.ns2.fun'")) + } + + test("global temp view should not be masked by v2 catalog") { + val globalTempDB = spark.sessionState.conf.getConf(StaticSQLConf.GLOBAL_TEMP_DATABASE) + spark.conf.set(s"spark.sql.catalog.$globalTempDB", classOf[InMemoryTableCatalog].getName) + + try { + sql("create global temp view v as select 1") + sql(s"alter view $globalTempDB.v rename to v2") + checkAnswer(spark.table(s"$globalTempDB.v2"), Row(1)) + sql(s"drop view $globalTempDB.v2") + } finally { + spark.sharedState.globalTempViewManager.clear() + } + } + + test("SPARK-30104: global temp db is used as a table name under v2 catalog") { + val globalTempDB = spark.sessionState.conf.getConf(StaticSQLConf.GLOBAL_TEMP_DATABASE) + val t = s"testcat.$globalTempDB" + withTable(t) { + sql(s"CREATE TABLE $t (id bigint, data string) USING foo") + sql("USE testcat") + // The following should not throw AnalysisException, but should use `testcat.$globalTempDB`. + sql(s"DESCRIBE TABLE $globalTempDB") + } + } + + test("SPARK-30104: v2 catalog named global_temp will be masked") { + val globalTempDB = spark.sessionState.conf.getConf(StaticSQLConf.GLOBAL_TEMP_DATABASE) + spark.conf.set(s"spark.sql.catalog.$globalTempDB", classOf[InMemoryTableCatalog].getName) + + val e = intercept[AnalysisException] { + // Since the following multi-part name starts with `globalTempDB`, it is resolved to + // the session catalog, not the `gloabl_temp` v2 catalog. + sql(s"CREATE TABLE $globalTempDB.ns1.ns2.tbl (id bigint, data string) USING json") + } + assert(e.message.contains("global_temp.ns1.ns2.tbl is not a valid TableIdentifier")) + } + + test("table name same as catalog can be used") { + withTable("testcat.testcat") { + sql(s"CREATE TABLE testcat.testcat (id bigint, data string) USING foo") + sql("USE testcat") + // The following should not throw AnalysisException. + sql(s"DESCRIBE TABLE testcat") + } + } + + test("SPARK-30001: session catalog name can be specified in SQL statements") { + // unset this config to use the default v2 session catalog. + spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + + withTable("t") { + sql("CREATE TABLE t USING json AS SELECT 1 AS i") + checkAnswer(sql("select * from t"), Row(1)) + checkAnswer(sql("select * from spark_catalog.t"), Row(1)) + checkAnswer(sql("select * from spark_catalog.default.t"), Row(1)) + } + } + + test("SPARK-30259: session catalog can be specified in CREATE TABLE AS SELECT command") { + withTable("tbl") { + val ident = Identifier.of(Array(), "tbl") + sql("CREATE TABLE spark_catalog.tbl USING json AS SELECT 1 AS i") + assert(catalog("spark_catalog").asTableCatalog.tableExists(ident) === true) + } + } + + test("SPARK-30259: session catalog can be specified in CREATE TABLE command") { + withTable("tbl") { + val ident = Identifier.of(Array(), "tbl") + sql("CREATE TABLE spark_catalog.tbl (col string) USING json") + assert(catalog("spark_catalog").asTableCatalog.tableExists(ident) === true) + } + } + + test("SPARK-30094: current namespace is used during table resolution") { + // unset this config to use the default v2 session catalog. + spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + + withTable("spark_catalog.t", "testcat.ns.t") { + sql("CREATE TABLE t USING parquet AS SELECT 1") + sql("CREATE TABLE testcat.ns.t USING parquet AS SELECT 2") + + checkAnswer(sql("SELECT * FROM t"), Row(1)) + + sql("USE testcat.ns") + checkAnswer(sql("SELECT * FROM t"), Row(2)) + } + } + + test("SPARK-30284: CREATE VIEW should track the current catalog and namespace") { + // unset this config to use the default v2 session catalog. + spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + val sessionCatalogName = CatalogManager.SESSION_CATALOG_NAME + + sql("USE testcat.ns1.ns2") + sql("CREATE TABLE t USING foo AS SELECT 1 col") + checkAnswer(spark.table("t"), Row(1)) + + withTempView("t") { + spark.range(10).createTempView("t") + withView(s"$sessionCatalogName.v") { + val e = intercept[AnalysisException] { + sql(s"CREATE VIEW $sessionCatalogName.v AS SELECT * FROM t") + } + assert(e.message.contains("referencing a temporary view")) + } + } + + withTempView("t") { + withView(s"$sessionCatalogName.v") { + sql(s"CREATE VIEW $sessionCatalogName.v AS SELECT t1.col FROM t t1 JOIN ns1.ns2.t t2") + sql(s"USE $sessionCatalogName") + // The view should read data from table `testcat.ns1.ns2.t` not the temp view. + spark.range(10).createTempView("t") + checkAnswer(spark.table("v"), Row(1)) + } + } + } + + test("COMMENT ON NAMESPACE") { + // unset this config to use the default v2 session catalog. + spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + // Session catalog is used. + sql("CREATE NAMESPACE ns") + checkNamespaceComment("ns", "minor revision") + checkNamespaceComment("ns", null) + checkNamespaceComment("ns", "NULL") + intercept[AnalysisException](sql("COMMENT ON NAMESPACE abc IS NULL")) + + // V2 non-session catalog is used. + sql("CREATE NAMESPACE testcat.ns1") + checkNamespaceComment("testcat.ns1", "minor revision") + checkNamespaceComment("testcat.ns1", null) + checkNamespaceComment("testcat.ns1", "NULL") + intercept[AnalysisException](sql("COMMENT ON NAMESPACE testcat.abc IS NULL")) + } + + private def checkNamespaceComment(namespace: String, comment: String): Unit = { + sql(s"COMMENT ON NAMESPACE $namespace IS " + + Option(comment).map("'" + _ + "'").getOrElse("NULL")) + val expectedComment = Option(comment).getOrElse("") + assert(sql(s"DESC NAMESPACE extended $namespace").toDF("k", "v") + .where(s"k='${SupportsNamespaces.PROP_COMMENT.capitalize}'") + .head().getString(1) === expectedComment) + } + + test("COMMENT ON TABLE") { + // unset this config to use the default v2 session catalog. + spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + // Session catalog is used. + withTable("t") { + sql("CREATE TABLE t(k int) USING json") + checkTableComment("t", "minor revision") + checkTableComment("t", null) + checkTableComment("t", "NULL") + } + intercept[AnalysisException](sql("COMMENT ON TABLE abc IS NULL")) + + // V2 non-session catalog is used. + withTable("testcat.ns1.ns2.t") { + sql("CREATE TABLE testcat.ns1.ns2.t(k int) USING foo") + checkTableComment("testcat.ns1.ns2.t", "minor revision") + checkTableComment("testcat.ns1.ns2.t", null) + checkTableComment("testcat.ns1.ns2.t", "NULL") + } + intercept[AnalysisException](sql("COMMENT ON TABLE testcat.abc IS NULL")) + + val globalTempDB = spark.sessionState.conf.getConf(StaticSQLConf.GLOBAL_TEMP_DATABASE) + spark.conf.set(s"spark.sql.catalog.$globalTempDB", classOf[InMemoryTableCatalog].getName) + withTempView("v") { + sql("create global temp view v as select 1") + val e = intercept[AnalysisException](sql("COMMENT ON TABLE global_temp.v IS NULL")) + assert(e.getMessage.contains("global_temp.v is a temp view not table.")) + } + } + + private def checkTableComment(tableName: String, comment: String): Unit = { + sql(s"COMMENT ON TABLE $tableName IS " + Option(comment).map("'" + _ + "'").getOrElse("NULL")) + val expectedComment = Option(comment).getOrElse("") + assert(sql(s"DESC extended $tableName").toDF("k", "v", "c") + .where(s"k='${TableCatalog.PROP_COMMENT.capitalize}'") + .head().getString(1) === expectedComment) + } + + private def testV1Command(sqlCommand: String, sqlParams: String): Unit = { + val e = intercept[AnalysisException] { + sql(s"$sqlCommand $sqlParams") + } + assert(e.message.contains(s"$sqlCommand is only supported with v1 tables")) + } + + private def assertAnalysisError(sqlStatement: String, expectedError: String): Unit = { + val errMsg = intercept[AnalysisException] { + sql(sqlStatement) + }.getMessage + assert(errMsg.contains(expectedError)) + } +} + + +/** Used as a V2 DataSource for V2SessionCatalog DDL */ +class FakeV2Provider extends SimpleTableProvider { + override def getTable(options: CaseInsensitiveStringMap): Table = { + throw new UnsupportedOperationException("Unnecessary for DDL tests") + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2Suite.scala similarity index 89% rename from sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala rename to sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2Suite.scala index 8f7dbe8d13c39..2d8761f872da7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2Suite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2Suite.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.sources.v2 +package org.apache.spark.sql.connector import java.io.File import java.util @@ -23,25 +23,28 @@ import java.util.OptionalLong import scala.collection.JavaConverters._ -import test.org.apache.spark.sql.sources.v2._ +import test.org.apache.spark.sql.connector._ import org.apache.spark.SparkException import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row} import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.execution.datasources.v2.{BatchScanExec, DataSourceV2Relation} +import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapability, TableProvider} +import org.apache.spark.sql.connector.catalog.TableCapability._ +import org.apache.spark.sql.connector.expressions.Transform +import org.apache.spark.sql.connector.read._ +import org.apache.spark.sql.connector.read.partitioning.{ClusteredDistribution, Distribution, Partitioning} +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.execution.datasources.v2.{BatchScanExec, DataSourceV2Relation, DataSourceV2ScanRelation} import org.apache.spark.sql.execution.exchange.{Exchange, ShuffleExchangeExec} import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector import org.apache.spark.sql.functions._ import org.apache.spark.sql.sources.{Filter, GreaterThan} -import org.apache.spark.sql.sources.v2.TableCapability._ -import org.apache.spark.sql.sources.v2.reader._ -import org.apache.spark.sql.sources.v2.reader.partitioning.{ClusteredDistribution, Distribution, Partitioning} import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{IntegerType, StructType} import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.sql.vectorized.ColumnarBatch -class DataSourceV2Suite extends QueryTest with SharedSparkSession { +class DataSourceV2Suite extends QueryTest with SharedSparkSession with AdaptiveSparkPlanHelper { import testImplicits._ private def getBatch(query: DataFrame): AdvancedBatch = { @@ -163,25 +166,25 @@ class DataSourceV2Suite extends QueryTest with SharedSparkSession { val groupByColA = df.groupBy('i).agg(sum('j)) checkAnswer(groupByColA, Seq(Row(1, 8), Row(2, 6), Row(3, 6), Row(4, 4))) - assert(groupByColA.queryExecution.executedPlan.collectFirst { + assert(collectFirst(groupByColA.queryExecution.executedPlan) { case e: ShuffleExchangeExec => e }.isEmpty) val groupByColAB = df.groupBy('i, 'j).agg(count("*")) checkAnswer(groupByColAB, Seq(Row(1, 4, 2), Row(2, 6, 1), Row(3, 6, 1), Row(4, 2, 2))) - assert(groupByColAB.queryExecution.executedPlan.collectFirst { + assert(collectFirst(groupByColAB.queryExecution.executedPlan) { case e: ShuffleExchangeExec => e }.isEmpty) val groupByColB = df.groupBy('j).agg(sum('i)) checkAnswer(groupByColB, Seq(Row(2, 8), Row(4, 2), Row(6, 5))) - assert(groupByColB.queryExecution.executedPlan.collectFirst { + assert(collectFirst(groupByColB.queryExecution.executedPlan) { case e: ShuffleExchangeExec => e }.isDefined) val groupByAPlusB = df.groupBy('i + 'j).agg(count("*")) checkAnswer(groupByAPlusB, Seq(Row(5, 2), Row(6, 2), Row(8, 1), Row(9, 1))) - assert(groupByAPlusB.queryExecution.executedPlan.collectFirst { + assert(collectFirst(groupByAPlusB.queryExecution.executedPlan) { case e: ShuffleExchangeExec => e }.isDefined) } @@ -194,7 +197,7 @@ class DataSourceV2Suite extends QueryTest with SharedSparkSession { withClue(cls.getName) { val df = spark.read.format(cls.getName).load() val logical = df.queryExecution.optimizedPlan.collect { - case d: DataSourceV2Relation => d + case d: DataSourceV2ScanRelation => d }.head val statics = logical.computeStats() @@ -224,8 +227,12 @@ class DataSourceV2Suite extends QueryTest with SharedSparkSession { spark.read.format(cls.getName).option("path", path).load(), spark.range(10).select('id, -'id)) - // default save mode is append - spark.range(10).select('id as 'i, -'id as 'j).write.format(cls.getName) + // default save mode is ErrorIfExists + intercept[AnalysisException] { + spark.range(10).select('id as 'i, -'id as 'j).write.format(cls.getName) + .option("path", path).save() + } + spark.range(10).select('id as 'i, -'id as 'j).write.mode("append").format(cls.getName) .option("path", path).save() checkAnswer( spark.read.format(cls.getName).option("path", path).load(), @@ -280,7 +287,7 @@ class DataSourceV2Suite extends QueryTest with SharedSparkSession { val numPartition = 6 spark.range(0, 10, 1, numPartition).select('id as 'i, -'id as 'j).write.format(cls.getName) - .option("path", path).save() + .mode("append").option("path", path).save() checkAnswer( spark.read.format(cls.getName).option("path", path).load(), spark.range(10).select('id, -'id)) @@ -327,7 +334,7 @@ class DataSourceV2Suite extends QueryTest with SharedSparkSession { test("SPARK-23315: get output from canonicalized data source v2 related plans") { def checkCanonicalizedOutput( df: DataFrame, logicalNumOutput: Int, physicalNumOutput: Int): Unit = { - val logical = df.queryExecution.optimizedPlan.collect { + val logical = df.queryExecution.logical.collect { case d: DataSourceV2Relation => d }.head assert(logical.canonicalized.output.length == logicalNumOutput) @@ -351,7 +358,7 @@ class DataSourceV2Suite extends QueryTest with SharedSparkSession { .read .option(optionName, false) .format(classOf[DataSourceV2WithSessionConfig].getName).load() - val options = df.queryExecution.optimizedPlan.collectFirst { + val options = df.queryExecution.logical.collectFirst { case d: DataSourceV2Relation => d.options }.get assert(options.get(optionName) === "false") @@ -367,7 +374,7 @@ class DataSourceV2Suite extends QueryTest with SharedSparkSession { val format = classOf[SimpleWritableDataSource].getName val df = Seq((1L, 2L)).toDF("i", "j") - df.write.format(format).option("path", optionPath).save() + df.write.format(format).mode("append").option("path", optionPath).save() assert(!new File(sessionPath).exists) checkAnswer(spark.read.format(format).option("path", optionPath).load(), df) } @@ -380,7 +387,7 @@ class DataSourceV2Suite extends QueryTest with SharedSparkSession { val t2 = spark.read.format(classOf[SimpleDataSourceV2].getName).load() Seq(2, 3).toDF("a").createTempView("t1") val df = t2.where("i < (select max(a) from t1)").select('i) - val subqueries = df.queryExecution.executedPlan.collect { + val subqueries = stripAQEPlan(df.queryExecution.executedPlan).collect { case p => p.subqueries }.flatten assert(subqueries.length == 1) @@ -412,7 +419,7 @@ object SimpleReaderFactory extends PartitionReaderFactory { abstract class SimpleBatchTable extends Table with SupportsRead { - override def schema(): StructType = new StructType().add("i", "int").add("j", "int") + override def schema(): StructType = TestingV2Source.schema override def name(): String = this.getClass.toString @@ -426,12 +433,31 @@ abstract class SimpleScanBuilder extends ScanBuilder override def toBatch: Batch = this - override def readSchema(): StructType = new StructType().add("i", "int").add("j", "int") + override def readSchema(): StructType = TestingV2Source.schema override def createReaderFactory(): PartitionReaderFactory = SimpleReaderFactory } -class SimpleSinglePartitionSource extends TableProvider { +trait TestingV2Source extends TableProvider { + override def inferSchema(options: CaseInsensitiveStringMap): StructType = { + TestingV2Source.schema + } + + override def getTable( + schema: StructType, + partitioning: Array[Transform], + properties: util.Map[String, String]): Table = { + getTable(new CaseInsensitiveStringMap(properties)) + } + + def getTable(options: CaseInsensitiveStringMap): Table +} + +object TestingV2Source { + val schema = new StructType().add("i", "int").add("j", "int") +} + +class SimpleSinglePartitionSource extends TestingV2Source { class MyScanBuilder extends SimpleScanBuilder { override def planInputPartitions(): Array[InputPartition] = { @@ -446,9 +472,10 @@ class SimpleSinglePartitionSource extends TableProvider { } } + // This class is used by pyspark tests. If this class is modified/moved, make sure pyspark // tests still pass. -class SimpleDataSourceV2 extends TableProvider { +class SimpleDataSourceV2 extends TestingV2Source { class MyScanBuilder extends SimpleScanBuilder { override def planInputPartitions(): Array[InputPartition] = { @@ -463,7 +490,7 @@ class SimpleDataSourceV2 extends TableProvider { } } -class AdvancedDataSourceV2 extends TableProvider { +class AdvancedDataSourceV2 extends TestingV2Source { override def getTable(options: CaseInsensitiveStringMap): Table = new SimpleBatchTable { override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = { @@ -475,7 +502,7 @@ class AdvancedDataSourceV2 extends TableProvider { class AdvancedScanBuilder extends ScanBuilder with Scan with SupportsPushDownFilters with SupportsPushDownRequiredColumns { - var requiredSchema = new StructType().add("i", "int").add("j", "int") + var requiredSchema = TestingV2Source.schema var filters = Array.empty[Filter] override def pruneColumns(requiredSchema: StructType): Unit = { @@ -561,11 +588,16 @@ class SchemaRequiredDataSource extends TableProvider { override def readSchema(): StructType = schema } - override def getTable(options: CaseInsensitiveStringMap): Table = { + override def supportsExternalMetadata(): Boolean = true + + override def inferSchema(options: CaseInsensitiveStringMap): StructType = { throw new IllegalArgumentException("requires a user-supplied schema") } - override def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = { + override def getTable( + schema: StructType, + partitioning: Array[Transform], + properties: util.Map[String, String]): Table = { val userGivenSchema = schema new SimpleBatchTable { override def schema(): StructType = userGivenSchema @@ -577,7 +609,7 @@ class SchemaRequiredDataSource extends TableProvider { } } -class ColumnarDataSourceV2 extends TableProvider { +class ColumnarDataSourceV2 extends TestingV2Source { class MyScanBuilder extends SimpleScanBuilder { @@ -642,7 +674,7 @@ object ColumnarReaderFactory extends PartitionReaderFactory { } } -class PartitionAwareDataSource extends TableProvider { +class PartitionAwareDataSource extends TestingV2Source { class MyScanBuilder extends SimpleScanBuilder with SupportsReportPartitioning{ @@ -710,7 +742,7 @@ class SimpleWriteOnlyDataSource extends SimpleWritableDataSource { } } -class ReportStatisticsDataSource extends TableProvider { +class ReportStatisticsDataSource extends SimpleWritableDataSource { class MyScanBuilder extends SimpleScanBuilder with SupportsReportStatistics { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2UtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2UtilsSuite.scala similarity index 95% rename from sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2UtilsSuite.scala rename to sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2UtilsSuite.scala index 0b1e3b5fb076d..01fcced5b12a8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2UtilsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2UtilsSuite.scala @@ -15,9 +15,10 @@ * limitations under the License. */ -package org.apache.spark.sql.sources.v2 +package org.apache.spark.sql.connector import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.connector.catalog.SessionConfigSupport import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils import org.apache.spark.sql.internal.SQLConf diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/FileDataSourceV2FallBackSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/FileDataSourceV2FallBackSuite.scala similarity index 93% rename from sql/core/src/test/scala/org/apache/spark/sql/sources/v2/FileDataSourceV2FallBackSuite.scala rename to sql/core/src/test/scala/org/apache/spark/sql/connector/FileDataSourceV2FallBackSuite.scala index 26f941244f5cc..b0da2eb697f36 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/FileDataSourceV2FallBackSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/FileDataSourceV2FallBackSuite.scala @@ -14,21 +14,23 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.spark.sql.sources.v2 +package org.apache.spark.sql.connector import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer +import org.apache.spark.SparkConf import org.apache.spark.sql.{AnalysisException, QueryTest} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.connector.catalog.{SupportsRead, SupportsWrite, Table, TableCapability} +import org.apache.spark.sql.connector.read.ScanBuilder +import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder} import org.apache.spark.sql.execution.{FileSourceScanExec, QueryExecution} import org.apache.spark.sql.execution.datasources.{FileFormat, InsertIntoHadoopFsRelationCommand} import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2 import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetDataSourceV2 import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.sources.v2.reader.ScanBuilder -import org.apache.spark.sql.sources.v2.writer.WriteBuilder import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.{CaseInsensitiveStringMap, QueryExecutionListener} @@ -73,7 +75,7 @@ class DummyWriteOnlyFileTable extends Table with SupportsWrite { override def schema(): StructType = StructType(Nil) - override def newWriteBuilder(options: CaseInsensitiveStringMap): WriteBuilder = + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = throw new AnalysisException("Dummy file writer") override def capabilities(): java.util.Set[TableCapability] = @@ -85,6 +87,8 @@ class FileDataSourceV2FallBackSuite extends QueryTest with SharedSparkSession { private val dummyReadOnlyFileSourceV2 = classOf[DummyReadOnlyFileDataSourceV2].getName private val dummyWriteOnlyFileSourceV2 = classOf[DummyWriteOnlyFileDataSourceV2].getName + override protected def sparkConf: SparkConf = super.sparkConf.set(SQLConf.USE_V1_SOURCE_LIST, "") + test("Fall back to v1 when writing to file with read only FileDataSourceV2") { val df = spark.range(10).toDF() withTempPath { file => @@ -172,7 +176,7 @@ class FileDataSourceV2FallBackSuite extends QueryTest with SharedSparkSession { withTempPath { path => val inputData = spark.range(10) inputData.write.format(format).save(path.getCanonicalPath) - sparkContext.listenerBus.waitUntilEmpty(1000) + sparkContext.listenerBus.waitUntilEmpty() assert(commands.length == 1) assert(commands.head._1 == "save") assert(commands.head._2.isInstanceOf[InsertIntoHadoopFsRelationCommand]) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/InsertIntoTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/InsertIntoTests.scala similarity index 99% rename from sql/core/src/test/scala/org/apache/spark/sql/sources/v2/InsertIntoTests.scala rename to sql/core/src/test/scala/org/apache/spark/sql/connector/InsertIntoTests.scala index 5b5382e5ca931..0fd6cf1b6746c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/InsertIntoTests.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/InsertIntoTests.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.sources.v2 +package org.apache.spark.sql.connector import org.scalatest.BeforeAndAfter @@ -175,7 +175,7 @@ abstract class InsertIntoTests( } } -private[v2] trait InsertIntoSQLOnlyTests +trait InsertIntoSQLOnlyTests extends QueryTest with SharedSparkSession with BeforeAndAfter { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/SimpleWritableDataSource.scala similarity index 87% rename from sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala rename to sql/core/src/test/scala/org/apache/spark/sql/connector/SimpleWritableDataSource.scala index c9d2f1eef24bb..f9306ba28e7f2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/SimpleWritableDataSource.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.sources.v2 +package org.apache.spark.sql.connector import java.io.{BufferedReader, InputStreamReader, IOException} import java.util @@ -27,9 +27,11 @@ import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkContext import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.sources.v2.TableCapability._ -import org.apache.spark.sql.sources.v2.reader._ -import org.apache.spark.sql.sources.v2.writer._ +import org.apache.spark.sql.connector.catalog.{SessionConfigSupport, SupportsWrite, Table, TableCapability} +import org.apache.spark.sql.connector.catalog.TableCapability._ +import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory, ScanBuilder} +import org.apache.spark.sql.connector.write._ +import org.apache.spark.sql.internal.connector.SimpleTableProvider import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util.SerializableConfiguration @@ -39,7 +41,7 @@ import org.apache.spark.util.SerializableConfiguration * Each task writes data to `target/_temporary/uniqueId/$jobId-$partitionId-$attemptNumber`. * Each job moves files from `target/_temporary/uniqueId/` to `target`. */ -class SimpleWritableDataSource extends TableProvider with SessionConfigSupport { +class SimpleWritableDataSource extends SimpleTableProvider with SessionConfigSupport { private val tableSchema = new StructType().add("i", "long").add("j", "long") @@ -69,15 +71,11 @@ class SimpleWritableDataSource extends TableProvider with SessionConfigSupport { override def readSchema(): StructType = tableSchema } - class MyWriteBuilder(path: String) extends WriteBuilder with SupportsTruncate { - private var queryId: String = _ + class MyWriteBuilder(path: String, info: LogicalWriteInfo) + extends WriteBuilder with SupportsTruncate { + private val queryId: String = info.queryId() private var needTruncate = false - override def withQueryId(queryId: String): WriteBuilder = { - this.queryId = queryId - this - } - override def truncate(): WriteBuilder = { this.needTruncate = true this @@ -98,7 +96,7 @@ class SimpleWritableDataSource extends TableProvider with SessionConfigSupport { } class MyBatchWrite(queryId: String, path: String, conf: Configuration) extends BatchWrite { - override def createBatchWriterFactory(): DataWriterFactory = { + override def createBatchWriterFactory(info: PhysicalWriteInfo): DataWriterFactory = { SimpleCounter.resetCounter new CSVDataWriterFactory(path, queryId, new SerializableConfiguration(conf)) } @@ -142,8 +140,8 @@ class SimpleWritableDataSource extends TableProvider with SessionConfigSupport { new MyScanBuilder(new Path(path).toUri.toString, conf) } - override def newWriteBuilder(options: CaseInsensitiveStringMap): WriteBuilder = { - new MyWriteBuilder(path) + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { + new MyWriteBuilder(path, info) } override def capabilities(): util.Set[TableCapability] = @@ -190,7 +188,7 @@ class CSVReaderFactory(conf: SerializableConfiguration) } } -private[v2] object SimpleCounter { +private[connector] object SimpleCounter { private var count: Int = 0 def increaseCounter: Unit = { @@ -239,4 +237,6 @@ class CSVDataWriter(fs: FileSystem, file: Path) extends DataWriter[InternalRow] fs.delete(file, false) } } + + override def close(): Unit = {} } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/SupportsCatalogOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/SupportsCatalogOptionsSuite.scala new file mode 100644 index 0000000000000..7bff955b18360 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/SupportsCatalogOptionsSuite.scala @@ -0,0 +1,288 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connector + +import scala.language.implicitConversions +import scala.util.Try + +import org.scalatest.BeforeAndAfter + +import org.apache.spark.sql.{DataFrame, QueryTest, SaveMode} +import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException +import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan, OverwriteByExpression} +import org.apache.spark.sql.connector.catalog.{Identifier, SupportsCatalogOptions, TableCatalog} +import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME +import org.apache.spark.sql.connector.expressions.{FieldReference, IdentityTransform, Transform} +import org.apache.spark.sql.execution.QueryExecution +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation +import org.apache.spark.sql.internal.SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.{LongType, StructType} +import org.apache.spark.sql.util.{CaseInsensitiveStringMap, QueryExecutionListener} + +class SupportsCatalogOptionsSuite extends QueryTest with SharedSparkSession with BeforeAndAfter { + + import testImplicits._ + + private val catalogName = "testcat" + private val format = classOf[CatalogSupportingInMemoryTableProvider].getName + + private def catalog(name: String): TableCatalog = { + spark.sessionState.catalogManager.catalog(name).asInstanceOf[TableCatalog] + } + + private implicit def stringToIdentifier(value: String): Identifier = { + Identifier.of(Array.empty, value) + } + + before { + spark.conf.set( + V2_SESSION_CATALOG_IMPLEMENTATION.key, classOf[InMemoryTableSessionCatalog].getName) + spark.conf.set( + s"spark.sql.catalog.$catalogName", classOf[InMemoryTableCatalog].getName) + } + + override def afterEach(): Unit = { + super.afterEach() + Try(catalog(SESSION_CATALOG_NAME).asInstanceOf[InMemoryTableSessionCatalog].clearTables()) + catalog(catalogName).listTables(Array.empty).foreach( + catalog(catalogName).dropTable(_)) + spark.conf.unset(V2_SESSION_CATALOG_IMPLEMENTATION.key) + spark.conf.unset(s"spark.sql.catalog.$catalogName") + } + + private def testCreateAndRead( + saveMode: SaveMode, + withCatalogOption: Option[String], + partitionBy: Seq[String]): Unit = { + val df = spark.range(10).withColumn("part", 'id % 5) + val dfw = df.write.format(format).mode(saveMode).option("name", "t1") + withCatalogOption.foreach(cName => dfw.option("catalog", cName)) + dfw.partitionBy(partitionBy: _*).save() + + val table = catalog(withCatalogOption.getOrElse(SESSION_CATALOG_NAME)).loadTable("t1") + val namespace = withCatalogOption.getOrElse("default") + assert(table.name() === s"$namespace.t1", "Table identifier was wrong") + assert(table.partitioning().length === partitionBy.length, "Partitioning did not match") + if (partitionBy.nonEmpty) { + table.partitioning.head match { + case IdentityTransform(FieldReference(field)) => + assert(field === Seq(partitionBy.head), "Partitioning column did not match") + case otherTransform => + fail(s"Unexpected partitioning ${otherTransform.describe()} received") + } + } + assert(table.partitioning().map(_.references().head.fieldNames().head) === partitionBy, + "Partitioning was incorrect") + assert(table.schema() === df.schema.asNullable, "Schema did not match") + + checkAnswer(load("t1", withCatalogOption), df.toDF()) + } + + test(s"save works with ErrorIfExists - no table, no partitioning, session catalog") { + testCreateAndRead(SaveMode.ErrorIfExists, None, Nil) + } + + test(s"save works with ErrorIfExists - no table, with partitioning, session catalog") { + testCreateAndRead(SaveMode.ErrorIfExists, None, Seq("part")) + } + + test(s"save works with Ignore - no table, no partitioning, testcat catalog") { + testCreateAndRead(SaveMode.Ignore, Some(catalogName), Nil) + } + + test(s"save works with Ignore - no table, with partitioning, testcat catalog") { + testCreateAndRead(SaveMode.Ignore, Some(catalogName), Seq("part")) + } + + test("save fails with ErrorIfExists if table exists - session catalog") { + sql(s"create table t1 (id bigint) using $format") + val df = spark.range(10) + intercept[TableAlreadyExistsException] { + val dfw = df.write.format(format).option("name", "t1") + dfw.save() + } + } + + test("save fails with ErrorIfExists if table exists - testcat catalog") { + sql(s"create table $catalogName.t1 (id bigint) using $format") + val df = spark.range(10) + intercept[TableAlreadyExistsException] { + val dfw = df.write.format(format).option("name", "t1").option("catalog", catalogName) + dfw.save() + } + } + + test("Ignore mode if table exists - session catalog") { + sql(s"create table t1 (id bigint) using $format") + val df = spark.range(10).withColumn("part", 'id % 5) + val dfw = df.write.format(format).mode(SaveMode.Ignore).option("name", "t1") + dfw.save() + + val table = catalog(SESSION_CATALOG_NAME).loadTable("t1") + assert(table.partitioning().isEmpty, "Partitioning should be empty") + assert(table.schema() === new StructType().add("id", LongType), "Schema did not match") + assert(load("t1", None).count() === 0) + } + + test("Ignore mode if table exists - testcat catalog") { + sql(s"create table $catalogName.t1 (id bigint) using $format") + val df = spark.range(10).withColumn("part", 'id % 5) + val dfw = df.write.format(format).mode(SaveMode.Ignore).option("name", "t1") + dfw.option("catalog", catalogName).save() + + val table = catalog(catalogName).loadTable("t1") + assert(table.partitioning().isEmpty, "Partitioning should be empty") + assert(table.schema() === new StructType().add("id", LongType), "Schema did not match") + assert(load("t1", Some(catalogName)).count() === 0) + } + + test("append and overwrite modes - session catalog") { + sql(s"create table t1 (id bigint) using $format") + val df = spark.range(10) + df.write.format(format).option("name", "t1").mode(SaveMode.Append).save() + + checkAnswer(load("t1", None), df.toDF()) + + val df2 = spark.range(10, 20) + df2.write.format(format).option("name", "t1").mode(SaveMode.Overwrite).save() + + checkAnswer(load("t1", None), df2.toDF()) + } + + test("append and overwrite modes - testcat catalog") { + sql(s"create table $catalogName.t1 (id bigint) using $format") + val df = spark.range(10) + df.write.format(format).option("name", "t1").option("catalog", catalogName) + .mode(SaveMode.Append).save() + + checkAnswer(load("t1", Some(catalogName)), df.toDF()) + + val df2 = spark.range(10, 20) + df2.write.format(format).option("name", "t1").option("catalog", catalogName) + .mode(SaveMode.Overwrite).save() + + checkAnswer(load("t1", Some(catalogName)), df2.toDF()) + } + + test("fail on user specified schema when reading - session catalog") { + sql(s"create table t1 (id bigint) using $format") + val e = intercept[IllegalArgumentException] { + spark.read.format(format).option("name", "t1").schema("id bigint").load() + } + assert(e.getMessage.contains("not support user specified schema")) + } + + test("fail on user specified schema when reading - testcat catalog") { + sql(s"create table $catalogName.t1 (id bigint) using $format") + val e = intercept[IllegalArgumentException] { + spark.read.format(format).option("name", "t1").option("catalog", catalogName) + .schema("id bigint").load() + } + assert(e.getMessage.contains("not support user specified schema")) + } + + test("DataFrameReader creates v2Relation with identifiers") { + sql(s"create table $catalogName.t1 (id bigint) using $format") + val df = load("t1", Some(catalogName)) + checkV2Identifiers(df.logicalPlan) + } + + test("DataFrameWriter creates v2Relation with identifiers") { + sql(s"create table $catalogName.t1 (id bigint) using $format") + + var plan: LogicalPlan = null + val listener = new QueryExecutionListener { + override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = { + plan = qe.analyzed + } + override def onFailure(funcName: String, qe: QueryExecution, error: Throwable): Unit = {} + } + + spark.listenerManager.register(listener) + + try { + // Test append + save("t1", SaveMode.Append, Some(catalogName)) + sparkContext.listenerBus.waitUntilEmpty() + assert(plan.isInstanceOf[AppendData]) + val appendRelation = plan.asInstanceOf[AppendData].table + checkV2Identifiers(appendRelation) + + // Test overwrite + save("t1", SaveMode.Overwrite, Some(catalogName)) + sparkContext.listenerBus.waitUntilEmpty() + assert(plan.isInstanceOf[OverwriteByExpression]) + val overwriteRelation = plan.asInstanceOf[OverwriteByExpression].table + checkV2Identifiers(overwriteRelation) + + // Test insert + spark.range(10).write.format(format).insertInto(s"$catalogName.t1") + sparkContext.listenerBus.waitUntilEmpty() + assert(plan.isInstanceOf[AppendData]) + val insertRelation = plan.asInstanceOf[AppendData].table + checkV2Identifiers(insertRelation) + + // Test saveAsTable append + spark.range(10).write.format(format).mode(SaveMode.Append).saveAsTable(s"$catalogName.t1") + sparkContext.listenerBus.waitUntilEmpty() + assert(plan.isInstanceOf[AppendData]) + val saveAsTableRelation = plan.asInstanceOf[AppendData].table + checkV2Identifiers(saveAsTableRelation) + } finally { + spark.listenerManager.unregister(listener) + } + } + + private def checkV2Identifiers( + plan: LogicalPlan, + identifier: String = "t1", + catalogPlugin: TableCatalog = catalog(catalogName)): Unit = { + assert(plan.isInstanceOf[DataSourceV2Relation]) + val v2 = plan.asInstanceOf[DataSourceV2Relation] + assert(v2.identifier.exists(_.name() == identifier)) + assert(v2.catalog.exists(_ == catalogPlugin)) + } + + private def load(name: String, catalogOpt: Option[String]): DataFrame = { + val dfr = spark.read.format(format).option("name", name) + catalogOpt.foreach(cName => dfr.option("catalog", cName)) + dfr.load() + } + + private def save(name: String, mode: SaveMode, catalogOpt: Option[String]): Unit = { + val df = spark.range(10).write.format(format).option("name", name) + catalogOpt.foreach(cName => df.option("catalog", cName)) + df.mode(mode).save() + } +} + +class CatalogSupportingInMemoryTableProvider + extends FakeV2Provider + with SupportsCatalogOptions { + + override def extractIdentifier(options: CaseInsensitiveStringMap): Identifier = { + val name = options.get("name") + assert(name != null, "The name should be provided for this table") + Identifier.of(Array.empty, name) + } + + override def extractCatalog(options: CaseInsensitiveStringMap): String = { + options.get("catalog") + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/TableCapabilityCheckSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/TableCapabilityCheckSuite.scala similarity index 79% rename from sql/core/src/test/scala/org/apache/spark/sql/sources/v2/TableCapabilityCheckSuite.scala rename to sql/core/src/test/scala/org/apache/spark/sql/connector/TableCapabilityCheckSuite.scala index ab47836001704..23e4c293cbc28 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/TableCapabilityCheckSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/TableCapabilityCheckSuite.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.sources.v2 +package org.apache.spark.sql.connector import java.util @@ -24,21 +24,23 @@ import scala.collection.JavaConverters._ import org.apache.spark.sql.{AnalysisException, DataFrame, SQLContext} import org.apache.spark.sql.catalyst.analysis.{AnalysisSuite, NamedRelation} import org.apache.spark.sql.catalyst.expressions.{AttributeReference, EqualTo, Literal} -import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LeafNode, OverwriteByExpression, OverwritePartitionsDynamic, Union} +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.connector.catalog.{CatalogPlugin, Identifier, Table, TableCapability, TableProvider} +import org.apache.spark.sql.connector.catalog.TableCapability._ import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, TableCapabilityCheck} import org.apache.spark.sql.execution.streaming.{Offset, Source, StreamingRelation, StreamingRelationV2} import org.apache.spark.sql.sources.StreamSourceProvider -import org.apache.spark.sql.sources.v2.TableCapability.{BATCH_WRITE, CONTINUOUS_READ, MICRO_BATCH_READ, OVERWRITE_BY_FILTER, OVERWRITE_DYNAMIC, TRUNCATE} import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{LongType, StringType, StructType} import org.apache.spark.sql.util.CaseInsensitiveStringMap class TableCapabilityCheckSuite extends AnalysisSuite with SharedSparkSession { + private val emptyMap = CaseInsensitiveStringMap.empty private def createStreamingRelation(table: Table, v1Relation: Option[StreamingRelation]) = { StreamingRelationV2( - TestTableProvider, + new FakeV2Provider, "fake", table, CaseInsensitiveStringMap.empty(), @@ -52,9 +54,9 @@ class TableCapabilityCheckSuite extends AnalysisSuite with SharedSparkSession { test("batch scan: check missing capabilities") { val e = intercept[AnalysisException] { - TableCapabilityCheck.apply(DataSourceV2Relation.create( - CapabilityTable(), - CaseInsensitiveStringMap.empty)) + TableCapabilityCheck.apply( + DataSourceV2Relation.create(CapabilityTable(), None, None, emptyMap) + ) } assert(e.message.contains("does not support batch scan")) } @@ -87,7 +89,8 @@ class TableCapabilityCheckSuite extends AnalysisSuite with SharedSparkSession { test("AppendData: check missing capabilities") { val plan = AppendData.byName( - DataSourceV2Relation.create(CapabilityTable(), CaseInsensitiveStringMap.empty), TestRelation) + DataSourceV2Relation.create(CapabilityTable(), None, None, emptyMap), + TestRelation) val exc = intercept[AnalysisException]{ TableCapabilityCheck.apply(plan) @@ -97,21 +100,25 @@ class TableCapabilityCheckSuite extends AnalysisSuite with SharedSparkSession { } test("AppendData: check correct capabilities") { - val plan = AppendData.byName( - DataSourceV2Relation.create(CapabilityTable(BATCH_WRITE), CaseInsensitiveStringMap.empty), - TestRelation) + Seq(BATCH_WRITE, V1_BATCH_WRITE).foreach { write => + val plan = AppendData.byName( + DataSourceV2Relation.create(CapabilityTable(write), None, None, emptyMap), + TestRelation) - TableCapabilityCheck.apply(plan) + TableCapabilityCheck.apply(plan) + } } test("Truncate: check missing capabilities") { Seq(CapabilityTable(), CapabilityTable(BATCH_WRITE), + CapabilityTable(V1_BATCH_WRITE), CapabilityTable(TRUNCATE), CapabilityTable(OVERWRITE_BY_FILTER)).foreach { table => val plan = OverwriteByExpression.byName( - DataSourceV2Relation.create(table, CaseInsensitiveStringMap.empty), TestRelation, + DataSourceV2Relation.create(table, None, None, emptyMap), + TestRelation, Literal(true)) val exc = intercept[AnalysisException]{ @@ -124,10 +131,13 @@ class TableCapabilityCheckSuite extends AnalysisSuite with SharedSparkSession { test("Truncate: check correct capabilities") { Seq(CapabilityTable(BATCH_WRITE, TRUNCATE), - CapabilityTable(BATCH_WRITE, OVERWRITE_BY_FILTER)).foreach { table => + CapabilityTable(V1_BATCH_WRITE, TRUNCATE), + CapabilityTable(BATCH_WRITE, OVERWRITE_BY_FILTER), + CapabilityTable(V1_BATCH_WRITE, OVERWRITE_BY_FILTER)).foreach { table => val plan = OverwriteByExpression.byName( - DataSourceV2Relation.create(table, CaseInsensitiveStringMap.empty), TestRelation, + DataSourceV2Relation.create(table, None, None, emptyMap), + TestRelation, Literal(true)) TableCapabilityCheck.apply(plan) @@ -136,11 +146,13 @@ class TableCapabilityCheckSuite extends AnalysisSuite with SharedSparkSession { test("OverwriteByExpression: check missing capabilities") { Seq(CapabilityTable(), + CapabilityTable(V1_BATCH_WRITE), CapabilityTable(BATCH_WRITE), CapabilityTable(OVERWRITE_BY_FILTER)).foreach { table => val plan = OverwriteByExpression.byName( - DataSourceV2Relation.create(table, CaseInsensitiveStringMap.empty), TestRelation, + DataSourceV2Relation.create(table, None, None, emptyMap), + TestRelation, EqualTo(AttributeReference("x", LongType)(), Literal(5))) val exc = intercept[AnalysisException]{ @@ -152,12 +164,15 @@ class TableCapabilityCheckSuite extends AnalysisSuite with SharedSparkSession { } test("OverwriteByExpression: check correct capabilities") { - val table = CapabilityTable(BATCH_WRITE, OVERWRITE_BY_FILTER) - val plan = OverwriteByExpression.byName( - DataSourceV2Relation.create(table, CaseInsensitiveStringMap.empty), TestRelation, - EqualTo(AttributeReference("x", LongType)(), Literal(5))) + Seq(BATCH_WRITE, V1_BATCH_WRITE).foreach { write => + val table = CapabilityTable(write, OVERWRITE_BY_FILTER) + val plan = OverwriteByExpression.byName( + DataSourceV2Relation.create(table, None, None, emptyMap), + TestRelation, + EqualTo(AttributeReference("x", LongType)(), Literal(5))) - TableCapabilityCheck.apply(plan) + TableCapabilityCheck.apply(plan) + } } test("OverwritePartitionsDynamic: check missing capabilities") { @@ -166,7 +181,8 @@ class TableCapabilityCheckSuite extends AnalysisSuite with SharedSparkSession { CapabilityTable(OVERWRITE_DYNAMIC)).foreach { table => val plan = OverwritePartitionsDynamic.byName( - DataSourceV2Relation.create(table, CaseInsensitiveStringMap.empty), TestRelation) + DataSourceV2Relation.create(table, None, None, emptyMap), + TestRelation) val exc = intercept[AnalysisException] { TableCapabilityCheck.apply(plan) @@ -179,7 +195,8 @@ class TableCapabilityCheckSuite extends AnalysisSuite with SharedSparkSession { test("OverwritePartitionsDynamic: check correct capabilities") { val table = CapabilityTable(BATCH_WRITE, OVERWRITE_DYNAMIC) val plan = OverwritePartitionsDynamic.byName( - DataSourceV2Relation.create(table, CaseInsensitiveStringMap.empty), TestRelation) + DataSourceV2Relation.create(table, None, None, emptyMap), + TestRelation) TableCapabilityCheck.apply(plan) } @@ -194,12 +211,6 @@ private case object TestRelation extends LeafNode with NamedRelation { override def output: Seq[AttributeReference] = TableCapabilityCheckSuite.schema.toAttributes } -private object TestTableProvider extends TableProvider { - override def getTable(options: CaseInsensitiveStringMap): Table = { - throw new UnsupportedOperationException - } -} - private case class CapabilityTable(_capabilities: TableCapability*) extends Table { override def name(): String = "capability_test_table" override def schema(): StructType = TableCapabilityCheckSuite.schema diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/utils/TestV2SessionCatalogBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/TestV2SessionCatalogBase.scala similarity index 86% rename from sql/core/src/test/scala/org/apache/spark/sql/sources/v2/utils/TestV2SessionCatalogBase.scala rename to sql/core/src/test/scala/org/apache/spark/sql/connector/TestV2SessionCatalogBase.scala index 28ce6a94b253a..3f6ac0b7f8d3c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/utils/TestV2SessionCatalogBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/TestV2SessionCatalogBase.scala @@ -15,17 +15,15 @@ * limitations under the License. */ -package org.apache.spark.sql.sources.v2.utils +package org.apache.spark.sql.connector import java.util import java.util.concurrent.ConcurrentHashMap import scala.collection.JavaConverters._ -import org.apache.spark.sql.catalog.v2.Identifier -import org.apache.spark.sql.catalog.v2.expressions.Transform -import org.apache.spark.sql.execution.datasources.v2.V2SessionCatalog -import org.apache.spark.sql.sources.v2.Table +import org.apache.spark.sql.connector.catalog.{DelegatingCatalogExtension, Identifier, Table} +import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.types.StructType /** @@ -33,7 +31,7 @@ import org.apache.spark.sql.types.StructType * for testing DDL as well as write operations (through df.write.saveAsTable, df.write.insertInto * and SQL). */ -private[v2] trait TestV2SessionCatalogBase[T <: Table] extends V2SessionCatalog { +private[connector] trait TestV2SessionCatalogBase[T <: Table] extends DelegatingCatalogExtension { protected val tables: util.Map[Identifier, T] = new ConcurrentHashMap[Identifier, T]() @@ -76,6 +74,11 @@ private[v2] trait TestV2SessionCatalogBase[T <: Table] extends V2SessionCatalog t } + override def dropTable(ident: Identifier): Boolean = { + tables.remove(fullIdentifier(ident)) + super.dropTable(ident) + } + def clearTables(): Unit = { assert(!tables.isEmpty, "Tables were empty, maybe didn't use the session catalog code path?") tables.keySet().asScala.foreach(super.dropTable) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/V1ReadFallbackSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/V1ReadFallbackSuite.scala new file mode 100644 index 0000000000000..74f2ca14234d2 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/V1ReadFallbackSuite.scala @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connector + +import java.util + +import scala.collection.JavaConverters._ + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{DataFrame, QueryTest, Row, SparkSession, SQLContext} +import org.apache.spark.sql.connector.catalog.{Identifier, SupportsRead, Table, TableCapability, TableProvider} +import org.apache.spark.sql.connector.expressions.Transform +import org.apache.spark.sql.connector.read.{Scan, ScanBuilder, SupportsPushDownFilters, SupportsPushDownRequiredColumns, V1Scan} +import org.apache.spark.sql.execution.RowDataSourceScanExec +import org.apache.spark.sql.internal.connector.SimpleTableProvider +import org.apache.spark.sql.sources.{BaseRelation, Filter, GreaterThan, TableScan} +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.CaseInsensitiveStringMap + +abstract class V1ReadFallbackSuite extends QueryTest with SharedSparkSession { + protected def baseTableScan(): DataFrame + + test("full scan") { + val df = baseTableScan() + val v1Scan = df.queryExecution.executedPlan.collect { + case s: RowDataSourceScanExec => s + } + assert(v1Scan.length == 1) + checkAnswer(df, Seq(Row(1, 10), Row(2, 20), Row(3, 30))) + } + + test("column pruning") { + val df = baseTableScan().select("i") + val v1Scan = df.queryExecution.executedPlan.collect { + case s: RowDataSourceScanExec => s + } + assert(v1Scan.length == 1) + assert(v1Scan.head.output.map(_.name) == Seq("i")) + checkAnswer(df, Seq(Row(1), Row(2), Row(3))) + } + + test("filter push down") { + val df = baseTableScan().filter("i > 1 and j < 30") + val v1Scan = df.queryExecution.executedPlan.collect { + case s: RowDataSourceScanExec => s + } + assert(v1Scan.length == 1) + // `j < 30` can't be pushed. + assert(v1Scan.head.handledFilters.size == 1) + checkAnswer(df, Seq(Row(2, 20))) + } + + test("filter push down + column pruning") { + val df = baseTableScan().filter("i > 1").select("i") + val v1Scan = df.queryExecution.executedPlan.collect { + case s: RowDataSourceScanExec => s + } + assert(v1Scan.length == 1) + assert(v1Scan.head.output.map(_.name) == Seq("i")) + assert(v1Scan.head.handledFilters.size == 1) + checkAnswer(df, Seq(Row(2), Row(3))) + } +} + +class V1ReadFallbackWithDataFrameReaderSuite extends V1ReadFallbackSuite { + override protected def baseTableScan(): DataFrame = { + spark.read.format(classOf[V1ReadFallbackTableProvider].getName).load() + } +} + +class V1ReadFallbackWithCatalogSuite extends V1ReadFallbackSuite { + override def beforeAll(): Unit = { + super.beforeAll() + spark.conf.set("spark.sql.catalog.read_fallback", classOf[V1ReadFallbackCatalog].getName) + sql("CREATE TABLE read_fallback.tbl(i int, j int) USING foo") + } + + override def afterAll(): Unit = { + spark.conf.unset("spark.sql.catalog.read_fallback") + super.afterAll() + } + + override protected def baseTableScan(): DataFrame = { + spark.table("read_fallback.tbl") + } +} + +class V1ReadFallbackCatalog extends BasicInMemoryTableCatalog { + override def createTable( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String]): Table = { + // To simplify the test implementation, only support fixed schema. + if (schema != V1ReadFallbackCatalog.schema || partitions.nonEmpty) { + throw new UnsupportedOperationException + } + val table = new TableWithV1ReadFallback(ident.toString) + tables.put(ident, table) + table + } +} + +object V1ReadFallbackCatalog { + val schema = new StructType().add("i", "int").add("j", "int") +} + +class V1ReadFallbackTableProvider extends SimpleTableProvider { + override def getTable(options: CaseInsensitiveStringMap): Table = { + new TableWithV1ReadFallback("v1-read-fallback") + } +} + +class TableWithV1ReadFallback(override val name: String) extends Table with SupportsRead { + + override def schema(): StructType = V1ReadFallbackCatalog.schema + + override def capabilities(): util.Set[TableCapability] = { + Set(TableCapability.BATCH_READ).asJava + } + + override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = { + new V1ReadFallbackScanBuilder + } + + private class V1ReadFallbackScanBuilder extends ScanBuilder + with SupportsPushDownRequiredColumns with SupportsPushDownFilters { + + private var requiredSchema: StructType = schema() + override def pruneColumns(requiredSchema: StructType): Unit = { + this.requiredSchema = requiredSchema + } + + private var filters: Array[Filter] = Array.empty + override def pushFilters(filters: Array[Filter]): Array[Filter] = { + val (supported, unsupported) = filters.partition { + case GreaterThan("i", _: Int) => true + case _ => false + } + this.filters = supported + unsupported + } + override def pushedFilters(): Array[Filter] = filters + + override def build(): Scan = new V1ReadFallbackScan(requiredSchema, filters) + } + + private class V1ReadFallbackScan( + requiredSchema: StructType, + filters: Array[Filter]) extends V1Scan { + override def readSchema(): StructType = requiredSchema + + override def toV1TableScan[T <: BaseRelation with TableScan](context: SQLContext): T = { + new V1TableScan(context, requiredSchema, filters).asInstanceOf[T] + } + } +} + +class V1TableScan( + context: SQLContext, + requiredSchema: StructType, + filters: Array[Filter]) extends BaseRelation with TableScan { + override def sqlContext: SQLContext = context + override def schema: StructType = requiredSchema + override def buildScan(): RDD[Row] = { + val lowerBound = if (filters.isEmpty) { + 0 + } else { + filters.collect { case GreaterThan("i", v: Int) => v }.max + } + val data = Seq(Row(1, 10), Row(2, 20), Row(3, 30)).filter(_.getInt(0) > lowerBound) + val result = if (requiredSchema.length == 2) { + data + } else if (requiredSchema.map(_.name) == Seq("i")) { + data.map(row => Row(row.getInt(0))) + } else if (requiredSchema.map(_.name) == Seq("j")) { + data.map(row => Row(row.getInt(1))) + } else { + throw new UnsupportedOperationException + } + + SparkSession.active.sparkContext.makeRDD(result) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/V1WriteFallbackSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/V1WriteFallbackSuite.scala similarity index 54% rename from sql/core/src/test/scala/org/apache/spark/sql/sources/v2/V1WriteFallbackSuite.scala rename to sql/core/src/test/scala/org/apache/spark/sql/connector/V1WriteFallbackSuite.scala index 9002775bce211..10ed2048dbf61 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/V1WriteFallbackSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/V1WriteFallbackSuite.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.sources.v2 +package org.apache.spark.sql.connector import java.util @@ -24,15 +24,15 @@ import scala.collection.mutable import org.scalatest.BeforeAndAfter -import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row, SaveMode, SparkSession} -import org.apache.spark.sql.catalog.v2.expressions.{FieldReference, IdentityTransform, Transform} -import org.apache.spark.sql.connector.InMemoryTable -import org.apache.spark.sql.internal.SQLConf.{PARTITION_OVERWRITE_MODE, PartitionOverwriteMode} -import org.apache.spark.sql.sources.{DataSourceRegister, Filter, InsertableRelation} -import org.apache.spark.sql.sources.v2.utils.TestV2SessionCatalogBase -import org.apache.spark.sql.sources.v2.writer.{SupportsOverwrite, SupportsTruncate, V1WriteBuilder, WriteBuilder} +import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row, SaveMode, SparkSession, SQLContext} +import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table, TableCapability} +import org.apache.spark.sql.connector.expressions.{FieldReference, IdentityTransform, Transform} +import org.apache.spark.sql.connector.write.{LogicalWriteInfo, LogicalWriteInfoImpl, SupportsOverwrite, SupportsTruncate, V1WriteBuilder, WriteBuilder} +import org.apache.spark.sql.execution.datasources.DataSourceUtils +import org.apache.spark.sql.internal.connector.SimpleTableProvider +import org.apache.spark.sql.sources._ import org.apache.spark.sql.test.SharedSparkSession -import org.apache.spark.sql.types.{IntegerType, StringType, StructType} +import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap class V1WriteFallbackSuite extends QueryTest with SharedSparkSession with BeforeAndAfter { @@ -54,7 +54,11 @@ class V1WriteFallbackSuite extends QueryTest with SharedSparkSession with Before test("append fallback") { val df = Seq((1, "x"), (2, "y"), (3, "z")).toDF("a", "b") df.write.mode("append").option("name", "t1").format(v2Format).save() + checkAnswer(InMemoryV1Provider.getTableData(spark, "t1"), df) + assert(InMemoryV1Provider.tables("t1").schema === df.schema.asNullable) + assert(InMemoryV1Provider.tables("t1").partitioning.isEmpty) + df.write.mode("append").option("name", "t1").format(v2Format).save() checkAnswer(InMemoryV1Provider.getTableData(spark, "t1"), df.union(df)) } @@ -67,6 +71,59 @@ class V1WriteFallbackSuite extends QueryTest with SharedSparkSession with Before df2.write.mode("overwrite").option("name", "t1").format(v2Format).save() checkAnswer(InMemoryV1Provider.getTableData(spark, "t1"), df2) } + + SaveMode.values().foreach { mode => + test(s"save: new table creations with partitioning for table - mode: $mode") { + val format = classOf[InMemoryV1Provider].getName + val df = Seq((1, "x"), (2, "y"), (3, "z")).toDF("a", "b") + df.write.mode(mode).option("name", "t1").format(format).partitionBy("a").save() + + checkAnswer(InMemoryV1Provider.getTableData(spark, "t1"), df) + assert(InMemoryV1Provider.tables("t1").schema === df.schema.asNullable) + assert(InMemoryV1Provider.tables("t1").partitioning.sameElements( + Array(IdentityTransform(FieldReference(Seq("a")))))) + } + } + + test("save: default mode is ErrorIfExists") { + val format = classOf[InMemoryV1Provider].getName + val df = Seq((1, "x"), (2, "y"), (3, "z")).toDF("a", "b") + + df.write.option("name", "t1").format(format).partitionBy("a").save() + // default is ErrorIfExists, and since a table already exists we throw an exception + val e = intercept[AnalysisException] { + df.write.option("name", "t1").format(format).partitionBy("a").save() + } + assert(e.getMessage.contains("already exists")) + } + + test("save: Ignore mode") { + val format = classOf[InMemoryV1Provider].getName + val df = Seq((1, "x"), (2, "y"), (3, "z")).toDF("a", "b") + + df.write.option("name", "t1").format(format).partitionBy("a").save() + // no-op + df.write.option("name", "t1").format(format).mode("ignore").partitionBy("a").save() + + checkAnswer(InMemoryV1Provider.getTableData(spark, "t1"), df) + } + + test("save: tables can perform schema and partitioning checks if they already exist") { + val format = classOf[InMemoryV1Provider].getName + val df = Seq((1, "x"), (2, "y"), (3, "z")).toDF("a", "b") + + df.write.option("name", "t1").format(format).partitionBy("a").save() + val e2 = intercept[IllegalArgumentException] { + df.write.mode("append").option("name", "t1").format(format).partitionBy("b").save() + } + assert(e2.getMessage.contains("partitioning")) + + val e3 = intercept[IllegalArgumentException] { + Seq((1, "x")).toDF("c", "d").write.mode("append").option("name", "t1").format(format) + .save() + } + assert(e3.getMessage.contains("schema")) + } } class V1WriteFallbackSessionCatalogSuite @@ -116,26 +173,85 @@ private object InMemoryV1Provider { } } -class InMemoryV1Provider extends TableProvider with DataSourceRegister { +class InMemoryV1Provider + extends SimpleTableProvider + with DataSourceRegister + with CreatableRelationProvider { override def getTable(options: CaseInsensitiveStringMap): Table = { - InMemoryV1Provider.tables.getOrElseUpdate(options.get("name"), { + + InMemoryV1Provider.tables.getOrElse(options.get("name"), { new InMemoryTableWithV1Fallback( "InMemoryTableWithV1Fallback", - new StructType().add("a", IntegerType).add("b", StringType), - Array(IdentityTransform(FieldReference(Seq("a")))), + new StructType(), + Array.empty, options.asCaseSensitiveMap() ) }) } override def shortName(): String = "in-memory" + + override def createRelation( + sqlContext: SQLContext, + mode: SaveMode, + parameters: Map[String, String], + data: DataFrame): BaseRelation = { + val _sqlContext = sqlContext + + val partitioning = parameters.get(DataSourceUtils.PARTITIONING_COLUMNS_KEY).map { value => + DataSourceUtils.decodePartitioningColumns(value).map { partitioningColumn => + IdentityTransform(FieldReference(partitioningColumn)) + } + }.getOrElse(Nil) + + val tableName = parameters("name") + val tableOpt = InMemoryV1Provider.tables.get(tableName) + val table = tableOpt.getOrElse(new InMemoryTableWithV1Fallback( + "InMemoryTableWithV1Fallback", + data.schema.asNullable, + partitioning.toArray, + Map.empty[String, String].asJava + )) + if (tableOpt.isEmpty) { + InMemoryV1Provider.tables.put(tableName, table) + } else { + if (data.schema.asNullable != table.schema) { + throw new IllegalArgumentException("Wrong schema provided") + } + if (!partitioning.sameElements(table.partitioning)) { + throw new IllegalArgumentException("Wrong partitioning provided") + } + } + + def getRelation: BaseRelation = new BaseRelation { + override def sqlContext: SQLContext = _sqlContext + override def schema: StructType = table.schema + } + + if (mode == SaveMode.ErrorIfExists && tableOpt.isDefined) { + throw new AnalysisException("Table already exists") + } else if (mode == SaveMode.Ignore && tableOpt.isDefined) { + // do nothing + return getRelation + } + val writer = table.newWriteBuilder( + LogicalWriteInfoImpl( + "", StructType(Seq.empty), new CaseInsensitiveStringMap(parameters.asJava))) + if (mode == SaveMode.Overwrite) { + writer.asInstanceOf[SupportsTruncate].truncate() + } + writer.asInstanceOf[V1WriteBuilder].buildForV1Write().insert(data, overwrite = false) + getRelation + } } class InMemoryTableWithV1Fallback( override val name: String, override val schema: StructType, override val partitioning: Array[Transform], - override val properties: util.Map[String, String]) extends Table with SupportsWrite { + override val properties: util.Map[String, String]) + extends Table + with SupportsWrite { partitioning.foreach { t => if (!t.isInstanceOf[IdentityTransform]) { @@ -144,7 +260,6 @@ class InMemoryTableWithV1Fallback( } override def capabilities: util.Set[TableCapability] = Set( - TableCapability.BATCH_WRITE, TableCapability.V1_BATCH_WRITE, TableCapability.OVERWRITE_BY_FILTER, TableCapability.TRUNCATE).asJava @@ -155,8 +270,8 @@ class InMemoryTableWithV1Fallback( def getData: Seq[Row] = dataMap.values.flatten.toSeq - override def newWriteBuilder(options: CaseInsensitiveStringMap): WriteBuilder = { - new FallbackWriteBuilder(options) + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { + new FallbackWriteBuilder(info.options) } private class FallbackWriteBuilder(options: CaseInsensitiveStringMap) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/V2CommandsCaseSensitivitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/V2CommandsCaseSensitivitySuite.scala new file mode 100644 index 0000000000000..289f9dc427795 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/V2CommandsCaseSensitivitySuite.scala @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connector + +import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, TestRelation2} +import org.apache.spark.sql.catalyst.analysis.CreateTablePartitioningValidationSuite +import org.apache.spark.sql.catalyst.plans.logical.{AlterTable, CreateTableAsSelect, LogicalPlan, ReplaceTableAsSelect} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.connector.catalog.{Identifier, TableChange} +import org.apache.spark.sql.connector.catalog.TableChange.ColumnPosition +import org.apache.spark.sql.connector.expressions.Expressions +import org.apache.spark.sql.execution.datasources.PreprocessTableCreation +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.{LongType, StringType} + +class V2CommandsCaseSensitivitySuite extends SharedSparkSession with AnalysisTest { + import CreateTablePartitioningValidationSuite._ + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ + + override protected def extendedAnalysisRules: Seq[Rule[LogicalPlan]] = { + Seq(PreprocessTableCreation(spark)) + } + + test("CreateTableAsSelect: using top level field for partitioning") { + Seq(true, false).foreach { caseSensitive => + withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { + Seq("ID", "iD").foreach { ref => + val plan = CreateTableAsSelect( + catalog, + Identifier.of(Array(), "table_name"), + Expressions.identity(ref) :: Nil, + TestRelation2, + Map.empty, + Map.empty, + ignoreIfExists = false) + + if (caseSensitive) { + assertAnalysisError(plan, Seq("Couldn't find column", ref), caseSensitive) + } else { + assertAnalysisSuccess(plan, caseSensitive) + } + } + } + } + } + + test("CreateTableAsSelect: using nested column for partitioning") { + Seq(true, false).foreach { caseSensitive => + withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { + Seq("POINT.X", "point.X", "poInt.x", "poInt.X").foreach { ref => + val plan = CreateTableAsSelect( + catalog, + Identifier.of(Array(), "table_name"), + Expressions.bucket(4, ref) :: Nil, + TestRelation2, + Map.empty, + Map.empty, + ignoreIfExists = false) + + if (caseSensitive) { + val field = ref.split("\\.") + assertAnalysisError(plan, Seq("Couldn't find column", field.head), caseSensitive) + } else { + assertAnalysisSuccess(plan, caseSensitive) + } + } + } + } + } + + test("ReplaceTableAsSelect: using top level field for partitioning") { + Seq(true, false).foreach { caseSensitive => + withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { + Seq("ID", "iD").foreach { ref => + val plan = ReplaceTableAsSelect( + catalog, + Identifier.of(Array(), "table_name"), + Expressions.identity(ref) :: Nil, + TestRelation2, + Map.empty, + Map.empty, + orCreate = true) + + if (caseSensitive) { + assertAnalysisError(plan, Seq("Couldn't find column", ref), caseSensitive) + } else { + assertAnalysisSuccess(plan, caseSensitive) + } + } + } + } + } + + test("ReplaceTableAsSelect: using nested column for partitioning") { + Seq(true, false).foreach { caseSensitive => + withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { + Seq("POINT.X", "point.X", "poInt.x", "poInt.X").foreach { ref => + val plan = ReplaceTableAsSelect( + catalog, + Identifier.of(Array(), "table_name"), + Expressions.bucket(4, ref) :: Nil, + TestRelation2, + Map.empty, + Map.empty, + orCreate = true) + + if (caseSensitive) { + val field = ref.split("\\.") + assertAnalysisError(plan, Seq("Couldn't find column", field.head), caseSensitive) + } else { + assertAnalysisSuccess(plan, caseSensitive) + } + } + } + } + } + + test("AlterTable: add column - nested") { + Seq("POINT.Z", "poInt.z", "poInt.Z").foreach { ref => + val field = ref.split("\\.") + alterTableTest( + TableChange.addColumn(field, LongType), + Seq("add", field.head) + ) + } + } + + test("AlterTable: add column resolution - positional") { + Seq("ID", "iD").foreach { ref => + alterTableTest( + TableChange.addColumn( + Array("f"), LongType, true, null, ColumnPosition.after(ref)), + Seq("reference column", ref) + ) + } + } + + test("AlterTable: add column resolution - nested positional") { + Seq("X", "Y").foreach { ref => + alterTableTest( + TableChange.addColumn( + Array("point", "z"), LongType, true, null, ColumnPosition.after(ref)), + Seq("reference column", ref) + ) + } + } + + test("AlterTable: drop column resolution") { + Seq(Array("ID"), Array("point", "X"), Array("POINT", "X"), Array("POINT", "x")).foreach { ref => + alterTableTest( + TableChange.deleteColumn(ref), + Seq("Cannot delete missing field", ref.quoted) + ) + } + } + + test("AlterTable: rename column resolution") { + Seq(Array("ID"), Array("point", "X"), Array("POINT", "X"), Array("POINT", "x")).foreach { ref => + alterTableTest( + TableChange.renameColumn(ref, "newName"), + Seq("Cannot rename missing field", ref.quoted) + ) + } + } + + test("AlterTable: drop column nullability resolution") { + Seq(Array("ID"), Array("point", "X"), Array("POINT", "X"), Array("POINT", "x")).foreach { ref => + alterTableTest( + TableChange.updateColumnNullability(ref, true), + Seq("Cannot update missing field", ref.quoted) + ) + } + } + + test("AlterTable: change column type resolution") { + Seq(Array("ID"), Array("point", "X"), Array("POINT", "X"), Array("POINT", "x")).foreach { ref => + alterTableTest( + TableChange.updateColumnType(ref, StringType), + Seq("Cannot update missing field", ref.quoted) + ) + } + } + + test("AlterTable: change column comment resolution") { + Seq(Array("ID"), Array("point", "X"), Array("POINT", "X"), Array("POINT", "x")).foreach { ref => + alterTableTest( + TableChange.updateColumnComment(ref, "Here's a comment for ya"), + Seq("Cannot update missing field", ref.quoted) + ) + } + } + + private def alterTableTest(change: TableChange, error: Seq[String]): Unit = { + Seq(true, false).foreach { caseSensitive => + withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { + val plan = AlterTable( + catalog, + Identifier.of(Array(), "table_name"), + TestRelation2, + Seq(change) + ) + + if (caseSensitive) { + assertAnalysisError(plan, error, caseSensitive) + } else { + assertAnalysisSuccess(plan, caseSensitive) + } + } + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/AggregatingAccumulatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/AggregatingAccumulatorSuite.scala new file mode 100644 index 0000000000000..a33b9fad7ff4f --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/AggregatingAccumulatorSuite.scala @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution + +import java.util.Properties + +import org.apache.spark.{SparkFunSuite, TaskContext, TaskContextImpl} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.expressions.{ExpressionEvalHelper, If, SortArray, SparkPartitionID, SpecificInternalRow} +import org.apache.spark.sql.catalyst.expressions.aggregate.CollectSet +import org.apache.spark.sql.catalyst.util.GenericArrayData +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.{LongType, StringType, StructType} +import org.apache.spark.unsafe.types.UTF8String + +/** + * Test suite for [[AggregatingAccumulator]]. + */ +class AggregatingAccumulatorSuite + extends SparkFunSuite + with SharedSparkSession + with ExpressionEvalHelper { + private val a = 'a.long + private val b = 'b.string + private val c = 'c.double + private val inputAttributes = Seq(a, b, c) + private def str(s: String): UTF8String = UTF8String.fromString(s) + + test("empty aggregation") { + val acc1 = AggregatingAccumulator( + Seq(sum(a) + 1L as "sum_a", max(b) as "max_b", approxCountDistinct(c) as "acntd_c"), + inputAttributes) + val expectedSchema = new StructType() + .add("sum_a", "long") + .add("max_b", "string") + .add("acntd_c", "long", nullable = false) + assert(acc1.schema === expectedSchema) + + val accEmpty = acc1.copy() + val acc2 = acc1.copy() + + // Merge empty + acc1.merge(accEmpty) + assert(acc1.isZero) + + // No updates + assert(acc1.isZero) + checkResult(acc1.value, InternalRow(null, null, 0), expectedSchema, false) + assert(acc1.isZero) + + // A few updates + acc1.add(InternalRow(4L, str("foo"), 4.9d)) + acc1.add(InternalRow(98L, str("bar"), -323.9d)) + acc1.add(InternalRow(-30L, str("baz"), 4129.8d)) + assert(!acc1.isZero) + checkResult(acc1.value, InternalRow(73L, str("baz"), 3L), expectedSchema, false) + + // Idempotency of result + checkResult(acc1.value, InternalRow(73L, str("baz"), 3L), expectedSchema, false) + + // A few updates to the copied accumulator using an updater + val updater = acc2.copyAndReset() + updater.add(InternalRow(-2L, str("qwerty"), -6773.9d)) + updater.add(InternalRow(-35L, str("zzz-top"), -323.9d)) + assert(acc2.isZero) + acc2.setState(updater) + checkResult(acc2.value, InternalRow(-36L, str("zzz-top"), 2L), expectedSchema, false) + + // Merge accumulators + acc1.merge(acc2) + acc1.merge(acc2) + acc1.merge(accEmpty) + acc1.merge(accEmpty) + checkResult(acc1.value, InternalRow(1L, str("zzz-top"), 5L), expectedSchema, false) + + // Reset + acc1.reset() + assert(acc1.isZero) + } + + test("non-deterministic expressions") { + val acc_driver = AggregatingAccumulator( + Seq( + min(SparkPartitionID()) as "min_pid", + max(SparkPartitionID()) as "max_pid", + SparkPartitionID()), + Nil) + checkResult(acc_driver.value, InternalRow(null, null, 0), acc_driver.schema, false) + + def inPartition(id: Int)(f: => Unit): Unit = { + val ctx = new TaskContextImpl(0, 0, 1, 0, 0, null, new Properties, null) + TaskContext.setTaskContext(ctx) + try { + f + } finally { + TaskContext.unset() + } + } + + val acc1 = acc_driver.copy() + inPartition(3) { + acc1.add(InternalRow.empty) + } + val acc2 = acc_driver.copy() + inPartition(42) { + acc2.add(InternalRow.empty) + } + val acc3 = acc_driver.copy() + inPartition(96) { + acc3.add(InternalRow.empty) + } + + acc_driver.merge(acc1) + acc_driver.merge(acc2) + acc_driver.merge(acc3) + assert(!acc_driver.isZero) + checkResult(acc_driver.value, InternalRow(3, 96, 0), acc_driver.schema, false) + } + + test("collect agg metrics on job") { + val acc = AggregatingAccumulator( + Seq( + avg(a) + 1.0d as "avg_a", + sum(a + 10L) as "sum_a", + min(b) as "min_b", + max(b) as "max_b", + approxCountDistinct(b) as "acntd_b", + SortArray(CollectSet(If(a < 1000L, a % 3L, a % 6L)).toAggregateExpression(), true) + as "item_set", + min(SparkPartitionID()) as "min_pid", + max(SparkPartitionID()) as "max_pid", + SparkPartitionID()), + Seq(a, b)) + sparkContext.register(acc) + def consume(ids: Iterator[Long]): Unit = { + val row = new SpecificInternalRow(Seq(LongType, StringType)) + ids.foreach { id => + // Create the new row values. + row.setLong(0, id) + row.update(1, UTF8String.fromString(f"val_$id%06d")) + + // Update the accumulator + acc.add(row) + } + } + + // Run job 1 + spark.sparkContext + .range(0, 1000, 1, 8) + .foreachPartition(consume) + assert(checkResult( + acc.value, + InternalRow( + 500.5d, + 509500L, + str("val_000000"), + str("val_000999"), + 1057L, + new GenericArrayData(Seq(0L, 1L, 2L)), + 0, + 7, + 0), + acc.schema, + false)) + + // Run job 2 + spark.sparkContext + .range(1000, 1200, 1, 8) + .foreachPartition(consume) + assert(checkResult( + acc.value, + InternalRow( + 600.5d, + 731400L, + str("val_000000"), + str("val_001199"), + 1280L, + new GenericArrayData(Seq(0L, 1L, 2L, 3L, 4L, 5L)), + 0, + 7, + 0), + acc.schema, + false)) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/BroadcastExchangeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/BroadcastExchangeSuite.scala index 43e29c2d50786..7d6306b65ff47 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/BroadcastExchangeSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/BroadcastExchangeSuite.scala @@ -21,13 +21,16 @@ import java.util.concurrent.{CountDownLatch, TimeUnit} import org.apache.spark.SparkException import org.apache.spark.scheduler._ +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.exchange.BroadcastExchangeExec import org.apache.spark.sql.execution.joins.HashedRelation import org.apache.spark.sql.functions.broadcast import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession -class BroadcastExchangeSuite extends SparkPlanTest with SharedSparkSession { +class BroadcastExchangeSuite extends SparkPlanTest + with SharedSparkSession + with AdaptiveSparkPlanHelper { import testImplicits._ @@ -53,8 +56,8 @@ class BroadcastExchangeSuite extends SparkPlanTest with SharedSparkSession { }).where("id = value") // get the exchange physical plan - val hashExchange = df.queryExecution.executedPlan - .collect { case p: BroadcastExchangeExec => p }.head + val hashExchange = collect( + df.queryExecution.executedPlan) { case p: BroadcastExchangeExec => p }.head // materialize the future and wait for the job being scheduled hashExchange.prepare() @@ -84,8 +87,8 @@ class BroadcastExchangeSuite extends SparkPlanTest with SharedSparkSession { withSQLConf(SQLConf.BROADCAST_TIMEOUT.key -> "-1") { val df = spark.range(1).toDF() val joinDF = df.join(broadcast(df), "id") - val broadcastExchangeExec = joinDF.queryExecution.executedPlan - .collect { case p: BroadcastExchangeExec => p } + val broadcastExchangeExec = collect( + joinDF.queryExecution.executedPlan) { case p: BroadcastExchangeExec => p } assert(broadcastExchangeExec.size == 1, "one and only BroadcastExchangeExec") assert(joinDF.collect().length == 1) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala index f1a3092a193f4..f1411b263c77b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala @@ -16,9 +16,12 @@ */ package org.apache.spark.sql.execution +import scala.collection.mutable + import org.apache.hadoop.fs.Path import org.apache.spark.SparkConf +import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd} import org.apache.spark.sql.{DataFrame, QueryTest} import org.apache.spark.sql.execution.datasources.v2.BatchScanExec import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan @@ -150,15 +153,50 @@ class DataSourceV2ScanExecRedactionSuite extends DataSourceScanRedactionTest { } test("FileScan description") { - withTempPath { path => - val dir = path.getCanonicalPath - spark.range(0, 10).write.orc(dir) - val df = spark.read.orc(dir) + Seq("json", "orc", "parquet").foreach { format => + withTempPath { path => + val dir = path.getCanonicalPath + spark.range(0, 10).write.format(format).save(dir) + val df = spark.read.format(format).load(dir) + + withClue(s"Source '$format':") { + assert(isIncluded(df.queryExecution, "ReadSchema")) + assert(isIncluded(df.queryExecution, "BatchScan")) + if (Seq("orc", "parquet").contains(format)) { + assert(isIncluded(df.queryExecution, "PushedFilters")) + } + assert(isIncluded(df.queryExecution, "Location")) + } + } + } + } - assert(isIncluded(df.queryExecution, "ReadSchema")) - assert(isIncluded(df.queryExecution, "BatchScan")) - assert(isIncluded(df.queryExecution, "PushedFilters")) - assert(isIncluded(df.queryExecution, "Location")) + test("SPARK-30362: test input metrics for DSV2") { + withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> "") { + Seq("json", "orc", "parquet").foreach { format => + withTempPath { path => + val dir = path.getCanonicalPath + spark.range(0, 10).write.format(format).save(dir) + val df = spark.read.format(format).load(dir) + val bytesReads = new mutable.ArrayBuffer[Long]() + val recordsRead = new mutable.ArrayBuffer[Long]() + val bytesReadListener = new SparkListener() { + override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = { + bytesReads += taskEnd.taskMetrics.inputMetrics.bytesRead + recordsRead += taskEnd.taskMetrics.inputMetrics.recordsRead + } + } + sparkContext.addSparkListener(bytesReadListener) + try { + df.collect() + sparkContext.listenerBus.waitUntilEmpty() + assert(bytesReads.sum > 0) + assert(recordsRead.sum == 10) + } finally { + sparkContext.removeSparkListener(bytesReadListener) + } + } + } } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/DeprecatedWholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/DeprecatedWholeStageCodegenSuite.scala new file mode 100644 index 0000000000000..1e90754ad7721 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/DeprecatedWholeStageCodegenSuite.scala @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.execution.aggregate.HashAggregateExec +import org.apache.spark.sql.expressions.scalalang.typed +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession + +@deprecated("This test suite will be removed.", "3.0.0") +class DeprecatedWholeStageCodegenSuite extends QueryTest + with SharedSparkSession + with AdaptiveSparkPlanHelper { + + test("simple typed UDAF should be included in WholeStageCodegen") { + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { + // With enable AQE, the WholeStageCodegenExec rule is applied when running QueryStageExec. + import testImplicits._ + + val ds = Seq(("a", 10), ("b", 1), ("b", 2), ("c", 1)).toDS() + .groupByKey(_._1).agg(typed.sum(_._2)) + + val plan = ds.queryExecution.executedPlan + assert(find(plan)(p => + p.isInstanceOf[WholeStageCodegenExec] && + p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined) + assert(ds.collect() === Array(("a", 10.0), ("b", 3.0), ("c", 1.0))) + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala index 104cf4c58d617..bb59b12e6f350 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/HiveResultSuite.scala @@ -17,25 +17,35 @@ package org.apache.spark.sql.execution -import java.sql.{Date, Timestamp} - import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, SharedSparkSession} class HiveResultSuite extends SharedSparkSession { import testImplicits._ test("date formatting in hive result") { - val date = "2018-12-28" - val executedPlan = Seq(Date.valueOf(date)).toDS().queryExecution.executedPlan - val result = HiveResult.hiveResultString(executedPlan) - assert(result.head == date) + val dates = Seq("2018-12-28", "1582-10-13", "1582-10-14", "1582-10-15") + val df = dates.toDF("a").selectExpr("cast(a as date) as b") + val executedPlan1 = df.queryExecution.executedPlan + val result = HiveResult.hiveResultString(executedPlan1) + assert(result == dates) + val executedPlan2 = df.selectExpr("array(b)").queryExecution.executedPlan + val result2 = HiveResult.hiveResultString(executedPlan2) + assert(result2 == dates.map(x => s"[$x]")) } test("timestamp formatting in hive result") { - val timestamp = "2018-12-28 01:02:03" - val executedPlan = Seq(Timestamp.valueOf(timestamp)).toDS().queryExecution.executedPlan - val result = HiveResult.hiveResultString(executedPlan) - assert(result.head == timestamp) + val timestamps = Seq( + "2018-12-28 01:02:03", + "1582-10-13 01:02:03", + "1582-10-14 01:02:03", + "1582-10-15 01:02:03") + val df = timestamps.toDF("a").selectExpr("cast(a as timestamp) as b") + val executedPlan1 = df.queryExecution.executedPlan + val result = HiveResult.hiveResultString(executedPlan1) + assert(result == timestamps) + val executedPlan2 = df.selectExpr("array(b)").queryExecution.executedPlan + val result2 = HiveResult.hiveResultString(executedPlan2) + assert(result2 == timestamps.map(x => s"[$x]")) } test("toHiveString correctly handles UDTs") { @@ -43,4 +53,19 @@ class HiveResultSuite extends SharedSparkSession { val tpe = new ExamplePointUDT() assert(HiveResult.toHiveString((point, tpe)) === "(50.0, 50.0)") } + + test("decimal formatting in hive result") { + val df = Seq(new java.math.BigDecimal("1")).toDS() + Seq(2, 6, 18).foreach { scala => + val executedPlan = + df.selectExpr(s"CAST(value AS decimal(38, $scala))").queryExecution.executedPlan + val result = HiveResult.hiveResultString(executedPlan) + assert(result.head.split("\\.").last.length === scala) + } + + val executedPlan = Seq(java.math.BigDecimal.ZERO).toDS() + .selectExpr(s"CAST(value AS decimal(38, 8))").queryExecution.executedPlan + val result = HiveResult.hiveResultString(executedPlan) + assert(result.head === "0.00000000") + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/LogicalPlanTagInSparkPlanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/LogicalPlanTagInSparkPlanSuite.scala index aa83b9b11dcfc..311f84c07a955 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/LogicalPlanTagInSparkPlanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/LogicalPlanTagInSparkPlanSuite.scala @@ -29,11 +29,26 @@ import org.apache.spark.sql.execution.datasources.v2.{BatchScanExec, DataSourceV import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec, ShuffleExchangeExec} import org.apache.spark.sql.execution.joins._ import org.apache.spark.sql.execution.window.WindowExec +import org.apache.spark.sql.internal.SQLConf class LogicalPlanTagInSparkPlanSuite extends TPCDSQuerySuite { - override protected def checkGeneratedCode(plan: SparkPlan): Unit = { - super.checkGeneratedCode(plan) + var originalValue: String = _ + // when enable AQE, the 'AdaptiveSparkPlanExec' node does not have a logical plan link + override def beforeAll(): Unit = { + super.beforeAll() + originalValue = spark.conf.get(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key) + spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false") + } + + override def afterAll(): Unit = { + spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, originalValue) + super.afterAll() + } + + override protected def checkGeneratedCode( + plan: SparkPlan, checkMethodCodeSize: Boolean = true): Unit = { + super.checkGeneratedCode(plan, checkMethodCodeSize) checkLogicalPlanTag(plan) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala index 90ce6765013b4..0c5e2e3c7d1d4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala @@ -24,6 +24,8 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Range, Repartition, Sort, Union} import org.apache.spark.sql.catalyst.plans.physical._ +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper +import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec} import org.apache.spark.sql.execution.columnar.{InMemoryRelation, InMemoryTableScanExec} import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ReusedExchangeExec, ReuseExchange, ShuffleExchangeExec} import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, SortMergeJoinExec} @@ -32,7 +34,7 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ -class PlannerSuite extends SharedSparkSession { +class PlannerSuite extends SharedSparkSession with AdaptiveSparkPlanHelper { import testImplicits._ setupTestData() @@ -254,29 +256,31 @@ class PlannerSuite extends SharedSparkSession { // Disable broadcast join withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { { - val numExchanges = sql( + val plan = sql( """ |SELECT * |FROM | normal JOIN small ON (normal.key = small.key) | JOIN tiny ON (small.key = tiny.key) """.stripMargin - ).queryExecution.executedPlan.collect { + ).queryExecution.executedPlan + val numExchanges = collect(plan) { case exchange: ShuffleExchangeExec => exchange }.length assert(numExchanges === 5) } { - // This second query joins on different keys: - val numExchanges = sql( + val plan = sql( """ |SELECT * |FROM | normal JOIN small ON (normal.key = small.key) | JOIN tiny ON (normal.key = tiny.key) """.stripMargin - ).queryExecution.executedPlan.collect { + ).queryExecution.executedPlan + // This second query joins on different keys: + val numExchanges = collect(plan) { case exchange: ShuffleExchangeExec => exchange }.length assert(numExchanges === 5) @@ -689,7 +693,7 @@ class PlannerSuite extends SharedSparkSession { val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(smjExec) outputPlan match { - case SortMergeJoinExec(leftKeys, rightKeys, _, _, _, _) => + case SortMergeJoinExec(leftKeys, rightKeys, _, _, _, _, _) => assert(leftKeys == Seq(exprA, exprA)) assert(rightKeys == Seq(exprB, exprC)) case _ => fail() @@ -713,7 +717,8 @@ class PlannerSuite extends SharedSparkSession { SortExec(_, _, ShuffleExchangeExec(HashPartitioning(leftPartitioningExpressions, _), _, _), _), SortExec(_, _, - ShuffleExchangeExec(HashPartitioning(rightPartitioningExpressions, _), _, _), _)) => + ShuffleExchangeExec(HashPartitioning(rightPartitioningExpressions, _), + _, _), _), _) => assert(leftKeys === smjExec.leftKeys) assert(rightKeys === smjExec.rightKeys) assert(leftKeys === leftPartitioningExpressions) @@ -762,7 +767,7 @@ class PlannerSuite extends SharedSparkSession { def checkReusedExchangeOutputPartitioningRewrite( df: DataFrame, expectedPartitioningClass: Class[_]): Unit = { - val reusedExchange = df.queryExecution.executedPlan.collect { + val reusedExchange = collect(df.queryExecution.executedPlan) { case r: ReusedExchangeExec => r } checkOutputPartitioningRewrite(reusedExchange, expectedPartitioningClass) @@ -771,31 +776,34 @@ class PlannerSuite extends SharedSparkSession { def checkInMemoryTableScanOutputPartitioningRewrite( df: DataFrame, expectedPartitioningClass: Class[_]): Unit = { - val inMemoryScan = df.queryExecution.executedPlan.collect { + val inMemoryScan = collect(df.queryExecution.executedPlan) { case m: InMemoryTableScanExec => m } checkOutputPartitioningRewrite(inMemoryScan, expectedPartitioningClass) } + // when enable AQE, the reusedExchange is inserted when executed. + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { + // ReusedExchange is HashPartitioning + val df1 = Seq(1 -> "a").toDF("i", "j").repartition($"i") + val df2 = Seq(1 -> "a").toDF("i", "j").repartition($"i") + checkReusedExchangeOutputPartitioningRewrite(df1.union(df2), classOf[HashPartitioning]) + + // ReusedExchange is RangePartitioning + val df3 = Seq(1 -> "a").toDF("i", "j").orderBy($"i") + val df4 = Seq(1 -> "a").toDF("i", "j").orderBy($"i") + checkReusedExchangeOutputPartitioningRewrite(df3.union(df4), classOf[RangePartitioning]) + + // InMemoryTableScan is HashPartitioning + Seq(1 -> "a").toDF("i", "j").repartition($"i").persist() + checkInMemoryTableScanOutputPartitioningRewrite( + Seq(1 -> "a").toDF("i", "j").repartition($"i"), classOf[HashPartitioning]) - // ReusedExchange is HashPartitioning - val df1 = Seq(1 -> "a").toDF("i", "j").repartition($"i") - val df2 = Seq(1 -> "a").toDF("i", "j").repartition($"i") - checkReusedExchangeOutputPartitioningRewrite(df1.union(df2), classOf[HashPartitioning]) - - // ReusedExchange is RangePartitioning - val df3 = Seq(1 -> "a").toDF("i", "j").orderBy($"i") - val df4 = Seq(1 -> "a").toDF("i", "j").orderBy($"i") - checkReusedExchangeOutputPartitioningRewrite(df3.union(df4), classOf[RangePartitioning]) - - // InMemoryTableScan is HashPartitioning - Seq(1 -> "a").toDF("i", "j").repartition($"i").persist() - checkInMemoryTableScanOutputPartitioningRewrite( - Seq(1 -> "a").toDF("i", "j").repartition($"i"), classOf[HashPartitioning]) - - // InMemoryTableScan is RangePartitioning - spark.range(1, 100, 1, 10).toDF().persist() - checkInMemoryTableScanOutputPartitioningRewrite( - spark.range(1, 100, 1, 10).toDF(), classOf[RangePartitioning]) + // InMemoryTableScan is RangePartitioning + spark.range(1, 100, 1, 10).toDF().persist() + checkInMemoryTableScanOutputPartitioningRewrite( + spark.range(1, 100, 1, 10).toDF(), classOf[RangePartitioning]) + } // InMemoryTableScan is PartitioningCollection withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { @@ -856,6 +864,117 @@ class PlannerSuite extends SharedSparkSession { StructField("f2", StringType, nullable = true), StructField("f3", StringType, nullable = false)))) } + + test("Do not analyze subqueries twice") { + // Analyzing the subquery twice will result in stacked + // CheckOverflow & PromotePrecision expressions. + val df = sql( + """ + |SELECT id, + | (SELECT 1.3000000 * AVG(CAST(id AS DECIMAL(10, 3))) FROM range(13)) AS ref + |FROM range(5) + |""".stripMargin) + + val Seq(subquery) = stripAQEPlan(df.queryExecution.executedPlan).subqueriesAll + subquery.foreach { node => + node.expressions.foreach { expression => + expression.foreach { + case PromotePrecision(_: PromotePrecision) => + fail(s"$expression contains stacked PromotePrecision expressions.") + case CheckOverflow(_: CheckOverflow, _, _) => + fail(s"$expression contains stacked CheckOverflow expressions.") + case _ => // Ok + } + } + } + } + + test("aliases in the project should not introduce extra shuffle") { + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withTempView("df1", "df2") { + spark.range(10).selectExpr("id AS key", "0").repartition($"key").createTempView("df1") + spark.range(20).selectExpr("id AS key", "0").repartition($"key").createTempView("df2") + val planned = sql( + """ + |SELECT * FROM + | (SELECT key AS k from df1) t1 + |INNER JOIN + | (SELECT key AS k from df2) t2 + |ON t1.k = t2.k + """.stripMargin).queryExecution.executedPlan + val exchanges = planned.collect { case s: ShuffleExchangeExec => s } + assert(exchanges.size == 2) + } + } + } + + test("aliases to expressions should not be replaced") { + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + withTempView("df1", "df2") { + spark.range(10).selectExpr("id AS key", "0").repartition($"key").createTempView("df1") + spark.range(20).selectExpr("id AS key", "0").repartition($"key").createTempView("df2") + val planned = sql( + """ + |SELECT * FROM + | (SELECT key + 1 AS k1 from df1) t1 + |INNER JOIN + | (SELECT key + 1 AS k2 from df2) t2 + |ON t1.k1 = t2.k2 + |""".stripMargin).queryExecution.executedPlan + val exchanges = planned.collect { case s: ShuffleExchangeExec => s } + + // Make sure aliases to an expression (key + 1) are not replaced. + Seq("k1", "k2").foreach { alias => + assert(exchanges.exists(_.outputPartitioning match { + case HashPartitioning(Seq(a: AttributeReference), _) => a.name == alias + case _ => false + })) + } + } + } + } + + test("aliases in the aggregate expressions should not introduce extra shuffle") { + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + val t1 = spark.range(10).selectExpr("floor(id/4) as k1") + val t2 = spark.range(20).selectExpr("floor(id/4) as k2") + + val agg1 = t1.groupBy("k1").agg(count(lit("1")).as("cnt1")) + val agg2 = t2.groupBy("k2").agg(count(lit("1")).as("cnt2")).withColumnRenamed("k2", "k3") + + val planned = agg1.join(agg2, $"k1" === $"k3").queryExecution.executedPlan + + assert(planned.collect { case h: HashAggregateExec => h }.nonEmpty) + + val exchanges = planned.collect { case s: ShuffleExchangeExec => s } + assert(exchanges.size == 2) + } + } + + test("aliases in the object hash/sort aggregate expressions should not introduce extra shuffle") { + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { + Seq(true, false).foreach { useObjectHashAgg => + withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> useObjectHashAgg.toString) { + val t1 = spark.range(10).selectExpr("floor(id/4) as k1") + val t2 = spark.range(20).selectExpr("floor(id/4) as k2") + + val agg1 = t1.groupBy("k1").agg(collect_list("k1")) + val agg2 = t2.groupBy("k2").agg(collect_list("k2")).withColumnRenamed("k2", "k3") + + val planned = agg1.join(agg2, $"k1" === $"k3").queryExecution.executedPlan + + if (useObjectHashAgg) { + assert(planned.collect { case o: ObjectHashAggregateExec => o }.nonEmpty) + } else { + assert(planned.collect { case s: SortAggregateExec => s }.nonEmpty) + } + + val exchanges = planned.collect { case s: ShuffleExchangeExec => s } + assert(exchanges.size == 2) + } + } + } + } } // Used for unit-testing EnsureRequirements diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryPlanningTrackerEndToEndSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryPlanningTrackerEndToEndSuite.scala index 76006efda992f..987338cf6cbbf 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryPlanningTrackerEndToEndSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryPlanningTrackerEndToEndSuite.scala @@ -17,9 +17,11 @@ package org.apache.spark.sql.execution -import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.execution.streaming.{MemoryStream, StreamExecution} +import org.apache.spark.sql.streaming.StreamTest -class QueryPlanningTrackerEndToEndSuite extends SharedSparkSession { +class QueryPlanningTrackerEndToEndSuite extends StreamTest { + import testImplicits._ test("programmatic API") { val df = spark.range(1000).selectExpr("count(*)") @@ -38,4 +40,22 @@ class QueryPlanningTrackerEndToEndSuite extends SharedSparkSession { assert(tracker.rules.nonEmpty) } + test("SPARK-29227: Track rule info in optimization phase in streaming") { + val inputData = MemoryStream[Int] + val df = inputData.toDF() + + def assertStatus(stream: StreamExecution): Unit = { + stream.processAllAvailable() + val tracker = stream.lastExecution.tracker + assert(tracker.phases.keys == Set("analysis", "optimization", "planning")) + assert(tracker.rules.nonEmpty) + } + + testStream(df)( + StartStream(), + AddData(inputData, 1, 2, 3), + Execute(assertStatus), + StopStream) + } + } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ReduceNumShufflePartitionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ReduceNumShufflePartitionsSuite.scala index b5dbdd0b18b49..5565a0dd01840 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ReduceNumShufflePartitionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ReduceNumShufflePartitionsSuite.scala @@ -19,11 +19,12 @@ package org.apache.spark.sql.execution import org.scalatest.BeforeAndAfterAll -import org.apache.spark.{MapOutputStatistics, SparkConf, SparkFunSuite} +import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.internal.config.UI.UI_ENABLED import org.apache.spark.sql._ import org.apache.spark.sql.execution.adaptive._ -import org.apache.spark.sql.execution.adaptive.rule.{CoalescedShuffleReaderExec, ReduceNumShufflePartitions} +import org.apache.spark.sql.execution.adaptive.CoalescedShuffleReaderExec +import org.apache.spark.sql.execution.exchange.ReusedExchangeExec import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf @@ -51,220 +52,8 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA } } - private def checkEstimation( - rule: ReduceNumShufflePartitions, - bytesByPartitionIdArray: Array[Array[Long]], - expectedPartitionStartIndices: Array[Int]): Unit = { - val mapOutputStatistics = bytesByPartitionIdArray.zipWithIndex.map { - case (bytesByPartitionId, index) => - new MapOutputStatistics(index, bytesByPartitionId) - } - val estimatedPartitionStartIndices = - rule.estimatePartitionStartIndices(mapOutputStatistics) - assert(estimatedPartitionStartIndices === expectedPartitionStartIndices) - } - - private def createReduceNumShufflePartitionsRule( - advisoryTargetPostShuffleInputSize: Long, - minNumPostShufflePartitions: Int = 1): ReduceNumShufflePartitions = { - val conf = new SQLConf().copy( - SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE -> advisoryTargetPostShuffleInputSize, - SQLConf.SHUFFLE_MIN_NUM_POSTSHUFFLE_PARTITIONS -> minNumPostShufflePartitions) - ReduceNumShufflePartitions(conf) - } - - test("test estimatePartitionStartIndices - 1 Exchange") { - val rule = createReduceNumShufflePartitionsRule(100L) - - { - // All bytes per partition are 0. - val bytesByPartitionId = Array[Long](0, 0, 0, 0, 0) - val expectedPartitionStartIndices = Array[Int](0) - checkEstimation(rule, Array(bytesByPartitionId), expectedPartitionStartIndices) - } - - { - // Some bytes per partition are 0 and total size is less than the target size. - // 1 post-shuffle partition is needed. - val bytesByPartitionId = Array[Long](10, 0, 20, 0, 0) - val expectedPartitionStartIndices = Array[Int](0) - checkEstimation(rule, Array(bytesByPartitionId), expectedPartitionStartIndices) - } - - { - // 2 post-shuffle partitions are needed. - val bytesByPartitionId = Array[Long](10, 0, 90, 20, 0) - val expectedPartitionStartIndices = Array[Int](0, 3) - checkEstimation(rule, Array(bytesByPartitionId), expectedPartitionStartIndices) - } - - { - // There are a few large pre-shuffle partitions. - val bytesByPartitionId = Array[Long](110, 10, 100, 110, 0) - val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4) - checkEstimation(rule, Array(bytesByPartitionId), expectedPartitionStartIndices) - } - - { - // All pre-shuffle partitions are larger than the targeted size. - val bytesByPartitionId = Array[Long](100, 110, 100, 110, 110) - val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4) - checkEstimation(rule, Array(bytesByPartitionId), expectedPartitionStartIndices) - } - - { - // The last pre-shuffle partition is in a single post-shuffle partition. - val bytesByPartitionId = Array[Long](30, 30, 0, 40, 110) - val expectedPartitionStartIndices = Array[Int](0, 4) - checkEstimation(rule, Array(bytesByPartitionId), expectedPartitionStartIndices) - } - } - - test("test estimatePartitionStartIndices - 2 Exchanges") { - val rule = createReduceNumShufflePartitionsRule(100L) - - { - // If there are multiple values of the number of pre-shuffle partitions, - // we should see an assertion error. - val bytesByPartitionId1 = Array[Long](0, 0, 0, 0, 0) - val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0, 0) - val mapOutputStatistics = - Array( - new MapOutputStatistics(0, bytesByPartitionId1), - new MapOutputStatistics(1, bytesByPartitionId2)) - intercept[AssertionError](rule.estimatePartitionStartIndices(mapOutputStatistics)) - } - - { - // All bytes per partition are 0. - val bytesByPartitionId1 = Array[Long](0, 0, 0, 0, 0) - val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0) - val expectedPartitionStartIndices = Array[Int](0) - checkEstimation( - rule, - Array(bytesByPartitionId1, bytesByPartitionId2), - expectedPartitionStartIndices) - } - - { - // Some bytes per partition are 0. - // 1 post-shuffle partition is needed. - val bytesByPartitionId1 = Array[Long](0, 10, 0, 20, 0) - val bytesByPartitionId2 = Array[Long](30, 0, 20, 0, 20) - val expectedPartitionStartIndices = Array[Int](0) - checkEstimation( - rule, - Array(bytesByPartitionId1, bytesByPartitionId2), - expectedPartitionStartIndices) - } - - { - // 2 post-shuffle partition are needed. - val bytesByPartitionId1 = Array[Long](0, 10, 0, 20, 0) - val bytesByPartitionId2 = Array[Long](30, 0, 70, 0, 30) - val expectedPartitionStartIndices = Array[Int](0, 2, 4) - checkEstimation( - rule, - Array(bytesByPartitionId1, bytesByPartitionId2), - expectedPartitionStartIndices) - } - - { - // 4 post-shuffle partition are needed. - val bytesByPartitionId1 = Array[Long](0, 99, 0, 20, 0) - val bytesByPartitionId2 = Array[Long](30, 0, 70, 0, 30) - val expectedPartitionStartIndices = Array[Int](0, 1, 2, 4) - checkEstimation( - rule, - Array(bytesByPartitionId1, bytesByPartitionId2), - expectedPartitionStartIndices) - } - - { - // 2 post-shuffle partition are needed. - val bytesByPartitionId1 = Array[Long](0, 100, 0, 30, 0) - val bytesByPartitionId2 = Array[Long](30, 0, 70, 0, 30) - val expectedPartitionStartIndices = Array[Int](0, 1, 2, 4) - checkEstimation( - rule, - Array(bytesByPartitionId1, bytesByPartitionId2), - expectedPartitionStartIndices) - } - - { - // There are a few large pre-shuffle partitions. - val bytesByPartitionId1 = Array[Long](0, 100, 40, 30, 0) - val bytesByPartitionId2 = Array[Long](30, 0, 60, 0, 110) - val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4) - checkEstimation( - rule, - Array(bytesByPartitionId1, bytesByPartitionId2), - expectedPartitionStartIndices) - } - - { - // All pairs of pre-shuffle partitions are larger than the targeted size. - val bytesByPartitionId1 = Array[Long](100, 100, 40, 30, 0) - val bytesByPartitionId2 = Array[Long](30, 0, 60, 70, 110) - val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4) - checkEstimation( - rule, - Array(bytesByPartitionId1, bytesByPartitionId2), - expectedPartitionStartIndices) - } - } - - test("test estimatePartitionStartIndices and enforce minimal number of reducers") { - val rule = createReduceNumShufflePartitionsRule(100L, 2) - - { - // The minimal number of post-shuffle partitions is not enforced because - // the size of data is 0. - val bytesByPartitionId1 = Array[Long](0, 0, 0, 0, 0) - val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0) - val expectedPartitionStartIndices = Array[Int](0) - checkEstimation( - rule, - Array(bytesByPartitionId1, bytesByPartitionId2), - expectedPartitionStartIndices) - } - - { - // The minimal number of post-shuffle partitions is enforced. - val bytesByPartitionId1 = Array[Long](10, 5, 5, 0, 20) - val bytesByPartitionId2 = Array[Long](5, 10, 0, 10, 5) - val expectedPartitionStartIndices = Array[Int](0, 3) - checkEstimation( - rule, - Array(bytesByPartitionId1, bytesByPartitionId2), - expectedPartitionStartIndices) - } - - { - // The number of post-shuffle partitions is determined by the coordinator. - val bytesByPartitionId1 = Array[Long](10, 50, 20, 80, 20) - val bytesByPartitionId2 = Array[Long](40, 10, 0, 10, 30) - val expectedPartitionStartIndices = Array[Int](0, 1, 3, 4) - checkEstimation( - rule, - Array(bytesByPartitionId1, bytesByPartitionId2), - expectedPartitionStartIndices) - } - } - - /////////////////////////////////////////////////////////////////////////// - // Query tests - /////////////////////////////////////////////////////////////////////////// - val numInputPartitions: Int = 10 - def checkAnswer(actual: => DataFrame, expectedAnswer: Seq[Row]): Unit = { - QueryTest.checkAnswer(actual, expectedAnswer) match { - case Some(errorMessage) => fail(errorMessage) - case None => - } - } - def withSparkSession( f: SparkSession => Unit, targetPostShuffleInputSize: Int, @@ -274,6 +63,7 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA .setMaster("local[*]") .setAppName("test") .set(UI_ENABLED, false) + .set(SQLConf.SHUFFLE_PARTITIONS.key, "5") .set(SQLConf.SHUFFLE_MAX_NUM_POSTSHUFFLE_PARTITIONS.key, "5") .set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "true") .set(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key, "-1") @@ -308,7 +98,7 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA val agg = df.groupBy("key").count() // Check the answer first. - checkAnswer( + QueryTest.checkAnswer( agg, spark.range(0, 20).selectExpr("id", "50 as cnt").collect()) @@ -355,7 +145,7 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA .range(0, 1000) .selectExpr("id % 500 as key", "id as value") .union(spark.range(0, 1000).selectExpr("id % 500 as key", "id as value")) - checkAnswer( + QueryTest.checkAnswer( join, expectedAnswer.collect()) @@ -407,7 +197,7 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA spark .range(0, 500) .selectExpr("id", "2 as cnt") - checkAnswer( + QueryTest.checkAnswer( join, expectedAnswer.collect()) @@ -459,7 +249,7 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA spark .range(0, 1000) .selectExpr("id % 500 as key", "2 as cnt", "id as value") - checkAnswer( + QueryTest.checkAnswer( join, expectedAnswer.collect()) @@ -503,11 +293,11 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA // Check the answer first. val expectedAnswer = spark.range(0, 500).selectExpr("id % 500", "id as value") .union(spark.range(500, 1000).selectExpr("id % 500", "id as value")) - checkAnswer( + QueryTest.checkAnswer( join, expectedAnswer.collect()) - // Then, let's make sure we do not reduce number of ppst shuffle partitions. + // Then, let's make sure we do not reduce number of post shuffle partitions. val finalPlan = join.queryExecution.executedPlan .asInstanceOf[AdaptiveSparkPlanExec].executedPlan val shuffleReaders = finalPlan.collect { @@ -533,10 +323,12 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA // ReusedQueryStage 0 // ReusedQueryStage 0 val resultDf = df.join(df, "key").join(df, "key") - checkAnswer(resultDf, Row(0, 0, 0, 0) :: Nil) + QueryTest.checkAnswer(resultDf, Row(0, 0, 0, 0) :: Nil) val finalPlan = resultDf.queryExecution.executedPlan .asInstanceOf[AdaptiveSparkPlanExec].executedPlan - assert(finalPlan.collect { case p: ReusedQueryStageExec => p }.length == 2) + assert(finalPlan.collect { + case ShuffleQueryStageExec(_, r: ReusedExchangeExec) => r + }.length == 2) assert(finalPlan.collect { case p: CoalescedShuffleReaderExec => p }.length == 3) @@ -549,7 +341,7 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA val grouped = df.groupBy("key").agg(max("value").as("value")) val resultDf2 = grouped.groupBy(col("key") + 1).max("value") .union(grouped.groupBy(col("key") + 2).max("value")) - checkAnswer(resultDf2, Row(1, 0) :: Row(2, 0) :: Nil) + QueryTest.checkAnswer(resultDf2, Row(1, 0) :: Row(2, 0) :: Nil) val finalPlan2 = resultDf2.queryExecution.executedPlan .asInstanceOf[AdaptiveSparkPlanExec].executedPlan @@ -567,7 +359,9 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA assert(leafStages.length == 2) val reusedStages = level1Stages.flatMap { stage => - stage.plan.collect { case r: ReusedQueryStageExec => r } + stage.plan.collect { + case ShuffleQueryStageExec(_, r: ReusedExchangeExec) => r + } } assert(reusedStages.length == 1) } @@ -579,7 +373,7 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA val ds = spark.range(3) val resultDf = ds.repartition(2, ds.col("id")).toDF() - checkAnswer(resultDf, + QueryTest.checkAnswer(resultDf, Seq(0, 1, 2).map(i => Row(i))) val finalPlan = resultDf.queryExecution.executedPlan .asInstanceOf[AdaptiveSparkPlanExec].executedPlan @@ -595,7 +389,7 @@ class ReduceNumShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterA val resultDf = df1.union(df2) - checkAnswer(resultDf, Seq((0), (1), (2), (3)).map(i => Row(i))) + QueryTest.checkAnswer(resultDf, Seq((0), (1), (2), (3)).map(i => Row(i))) val finalPlan = resultDf.queryExecution.executedPlan .asInstanceOf[AdaptiveSparkPlanExec].executedPlan diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ReferenceSort.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ReferenceSort.scala index 6abcb1f067968..25b4464823e5f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ReferenceSort.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ReferenceSort.scala @@ -21,7 +21,7 @@ import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ -import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} +import org.apache.spark.sql.catalyst.expressions.{Attribute, RowOrdering, SortOrder} import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.ExternalSorter @@ -41,7 +41,7 @@ case class ReferenceSort( protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") { child.execute().mapPartitions( { iterator => - val ordering = newOrdering(sortOrder, child.output) + val ordering = RowOrdering.create(sortOrder, child.output) val sorter = new ExternalSorter[InternalRow, Null, InternalRow]( TaskContext.get(), ordering = Some(ordering)) sorter.insertAll(iterator.map(r => (r.copy(), null))) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala index f6b006b98edd1..8bf7fe62cd49b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLExecutionSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution -import java.util.Properties +import scala.collection.parallel.immutable.ParRange import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart} @@ -46,7 +46,7 @@ class SQLExecutionSuite extends SparkFunSuite { import spark.implicits._ try { // Should not throw IllegalArgumentException - (1 to 100).par.foreach { _ => + new ParRange(1 to 100).foreach { _ => spark.sparkContext.parallelize(1 to 5).map { i => (i, i) }.toDF("a", "b").count() } } finally { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala index ff84b05713676..9a393f19ce9bb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala @@ -80,7 +80,8 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils { sql("CREATE VIEW jtv1 AS SELECT * FROM temp_jtv1 WHERE id < 6") }.getMessage assert(e.contains("Not allowed to create a permanent view `jtv1` by " + - "referencing a temporary view `temp_jtv1`")) + "referencing a temporary view temp_jtv1. " + + "Please create a temp view instead by CREATE TEMP VIEW")) val globalTempDB = spark.sharedState.globalTempViewManager.database sql("CREATE GLOBAL TEMP VIEW global_temp_jtv1 AS SELECT * FROM jt WHERE id > 0") @@ -88,7 +89,7 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils { sql(s"CREATE VIEW jtv1 AS SELECT * FROM $globalTempDB.global_temp_jtv1 WHERE id < 6") }.getMessage assert(e.contains(s"Not allowed to create a permanent view `jtv1` by referencing " + - s"a temporary view `global_temp`.`global_temp_jtv1`")) + s"a temporary view global_temp.global_temp_jtv1")) } } } @@ -136,12 +137,21 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils { assertNoSuchTable(s"ALTER TABLE $viewName SET SERDE 'whatever'") assertNoSuchTable(s"ALTER TABLE $viewName PARTITION (a=1, b=2) SET SERDE 'whatever'") assertNoSuchTable(s"ALTER TABLE $viewName SET SERDEPROPERTIES ('p' = 'an')") - assertNoSuchTable(s"ALTER TABLE $viewName SET LOCATION '/path/to/your/lovely/heart'") - assertNoSuchTable(s"ALTER TABLE $viewName PARTITION (a='4') SET LOCATION '/path/to/home'") assertNoSuchTable(s"ALTER TABLE $viewName ADD IF NOT EXISTS PARTITION (a='4', b='8')") assertNoSuchTable(s"ALTER TABLE $viewName DROP PARTITION (a='4', b='8')") assertNoSuchTable(s"ALTER TABLE $viewName PARTITION (a='4') RENAME TO PARTITION (a='5')") assertNoSuchTable(s"ALTER TABLE $viewName RECOVER PARTITIONS") + + // For v2 ALTER TABLE statements, we have better error message saying view is not supported. + assertAnalysisError( + s"ALTER TABLE $viewName SET LOCATION '/path/to/your/lovely/heart'", + s"'$viewName' is a view not a table") + + // For the following v2 ALERT TABLE statements, unsupported operations are checked first + // before resolving the relations. + assertAnalysisError( + s"ALTER TABLE $viewName PARTITION (a='4') SET LOCATION '/path/to/home'", + "ALTER TABLE SET LOCATION does not support partition for v2 tables") } } @@ -175,6 +185,11 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils { } } + private def assertAnalysisError(query: String, message: String): Unit = { + val e = intercept[AnalysisException](sql(query)) + assert(e.message.contains(message)) + } + test("error handling: insert/load/truncate table commands against a view") { val viewName = "testView" withView(viewName) { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ShufflePartitionsCoalescerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ShufflePartitionsCoalescerSuite.scala new file mode 100644 index 0000000000000..fcfde83b2ffd5 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ShufflePartitionsCoalescerSuite.scala @@ -0,0 +1,220 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.spark.{MapOutputStatistics, SparkFunSuite} +import org.apache.spark.sql.execution.adaptive.ShufflePartitionsCoalescer + +class ShufflePartitionsCoalescerSuite extends SparkFunSuite { + + private def checkEstimation( + bytesByPartitionIdArray: Array[Array[Long]], + expectedPartitionStartIndices: Array[Int], + targetSize: Long, + minNumPartitions: Int = 1): Unit = { + val mapOutputStatistics = bytesByPartitionIdArray.zipWithIndex.map { + case (bytesByPartitionId, index) => + new MapOutputStatistics(index, bytesByPartitionId) + } + val estimatedPartitionStartIndices = ShufflePartitionsCoalescer.coalescePartitions( + mapOutputStatistics, + 0, + bytesByPartitionIdArray.head.length, + targetSize, + minNumPartitions) + assert(estimatedPartitionStartIndices === expectedPartitionStartIndices) + } + + test("1 shuffle") { + val targetSize = 100 + + { + // All bytes per partition are 0. + val bytesByPartitionId = Array[Long](0, 0, 0, 0, 0) + val expectedPartitionStartIndices = Array[Int](0) + checkEstimation(Array(bytesByPartitionId), expectedPartitionStartIndices, targetSize) + } + + { + // Some bytes per partition are 0 and total size is less than the target size. + // 1 coalesced partition is expected. + val bytesByPartitionId = Array[Long](10, 0, 20, 0, 0) + val expectedPartitionStartIndices = Array[Int](0) + checkEstimation(Array(bytesByPartitionId), expectedPartitionStartIndices, targetSize) + } + + { + // 2 coalesced partitions are expected. + val bytesByPartitionId = Array[Long](10, 0, 90, 20, 0) + val expectedPartitionStartIndices = Array[Int](0, 3) + checkEstimation(Array(bytesByPartitionId), expectedPartitionStartIndices, targetSize) + } + + { + // There are a few large shuffle partitions. + val bytesByPartitionId = Array[Long](110, 10, 100, 110, 0) + val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4) + checkEstimation(Array(bytesByPartitionId), expectedPartitionStartIndices, targetSize) + } + + { + // All shuffle partitions are larger than the targeted size. + val bytesByPartitionId = Array[Long](100, 110, 100, 110, 110) + val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4) + checkEstimation(Array(bytesByPartitionId), expectedPartitionStartIndices, targetSize) + } + + { + // The last shuffle partition is in a single coalesced partition. + val bytesByPartitionId = Array[Long](30, 30, 0, 40, 110) + val expectedPartitionStartIndices = Array[Int](0, 4) + checkEstimation(Array(bytesByPartitionId), expectedPartitionStartIndices, targetSize) + } + } + + test("2 shuffles") { + val targetSize = 100 + + { + // If there are multiple values of the number of shuffle partitions, + // we should see an assertion error. + val bytesByPartitionId1 = Array[Long](0, 0, 0, 0, 0) + val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0, 0) + intercept[AssertionError] { + checkEstimation(Array(bytesByPartitionId1, bytesByPartitionId2), Array.empty, targetSize) + } + } + + { + // All bytes per partition are 0. + val bytesByPartitionId1 = Array[Long](0, 0, 0, 0, 0) + val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0) + val expectedPartitionStartIndices = Array[Int](0) + checkEstimation( + Array(bytesByPartitionId1, bytesByPartitionId2), + expectedPartitionStartIndices, + targetSize) + } + + { + // Some bytes per partition are 0. + // 1 coalesced partition is expected. + val bytesByPartitionId1 = Array[Long](0, 10, 0, 20, 0) + val bytesByPartitionId2 = Array[Long](30, 0, 20, 0, 20) + val expectedPartitionStartIndices = Array[Int](0) + checkEstimation( + Array(bytesByPartitionId1, bytesByPartitionId2), + expectedPartitionStartIndices, + targetSize) + } + + { + // 2 coalesced partition are expected. + val bytesByPartitionId1 = Array[Long](0, 10, 0, 20, 0) + val bytesByPartitionId2 = Array[Long](30, 0, 70, 0, 30) + val expectedPartitionStartIndices = Array[Int](0, 2, 4) + checkEstimation( + Array(bytesByPartitionId1, bytesByPartitionId2), + expectedPartitionStartIndices, + targetSize) + } + + { + // 4 coalesced partition are expected. + val bytesByPartitionId1 = Array[Long](0, 99, 0, 20, 0) + val bytesByPartitionId2 = Array[Long](30, 0, 70, 0, 30) + val expectedPartitionStartIndices = Array[Int](0, 1, 2, 4) + checkEstimation( + Array(bytesByPartitionId1, bytesByPartitionId2), + expectedPartitionStartIndices, + targetSize) + } + + { + // 2 coalesced partition are needed. + val bytesByPartitionId1 = Array[Long](0, 100, 0, 30, 0) + val bytesByPartitionId2 = Array[Long](30, 0, 70, 0, 30) + val expectedPartitionStartIndices = Array[Int](0, 1, 2, 4) + checkEstimation( + Array(bytesByPartitionId1, bytesByPartitionId2), + expectedPartitionStartIndices, + targetSize) + } + + { + // There are a few large shuffle partitions. + val bytesByPartitionId1 = Array[Long](0, 100, 40, 30, 0) + val bytesByPartitionId2 = Array[Long](30, 0, 60, 0, 110) + val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4) + checkEstimation( + Array(bytesByPartitionId1, bytesByPartitionId2), + expectedPartitionStartIndices, + targetSize) + } + + { + // All pairs of shuffle partitions are larger than the targeted size. + val bytesByPartitionId1 = Array[Long](100, 100, 40, 30, 0) + val bytesByPartitionId2 = Array[Long](30, 0, 60, 70, 110) + val expectedPartitionStartIndices = Array[Int](0, 1, 2, 3, 4) + checkEstimation( + Array(bytesByPartitionId1, bytesByPartitionId2), + expectedPartitionStartIndices, + targetSize) + } + } + + test("enforce minimal number of coalesced partitions") { + val targetSize = 100 + val minNumPartitions = 2 + + { + // The minimal number of coalesced partitions is not enforced because + // the size of data is 0. + val bytesByPartitionId1 = Array[Long](0, 0, 0, 0, 0) + val bytesByPartitionId2 = Array[Long](0, 0, 0, 0, 0) + val expectedPartitionStartIndices = Array[Int](0) + checkEstimation( + Array(bytesByPartitionId1, bytesByPartitionId2), + expectedPartitionStartIndices, + targetSize, minNumPartitions) + } + + { + // The minimal number of coalesced partitions is enforced. + val bytesByPartitionId1 = Array[Long](10, 5, 5, 0, 20) + val bytesByPartitionId2 = Array[Long](5, 10, 0, 10, 5) + val expectedPartitionStartIndices = Array[Int](0, 3) + checkEstimation( + Array(bytesByPartitionId1, bytesByPartitionId2), + expectedPartitionStartIndices, + targetSize, minNumPartitions) + } + + { + // The number of coalesced partitions is determined by the algorithm. + val bytesByPartitionId1 = Array[Long](10, 50, 20, 80, 20) + val bytesByPartitionId2 = Array[Long](40, 10, 0, 10, 30) + val expectedPartitionStartIndices = Array[Int](0, 1, 3, 4) + checkEstimation( + Array(bytesByPartitionId1, bytesByPartitionId2), + expectedPartitionStartIndices, + targetSize, minNumPartitions) + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala index 79000be05a8c7..56fff1107ae39 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanSuite.scala @@ -34,6 +34,7 @@ class SparkPlanSuite extends QueryTest with SharedSparkSession { intercept[IllegalStateException] { plan.executeToIterator() } intercept[IllegalStateException] { plan.executeBroadcast() } intercept[IllegalStateException] { plan.executeTake(1) } + intercept[IllegalStateException] { plan.executeTail(1) } } test("SPARK-23731 plans should be canonicalizable after being (de)serialized") { @@ -83,4 +84,8 @@ class SparkPlanSuite extends QueryTest with SharedSparkSession { } } } + + test("SPARK-30780 empty LocalTableScan should use RDD without partitions") { + assert(LocalTableScanExec(Nil, Nil).execute().getNumPartitions == 0) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala index b751fb7c50438..06574a9f8fd2c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkSqlParserSuite.scala @@ -18,12 +18,11 @@ package org.apache.spark.sql.execution import org.apache.spark.sql.SaveMode -import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} +import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, UnresolvedAlias, UnresolvedAttribute, UnresolvedRelation, UnresolvedStar} import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType} import org.apache.spark.sql.catalyst.expressions.{Ascending, Concat, SortOrder} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, RepartitionByExpression, Sort} -import org.apache.spark.sql.catalyst.plans.logical.sql.{DescribeColumnStatement, DescribeTableStatement} import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.execution.datasources.{CreateTable, RefreshResource} import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} @@ -81,33 +80,6 @@ class SparkSqlParserSuite extends AnalysisTest { intercept("REFRESH", "Resource paths cannot be empty in REFRESH statements") } - test("show functions") { - assertEqual("show functions", ShowFunctionsCommand(None, None, true, true)) - assertEqual("show all functions", ShowFunctionsCommand(None, None, true, true)) - assertEqual("show user functions", ShowFunctionsCommand(None, None, true, false)) - assertEqual("show system functions", ShowFunctionsCommand(None, None, false, true)) - intercept("show special functions", "SHOW special FUNCTIONS") - assertEqual("show functions foo", - ShowFunctionsCommand(None, Some("foo"), true, true)) - assertEqual("show functions foo.bar", - ShowFunctionsCommand(Some("foo"), Some("bar"), true, true)) - assertEqual("show functions 'foo\\\\.*'", - ShowFunctionsCommand(None, Some("foo\\.*"), true, true)) - intercept("show functions foo.bar.baz", "Unsupported function name") - } - - test("describe function") { - assertEqual("describe function bar", - DescribeFunctionCommand(FunctionIdentifier("bar", database = None), isExtended = false)) - assertEqual("describe function extended bar", - DescribeFunctionCommand(FunctionIdentifier("bar", database = None), isExtended = true)) - assertEqual("describe function foo.bar", - DescribeFunctionCommand( - FunctionIdentifier("bar", database = Some("foo")), isExtended = false)) - assertEqual("describe function extended f.bar", - DescribeFunctionCommand(FunctionIdentifier("bar", database = Some("f")), isExtended = true)) - } - private def createTableUsing( table: String, database: Option[String] = None, @@ -160,7 +132,7 @@ class SparkSqlParserSuite extends AnalysisTest { } test("create table - schema") { - assertEqual("CREATE TABLE my_tab(a INT COMMENT 'test', b STRING)", + assertEqual("CREATE TABLE my_tab(a INT COMMENT 'test', b STRING) STORED AS textfile", createTable( table = "my_tab", schema = (new StructType) @@ -180,7 +152,8 @@ class SparkSqlParserSuite extends AnalysisTest { partitionColumnNames = Seq("c", "d") ) ) - assertEqual("CREATE TABLE my_tab(id BIGINT, nested STRUCT)", + assertEqual("CREATE TABLE my_tab(id BIGINT, nested STRUCT) " + + "STORED AS textfile", createTable( table = "my_tab", schema = (new StructType) @@ -217,68 +190,6 @@ class SparkSqlParserSuite extends AnalysisTest { assertEqual("DESCRIBE " + query, DescribeQueryCommand(query, parser.parsePlan(query))) } - test("analyze table statistics") { - assertEqual("analyze table t compute statistics", - AnalyzeTableCommand(TableIdentifier("t"), noscan = false)) - assertEqual("analyze table t compute statistics noscan", - AnalyzeTableCommand(TableIdentifier("t"), noscan = true)) - assertEqual("analyze table t partition (a) compute statistics nOscAn", - AnalyzePartitionCommand(TableIdentifier("t"), Map("a" -> None), noscan = true)) - - // Partitions specified - assertEqual("ANALYZE TABLE t PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS", - AnalyzePartitionCommand(TableIdentifier("t"), noscan = false, - partitionSpec = Map("ds" -> Some("2008-04-09"), "hr" -> Some("11")))) - assertEqual("ANALYZE TABLE t PARTITION(ds='2008-04-09', hr=11) COMPUTE STATISTICS noscan", - AnalyzePartitionCommand(TableIdentifier("t"), noscan = true, - partitionSpec = Map("ds" -> Some("2008-04-09"), "hr" -> Some("11")))) - assertEqual("ANALYZE TABLE t PARTITION(ds='2008-04-09') COMPUTE STATISTICS noscan", - AnalyzePartitionCommand(TableIdentifier("t"), noscan = true, - partitionSpec = Map("ds" -> Some("2008-04-09")))) - assertEqual("ANALYZE TABLE t PARTITION(ds='2008-04-09', hr) COMPUTE STATISTICS", - AnalyzePartitionCommand(TableIdentifier("t"), noscan = false, - partitionSpec = Map("ds" -> Some("2008-04-09"), "hr" -> None))) - assertEqual("ANALYZE TABLE t PARTITION(ds='2008-04-09', hr) COMPUTE STATISTICS noscan", - AnalyzePartitionCommand(TableIdentifier("t"), noscan = true, - partitionSpec = Map("ds" -> Some("2008-04-09"), "hr" -> None))) - assertEqual("ANALYZE TABLE t PARTITION(ds, hr=11) COMPUTE STATISTICS noscan", - AnalyzePartitionCommand(TableIdentifier("t"), noscan = true, - partitionSpec = Map("ds" -> None, "hr" -> Some("11")))) - assertEqual("ANALYZE TABLE t PARTITION(ds, hr) COMPUTE STATISTICS", - AnalyzePartitionCommand(TableIdentifier("t"), noscan = false, - partitionSpec = Map("ds" -> None, "hr" -> None))) - assertEqual("ANALYZE TABLE t PARTITION(ds, hr) COMPUTE STATISTICS noscan", - AnalyzePartitionCommand(TableIdentifier("t"), noscan = true, - partitionSpec = Map("ds" -> None, "hr" -> None))) - - intercept("analyze table t compute statistics xxxx", - "Expected `NOSCAN` instead of `xxxx`") - intercept("analyze table t partition (a) compute statistics xxxx", - "Expected `NOSCAN` instead of `xxxx`") - } - - test("analyze table column statistics") { - intercept("ANALYZE TABLE t COMPUTE STATISTICS FOR COLUMNS", "") - - assertEqual("ANALYZE TABLE t COMPUTE STATISTICS FOR COLUMNS key, value", - AnalyzeColumnCommand(TableIdentifier("t"), Option(Seq("key", "value")), allColumns = false)) - - // Partition specified - should be ignored - assertEqual("ANALYZE TABLE t PARTITION(ds='2017-06-10') " + - "COMPUTE STATISTICS FOR COLUMNS key, value", - AnalyzeColumnCommand(TableIdentifier("t"), Option(Seq("key", "value")), allColumns = false)) - - // Partition specified should be ignored in case of COMPUTE STATISTICS FOR ALL COLUMNS - assertEqual("ANALYZE TABLE t PARTITION(ds='2017-06-10') " + - "COMPUTE STATISTICS FOR ALL COLUMNS", - AnalyzeColumnCommand(TableIdentifier("t"), None, allColumns = true)) - - intercept("ANALYZE TABLE t COMPUTE STATISTICS FOR ALL COLUMNS key, value", - "mismatched input 'key' expecting ") - intercept("ANALYZE TABLE t COMPUTE STATISTICS FOR ALL", - "missing 'COLUMNS' at ''") - } - test("query organization") { // Test all valid combinations of order by/sort by/distribute by/cluster by/limit/windows val baseSql = "select * from t" @@ -322,4 +233,22 @@ class SparkSqlParserSuite extends AnalysisTest { parser.parsePlan("ALTER SCHEMA foo SET DBPROPERTIES ('x' = 'y')")) assertEqual("DESC DATABASE foo", parser.parsePlan("DESC SCHEMA foo")) } + + test("manage resources") { + assertEqual("ADD FILE abc.txt", AddFileCommand("abc.txt")) + assertEqual("ADD FILE 'abc.txt'", AddFileCommand("abc.txt")) + assertEqual("ADD FILE \"/path/to/abc.txt\"", AddFileCommand("/path/to/abc.txt")) + assertEqual("LIST FILE abc.txt", ListFilesCommand(Array("abc.txt"))) + assertEqual("LIST FILE '/path//abc.txt'", ListFilesCommand(Array("/path//abc.txt"))) + assertEqual("LIST FILE \"/path2/abc.txt\"", ListFilesCommand(Array("/path2/abc.txt"))) + assertEqual("ADD JAR /path2/_2/abc.jar", AddJarCommand("/path2/_2/abc.jar")) + assertEqual("ADD JAR '/test/path_2/jar/abc.jar'", AddJarCommand("/test/path_2/jar/abc.jar")) + assertEqual("ADD JAR \"abc.jar\"", AddJarCommand("abc.jar")) + assertEqual("LIST JAR /path-with-dash/abc.jar", + ListJarsCommand(Array("/path-with-dash/abc.jar"))) + assertEqual("LIST JAR 'abc.jar'", ListJarsCommand(Array("abc.jar"))) + assertEqual("LIST JAR \"abc.jar\"", ListJarsCommand(Array("abc.jar"))) + assertEqual("ADD FILE /path with space/abc.txt", AddFileCommand("/path with space/abc.txt")) + assertEqual("ADD JAR /path with space/abc.jar", AddJarCommand("/path with space/abc.jar")) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala index 392cce54ebede..ef81f1b788496 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala @@ -57,7 +57,7 @@ class UnsafeFixedWidthAggregationMapSuite private var taskContext: TaskContext = null - def testWithMemoryLeakDetection(name: String)(f: => Unit) { + def testWithMemoryLeakDetection(name: String)(f: => Unit): Unit = { def cleanup(): Unit = { if (taskMemoryManager != null) { assert(taskMemoryManager.cleanUpAllAllocatedMemory() === 0) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSparkSubmitSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSparkSubmitSuite.scala index f985386eee292..f6814d8ff8a3d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSparkSubmitSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSparkSubmitSuite.scala @@ -48,6 +48,7 @@ class WholeStageCodegenSparkSubmitSuite extends SparkFunSuite "--conf", "spark.master.rest.enabled=false", "--conf", "spark.driver.extraJavaOptions=-XX:-UseCompressedOops", "--conf", "spark.executor.extraJavaOptions=-XX:+UseCompressedOops", + "--conf", "spark.sql.adaptive.enabled=false", unusedJar.toString) SparkSubmitSuite.runSparkSubmit(argsForSparkSubmit, "../..") } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala index 0ea16a1a15d66..06a016fac5300 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala @@ -18,12 +18,11 @@ package org.apache.spark.sql.execution import org.apache.spark.sql.{Dataset, QueryTest, Row, SaveMode} -import org.apache.spark.sql.catalyst.expressions.codegen.{CodeAndComment, CodeGenerator} +import org.apache.spark.sql.catalyst.expressions.codegen.{ByteCodeStats, CodeAndComment, CodeGenerator} import org.apache.spark.sql.execution.aggregate.HashAggregateExec import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec import org.apache.spark.sql.execution.joins.SortMergeJoinExec -import org.apache.spark.sql.expressions.scalalang.typed import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -33,6 +32,19 @@ class WholeStageCodegenSuite extends QueryTest with SharedSparkSession { import testImplicits._ + var originalValue: String = _ + // With on AQE, the WholeStageCodegenExec is added when running QueryStageExec. + override def beforeAll(): Unit = { + super.beforeAll() + originalValue = spark.conf.get(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key) + spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false") + } + + override def afterAll(): Unit = { + spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, originalValue) + super.afterAll() + } + test("range/filter should be combined") { val df = spark.range(10).filter("id = 1").selectExpr("id + 1") val plan = df.queryExecution.executedPlan @@ -107,19 +119,6 @@ class WholeStageCodegenSuite extends QueryTest with SharedSparkSession { assert(ds.collect() === Array(0, 6)) } - test("simple typed UDAF should be included in WholeStageCodegen") { - import testImplicits._ - - val ds = Seq(("a", 10), ("b", 1), ("b", 2), ("c", 1)).toDS() - .groupByKey(_._1).agg(typed.sum(_._2)) - - val plan = ds.queryExecution.executedPlan - assert(plan.find(p => - p.isInstanceOf[WholeStageCodegenExec] && - p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined) - assert(ds.collect() === Array(("a", 10.0), ("b", 3.0), ("c", 1.0))) - } - test("cache for primitive type should be in WholeStageCodegen with InMemoryTableScanExec") { import testImplicits._ @@ -213,10 +212,10 @@ class WholeStageCodegenSuite extends QueryTest with SharedSparkSession { ignore("SPARK-21871 check if we can get large code size when compiling too long functions") { val codeWithShortFunctions = genGroupByCode(3) - val (_, maxCodeSize1) = CodeGenerator.compile(codeWithShortFunctions) + val (_, ByteCodeStats(maxCodeSize1, _, _)) = CodeGenerator.compile(codeWithShortFunctions) assert(maxCodeSize1 < SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.defaultValue.get) val codeWithLongFunctions = genGroupByCode(50) - val (_, maxCodeSize2) = CodeGenerator.compile(codeWithLongFunctions) + val (_, ByteCodeStats(maxCodeSize2, _, _)) = CodeGenerator.compile(codeWithLongFunctions) assert(maxCodeSize2 > SQLConf.WHOLESTAGE_HUGE_METHOD_LIMIT.defaultValue.get) } @@ -398,4 +397,48 @@ class WholeStageCodegenSuite extends QueryTest with SharedSparkSession { }.isDefined, "LocalTableScanExec should be within a WholeStageCodegen domain.") } + + test("Give up splitting aggregate code if a parameter length goes over the limit") { + withSQLConf( + SQLConf.CODEGEN_SPLIT_AGGREGATE_FUNC.key -> "true", + SQLConf.CODEGEN_METHOD_SPLIT_THRESHOLD.key -> "1", + "spark.sql.CodeGenerator.validParamLength" -> "0") { + withTable("t") { + val expectedErrMsg = "Failed to split aggregate code into small functions" + Seq( + // Test case without keys + "SELECT AVG(v) FROM VALUES(1) t(v)", + // Tet case with keys + "SELECT k, AVG(v) FROM VALUES((1, 1)) t(k, v) GROUP BY k").foreach { query => + val errMsg = intercept[IllegalStateException] { + sql(query).collect + }.getMessage + assert(errMsg.contains(expectedErrMsg)) + } + } + } + } + + test("Give up splitting subexpression code if a parameter length goes over the limit") { + withSQLConf( + SQLConf.CODEGEN_SPLIT_AGGREGATE_FUNC.key -> "false", + SQLConf.CODEGEN_METHOD_SPLIT_THRESHOLD.key -> "1", + "spark.sql.CodeGenerator.validParamLength" -> "0") { + withTable("t") { + val expectedErrMsg = "Failed to split subexpression code into small functions" + Seq( + // Test case without keys + "SELECT AVG(a + b), SUM(a + b + c) FROM VALUES((1, 1, 1)) t(a, b, c)", + // Tet case with keys + "SELECT k, AVG(a + b), SUM(a + b + c) FROM VALUES((1, 1, 1, 1)) t(k, a, b, c) " + + "GROUP BY k").foreach { query => + val e = intercept[Exception] { + sql(query).collect + }.getCause + assert(e.isInstanceOf[IllegalStateException]) + assert(e.getMessage.contains(expectedErrMsg)) + } + } + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala index 55e57a244c030..4edb35ea30fde 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala @@ -17,20 +17,44 @@ package org.apache.spark.sql.execution.adaptive +import java.io.File +import java.net.URI + +import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent, SparkListenerJobStart} import org.apache.spark.sql.QueryTest import org.apache.spark.sql.execution.{ReusedSubqueryExec, SparkPlan} -import org.apache.spark.sql.execution.adaptive.rule.CoalescedShuffleReaderExec -import org.apache.spark.sql.execution.exchange.Exchange -import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, SortMergeJoinExec} +import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, Exchange, ReusedExchangeExec, ShuffleExchangeExec} +import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, BuildRight, SortMergeJoinExec} +import org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.util.Utils + +class AdaptiveQueryExecSuite + extends QueryTest + with SharedSparkSession + with AdaptiveSparkPlanHelper { -class AdaptiveQueryExecSuite extends QueryTest with SharedSparkSession { import testImplicits._ setupTestData() private def runAdaptiveAndVerifyResult(query: String): (SparkPlan, SparkPlan) = { + var finalPlanCnt = 0 + val listener = new SparkListener { + override def onOtherEvent(event: SparkListenerEvent): Unit = { + event match { + case SparkListenerSQLAdaptiveExecutionUpdate(_, _, sparkPlanInfo) => + if (sparkPlanInfo.simpleString.startsWith( + "AdaptiveSparkPlan(isFinalPlan=true)")) { + finalPlanCnt += 1 + } + case _ => // ignore other events + } + } + } + spark.sparkContext.addSparkListener(listener) + val dfAdaptive = sql(query) val planBefore = dfAdaptive.queryExecution.executedPlan assert(planBefore.toString.startsWith("AdaptiveSparkPlan(isFinalPlan=false)")) @@ -41,6 +65,11 @@ class AdaptiveQueryExecSuite extends QueryTest with SharedSparkSession { } val planAfter = dfAdaptive.queryExecution.executedPlan assert(planAfter.toString.startsWith("AdaptiveSparkPlan(isFinalPlan=true)")) + + spark.sparkContext.listenerBus.waitUntilEmpty() + assert(finalPlanCnt == 1) + spark.sparkContext.removeSparkListener(listener) + val adaptivePlan = planAfter.asInstanceOf[AdaptiveSparkPlanExec].executedPlan val exchanges = adaptivePlan.collect { case e: Exchange => e @@ -50,34 +79,41 @@ class AdaptiveQueryExecSuite extends QueryTest with SharedSparkSession { } private def findTopLevelBroadcastHashJoin(plan: SparkPlan): Seq[BroadcastHashJoinExec] = { - plan.collect { - case j: BroadcastHashJoinExec => Seq(j) - case s: QueryStageExec => findTopLevelBroadcastHashJoin(s.plan) - }.flatten + collect(plan) { + case j: BroadcastHashJoinExec => j + } } private def findTopLevelSortMergeJoin(plan: SparkPlan): Seq[SortMergeJoinExec] = { - plan.collect { - case j: SortMergeJoinExec => Seq(j) - case s: QueryStageExec => findTopLevelSortMergeJoin(s.plan) - }.flatten + collect(plan) { + case j: SortMergeJoinExec => j + } } - private def findReusedExchange(plan: SparkPlan): Seq[ReusedQueryStageExec] = { - plan.collect { - case e: ReusedQueryStageExec => Seq(e) - case a: AdaptiveSparkPlanExec => findReusedExchange(a.executedPlan) - case s: QueryStageExec => findReusedExchange(s.plan) - case p: SparkPlan => p.subqueries.flatMap(findReusedExchange) - }.flatten + private def findReusedExchange(plan: SparkPlan): Seq[ReusedExchangeExec] = { + collectInPlanAndSubqueries(plan) { + case ShuffleQueryStageExec(_, e: ReusedExchangeExec) => e + case BroadcastQueryStageExec(_, e: ReusedExchangeExec) => e + } } private def findReusedSubquery(plan: SparkPlan): Seq[ReusedSubqueryExec] = { - plan.collect { - case e: ReusedSubqueryExec => Seq(e) - case s: QueryStageExec => findReusedSubquery(s.plan) - case p: SparkPlan => p.subqueries.flatMap(findReusedSubquery) - }.flatten + collectInPlanAndSubqueries(plan) { + case e: ReusedSubqueryExec => e + } + } + + private def checkNumLocalShuffleReaders( + plan: SparkPlan, numShufflesWithoutLocalReader: Int = 0): Unit = { + val numShuffles = collect(plan) { + case s: ShuffleQueryStageExec => s + }.length + + val numLocalReaders = collect(plan) { + case reader: LocalShuffleReaderExec => reader + }.length + + assert(numShuffles === (numLocalReaders + numShufflesWithoutLocalReader)) } test("Change merge join to broadcast join") { @@ -90,30 +126,65 @@ class AdaptiveQueryExecSuite extends QueryTest with SharedSparkSession { assert(smj.size == 1) val bhj = findTopLevelBroadcastHashJoin(adaptivePlan) assert(bhj.size == 1) + checkNumLocalShuffleReaders(adaptivePlan) } } - test("Change merge join to broadcast join and reduce number of shuffle partitions") { + test("Reuse the parallelism of CoalescedShuffleReaderExec in LocalShuffleReaderExec") { withSQLConf( SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", - SQLConf.REDUCE_POST_SHUFFLE_PARTITIONS_ENABLED.key -> "true", SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "80", - SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key -> "150") { + SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key -> "10") { val (plan, adaptivePlan) = runAdaptiveAndVerifyResult( "SELECT * FROM testData join testData2 ON key = a where value = '1'") val smj = findTopLevelSortMergeJoin(plan) assert(smj.size == 1) val bhj = findTopLevelBroadcastHashJoin(adaptivePlan) assert(bhj.size == 1) - - val shuffleReaders = adaptivePlan.collect { - case reader: CoalescedShuffleReaderExec => reader + val localReaders = collect(adaptivePlan) { + case reader: LocalShuffleReaderExec => reader } - assert(shuffleReaders.length === 1) + assert(localReaders.length == 2) + val localShuffleRDD0 = localReaders(0).execute().asInstanceOf[LocalShuffledRowRDD] + val localShuffleRDD1 = localReaders(1).execute().asInstanceOf[LocalShuffledRowRDD] + // The pre-shuffle partition size is [0, 0, 0, 72, 0] + // And the partitionStartIndices is [0, 3, 4], so advisoryParallelism = 3. + // the final parallelism is + // math.max(1, advisoryParallelism / numMappers): math.max(1, 3/2) = 1 + // and the partitions length is 1 * numMappers = 2 + assert(localShuffleRDD0.getPartitions.length == 2) // The pre-shuffle partition size is [0, 72, 0, 72, 126] - shuffleReaders.foreach { reader => - assert(reader.outputPartitioning.numPartitions === 2) + // And the partitionStartIndices is [0, 1, 2, 3, 4], so advisoryParallelism = 5. + // the final parallelism is + // math.max(1, advisoryParallelism / numMappers): math.max(1, 5/2) = 2 + // and the partitions length is 2 * numMappers = 4 + assert(localShuffleRDD1.getPartitions.length == 4) + } + } + + test("Reuse the default parallelism in LocalShuffleReaderExec") { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "80", + SQLConf.REDUCE_POST_SHUFFLE_PARTITIONS_ENABLED.key -> "false") { + val (plan, adaptivePlan) = runAdaptiveAndVerifyResult( + "SELECT * FROM testData join testData2 ON key = a where value = '1'") + val smj = findTopLevelSortMergeJoin(plan) + assert(smj.size == 1) + val bhj = findTopLevelBroadcastHashJoin(adaptivePlan) + assert(bhj.size == 1) + val localReaders = collect(adaptivePlan) { + case reader: LocalShuffleReaderExec => reader } + assert(localReaders.length == 2) + val localShuffleRDD0 = localReaders(0).execute().asInstanceOf[LocalShuffledRowRDD] + val localShuffleRDD1 = localReaders(1).execute().asInstanceOf[LocalShuffledRowRDD] + // the final parallelism is math.max(1, numReduces / numMappers): math.max(1, 5/2) = 2 + // and the partitions length is 2 * numMappers = 4 + assert(localShuffleRDD0.getPartitions.length == 4) + // the final parallelism is math.max(1, numReduces / numMappers): math.max(1, 5/2) = 2 + // and the partitions length is 2 * numMappers = 4 + assert(localShuffleRDD1.getPartitions.length == 4) } } @@ -128,6 +199,7 @@ class AdaptiveQueryExecSuite extends QueryTest with SharedSparkSession { assert(smj.size == 1) val bhj = findTopLevelBroadcastHashJoin(adaptivePlan) assert(bhj.size == 1) + checkNumLocalShuffleReaders(adaptivePlan) } } @@ -142,6 +214,8 @@ class AdaptiveQueryExecSuite extends QueryTest with SharedSparkSession { assert(smj.size == 1) val bhj = findTopLevelBroadcastHashJoin(adaptivePlan) assert(bhj.size == 1) + + checkNumLocalShuffleReaders(adaptivePlan) } } @@ -163,6 +237,30 @@ class AdaptiveQueryExecSuite extends QueryTest with SharedSparkSession { assert(smj.size == 3) val bhj = findTopLevelBroadcastHashJoin(adaptivePlan) assert(bhj.size == 3) + + // A possible resulting query plan: + // BroadcastHashJoin + // +- BroadcastExchange + // +- LocalShuffleReader* + // +- ShuffleExchange + // +- BroadcastHashJoin + // +- BroadcastExchange + // +- LocalShuffleReader* + // +- ShuffleExchange + // +- LocalShuffleReader* + // +- ShuffleExchange + // +- BroadcastHashJoin + // +- LocalShuffleReader* + // +- ShuffleExchange + // +- BroadcastExchange + // +-LocalShuffleReader* + // +- ShuffleExchange + + // After applied the 'OptimizeLocalShuffleReader' rule, we can convert all the four + // shuffle reader to local shuffle reader in the bottom two 'BroadcastHashJoin'. + // For the top level 'BroadcastHashJoin', the probe side is not shuffle query stage + // and the build side shuffle query stage is also converted to local shuffle reader. + checkNumLocalShuffleReaders(adaptivePlan) } } @@ -186,6 +284,28 @@ class AdaptiveQueryExecSuite extends QueryTest with SharedSparkSession { assert(smj.size == 3) val bhj = findTopLevelBroadcastHashJoin(adaptivePlan) assert(bhj.size == 3) + + // A possible resulting query plan: + // BroadcastHashJoin + // +- BroadcastExchange + // +- LocalShuffleReader* + // +- ShuffleExchange + // +- BroadcastHashJoin + // +- BroadcastExchange + // +- LocalShuffleReader* + // +- ShuffleExchange + // +- LocalShuffleReader* + // +- ShuffleExchange + // +- BroadcastHashJoin + // +- LocalShuffleReader* + // +- ShuffleExchange + // +- BroadcastExchange + // +-HashAggregate + // +- CoalescedShuffleReader + // +- ShuffleExchange + + // The shuffle added by Aggregate can't apply local reader. + checkNumLocalShuffleReaders(adaptivePlan, 1) } } @@ -209,6 +329,29 @@ class AdaptiveQueryExecSuite extends QueryTest with SharedSparkSession { assert(smj.size == 3) val bhj = findTopLevelBroadcastHashJoin(adaptivePlan) assert(bhj.size == 3) + + // A possible resulting query plan: + // BroadcastHashJoin + // +- BroadcastExchange + // +- LocalShuffleReader* + // +- ShuffleExchange + // +- BroadcastHashJoin + // +- BroadcastExchange + // +- LocalShuffleReader* + // +- ShuffleExchange + // +- LocalShuffleReader* + // +- ShuffleExchange + // +- BroadcastHashJoin + // +- Filter + // +- HashAggregate + // +- CoalescedShuffleReader + // +- ShuffleExchange + // +- BroadcastExchange + // +-LocalShuffleReader* + // +- ShuffleExchange + + // The shuffle added by Aggregate can't apply local reader. + checkNumLocalShuffleReaders(adaptivePlan, 1) } } @@ -223,6 +366,9 @@ class AdaptiveQueryExecSuite extends QueryTest with SharedSparkSession { assert(smj.size == 3) val bhj = findTopLevelBroadcastHashJoin(adaptivePlan) assert(bhj.size == 2) + // There is still a SMJ, and its two shuffles can't apply local reader. + checkNumLocalShuffleReaders(adaptivePlan, 2) + // Even with local shuffle reader, the query stage reuse can also work. val ex = findReusedExchange(adaptivePlan) assert(ex.size == 1) } @@ -239,6 +385,8 @@ class AdaptiveQueryExecSuite extends QueryTest with SharedSparkSession { assert(smj.size == 1) val bhj = findTopLevelBroadcastHashJoin(adaptivePlan) assert(bhj.size == 1) + checkNumLocalShuffleReaders(adaptivePlan) + // Even with local shuffle reader, the query stage reuse can also work. val ex = findReusedExchange(adaptivePlan) assert(ex.size == 1) } @@ -257,6 +405,8 @@ class AdaptiveQueryExecSuite extends QueryTest with SharedSparkSession { assert(smj.size == 1) val bhj = findTopLevelBroadcastHashJoin(adaptivePlan) assert(bhj.size == 1) + checkNumLocalShuffleReaders(adaptivePlan) + // Even with local shuffle reader, the query stage reuse can also work. val ex = findReusedExchange(adaptivePlan) assert(ex.nonEmpty) val sub = findReusedSubquery(adaptivePlan) @@ -276,6 +426,8 @@ class AdaptiveQueryExecSuite extends QueryTest with SharedSparkSession { assert(smj.size == 1) val bhj = findTopLevelBroadcastHashJoin(adaptivePlan) assert(bhj.size == 1) + checkNumLocalShuffleReaders(adaptivePlan) + // Even with local shuffle reader, the query stage reuse can also work. val ex = findReusedExchange(adaptivePlan) assert(ex.isEmpty) val sub = findReusedSubquery(adaptivePlan) @@ -298,9 +450,11 @@ class AdaptiveQueryExecSuite extends QueryTest with SharedSparkSession { assert(smj.size == 1) val bhj = findTopLevelBroadcastHashJoin(adaptivePlan) assert(bhj.size == 1) + checkNumLocalShuffleReaders(adaptivePlan) + // Even with local shuffle reader, the query stage reuse can also work. val ex = findReusedExchange(adaptivePlan) assert(ex.nonEmpty) - assert(ex.head.plan.isInstanceOf[BroadcastQueryStageExec]) + assert(ex.head.child.isInstanceOf[BroadcastExchangeExec]) val sub = findReusedSubquery(adaptivePlan) assert(sub.isEmpty) } @@ -357,4 +511,222 @@ class AdaptiveQueryExecSuite extends QueryTest with SharedSparkSession { assert(smj2.size == 2, origPlan.toString) } } + + test("Change merge join to broadcast join without local shuffle reader") { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.LOCAL_SHUFFLE_READER_ENABLED.key -> "true", + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "40") { + val (plan, adaptivePlan) = runAdaptiveAndVerifyResult( + """ + |SELECT * FROM testData t1 join testData2 t2 + |ON t1.key = t2.a join testData3 t3 on t2.a = t3.a + |where t1.value = 1 + """.stripMargin + ) + val smj = findTopLevelSortMergeJoin(plan) + assert(smj.size == 2) + val bhj = findTopLevelBroadcastHashJoin(adaptivePlan) + assert(bhj.size == 1) + // There is still a SMJ, and its two shuffles can't apply local reader. + checkNumLocalShuffleReaders(adaptivePlan, 2) + } + } + + test("Avoid changing merge join to broadcast join if too many empty partitions on build plan") { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.NON_EMPTY_PARTITION_RATIO_FOR_BROADCAST_JOIN.key -> "0.5") { + // `testData` is small enough to be broadcast but has empty partition ratio over the config. + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "80") { + val (plan, adaptivePlan) = runAdaptiveAndVerifyResult( + "SELECT * FROM testData join testData2 ON key = a where value = '1'") + val smj = findTopLevelSortMergeJoin(plan) + assert(smj.size == 1) + val bhj = findTopLevelBroadcastHashJoin(adaptivePlan) + assert(bhj.isEmpty) + } + // It is still possible to broadcast `testData2`. + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "2000") { + val (plan, adaptivePlan) = runAdaptiveAndVerifyResult( + "SELECT * FROM testData join testData2 ON key = a where value = '1'") + val smj = findTopLevelSortMergeJoin(plan) + assert(smj.size == 1) + val bhj = findTopLevelBroadcastHashJoin(adaptivePlan) + assert(bhj.size == 1) + assert(bhj.head.buildSide == BuildRight) + } + } + } + + test("SPARK-29906: AQE should not introduce extra shuffle for outermost limit") { + var numStages = 0 + val listener = new SparkListener { + override def onJobStart(jobStart: SparkListenerJobStart): Unit = { + numStages = jobStart.stageInfos.length + } + } + try { + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true") { + spark.sparkContext.addSparkListener(listener) + spark.range(0, 100, 1, numPartitions = 10).take(1) + spark.sparkContext.listenerBus.waitUntilEmpty() + // Should be only one stage since there is no shuffle. + assert(numStages == 1) + } + } finally { + spark.sparkContext.removeSparkListener(listener) + } + } + + test("SPARK-30524: Do not optimize skew join if introduce additional shuffle") { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD.key -> "100", + SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key -> "700") { + withTempView("skewData1", "skewData2") { + spark + .range(0, 1000, 1, 10) + .selectExpr("id % 2 as key1", "id as value1") + .createOrReplaceTempView("skewData1") + spark + .range(0, 1000, 1, 10) + .selectExpr("id % 1 as key2", "id as value2") + .createOrReplaceTempView("skewData2") + val (_, innerAdaptivePlan) = runAdaptiveAndVerifyResult( + "SELECT key1 FROM skewData1 join skewData2 ON key1 = key2 group by key1") + // Additional shuffle introduced, so disable the "OptimizeSkewedJoin" optimization + val innerSmj = findTopLevelSortMergeJoin(innerAdaptivePlan) + assert(innerSmj.size == 1 && !innerSmj.head.isSkewJoin) + } + } + } + + // TODO: we need a way to customize data distribution after shuffle, to improve test coverage + // of this case. + test("SPARK-29544: adaptive skew join with different join types") { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1", + SQLConf.ADAPTIVE_EXECUTION_SKEWED_PARTITION_SIZE_THRESHOLD.key -> "100", + SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key -> "700") { + withTempView("skewData1", "skewData2") { + spark + .range(0, 1000, 1, 10) + .selectExpr("id % 2 as key1", "id as value1") + .createOrReplaceTempView("skewData1") + spark + .range(0, 1000, 1, 10) + .selectExpr("id % 1 as key2", "id as value2") + .createOrReplaceTempView("skewData2") + + def checkSkewJoin(joins: Seq[SortMergeJoinExec], expectedNumPartitions: Int): Unit = { + assert(joins.size == 1 && joins.head.isSkewJoin) + assert(joins.head.left.collect { + case r: SkewJoinShuffleReaderExec => r + }.head.partitionSpecs.length == expectedNumPartitions) + assert(joins.head.right.collect { + case r: SkewJoinShuffleReaderExec => r + }.head.partitionSpecs.length == expectedNumPartitions) + } + + // skewed inner join optimization + val (_, innerAdaptivePlan) = runAdaptiveAndVerifyResult( + "SELECT * FROM skewData1 join skewData2 ON key1 = key2") + // left stats: [3496, 0, 0, 0, 4014] + // right stats:[6292, 0, 0, 0, 0] + // Partition 0: both left and right sides are skewed, and divide into 5 splits, so + // 5 x 5 sub-partitions. + // Partition 1, 2, 3: not skewed, and coalesced into 1 partition. + // Partition 4: only left side is skewed, and divide into 5 splits, so + // 5 sub-partitions. + // So total (25 + 1 + 5) partitions. + val innerSmj = findTopLevelSortMergeJoin(innerAdaptivePlan) + checkSkewJoin(innerSmj, 25 + 1 + 5) + + // skewed left outer join optimization + val (_, leftAdaptivePlan) = runAdaptiveAndVerifyResult( + "SELECT * FROM skewData1 left outer join skewData2 ON key1 = key2") + // left stats: [3496, 0, 0, 0, 4014] + // right stats:[6292, 0, 0, 0, 0] + // Partition 0: both left and right sides are skewed, but left join can't split right side, + // so only left side is divided into 5 splits, and thus 5 sub-partitions. + // Partition 1, 2, 3: not skewed, and coalesced into 1 partition. + // Partition 4: only left side is skewed, and divide into 5 splits, so + // 5 sub-partitions. + // So total (5 + 1 + 5) partitions. + val leftSmj = findTopLevelSortMergeJoin(leftAdaptivePlan) + checkSkewJoin(leftSmj, 5 + 1 + 5) + + // skewed right outer join optimization + val (_, rightAdaptivePlan) = runAdaptiveAndVerifyResult( + "SELECT * FROM skewData1 right outer join skewData2 ON key1 = key2") + // left stats: [3496, 0, 0, 0, 4014] + // right stats:[6292, 0, 0, 0, 0] + // Partition 0: both left and right sides are skewed, but right join can't split left side, + // so only right side is divided into 5 splits, and thus 5 sub-partitions. + // Partition 1, 2, 3: not skewed, and coalesced into 1 partition. + // Partition 4: only left side is skewed, but right join can't split left side, so just + // 1 partition. + // So total (5 + 1 + 1) partitions. + val rightSmj = findTopLevelSortMergeJoin(rightAdaptivePlan) + checkSkewJoin(rightSmj, 5 + 1 + 1) + } + } + } + + test("SPARK-30291: AQE should catch the exceptions when doing materialize") { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true") { + withTable("bucketed_table") { + val df1 = + (0 until 50).map(i => (i % 5, i % 13, i.toString)).toDF("i", "j", "k").as("df1") + df1.write.format("parquet").bucketBy(8, "i").saveAsTable("bucketed_table") + val warehouseFilePath = new URI(spark.sessionState.conf.warehousePath).getPath + val tableDir = new File(warehouseFilePath, "bucketed_table") + Utils.deleteRecursively(tableDir) + df1.write.parquet(tableDir.getAbsolutePath) + + val agged = spark.table("bucketed_table").groupBy("i").count() + val error = intercept[Exception] { + agged.count() + } + assert(error.getCause().toString contains "Failed to materialize query stage") + } + } + } + + test("SPARK-30403: AQE should handle InSubquery") { + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true") { + runAdaptiveAndVerifyResult("SELECT * FROM testData LEFT OUTER join testData2" + + " ON key = a AND key NOT IN (select a from testData3) where value = '1'" + ) + } + } + + test("force apply AQE") { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true", + SQLConf.ADAPTIVE_EXECUTION_FORCE_APPLY.key -> "true") { + val plan = sql("SELECT * FROM testData").queryExecution.executedPlan + assert(plan.isInstanceOf[AdaptiveSparkPlanExec]) + } + } + + test("SPARK-30719: do not log warning if intentionally skip AQE") { + val testAppender = new LogAppender("aqe logging warning test when skip") + withLogAppender(testAppender) { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true") { + val plan = sql("SELECT * FROM testData").queryExecution.executedPlan + assert(!plan.isInstanceOf[AdaptiveSparkPlanExec]) + } + } + assert(!testAppender.loggingEvents + .exists(msg => msg.getRenderedMessage.contains( + s"${SQLConf.ADAPTIVE_EXECUTION_ENABLED.key} is" + + s" enabled but is not supported for"))) + } } + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala index dc67446460877..3e47fd4289bef 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala @@ -140,7 +140,7 @@ class SortBasedAggregationStoreSuite extends SparkFunSuite with LocalSparkConte } override def getKey(): UnsafeRow = key override def getValue(): UnsafeRow = value - override def close(): Unit = Unit + override def close(): Unit = () } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala index 2eb4ac52aca90..fdb23d5be78a1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala @@ -1210,15 +1210,13 @@ class ArrowConvertersSuite extends SharedSparkSession { testQuietly("unsupported types") { def runUnsupported(block: => Unit): Unit = { - val msg = intercept[SparkException] { + val msg = intercept[UnsupportedOperationException] { block } - assert(msg.getMessage.contains("Unsupported data type")) - assert(msg.getCause.getClass === classOf[UnsupportedOperationException]) + assert(msg.getMessage.contains("is not supported")) } - runUnsupported { mapData.toDF().toArrowBatchRdd.collect() } - runUnsupported { complexData.toArrowBatchRdd.collect() } + runUnsupported { calenderIntervalData.toDF().toArrowBatchRdd.collect() } } test("test Arrow Validator") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala index 92506032ab2e5..bdc3b5eed7d8d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala @@ -19,9 +19,9 @@ package org.apache.spark.sql.execution.arrow import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.util.ArrayData +import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.types._ -import org.apache.spark.sql.vectorized.ArrowColumnVector +import org.apache.spark.sql.vectorized._ import org.apache.spark.unsafe.types.UTF8String class ArrowWriterSuite extends SparkFunSuite { @@ -267,4 +267,120 @@ class ArrowWriterSuite extends SparkFunSuite { writer.root.close() } + + test("map") { + val schema = new StructType() + .add("map", MapType(IntegerType, StringType), nullable = true) + val writer = ArrowWriter.create(schema, null) + assert(writer.schema == schema) + + writer.write(InternalRow(ArrayBasedMapData( + keys = Array(1, 2, 3), + values = Array( + UTF8String.fromString("v2"), + UTF8String.fromString("v3"), + UTF8String.fromString("v4") + ) + ))) + writer.write(InternalRow(ArrayBasedMapData(Array(43), + Array(UTF8String.fromString("v5")) + ))) + writer.write(InternalRow(ArrayBasedMapData(Array(43), Array(null)))) + writer.write(InternalRow(null)) + + writer.finish() + + val reader = new ArrowColumnVector(writer.root.getFieldVectors.get(0)) + val map0 = reader.getMap(0) + assert(map0.numElements() == 3) + assert(map0.keyArray().array().mkString(",") == Array(1, 2, 3).mkString(",")) + assert(map0.valueArray().array().mkString(",") == Array("v2", "v3", "v4").mkString(",")) + + val map1 = reader.getMap(1) + assert(map1.numElements() == 1) + assert(map1.keyArray().array().mkString(",") == Array(43).mkString(",")) + assert(map1.valueArray().array().mkString(",") == Array("v5").mkString(",")) + + val map2 = reader.getMap(2) + assert(map2.numElements() == 1) + assert(map2.keyArray().array().mkString(",") == Array(43).mkString(",")) + assert(map2.valueArray().array().mkString(",") == Array(null).mkString(",")) + + val map3 = reader.getMap(3) + assert(map3 == null) + writer.root.close() + } + + test("empty map") { + val schema = new StructType() + .add("map", MapType(IntegerType, StringType), nullable = true) + val writer = ArrowWriter.create(schema, null) + assert(writer.schema == schema) + writer.write(InternalRow(ArrayBasedMapData(Array(), Array()))) + writer.finish() + + val reader = new ArrowColumnVector(writer.root.getFieldVectors.get(0)) + + val map0 = reader.getMap(0) + assert(map0.numElements() == 0) + writer.root.close() + } + + test("nested map") { + val valueSchema = new StructType() + .add("name", StringType) + .add("age", IntegerType) + + val schema = new StructType() + .add("map", + MapType( + keyType = IntegerType, + valueType = valueSchema + ), + nullable = true) + val writer = ArrowWriter.create(schema, null) + assert(writer.schema == schema) + + writer.write(InternalRow( + ArrayBasedMapData( + keys = Array(1), + values = Array(InternalRow(UTF8String.fromString("jon"), 20)) + ))) + + writer.write(InternalRow( + ArrayBasedMapData( + keys = Array(1), + values = Array(InternalRow(UTF8String.fromString("alice"), 30)) + ))) + + writer.write(InternalRow( + ArrayBasedMapData( + keys = Array(1), + values = Array(InternalRow(UTF8String.fromString("bob"), 40)) + ))) + + + writer.finish() + + val reader = new ArrowColumnVector(writer.root.getFieldVectors.get(0)) + + def stringRepr(map: ColumnarMap): String = { + map.valueArray().getStruct(0, 2).toSeq(valueSchema).mkString(",") + } + + val map0 = reader.getMap(0) + assert(map0.numElements() == 1) + assert(map0.keyArray().array().mkString(",") == Array(1).mkString(",")) + assert(stringRepr(map0) == Array("jon", "20").mkString(",")) + + val map1 = reader.getMap(1) + assert(map1.numElements() == 1) + assert(map1.keyArray().array().mkString(",") == Array(1).mkString(",")) + assert(stringRepr(map1) == Array("alice", "30").mkString(",")) + + val map2 = reader.getMap(2) + assert(map2.numElements() == 1) + assert(map2.keyArray().array().mkString(",") == Array(1).mkString(",")) + assert(stringRepr(map2) == Array("bob", "40").mkString(",")) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala index 2776bc310fefe..965d78227c335 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/AggregateBenchmark.scala @@ -48,7 +48,7 @@ object AggregateBenchmark extends SqlBasedBenchmark { runBenchmark("aggregate without grouping") { val N = 500L << 22 codegenBenchmark("agg w/o group", N) { - spark.range(N).selectExpr("sum(id)").collect() + spark.range(N).selectExpr("sum(id)").noop() } } @@ -56,11 +56,11 @@ object AggregateBenchmark extends SqlBasedBenchmark { val N = 100L << 20 codegenBenchmark("stddev", N) { - spark.range(N).groupBy().agg("id" -> "stddev").collect() + spark.range(N).groupBy().agg("id" -> "stddev").noop() } codegenBenchmark("kurtosis", N) { - spark.range(N).groupBy().agg("id" -> "kurtosis").collect() + spark.range(N).groupBy().agg("id" -> "kurtosis").noop() } } @@ -70,7 +70,7 @@ object AggregateBenchmark extends SqlBasedBenchmark { val benchmark = new Benchmark("Aggregate w keys", N, output = output) def f(): Unit = { - spark.range(N).selectExpr("(id & 65535) as k").groupBy("k").sum().collect() + spark.range(N).selectExpr("(id & 65535) as k").groupBy("k").sum().noop() } benchmark.addCase("codegen = F", numIters = 2) { _ => @@ -107,7 +107,7 @@ object AggregateBenchmark extends SqlBasedBenchmark { spark.range(N).selectExpr("id", "floor(rand() * 10000) as k") .createOrReplaceTempView("test") - def f(): Unit = spark.sql("select k, k, sum(id) from test group by k, k").collect() + def f(): Unit = spark.sql("select k, k, sum(id) from test group by k, k").noop() benchmark.addCase("codegen = F", numIters = 2) { _ => withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") { @@ -142,7 +142,7 @@ object AggregateBenchmark extends SqlBasedBenchmark { val benchmark = new Benchmark("Aggregate w string key", N, output = output) def f(): Unit = spark.range(N).selectExpr("id", "cast(id & 1023 as string) as k") - .groupBy("k").count().collect() + .groupBy("k").count().noop() benchmark.addCase("codegen = F", numIters = 2) { _ => withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") { @@ -177,7 +177,7 @@ object AggregateBenchmark extends SqlBasedBenchmark { val benchmark = new Benchmark("Aggregate w decimal key", N, output = output) def f(): Unit = spark.range(N).selectExpr("id", "cast(id & 65535 as decimal) as k") - .groupBy("k").count().collect() + .groupBy("k").count().noop() benchmark.addCase("codegen = F") { _ => withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") { @@ -222,7 +222,7 @@ object AggregateBenchmark extends SqlBasedBenchmark { "id > 1023 as k6") .groupBy("k1", "k2", "k3", "k4", "k5", "k6") .sum() - .collect() + .noop() benchmark.addCase("codegen = F") { _ => withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") { @@ -282,7 +282,7 @@ object AggregateBenchmark extends SqlBasedBenchmark { "case when id > 1800 and id <= 1900 then 1 else 0 end as v18") .groupBy("k1", "k2", "k3") .sum() - .collect() + .noop() benchmark.addCase("codegen = F") { _ => withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") { @@ -315,7 +315,7 @@ object AggregateBenchmark extends SqlBasedBenchmark { codegenBenchmark("cube", N) { spark.range(N).selectExpr("id", "id % 1000 as k1", "id & 256 as k2") - .cube("k1", "k2").sum("id").collect() + .cube("k1", "k2").sum("id").noop() } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala index f727ebcf3fd1e..ae241b3625d02 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala @@ -70,10 +70,10 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { runBenchmark(s"ORC Read") { val benchmark = new Benchmark(s"Read a row from ${scaleFactor}M rows", N, output = output) benchmark.addCase("Without bloom filter") { _ => - spark.read.orc(path + "/withoutBF").where("value = 0").count + spark.read.orc(path + "/withoutBF").where("value = 0").noop() } benchmark.addCase("With bloom filter") { _ => - spark.read.orc(path + "/withBF").where("value = 0").count + spark.read.orc(path + "/withBF").where("value = 0").noop() } benchmark.run() } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala index bd2470ee20660..a084bec985510 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceReadBenchmark.scala @@ -22,11 +22,10 @@ import scala.collection.JavaConverters._ import scala.util.Random import org.apache.spark.SparkConf -import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} +import org.apache.spark.benchmark.Benchmark import org.apache.spark.internal.config.UI._ import org.apache.spark.sql.{DataFrame, DataFrameWriter, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.execution.datasources.parquet.{SpecificParquetRecordReaderBase, VectorizedParquetRecordReader} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -44,21 +43,26 @@ import org.apache.spark.sql.vectorized.ColumnVector * Results will be written to "benchmarks/DataSourceReadBenchmark-results.txt". * }}} */ -object DataSourceReadBenchmark extends BenchmarkBase with SQLHelper { - val conf = new SparkConf() - .setAppName("DataSourceReadBenchmark") - // Since `spark.master` always exists, overrides this value - .set("spark.master", "local[1]") - .setIfMissing("spark.driver.memory", "3g") - .setIfMissing("spark.executor.memory", "3g") - .setIfMissing(UI_ENABLED, false) - - val spark = SparkSession.builder.config(conf).getOrCreate() - - // Set default configs. Individual cases will change them if necessary. - spark.conf.set(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key, "true") - spark.conf.set(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key, "true") - spark.conf.set(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "true") +object DataSourceReadBenchmark extends SqlBasedBenchmark { + + override def getSparkSession: SparkSession = { + val conf = new SparkConf() + .setAppName("DataSourceReadBenchmark") + // Since `spark.master` always exists, overrides this value + .set("spark.master", "local[1]") + .setIfMissing("spark.driver.memory", "3g") + .setIfMissing("spark.executor.memory", "3g") + .setIfMissing(UI_ENABLED, false) + + val sparkSession = SparkSession.builder.config(conf).getOrCreate() + + // Set default configs. Individual cases will change them if necessary. + sparkSession.conf.set(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key, "true") + sparkSession.conf.set(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key, "true") + sparkSession.conf.set(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "true") + + sparkSession + } def withTempTable(tableNames: String*)(f: => Unit): Unit = { try f finally tableNames.foreach(spark.catalog.dropTempView) @@ -118,30 +122,30 @@ object DataSourceReadBenchmark extends BenchmarkBase with SQLHelper { prepareTable(dir, spark.sql(s"SELECT CAST(value as ${dataType.sql}) id FROM t1")) sqlBenchmark.addCase("SQL CSV") { _ => - spark.sql("select sum(id) from csvTable").collect() + spark.sql("select sum(id) from csvTable").noop() } sqlBenchmark.addCase("SQL Json") { _ => - spark.sql("select sum(id) from jsonTable").collect() + spark.sql("select sum(id) from jsonTable").noop() } sqlBenchmark.addCase("SQL Parquet Vectorized") { _ => - spark.sql("select sum(id) from parquetTable").collect() + spark.sql("select sum(id) from parquetTable").noop() } sqlBenchmark.addCase("SQL Parquet MR") { _ => withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("select sum(id) from parquetTable").collect() + spark.sql("select sum(id) from parquetTable").noop() } } sqlBenchmark.addCase("SQL ORC Vectorized") { _ => - spark.sql("SELECT sum(id) FROM orcTable").collect() + spark.sql("SELECT sum(id) FROM orcTable").noop() } sqlBenchmark.addCase("SQL ORC MR") { _ => withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("SELECT sum(id) FROM orcTable").collect() + spark.sql("SELECT sum(id) FROM orcTable").noop() } } @@ -234,30 +238,30 @@ object DataSourceReadBenchmark extends BenchmarkBase with SQLHelper { spark.sql("SELECT CAST(value AS INT) AS c1, CAST(value as STRING) AS c2 FROM t1")) benchmark.addCase("SQL CSV") { _ => - spark.sql("select sum(c1), sum(length(c2)) from csvTable").collect() + spark.sql("select sum(c1), sum(length(c2)) from csvTable").noop() } benchmark.addCase("SQL Json") { _ => - spark.sql("select sum(c1), sum(length(c2)) from jsonTable").collect() + spark.sql("select sum(c1), sum(length(c2)) from jsonTable").noop() } benchmark.addCase("SQL Parquet Vectorized") { _ => - spark.sql("select sum(c1), sum(length(c2)) from parquetTable").collect() + spark.sql("select sum(c1), sum(length(c2)) from parquetTable").noop() } benchmark.addCase("SQL Parquet MR") { _ => withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("select sum(c1), sum(length(c2)) from parquetTable").collect() + spark.sql("select sum(c1), sum(length(c2)) from parquetTable").noop() } } benchmark.addCase("SQL ORC Vectorized") { _ => - spark.sql("SELECT sum(c1), sum(length(c2)) FROM orcTable").collect() + spark.sql("SELECT sum(c1), sum(length(c2)) FROM orcTable").noop() } benchmark.addCase("SQL ORC MR") { _ => withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("SELECT sum(c1), sum(length(c2)) FROM orcTable").collect() + spark.sql("SELECT sum(c1), sum(length(c2)) FROM orcTable").noop() } } @@ -279,30 +283,30 @@ object DataSourceReadBenchmark extends BenchmarkBase with SQLHelper { spark.sql("select cast((value % 200) + 10000 as STRING) as c1 from t1")) benchmark.addCase("SQL CSV") { _ => - spark.sql("select sum(length(c1)) from csvTable").collect() + spark.sql("select sum(length(c1)) from csvTable").noop() } benchmark.addCase("SQL Json") { _ => - spark.sql("select sum(length(c1)) from jsonTable").collect() + spark.sql("select sum(length(c1)) from jsonTable").noop() } benchmark.addCase("SQL Parquet Vectorized") { _ => - spark.sql("select sum(length(c1)) from parquetTable").collect() + spark.sql("select sum(length(c1)) from parquetTable").noop() } benchmark.addCase("SQL Parquet MR") { _ => withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("select sum(length(c1)) from parquetTable").collect() + spark.sql("select sum(length(c1)) from parquetTable").noop() } } benchmark.addCase("SQL ORC Vectorized") { _ => - spark.sql("select sum(length(c1)) from orcTable").collect() + spark.sql("select sum(length(c1)) from orcTable").noop() } benchmark.addCase("SQL ORC MR") { _ => withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("select sum(length(c1)) from orcTable").collect() + spark.sql("select sum(length(c1)) from orcTable").noop() } } @@ -322,86 +326,86 @@ object DataSourceReadBenchmark extends BenchmarkBase with SQLHelper { prepareTable(dir, spark.sql("SELECT value % 2 AS p, value AS id FROM t1"), Some("p")) benchmark.addCase("Data column - CSV") { _ => - spark.sql("select sum(id) from csvTable").collect() + spark.sql("select sum(id) from csvTable").noop() } benchmark.addCase("Data column - Json") { _ => - spark.sql("select sum(id) from jsonTable").collect() + spark.sql("select sum(id) from jsonTable").noop() } benchmark.addCase("Data column - Parquet Vectorized") { _ => - spark.sql("select sum(id) from parquetTable").collect() + spark.sql("select sum(id) from parquetTable").noop() } benchmark.addCase("Data column - Parquet MR") { _ => withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("select sum(id) from parquetTable").collect() + spark.sql("select sum(id) from parquetTable").noop() } } benchmark.addCase("Data column - ORC Vectorized") { _ => - spark.sql("SELECT sum(id) FROM orcTable").collect() + spark.sql("SELECT sum(id) FROM orcTable").noop() } benchmark.addCase("Data column - ORC MR") { _ => withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("SELECT sum(id) FROM orcTable").collect() + spark.sql("SELECT sum(id) FROM orcTable").noop() } } benchmark.addCase("Partition column - CSV") { _ => - spark.sql("select sum(p) from csvTable").collect() + spark.sql("select sum(p) from csvTable").noop() } benchmark.addCase("Partition column - Json") { _ => - spark.sql("select sum(p) from jsonTable").collect() + spark.sql("select sum(p) from jsonTable").noop() } benchmark.addCase("Partition column - Parquet Vectorized") { _ => - spark.sql("select sum(p) from parquetTable").collect() + spark.sql("select sum(p) from parquetTable").noop() } benchmark.addCase("Partition column - Parquet MR") { _ => withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("select sum(p) from parquetTable").collect() + spark.sql("select sum(p) from parquetTable").noop() } } benchmark.addCase("Partition column - ORC Vectorized") { _ => - spark.sql("SELECT sum(p) FROM orcTable").collect() + spark.sql("SELECT sum(p) FROM orcTable").noop() } benchmark.addCase("Partition column - ORC MR") { _ => withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("SELECT sum(p) FROM orcTable").collect() + spark.sql("SELECT sum(p) FROM orcTable").noop() } } benchmark.addCase("Both columns - CSV") { _ => - spark.sql("select sum(p), sum(id) from csvTable").collect() + spark.sql("select sum(p), sum(id) from csvTable").noop() } benchmark.addCase("Both columns - Json") { _ => - spark.sql("select sum(p), sum(id) from jsonTable").collect() + spark.sql("select sum(p), sum(id) from jsonTable").noop() } benchmark.addCase("Both columns - Parquet Vectorized") { _ => - spark.sql("select sum(p), sum(id) from parquetTable").collect() + spark.sql("select sum(p), sum(id) from parquetTable").noop() } benchmark.addCase("Both columns - Parquet MR") { _ => withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("select sum(p), sum(id) from parquetTable").collect + spark.sql("select sum(p), sum(id) from parquetTable").noop() } } benchmark.addCase("Both columns - ORC Vectorized") { _ => - spark.sql("SELECT sum(p), sum(id) FROM orcTable").collect() + spark.sql("SELECT sum(p), sum(id) FROM orcTable").noop() } benchmark.addCase("Both columns - ORC MR") { _ => withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("SELECT sum(p), sum(id) FROM orcTable").collect() + spark.sql("SELECT sum(p), sum(id) FROM orcTable").noop() } } @@ -427,23 +431,23 @@ object DataSourceReadBenchmark extends BenchmarkBase with SQLHelper { benchmark.addCase("SQL CSV") { _ => spark.sql("select sum(length(c2)) from csvTable where c1 is " + - "not NULL and c2 is not NULL").collect() + "not NULL and c2 is not NULL").noop() } benchmark.addCase("SQL Json") { _ => spark.sql("select sum(length(c2)) from jsonTable where c1 is " + - "not NULL and c2 is not NULL").collect() + "not NULL and c2 is not NULL").noop() } benchmark.addCase("SQL Parquet Vectorized") { _ => spark.sql("select sum(length(c2)) from parquetTable where c1 is " + - "not NULL and c2 is not NULL").collect() + "not NULL and c2 is not NULL").noop() } benchmark.addCase("SQL Parquet MR") { _ => withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { spark.sql("select sum(length(c2)) from parquetTable where c1 is " + - "not NULL and c2 is not NULL").collect() + "not NULL and c2 is not NULL").noop() } } @@ -474,13 +478,13 @@ object DataSourceReadBenchmark extends BenchmarkBase with SQLHelper { benchmark.addCase("SQL ORC Vectorized") { _ => spark.sql("SELECT SUM(LENGTH(c2)) FROM orcTable " + - "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect() + "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").noop() } benchmark.addCase("SQL ORC MR") { _ => withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") { spark.sql("SELECT SUM(LENGTH(c2)) FROM orcTable " + - "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect() + "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").noop() } } @@ -506,30 +510,30 @@ object DataSourceReadBenchmark extends BenchmarkBase with SQLHelper { prepareTable(dir, spark.sql("SELECT * FROM t1")) benchmark.addCase("SQL CSV") { _ => - spark.sql(s"SELECT sum(c$middle) FROM csvTable").collect() + spark.sql(s"SELECT sum(c$middle) FROM csvTable").noop() } benchmark.addCase("SQL Json") { _ => - spark.sql(s"SELECT sum(c$middle) FROM jsonTable").collect() + spark.sql(s"SELECT sum(c$middle) FROM jsonTable").noop() } benchmark.addCase("SQL Parquet Vectorized") { _ => - spark.sql(s"SELECT sum(c$middle) FROM parquetTable").collect() + spark.sql(s"SELECT sum(c$middle) FROM parquetTable").noop() } benchmark.addCase("SQL Parquet MR") { _ => withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql(s"SELECT sum(c$middle) FROM parquetTable").collect() + spark.sql(s"SELECT sum(c$middle) FROM parquetTable").noop() } } benchmark.addCase("SQL ORC Vectorized") { _ => - spark.sql(s"SELECT sum(c$middle) FROM orcTable").collect() + spark.sql(s"SELECT sum(c$middle) FROM orcTable").noop() } benchmark.addCase("SQL ORC MR") { _ => withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql(s"SELECT sum(c$middle) FROM orcTable").collect() + spark.sql(s"SELECT sum(c$middle) FROM orcTable").noop() } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala index df0f87e483cdc..086583fdafe6d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala @@ -20,7 +20,6 @@ package org.apache.spark.sql.execution.benchmark import java.sql.Timestamp import org.apache.spark.benchmark.Benchmark -import org.apache.spark.sql.internal.SQLConf /** * Synthetic benchmark for date and timestamp functions. @@ -36,7 +35,9 @@ import org.apache.spark.sql.internal.SQLConf */ object DateTimeBenchmark extends SqlBasedBenchmark { private def doBenchmark(cardinality: Int, exprs: String*): Unit = { - spark.range(cardinality).selectExpr(exprs: _*).write.format("noop").save() + spark.range(cardinality) + .selectExpr(exprs: _*) + .noop() } private def run(cardinality: Int, name: String, exprs: String*): Unit = { @@ -89,11 +90,9 @@ object DateTimeBenchmark extends SqlBasedBenchmark { run(N, "from_unixtime", "from_unixtime(id, 'yyyy-MM-dd HH:mm:ss.SSSSSS')") } runBenchmark("Convert timestamps") { - withSQLConf(SQLConf.UTC_TIMESTAMP_FUNC_ENABLED.key -> "true") { - val timestampExpr = "cast(id as timestamp)" - run(N, "from_utc_timestamp", s"from_utc_timestamp($timestampExpr, 'CET')") - run(N, "to_utc_timestamp", s"to_utc_timestamp($timestampExpr, 'CET')") - } + val timestampExpr = "cast(id as timestamp)" + run(N, "from_utc_timestamp", s"from_utc_timestamp($timestampExpr, 'CET')") + run(N, "to_utc_timestamp", s"to_utc_timestamp($timestampExpr, 'CET')") } runBenchmark("Intervals") { val (start, end) = ("cast(id as timestamp)", "cast((id+8640000) as timestamp)") @@ -132,7 +131,7 @@ object DateTimeBenchmark extends SqlBasedBenchmark { benchmark.addCase("From java.sql.Timestamp", numIters) { _ => spark.range(rowsNum) .map(millis => new Timestamp(millis)) - .write.format("noop").save() + .noop() } benchmark.addCase("Collect longs", numIters) { _ => spark.range(0, rowsNum, 1, 1) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ExtractBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ExtractBenchmark.scala index dbbad43efa08c..de23132284dc8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ExtractBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ExtractBenchmark.scala @@ -19,6 +19,9 @@ package org.apache.spark.sql.execution.benchmark import java.time.Instant +import org.apache.spark.benchmark.Benchmark +import org.apache.spark.sql.internal.SQLConf + /** * Synthetic benchmark for the extract function. * To run this benchmark: @@ -32,51 +35,83 @@ import java.time.Instant * }}} */ object ExtractBenchmark extends SqlBasedBenchmark { + private def doBenchmark(cardinality: Long, exprs: String*): Unit = { val sinceSecond = Instant.parse("2010-01-01T00:00:00Z").getEpochSecond - spark - .range(sinceSecond, sinceSecond + cardinality, 1, 1) - .selectExpr(exprs: _*) - .write - .format("noop") - .save() + withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true") { + spark + .range(sinceSecond, sinceSecond + cardinality, 1, 1) + .selectExpr(exprs: _*) + .noop() + } } - private def run(cardinality: Long, name: String, exprs: String*): Unit = { - codegenBenchmark(name, cardinality) { + private def run( + benchmark: Benchmark, + cardinality: Long, + name: String, + exprs: String*): Unit = { + benchmark.addCase(name, numIters = 3) { _ => doBenchmark(cardinality, exprs: _*) } } - private def run(cardinality: Long, field: String): Unit = { - codegenBenchmark(s"$field of timestamp", cardinality) { - doBenchmark(cardinality, s"EXTRACT($field FROM (cast(id as timestamp)))") + private def castExpr(from: String): String = from match { + case "timestamp" => "cast(id as timestamp)" + case "date" => "cast(cast(id as timestamp) as date)" + case "interval" => "(cast(cast(id as timestamp) as date) - date'0001-01-01') + " + + "(cast(id as timestamp) - timestamp'1000-01-01 01:02:03.123456')" + case other => throw new IllegalArgumentException( + s"Unsupported column type $other. Valid column types are 'timestamp' and 'date'") + } + + private def run( + benchmark: Benchmark, + func: String, + cardinality: Long, + field: String, + from: String): Unit = { + val expr = func match { + case "extract" => s"EXTRACT($field FROM ${castExpr(from)}) AS $field" + case "date_part" => s"DATE_PART('$field', ${castExpr(from)}) AS $field" + case other => throw new IllegalArgumentException( + s"Unsupported function '$other'. Valid functions are 'extract' and 'date_part'.") + } + benchmark.addCase(s"$field of $from", numIters = 3) { _ => + doBenchmark(cardinality, expr) } } + private case class Settings(fields: Seq[String], func: Seq[String], iterNum: Long) + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { val N = 10000000L - runBenchmark("Extract") { - run(N, "cast to timestamp", "cast(id as timestamp)") - run(N, "MILLENNIUM") - run(N, "CENTURY") - run(N, "DECADE") - run(N, "YEAR") - run(N, "ISOYEAR") - run(N, "QUARTER") - run(N, "MONTH") - run(N, "WEEK") - run(N, "DAY") - run(N, "DAYOFWEEK") - run(N, "DOW") - run(N, "ISODOW") - run(N, "DOY") - run(N, "HOUR") - run(N, "MINUTE") - run(N, "SECOND") - run(N, "MILLISECONDS") - run(N, "MICROSECONDS") - run(N, "EPOCH") + val datetimeFields = Seq( + "MILLENNIUM", "CENTURY", "DECADE", "YEAR", + "ISOYEAR", "QUARTER", "MONTH", "WEEK", + "DAY", "DAYOFWEEK", "DOW", "ISODOW", + "DOY", "HOUR", "MINUTE", "SECOND", + "MILLISECONDS", "MICROSECONDS", "EPOCH") + val intervalFields = Seq( + "MILLENNIUM", "CENTURY", "DECADE", "YEAR", + "QUARTER", "MONTH", "DAY", + "HOUR", "MINUTE", "SECOND", + "MILLISECONDS", "MICROSECONDS", "EPOCH") + val settings = Map( + "timestamp" -> Settings(datetimeFields, Seq("extract", "date_part"), N), + "date" -> Settings(datetimeFields, Seq("extract", "date_part"), N), + "interval" -> Settings(intervalFields, Seq("date_part"), N)) + + for { + (dataType, Settings(fields, funcs, iterNum)) <- settings + func <- funcs} { + + val benchmark = new Benchmark(s"Invoke $func for $dataType", N, output = output) + + run(benchmark, iterNum, s"cast to $dataType", castExpr(dataType)) + fields.foreach(run(benchmark, func, iterNum, _, dataType)) + + benchmark.run() } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala index b040243717137..444ffa4f99697 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/FilterPushdownBenchmark.scala @@ -22,10 +22,9 @@ import java.io.File import scala.util.Random import org.apache.spark.SparkConf -import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} +import org.apache.spark.benchmark.Benchmark import org.apache.spark.internal.config.UI._ import org.apache.spark.sql.{DataFrame, SparkSession} -import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.functions.monotonically_increasing_id import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.ParquetOutputTimestampType @@ -41,17 +40,21 @@ import org.apache.spark.sql.types.{ByteType, Decimal, DecimalType, TimestampType * Results will be written to "benchmarks/FilterPushdownBenchmark-results.txt". * }}} */ -object FilterPushdownBenchmark extends BenchmarkBase with SQLHelper { - - private val conf = new SparkConf() - .setAppName(this.getClass.getSimpleName) - // Since `spark.master` always exists, overrides this value - .set("spark.master", "local[1]") - .setIfMissing("spark.driver.memory", "3g") - .setIfMissing("spark.executor.memory", "3g") - .setIfMissing(UI_ENABLED, false) - .setIfMissing("orc.compression", "snappy") - .setIfMissing("spark.sql.parquet.compression.codec", "snappy") +object FilterPushdownBenchmark extends SqlBasedBenchmark { + + override def getSparkSession: SparkSession = { + val conf = new SparkConf() + .setAppName(this.getClass.getSimpleName) + // Since `spark.master` always exists, overrides this value + .set("spark.master", "local[1]") + .setIfMissing("spark.driver.memory", "3g") + .setIfMissing("spark.executor.memory", "3g") + .setIfMissing(UI_ENABLED, false) + .setIfMissing("orc.compression", "snappy") + .setIfMissing("spark.sql.parquet.compression.codec", "snappy") + + SparkSession.builder().config(conf).getOrCreate() + } private val numRows = 1024 * 1024 * 15 private val width = 5 @@ -59,8 +62,6 @@ object FilterPushdownBenchmark extends BenchmarkBase with SQLHelper { // For Parquet/ORC, we will use the same value for block size and compression size private val blockSize = org.apache.parquet.hadoop.ParquetWriter.DEFAULT_PAGE_SIZE - private val spark = SparkSession.builder().config(conf).getOrCreate() - def withTempTable(tableNames: String*)(f: => Unit): Unit = { try f finally tableNames.foreach(spark.catalog.dropTempView) } @@ -118,7 +119,7 @@ object FilterPushdownBenchmark extends BenchmarkBase with SQLHelper { val name = s"Parquet Vectorized ${if (pushDownEnabled) s"(Pushdown)" else ""}" benchmark.addCase(name) { _ => withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> s"$pushDownEnabled") { - spark.sql(s"SELECT $selectExpr FROM parquetTable WHERE $whereExpr").collect() + spark.sql(s"SELECT $selectExpr FROM parquetTable WHERE $whereExpr").noop() } } } @@ -127,7 +128,7 @@ object FilterPushdownBenchmark extends BenchmarkBase with SQLHelper { val name = s"Native ORC Vectorized ${if (pushDownEnabled) s"(Pushdown)" else ""}" benchmark.addCase(name) { _ => withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> s"$pushDownEnabled") { - spark.sql(s"SELECT $selectExpr FROM orcTable WHERE $whereExpr").collect() + spark.sql(s"SELECT $selectExpr FROM orcTable WHERE $whereExpr").noop() } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/HashedRelationMetricsBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/HashedRelationMetricsBenchmark.scala index ebe278bff7d86..f3647b3bb2631 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/HashedRelationMetricsBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/HashedRelationMetricsBenchmark.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.execution.benchmark +import org.scalatest.Assertions._ + import org.apache.spark.SparkConf import org.apache.spark.benchmark.Benchmark import org.apache.spark.internal.config.MEMORY_OFFHEAP_ENABLED @@ -71,7 +73,7 @@ object HashedRelationMetricsBenchmark extends SqlBasedBenchmark { thread.start() thread } - threads.map(_.join()) + threads.foreach(_.join()) map.free() } benchmark.run() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/InExpressionBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/InExpressionBenchmark.scala index 611f582b66605..caf3387875813 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/InExpressionBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/InExpressionBenchmark.scala @@ -167,7 +167,7 @@ object InExpressionBenchmark extends SqlBasedBenchmark { def testClosure(): Unit = { val df = spark.sql(s"SELECT * FROM t WHERE id IN (${values.mkString(",")})") - df.queryExecution.toRdd.foreach(_ => Unit) + df.noop() } benchmark.addCase("In expression") { _ => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/IntervalBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/IntervalBenchmark.scala new file mode 100644 index 0000000000000..94e763459a111 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/IntervalBenchmark.scala @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +import scala.collection.mutable.ListBuffer + +import org.apache.spark.benchmark.Benchmark +import org.apache.spark.sql.Column +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.internal.SQLConf + +/** + * Synthetic benchmark for interval functions. + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class --jars + * 2. build/sbt "sql/test:runMain " + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/IntervalBenchmark-results.txt". + * }}} + */ +object IntervalBenchmark extends SqlBasedBenchmark { + import spark.implicits._ + + private def doBenchmark(cardinality: Long, exprs: Column*): Unit = { + withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true") { + spark + .range(0, cardinality, 1, 1) + .select(exprs: _*) + .noop() + } + } + + private def addCase( + benchmark: Benchmark, + cardinality: Long, + name: String, + exprs: Column*): Unit = { + benchmark.addCase(name, numIters = 3) { _ => + doBenchmark(cardinality, exprs: _*) + } + } + + private def buildString(withPrefix: Boolean, units: Seq[String] = Seq.empty): Column = { + val init = lit(if (withPrefix) "interval" else "") :: + ($"id" % 10000).cast("string") :: + lit("years") :: Nil + + concat_ws(" ", (init ++ units.map(lit)): _*) + } + + private def addCase(benchmark: Benchmark, cardinality: Long, units: Seq[String]): Unit = { + Seq(true, false).foreach { withPrefix => + val expr = buildString(withPrefix, units).cast("interval") + val note = if (withPrefix) "w/ interval" else "w/o interval" + benchmark.addCase(s"${units.length + 1} units $note", numIters = 3) { _ => + doBenchmark(cardinality, expr) + } + } + } + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + val N = 1000000 + val timeUnits = Seq( + "13 months", " 1 months", + "100 weeks", "9 days", "12 hours", "- 3 hours", + "5 minutes", "45 seconds", "123 milliseconds", "567 microseconds") + val intervalToTest = ListBuffer[String]() + + val benchmark = new Benchmark("cast strings to intervals", N, output = output) + // The first 2 cases are used to show the overhead of preparing the interval string. + addCase(benchmark, N, "prepare string w/ interval", buildString(true, timeUnits)) + addCase(benchmark, N, "prepare string w/o interval", buildString(false, timeUnits)) + addCase(benchmark, N, intervalToTest) // Only years + + for (unit <- timeUnits) { + intervalToTest.append(unit) + addCase(benchmark, N, intervalToTest) + } + + benchmark.run() + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/JoinBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/JoinBenchmark.scala index ad81711a13947..1cc92892fe122 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/JoinBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/JoinBenchmark.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.execution.benchmark +import org.scalatest.Assertions._ + import org.apache.spark.sql.execution.joins._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf @@ -44,7 +46,7 @@ object JoinBenchmark extends SqlBasedBenchmark { codegenBenchmark("Join w long", N) { val df = spark.range(N).join(dim, (col("id") % M) === col("k")) assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[BroadcastHashJoinExec]).isDefined) - df.count() + df.noop() } } @@ -55,7 +57,7 @@ object JoinBenchmark extends SqlBasedBenchmark { codegenBenchmark("Join w long duplicated", N) { val df = spark.range(N).join(dim, (col("id") % M) === col("k")) assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[BroadcastHashJoinExec]).isDefined) - df.count() + df.noop() } } @@ -70,7 +72,7 @@ object JoinBenchmark extends SqlBasedBenchmark { (col("id") % M).cast(IntegerType) === col("k1") && (col("id") % M).cast(IntegerType) === col("k2")) assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[BroadcastHashJoinExec]).isDefined) - df.count() + df.noop() } } @@ -84,7 +86,7 @@ object JoinBenchmark extends SqlBasedBenchmark { val df = spark.range(N).join(dim3, (col("id") % M) === col("k1") && (col("id") % M) === col("k2")) assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[BroadcastHashJoinExec]).isDefined) - df.count() + df.noop() } } @@ -98,7 +100,7 @@ object JoinBenchmark extends SqlBasedBenchmark { val df = spark.range(N).join(dim4, (col("id") bitwiseAND M) === col("k1") && (col("id") bitwiseAND M) === col("k2")) assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[BroadcastHashJoinExec]).isDefined) - df.count() + df.noop() } } @@ -109,7 +111,7 @@ object JoinBenchmark extends SqlBasedBenchmark { codegenBenchmark("outer join w long", N) { val df = spark.range(N).join(dim, (col("id") % M) === col("k"), "left") assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[BroadcastHashJoinExec]).isDefined) - df.count() + df.noop() } } @@ -120,7 +122,7 @@ object JoinBenchmark extends SqlBasedBenchmark { codegenBenchmark("semi join w long", N) { val df = spark.range(N).join(dim, (col("id") % M) === col("k"), "leftsemi") assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[BroadcastHashJoinExec]).isDefined) - df.count() + df.noop() } } @@ -131,7 +133,7 @@ object JoinBenchmark extends SqlBasedBenchmark { val df2 = spark.range(N).selectExpr(s"id * 3 as k2") val df = df1.join(df2, col("k1") === col("k2")) assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[SortMergeJoinExec]).isDefined) - df.count() + df.noop() } } @@ -144,7 +146,7 @@ object JoinBenchmark extends SqlBasedBenchmark { .selectExpr(s"(id * 15485867) % ${N*10} as k2") val df = df1.join(df2, col("k1") === col("k2")) assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[SortMergeJoinExec]).isDefined) - df.count() + df.noop() } } @@ -159,7 +161,7 @@ object JoinBenchmark extends SqlBasedBenchmark { val df2 = spark.range(N / 3).selectExpr(s"id * 3 as k2") val df = df1.join(df2, col("k1") === col("k2")) assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[ShuffledHashJoinExec]).isDefined) - df.count() + df.noop() } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/MakeDateTimeBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/MakeDateTimeBenchmark.scala new file mode 100644 index 0000000000000..c92098c93aa1e --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/MakeDateTimeBenchmark.scala @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +import org.apache.spark.benchmark.Benchmark +import org.apache.spark.sql.internal.SQLConf + +/** + * Synthetic benchmark for the make_date() and make_timestamp() functions. + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class --jars + * 2. build/sbt "sql/test:runMain " + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " + * Results will be written to "benchmarks/MakeDateTimeBenchmark-results.txt". + * }}} + */ +object MakeDateTimeBenchmark extends SqlBasedBenchmark { + + private def doBenchmark(cardinality: Long, exprs: String*): Unit = { + withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true") { + spark + .range(0, cardinality, 1, 1) + .selectExpr(exprs: _*) + .noop() + } + } + + private def run(benchmark: Benchmark, cardinality: Long, name: String, exprs: String*): Unit = { + benchmark.addCase(name, numIters = 3) { _ => doBenchmark(cardinality, exprs: _*) } + } + + private val ymdExprs = Seq("(2000 + (id % 30))", "((id % 12) + 1)", "((id % 27) + 1)") + + private def benchmarkMakeDate(cardinality: Long): Unit = { + val benchmark = new Benchmark("make_date()", cardinality, output = output) + val args = ymdExprs + + run(benchmark, cardinality, "prepare make_date()", args: _*) + val foldableExpr = "make_date(2019, 9, 16)" + run(benchmark, cardinality, foldableExpr, foldableExpr) + run( + benchmark, + cardinality, + "make_date(*, *, *)", + "make_date" + args.mkString("(", ",", ")")) + + benchmark.run() + } + + private def benchmarkMakeTimestamp(cardinality: Long): Unit = { + val benchmark = new Benchmark("make_timestamp()", cardinality, output = output) + val hmExprs = Seq("id % 24", "id % 60") + val hmsExprs = hmExprs ++ Seq("cast((id % 60000000) / 1000000.0 as decimal(8, 6))") + val args = ymdExprs ++ hmsExprs + + run( + benchmark, + cardinality, + "prepare make_timestamp()", + args: _*) + var foldableExpr = "make_timestamp(2019, 1, 2, 3, 4, 50.123456)" + run(benchmark, cardinality, foldableExpr, foldableExpr) + foldableExpr = "make_timestamp(2019, 1, 2, 3, 4, 60.000000)" + run(benchmark, cardinality, foldableExpr, foldableExpr) + foldableExpr = "make_timestamp(2019, 12, 31, 23, 59, 60.00)" + run(benchmark, cardinality, foldableExpr, foldableExpr) + run( + benchmark, + cardinality, + "make_timestamp(*, *, *, 3, 4, 50.123456)", + s"make_timestamp(${ymdExprs.mkString(",")}, 3, 4, 50.123456)") + run( + benchmark, + cardinality, + "make_timestamp(*, *, *, *, *, 0)", + s"make_timestamp(" + (ymdExprs ++ hmExprs).mkString(", ") + ", 0)") + run( + benchmark, + cardinality, + "make_timestamp(*, *, *, *, *, 60.0)", + s"make_timestamp(" + (ymdExprs ++ hmExprs).mkString(", ") + ", 60.0)") + run( + benchmark, + cardinality, + "make_timestamp(2019, 1, 2, *, *, *)", + s"make_timestamp(2019, 1, 2, ${hmsExprs.mkString(",")})") + run( + benchmark, + cardinality, + "make_timestamp(*, *, *, *, *, *)", + s"make_timestamp" + args.mkString("(", ", ", ")")) + + benchmark.run() + } + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + benchmarkMakeDate(100000000L) + benchmarkMakeTimestamp(1000000L) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/MiscBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/MiscBenchmark.scala index bafc0337bdc0e..2aecf553d75a2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/MiscBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/MiscBenchmark.scala @@ -35,7 +35,7 @@ object MiscBenchmark extends SqlBasedBenchmark { def filterAndAggregateWithoutGroup(numRows: Long): Unit = { runBenchmark("filter & aggregate without group") { codegenBenchmark("range/filter/sum", numRows) { - spark.range(numRows).filter("(id & 1) = 1").groupBy().sum().collect() + spark.range(numRows).filter("(id & 1) = 1").groupBy().sum().noop() } } } @@ -43,7 +43,7 @@ object MiscBenchmark extends SqlBasedBenchmark { def limitAndAggregateWithoutGroup(numRows: Long): Unit = { runBenchmark("range/limit/sum") { codegenBenchmark("range/limit/sum", numRows) { - spark.range(numRows).limit(1000000).groupBy().sum().collect() + spark.range(numRows).limit(1000000).groupBy().sum().noop() } } } @@ -51,11 +51,11 @@ object MiscBenchmark extends SqlBasedBenchmark { def sample(numRows: Int): Unit = { runBenchmark("sample") { codegenBenchmark("sample with replacement", numRows) { - spark.range(numRows).sample(withReplacement = true, 0.01).groupBy().sum().collect() + spark.range(numRows).sample(withReplacement = true, 0.01).groupBy().sum().noop() } codegenBenchmark("sample without replacement", numRows) { - spark.range(numRows).sample(withReplacement = false, 0.01).groupBy().sum().collect() + spark.range(numRows).sample(withReplacement = false, 0.01).groupBy().sum().noop() } } } @@ -95,28 +95,28 @@ object MiscBenchmark extends SqlBasedBenchmark { val df = spark.range(numRows).selectExpr( "id as key", "array(rand(), rand(), rand(), rand(), rand()) as values") - df.selectExpr("key", "explode(values) value").count() + df.selectExpr("key", "explode(values) value").noop() } codegenBenchmark("generate explode map", numRows) { val df = spark.range(numRows).selectExpr( "id as key", "map('a', rand(), 'b', rand(), 'c', rand(), 'd', rand(), 'e', rand()) pairs") - df.selectExpr("key", "explode(pairs) as (k, v)").count() + df.selectExpr("key", "explode(pairs) as (k, v)").noop() } codegenBenchmark("generate posexplode array", numRows) { val df = spark.range(numRows).selectExpr( "id as key", "array(rand(), rand(), rand(), rand(), rand()) as values") - df.selectExpr("key", "posexplode(values) as (idx, value)").count() + df.selectExpr("key", "posexplode(values) as (idx, value)").noop() } codegenBenchmark("generate inline array", numRows) { val df = spark.range(numRows).selectExpr( "id as key", "array((rand(), rand()), (rand(), rand()), (rand(), 0.0d)) as values") - df.selectExpr("key", "inline(values) as (r1, r2)").count() + df.selectExpr("key", "inline(values) as (r1, r2)").noop() } val M = 60000 @@ -129,7 +129,7 @@ object MiscBenchmark extends SqlBasedBenchmark { })))).toDF("col", "arr") df.selectExpr("*", "explode(arr) as arr_col") - .select("col", "arr_col.*").count + .select("col", "arr_col.*").noop() } withSQLConf(SQLConf.NESTED_PRUNING_ON_EXPRESSIONS.key -> "true") { @@ -142,7 +142,7 @@ object MiscBenchmark extends SqlBasedBenchmark { })))).toDF("col", "arr") .selectExpr("col", "struct(col, arr) as st") .selectExpr("col", "st.col as col1", "explode(st.arr) as arr_col") - df.collect() + df.noop() } } } @@ -158,7 +158,7 @@ object MiscBenchmark extends SqlBasedBenchmark { "id % 5 as t3", "id % 7 as t4", "id % 13 as t5") - df.selectExpr("key", "stack(4, t1, t2, t3, t4, t5)").count() + df.selectExpr("key", "stack(4, t1, t2, t3, t4, t5)").noop() } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/NestedSchemaPruningBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/NestedSchemaPruningBenchmark.scala index 96f90f29707d2..90fad7f36b862 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/NestedSchemaPruningBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/NestedSchemaPruningBenchmark.scala @@ -35,7 +35,7 @@ abstract class NestedSchemaPruningBenchmark extends SqlBasedBenchmark { // We use `col1 BIGINT, col2 STRUCT<_1: BIGINT, _2: STRING>, // col3 ARRAY>` as a test schema. - // col1, col2._1 and col3._1 are used for comparision. col2._2 and col3._2 mimics the burden + // col1, col2._1 and col3._1 are used for comparison. col2._2 and col3._2 mimics the burden // for the other columns private val df = spark .range(N * 10) @@ -47,7 +47,7 @@ abstract class NestedSchemaPruningBenchmark extends SqlBasedBenchmark { private def addCase(benchmark: Benchmark, name: String, sql: String): Unit = { benchmark.addCase(name) { _ => - spark.sql(sql).write.format("noop").save() + spark.sql(sql).noop() } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala index 8b1c422e63a3f..e07921bf3aa74 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/PrimitiveArrayBenchmark.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution.benchmark -import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} +import org.apache.spark.benchmark.Benchmark import org.apache.spark.sql.SparkSession /** @@ -28,13 +28,16 @@ import org.apache.spark.sql.SparkSession * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain " * Results will be written to "benchmarks/PrimitiveArrayBenchmark-results.txt". */ -object PrimitiveArrayBenchmark extends BenchmarkBase { - lazy val sparkSession = SparkSession.builder - .master("local[1]") - .appName("microbenchmark") - .config("spark.sql.shuffle.partitions", 1) - .config("spark.sql.autoBroadcastJoinThreshold", 1) - .getOrCreate() +object PrimitiveArrayBenchmark extends SqlBasedBenchmark { + + override def getSparkSession: SparkSession = { + SparkSession.builder + .master("local[1]") + .appName("microbenchmark") + .config("spark.sql.shuffle.partitions", 1) + .config("spark.sql.autoBroadcastJoinThreshold", 1) + .getOrCreate() + } override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { runBenchmark("Write primitive arrays in dataset") { @@ -43,11 +46,11 @@ object PrimitiveArrayBenchmark extends BenchmarkBase { } def writeDatasetArray(iters: Int): Unit = { - import sparkSession.implicits._ + import spark.implicits._ val count = 1024 * 1024 * 2 - val sc = sparkSession.sparkContext + val sc = spark.sparkContext val primitiveIntArray = Array.fill[Int](count)(65535) val dsInt = sc.parallelize(Seq(primitiveIntArray), 1).toDS dsInt.count // force to build dataset diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/RangeBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/RangeBenchmark.scala index a9f873f9094ba..e566f5d5adee6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/RangeBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/RangeBenchmark.scala @@ -40,15 +40,15 @@ object RangeBenchmark extends SqlBasedBenchmark { val benchmark = new Benchmark("range", N, output = output) benchmark.addCase("full scan", numIters = 4) { _ => - spark.range(N).queryExecution.toRdd.foreach(_ => ()) + spark.range(N).noop() } benchmark.addCase("limit after range", numIters = 4) { _ => - spark.range(N).limit(100).queryExecution.toRdd.foreach(_ => ()) + spark.range(N).limit(100).noop() } benchmark.addCase("filter after range", numIters = 4) { _ => - spark.range(N).filter('id % 100 === 0).queryExecution.toRdd.foreach(_ => ()) + spark.range(N).filter('id % 100 === 0).noop() } benchmark.addCase("count after range", numIters = 4) { _ => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SortBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SortBenchmark.scala index 3760539c16841..2c9e8a909633c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SortBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SortBenchmark.scala @@ -38,7 +38,8 @@ import org.apache.spark.util.random.XORShiftRandom */ object SortBenchmark extends BenchmarkBase { - private def referenceKeyPrefixSort(buf: LongArray, lo: Int, hi: Int, refCmp: PrefixComparator) { + private def referenceKeyPrefixSort(buf: LongArray, lo: Int, hi: Int, + refCmp: PrefixComparator): Unit = { val sortBuffer = new LongArray(MemoryBlock.fromLongArray(new Array[Long](buf.size().toInt))) new Sorter(new UnsafeSortDataFormat(sortBuffer)).sort(buf, lo, hi, (r1: RecordPointerAndKeyPrefix, r2: RecordPointerAndKeyPrefix) => @@ -47,7 +48,7 @@ object SortBenchmark extends BenchmarkBase { private def generateKeyPrefixTestData(size: Int, rand: => Long): (LongArray, LongArray) = { val ref = Array.tabulate[Long](size * 2) { i => rand } - val extended = ref ++ Array.fill[Long](size * 2)(0) + val extended = ref ++ Array.ofDim[Long](size * 2) (new LongArray(MemoryBlock.fromLongArray(ref)), new LongArray(MemoryBlock.fromLongArray(extended))) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SqlBasedBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SqlBasedBenchmark.scala index e95e5a960246b..ee7a03e5e0542 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SqlBasedBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/SqlBasedBenchmark.scala @@ -18,7 +18,8 @@ package org.apache.spark.sql.execution.benchmark import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} -import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.{Dataset, SparkSession} +import org.apache.spark.sql.SaveMode.Overwrite import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.internal.SQLConf @@ -57,4 +58,10 @@ trait SqlBasedBenchmark extends BenchmarkBase with SQLHelper { benchmark.run() } + + implicit class DatasetToBenchmark(ds: Dataset[_]) { + def noop(): Unit = { + ds.write.format("noop").mode(Overwrite).save() + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala index 93006d05b75bc..ad3d79760adf0 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala @@ -29,11 +29,19 @@ import org.apache.spark.sql.execution.datasources.LogicalRelation /** * Benchmark to measure TPCDS query performance. * To run this: - * spark-submit --class --data-location + * {{{ + * 1. without sbt: + * bin/spark-submit --class --data-location + * 2. build/sbt "sql/test:runMain --data-location " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt + * "sql/test:runMain --data-location " + * Results will be written to "benchmarks/TPCDSQueryBenchmark-results.txt". + * }}} */ -object TPCDSQueryBenchmark extends Logging { - val conf = - new SparkConf() +object TPCDSQueryBenchmark extends SqlBasedBenchmark { + + override def getSparkSession: SparkSession = { + val conf = new SparkConf() .setMaster("local[1]") .setAppName("test-sql-context") .set("spark.sql.parquet.compression.codec", "snappy") @@ -43,7 +51,8 @@ object TPCDSQueryBenchmark extends Logging { .set("spark.sql.autoBroadcastJoinThreshold", (20 * 1024 * 1024).toString) .set("spark.sql.crossJoin.enabled", "true") - val spark = SparkSession.builder.config(conf).getOrCreate() + SparkSession.builder.config(conf).getOrCreate() + } val tables = Seq("catalog_page", "catalog_returns", "customer", "customer_address", "customer_demographics", "date_dim", "household_demographics", "inventory", "item", @@ -72,21 +81,19 @@ object TPCDSQueryBenchmark extends Logging { val queryRelations = scala.collection.mutable.HashSet[String]() spark.sql(queryString).queryExecution.analyzed.foreach { case SubqueryAlias(alias, _: LogicalRelation) => - queryRelations.add(alias.identifier) + queryRelations.add(alias.name) case LogicalRelation(_, _, Some(catalogTable), _) => queryRelations.add(catalogTable.identifier.table) - case HiveTableRelation(tableMeta, _, _, _) => + case HiveTableRelation(tableMeta, _, _, _, _) => queryRelations.add(tableMeta.identifier.table) case _ => } val numRows = queryRelations.map(tableSizes.getOrElse(_, 0L)).sum - val benchmark = new Benchmark(s"TPCDS Snappy", numRows, 5) + val benchmark = new Benchmark(s"TPCDS Snappy", numRows, 2, output = output) benchmark.addCase(s"$name$nameSuffix") { _ => - spark.sql(queryString).collect() + spark.sql(queryString).noop() } - logInfo(s"\n\n===== TPCDS QUERY BENCHMARK OUTPUT FOR $name =====\n") benchmark.run() - logInfo(s"\n\n===== FINISHED $name =====\n") } } @@ -100,8 +107,8 @@ object TPCDSQueryBenchmark extends Logging { } } - def main(args: Array[String]): Unit = { - val benchmarkArgs = new TPCDSQueryBenchmarkArguments(args) + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + val benchmarkArgs = new TPCDSQueryBenchmarkArguments(mainArgs) // List of all TPC-DS v1.4 queries val tpcdsQueries = Seq( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/UDFBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/UDFBenchmark.scala index 9cbd6423f667f..ee8a6e787c36c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/UDFBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/UDFBenchmark.scala @@ -42,8 +42,9 @@ object UDFBenchmark extends SqlBasedBenchmark { val nullableIntCol = when( idCol % 2 === 0, idCol.cast(IntegerType)).otherwise(Literal(null, IntegerType)) val stringCol = idCol.cast(StringType) - spark.range(cardinality).select( - udf(idCol, nullableIntCol, stringCol)).write.format("noop").save() + spark.range(cardinality) + .select(udf(idCol, nullableIntCol, stringCol)) + .noop() } private def doRunBenchmarkWithPrimitiveTypes( @@ -51,7 +52,9 @@ object UDFBenchmark extends SqlBasedBenchmark { val idCol = col("id") val nullableIntCol = when( idCol % 2 === 0, idCol.cast(IntegerType)).otherwise(Literal(null, IntegerType)) - spark.range(cardinality).select(udf(idCol, nullableIntCol)).write.format("noop").save() + spark.range(cardinality) + .select(udf(idCol, nullableIntCol)) + .noop() } override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { @@ -104,16 +107,19 @@ object UDFBenchmark extends SqlBasedBenchmark { val benchmark = new Benchmark("UDF identity overhead", cardinality, output = output) benchmark.addCase(s"Baseline", numIters = 5) { _ => - spark.range(cardinality).select( - col("id"), col("id") * 2, col("id") * 3).write.format("noop").save() + spark.range(cardinality) + .select(col("id"), col("id") * 2, col("id") * 3) + .noop() } val identityUDF = udf { x: Long => x } benchmark.addCase(s"With identity UDF", numIters = 5) { _ => - spark.range(cardinality).select( - identityUDF(col("id")), - identityUDF(col("id") * 2), - identityUDF(col("id") * 3)).write.format("noop").save() + spark.range(cardinality) + .select( + identityUDF(col("id")), + identityUDF(col("id") * 2), + identityUDF(col("id") * 3)) + .noop() } benchmark.run() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/WideSchemaBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/WideSchemaBenchmark.scala index f4642e7d353e6..77dc3a10f8033 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/WideSchemaBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/WideSchemaBenchmark.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.execution.benchmark import java.io.File +import org.scalatest.Assertions._ + import org.apache.spark.benchmark.Benchmark import org.apache.spark.sql.DataFrame import org.apache.spark.util.Utils @@ -68,14 +70,14 @@ object WideSchemaBenchmark extends SqlBasedBenchmark { desc: String, selector: String): Unit = { benchmark.addCase(desc + " (read in-mem)") { iter => - df.selectExpr(s"sum($selector)").collect() + df.selectExpr(s"sum($selector)").noop() } benchmark.addCase(desc + " (exec in-mem)") { iter => - df.selectExpr("*", s"hash($selector) as f").selectExpr(s"sum($selector)", "sum(f)").collect() + df.selectExpr("*", s"hash($selector) as f").selectExpr(s"sum($selector)", "sum(f)").noop() } val parquet = saveAsParquet(df) benchmark.addCase(desc + " (read parquet)") { iter => - parquet.selectExpr(s"sum($selector) as f").collect() + parquet.selectExpr(s"sum($selector) as f").noop() } benchmark.addCase(desc + " (write parquet)") { iter => saveAsParquet(df.selectExpr(s"sum($selector) as f")) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/WideTableBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/WideTableBenchmark.scala index 52426d81bd1a7..ba79c12c461c1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/WideTableBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/WideTableBenchmark.scala @@ -42,7 +42,7 @@ object WideTableBenchmark extends SqlBasedBenchmark { Seq("10", "100", "1024", "2048", "4096", "8192", "65536").foreach { n => benchmark.addCase(s"split threshold $n", numIters = 5) { iter => withSQLConf(SQLConf.CODEGEN_METHOD_SPLIT_THRESHOLD.key -> n) { - df.selectExpr(columns: _*).foreach(_ => ()) + df.selectExpr(columns: _*).noop() } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnStatsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnStatsSuite.scala index 3121b7e99c99d..847e0ec4f3195 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnStatsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnStatsSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.CalendarInterval class ColumnStatsSuite extends SparkFunSuite { testColumnStats(classOf[BooleanColumnStats], BOOLEAN, Array(true, false, 0)) @@ -30,6 +31,7 @@ class ColumnStatsSuite extends SparkFunSuite { testColumnStats(classOf[DoubleColumnStats], DOUBLE, Array(Double.MaxValue, Double.MinValue, 0)) testColumnStats(classOf[StringColumnStats], STRING, Array(null, null, 0)) testDecimalColumnStats(Array(null, null, 0)) + testIntervalColumnStats(Array(null, null, 0)) def testColumnStats[T <: AtomicType, U <: ColumnStats]( columnStatsClass: Class[U], @@ -103,4 +105,36 @@ class ColumnStatsSuite extends SparkFunSuite { } } } + + def testIntervalColumnStats[T <: AtomicType, U <: ColumnStats]( + initialStatistics: Array[Any]): Unit = { + + val columnStatsName = classOf[IntervalColumnStats].getSimpleName + val columnType = CALENDAR_INTERVAL + + test(s"$columnStatsName: empty") { + val columnStats = new IntervalColumnStats + columnStats.collectedStatistics.zip(initialStatistics).foreach { + case (actual, expected) => assert(actual === expected) + } + } + + test(s"$columnStatsName: non-empty") { + import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ + + val columnStats = new IntervalColumnStats + val rows = Seq.fill(10)(makeRandomRow(columnType)) ++ Seq.fill(10)(makeNullRow(1)) + rows.foreach(columnStats.gatherStats(_, 0)) + + val stats = columnStats.collectedStatistics + + assertResult(10, "Wrong null count")(stats(2)) + assertResult(20, "Wrong row count")(stats(3)) + assertResult(stats(4), "Wrong size in bytes") { + rows.map { row => + if (row.isNullAt(0)) 4 else columnType.actualSize(row, 0) + }.sum + } + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnTypeSuite.scala index ff05049551dc8..b25aa6e308657 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnTypeSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnTypeSuite.scala @@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.CalendarInterval class ColumnTypeSuite extends SparkFunSuite with Logging { private val DEFAULT_BUFFER_SIZE = 512 @@ -38,7 +39,8 @@ class ColumnTypeSuite extends SparkFunSuite with Logging { val checks = Map( NULL -> 0, BOOLEAN -> 1, BYTE -> 1, SHORT -> 2, INT -> 4, LONG -> 8, FLOAT -> 4, DOUBLE -> 8, COMPACT_DECIMAL(15, 10) -> 8, LARGE_DECIMAL(20, 10) -> 12, - STRING -> 8, BINARY -> 16, STRUCT_TYPE -> 20, ARRAY_TYPE -> 28, MAP_TYPE -> 68) + STRING -> 8, BINARY -> 16, STRUCT_TYPE -> 20, ARRAY_TYPE -> 28, MAP_TYPE -> 68, + CALENDAR_INTERVAL -> 16) checks.foreach { case (columnType, expectedSize) => assertResult(expectedSize, s"Wrong defaultSize for $columnType") { @@ -76,6 +78,7 @@ class ColumnTypeSuite extends SparkFunSuite with Logging { checkActualSize(ARRAY_TYPE, Array[Any](1), 4 + 8 + 8 + 8) checkActualSize(MAP_TYPE, Map(1 -> "a"), 4 + (8 + 8 + 8 + 8) + (8 + 8 + 8 + 8)) checkActualSize(STRUCT_TYPE, Row("hello"), 28) + checkActualSize(CALENDAR_INTERVAL, new CalendarInterval(0, 0, 0), 4 + 4 + 8) } testNativeColumnType(BOOLEAN) @@ -94,6 +97,7 @@ class ColumnTypeSuite extends SparkFunSuite with Logging { testColumnType(STRUCT_TYPE) testColumnType(ARRAY_TYPE) testColumnType(MAP_TYPE) + testColumnType(CALENDAR_INTERVAL) def testNativeColumnType[T <: AtomicType](columnType: NativeColumnType[T]): Unit = { testColumnType[T#InternalType](columnType) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala index 686c8fa6f5fa9..fee3329030e66 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/ColumnarTestUtils.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.types.{AtomicType, Decimal} -import org.apache.spark.unsafe.types.UTF8String +import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} object ColumnarTestUtils { def makeNullRow(length: Int): GenericInternalRow = { @@ -51,6 +51,8 @@ object ColumnarTestUtils { case DOUBLE => Random.nextDouble() case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) case BINARY => randomBytes(Random.nextInt(32)) + case CALENDAR_INTERVAL => + new CalendarInterval(Random.nextInt(), Random.nextInt(), Random.nextLong()) case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale) case STRUCT(_) => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala index 0fac4dd3e5137..77047f329e105 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala @@ -38,7 +38,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSparkSession { setupTestData() - private def cachePrimitiveTest(data: DataFrame, dataType: String) { + private def cachePrimitiveTest(data: DataFrame, dataType: String): Unit = { data.createOrReplaceTempView(s"testData$dataType") val storageLevel = MEMORY_ONLY val plan = spark.sessionState.executePlan(data.logicalPlan).sparkPlan diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/NullableColumnAccessorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/NullableColumnAccessorSuite.scala index 8f4ca3cea77a5..92d9d84d9fac6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/NullableColumnAccessorSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/NullableColumnAccessorSuite.scala @@ -44,7 +44,8 @@ class NullableColumnAccessorSuite extends SparkFunSuite { NULL, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), - ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) + ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType)), + CALENDAR_INTERVAL) .foreach { testNullableColumnAccessor(_) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/NullableColumnBuilderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/NullableColumnBuilderSuite.scala index b2b6e92e9a056..7e295b4dc31c3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/NullableColumnBuilderSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/NullableColumnBuilderSuite.scala @@ -42,7 +42,8 @@ class NullableColumnBuilderSuite extends SparkFunSuite { BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), - ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) + ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType)), + CALENDAR_INTERVAL) .foreach { testNullableColumnBuilder(_) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/BooleanBitSetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/BooleanBitSetSuite.scala index 2d71a42628dfb..192db0e910d03 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/BooleanBitSetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/BooleanBitSetSuite.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.types.BooleanType class BooleanBitSetSuite extends SparkFunSuite { import BooleanBitSet._ - def skeleton(count: Int) { + def skeleton(count: Int): Unit = { // ------------- // Tests encoder // ------------- @@ -87,7 +87,7 @@ class BooleanBitSetSuite extends SparkFunSuite { assert(!decoder.hasNext) } - def skeletonForDecompress(count: Int) { + def skeletonForDecompress(count: Int): Unit = { val builder = TestCompressibleColumnBuilder(new NoopColumnStats, BOOLEAN, BooleanBitSet) val rows = Seq.fill[InternalRow](count)(makeRandomRow(BOOLEAN)) val values = rows.map(_.getBoolean(0)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/CompressionSchemeBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/CompressionSchemeBenchmark.scala index 8ea20f28a37b2..fcb18392235c3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/CompressionSchemeBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/CompressionSchemeBenchmark.scala @@ -22,6 +22,7 @@ import java.nio.charset.StandardCharsets import org.apache.commons.lang3.RandomStringUtils import org.apache.commons.math3.distribution.LogNormalDistribution +import org.scalatest.Assertions._ import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.sql.catalyst.expressions.GenericInternalRow diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/DictionaryEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/DictionaryEncodingSuite.scala index 28950b74cf1c8..61e4cc068fa80 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/DictionaryEncodingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/DictionaryEncodingSuite.scala @@ -35,7 +35,7 @@ class DictionaryEncodingSuite extends SparkFunSuite { def testDictionaryEncoding[T <: AtomicType]( columnStats: ColumnStats, columnType: NativeColumnType[T], - testDecompress: Boolean = true) { + testDecompress: Boolean = true): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") @@ -49,7 +49,7 @@ class DictionaryEncodingSuite extends SparkFunSuite { seq.head +: seq.tail.filterNot(_ == seq.head) } - def skeleton(uniqueValueCount: Int, inputSeq: Seq[Int]) { + def skeleton(uniqueValueCount: Int, inputSeq: Seq[Int]): Unit = { // ------------- // Tests encoder // ------------- @@ -116,7 +116,7 @@ class DictionaryEncodingSuite extends SparkFunSuite { } } - def skeletonForDecompress(uniqueValueCount: Int, inputSeq: Seq[Int]) { + def skeletonForDecompress(uniqueValueCount: Int, inputSeq: Seq[Int]): Unit = { if (!testDecompress) return val builder = TestCompressibleColumnBuilder(columnStats, columnType, DictionaryEncoding) val (values, rows) = makeUniqueValuesAndSingleValueRows(columnType, uniqueValueCount) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/IntegralDeltaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/IntegralDeltaSuite.scala index fb3388452e4e5..b5630488b3667 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/IntegralDeltaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/IntegralDeltaSuite.scala @@ -32,9 +32,9 @@ class IntegralDeltaSuite extends SparkFunSuite { def testIntegralDelta[I <: IntegralType]( columnStats: ColumnStats, columnType: NativeColumnType[I], - scheme: CompressionScheme) { + scheme: CompressionScheme): Unit = { - def skeleton(input: Seq[I#InternalType]) { + def skeleton(input: Seq[Any]): Unit = { // ------------- // Tests encoder // ------------- @@ -52,7 +52,7 @@ class IntegralDeltaSuite extends SparkFunSuite { input.foreach { value => val row = new GenericInternalRow(1) - columnType.setField(row, 0, value) + columnType.setField(row, 0, value.asInstanceOf[I#InternalType]) builder.appendFrom(row, 0) } @@ -112,7 +112,7 @@ class IntegralDeltaSuite extends SparkFunSuite { assert(!decoder.hasNext) } - def skeletonForDecompress(input: Seq[I#InternalType]) { + def skeletonForDecompress(input: Seq[I#InternalType]): Unit = { val builder = TestCompressibleColumnBuilder(columnStats, columnType, scheme) val row = new GenericInternalRow(1) val nullRow = new GenericInternalRow(1) @@ -173,9 +173,7 @@ class IntegralDeltaSuite extends SparkFunSuite { } test(s"$scheme: long random series") { - // Have to workaround with `Any` since no `ClassTag[I#JvmType]` available here. - val input = Array.fill[Any](10000)(makeRandomValue(columnType)) - skeleton(input.map(_.asInstanceOf[I#InternalType])) + skeleton(Seq.fill[I#InternalType](10000)(makeRandomValue(columnType))) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/PassThroughEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/PassThroughEncodingSuite.scala index b6f0b5e6277b4..f946a6779ec95 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/PassThroughEncodingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/PassThroughEncodingSuite.scala @@ -35,11 +35,11 @@ class PassThroughSuite extends SparkFunSuite { def testPassThrough[T <: AtomicType]( columnStats: ColumnStats, - columnType: NativeColumnType[T]) { + columnType: NativeColumnType[T]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") - def skeleton(input: Seq[T#InternalType]) { + def skeleton(input: Seq[T#InternalType]): Unit = { // ------------- // Tests encoder // ------------- @@ -93,7 +93,7 @@ class PassThroughSuite extends SparkFunSuite { assert(!decoder.hasNext) } - def skeletonForDecompress(input: Seq[T#InternalType]) { + def skeletonForDecompress(input: Seq[T#InternalType]): Unit = { val builder = TestCompressibleColumnBuilder(columnStats, columnType, PassThrough) val row = new GenericInternalRow(1) val nullRow = new GenericInternalRow(1) @@ -160,8 +160,7 @@ class PassThroughSuite extends SparkFunSuite { } test(s"$PassThrough with $typeName: long random series") { - val input = Array.fill[Any](10000)(makeRandomValue(columnType)) - skeleton(input.map(_.asInstanceOf[T#InternalType])) + skeleton(Seq.fill[T#InternalType](10000)(makeRandomValue(columnType))) } test(s"$PassThrough with $typeName: empty column for decompress()") { @@ -169,8 +168,7 @@ class PassThroughSuite extends SparkFunSuite { } test(s"$PassThrough with $typeName: long random series for decompress()") { - val input = Array.fill[Any](10000)(makeRandomValue(columnType)) - skeletonForDecompress(input.map(_.asInstanceOf[T#InternalType])) + skeletonForDecompress(Seq.fill[T#InternalType](10000)(makeRandomValue(columnType))) } test(s"$PassThrough with $typeName: simple case with null for decompress()") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/RunLengthEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/RunLengthEncodingSuite.scala index eb1cdd9bbceff..29dbc13b59c6b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/RunLengthEncodingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/RunLengthEncodingSuite.scala @@ -36,11 +36,11 @@ class RunLengthEncodingSuite extends SparkFunSuite { def testRunLengthEncoding[T <: AtomicType]( columnStats: ColumnStats, columnType: NativeColumnType[T], - testDecompress: Boolean = true) { + testDecompress: Boolean = true): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") - def skeleton(uniqueValueCount: Int, inputRuns: Seq[(Int, Int)]) { + def skeleton(uniqueValueCount: Int, inputRuns: Seq[(Int, Int)]): Unit = { // ------------- // Tests encoder // ------------- @@ -98,7 +98,7 @@ class RunLengthEncodingSuite extends SparkFunSuite { assert(!decoder.hasNext) } - def skeletonForDecompress(uniqueValueCount: Int, inputRuns: Seq[(Int, Int)]) { + def skeletonForDecompress(uniqueValueCount: Int, inputRuns: Seq[(Int, Int)]): Unit = { if (!testDecompress) return val builder = TestCompressibleColumnBuilder(columnStats, columnType, RunLengthEncoding) val (values, rows) = makeUniqueValuesAndSingleValueRows(columnType, uniqueValueCount) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala index 74ef81f7181da..81965e4c6c353 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala @@ -31,7 +31,8 @@ import org.apache.spark.sql.catalyst.dsl.plans import org.apache.spark.sql.catalyst.dsl.plans.DslLogicalPlan import org.apache.spark.sql.catalyst.expressions.JsonTuple import org.apache.spark.sql.catalyst.parser.ParseException -import org.apache.spark.sql.catalyst.plans.logical.{Generate, InsertIntoDir, LogicalPlan, Project, ScriptTransformation} +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.connector.expressions.{FieldReference, IdentityTransform} import org.apache.spark.sql.execution.SparkSqlParser import org.apache.spark.sql.execution.datasources.CreateTable import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} @@ -74,108 +75,10 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { }.head } - test("create database") { - val sql = - """ - |CREATE DATABASE IF NOT EXISTS database_name - |WITH DBPROPERTIES ('a'='a', 'b'='b', 'c'='c') - |COMMENT 'database_comment' LOCATION '/home/user/db' - """.stripMargin - val parsed = parser.parsePlan(sql) - val expected = CreateDatabaseCommand( - "database_name", - ifNotExists = true, - Some("/home/user/db"), - Some("database_comment"), - Map("a" -> "a", "b" -> "b", "c" -> "c")) - comparePlans(parsed, expected) - } - - test("create database -- check duplicates") { - def createDatabase(duplicateClause: String): String = { - s""" - |CREATE DATABASE IF NOT EXISTS database_name - |$duplicateClause - |$duplicateClause - """.stripMargin - } - val sql1 = createDatabase("COMMENT 'database_comment'") - val sql2 = createDatabase("LOCATION '/home/user/db'") - val sql3 = createDatabase("WITH DBPROPERTIES ('a'='a', 'b'='b', 'c'='c')") - - intercept(sql1, "Found duplicate clauses: COMMENT") - intercept(sql2, "Found duplicate clauses: LOCATION") - intercept(sql3, "Found duplicate clauses: WITH DBPROPERTIES") - } - - test("create database - property values must be set") { - assertUnsupported( - sql = "CREATE DATABASE my_db WITH DBPROPERTIES('key_without_value', 'key_with_value'='x')", - containsThesePhrases = Seq("key_without_value")) - } - - test("drop database") { - val sql1 = "DROP DATABASE IF EXISTS database_name RESTRICT" - val sql2 = "DROP DATABASE IF EXISTS database_name CASCADE" - val sql3 = "DROP SCHEMA IF EXISTS database_name RESTRICT" - val sql4 = "DROP SCHEMA IF EXISTS database_name CASCADE" - // The default is restrict=true - val sql5 = "DROP DATABASE IF EXISTS database_name" - // The default is ifExists=false - val sql6 = "DROP DATABASE database_name" - val sql7 = "DROP DATABASE database_name CASCADE" - - val parsed1 = parser.parsePlan(sql1) - val parsed2 = parser.parsePlan(sql2) - val parsed3 = parser.parsePlan(sql3) - val parsed4 = parser.parsePlan(sql4) - val parsed5 = parser.parsePlan(sql5) - val parsed6 = parser.parsePlan(sql6) - val parsed7 = parser.parsePlan(sql7) - - val expected1 = DropDatabaseCommand( - "database_name", - ifExists = true, - cascade = false) - val expected2 = DropDatabaseCommand( - "database_name", - ifExists = true, - cascade = true) - val expected3 = DropDatabaseCommand( - "database_name", - ifExists = false, - cascade = false) - val expected4 = DropDatabaseCommand( - "database_name", - ifExists = false, - cascade = true) - - comparePlans(parsed1, expected1) - comparePlans(parsed2, expected2) - comparePlans(parsed3, expected1) - comparePlans(parsed4, expected2) - comparePlans(parsed5, expected1) - comparePlans(parsed6, expected3) - comparePlans(parsed7, expected4) - } - - test("alter database set dbproperties") { - // ALTER (DATABASE|SCHEMA) database_name SET DBPROPERTIES (property_name=property_value, ...) - val sql1 = "ALTER DATABASE database_name SET DBPROPERTIES ('a'='a', 'b'='b', 'c'='c')" - val sql2 = "ALTER SCHEMA database_name SET DBPROPERTIES ('a'='a')" - - val parsed1 = parser.parsePlan(sql1) - val parsed2 = parser.parsePlan(sql2) - - val expected1 = AlterDatabasePropertiesCommand( - "database_name", - Map("a" -> "a", "b" -> "b", "c" -> "c")) - val expected2 = AlterDatabasePropertiesCommand( - "database_name", - Map("a" -> "a")) - - comparePlans(parsed1, expected1) - comparePlans(parsed2, expected2) + private def withCreateTableStatement(sql: String)(prediction: CreateTableStatement => Unit) + : Unit = { + val statement = parser.parsePlan(sql).asInstanceOf[CreateTableStatement] + prediction(statement) } test("alter database - property values must be set") { @@ -184,146 +87,6 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { containsThesePhrases = Seq("key_without_value")) } - test("describe database") { - // DESCRIBE DATABASE [EXTENDED] db_name; - val sql1 = "DESCRIBE DATABASE EXTENDED db_name" - val sql2 = "DESCRIBE DATABASE db_name" - - val parsed1 = parser.parsePlan(sql1) - val parsed2 = parser.parsePlan(sql2) - - val expected1 = DescribeDatabaseCommand( - "db_name", - extended = true) - val expected2 = DescribeDatabaseCommand( - "db_name", - extended = false) - - comparePlans(parsed1, expected1) - comparePlans(parsed2, expected2) - } - - test("create function") { - val sql1 = - """ - |CREATE TEMPORARY FUNCTION helloworld as - |'com.matthewrathbone.example.SimpleUDFExample' USING JAR '/path/to/jar1', - |JAR '/path/to/jar2' - """.stripMargin - val sql2 = - """ - |CREATE FUNCTION hello.world as - |'com.matthewrathbone.example.SimpleUDFExample' USING ARCHIVE '/path/to/archive', - |FILE '/path/to/file' - """.stripMargin - val sql3 = - """ - |CREATE OR REPLACE TEMPORARY FUNCTION helloworld3 as - |'com.matthewrathbone.example.SimpleUDFExample' USING JAR '/path/to/jar1', - |JAR '/path/to/jar2' - """.stripMargin - val sql4 = - """ - |CREATE OR REPLACE FUNCTION hello.world1 as - |'com.matthewrathbone.example.SimpleUDFExample' USING ARCHIVE '/path/to/archive', - |FILE '/path/to/file' - """.stripMargin - val sql5 = - """ - |CREATE FUNCTION IF NOT EXISTS hello.world2 as - |'com.matthewrathbone.example.SimpleUDFExample' USING ARCHIVE '/path/to/archive', - |FILE '/path/to/file' - """.stripMargin - val parsed1 = parser.parsePlan(sql1) - val parsed2 = parser.parsePlan(sql2) - val parsed3 = parser.parsePlan(sql3) - val parsed4 = parser.parsePlan(sql4) - val parsed5 = parser.parsePlan(sql5) - val expected1 = CreateFunctionCommand( - None, - "helloworld", - "com.matthewrathbone.example.SimpleUDFExample", - Seq( - FunctionResource(FunctionResourceType.fromString("jar"), "/path/to/jar1"), - FunctionResource(FunctionResourceType.fromString("jar"), "/path/to/jar2")), - isTemp = true, ignoreIfExists = false, replace = false) - val expected2 = CreateFunctionCommand( - Some("hello"), - "world", - "com.matthewrathbone.example.SimpleUDFExample", - Seq( - FunctionResource(FunctionResourceType.fromString("archive"), "/path/to/archive"), - FunctionResource(FunctionResourceType.fromString("file"), "/path/to/file")), - isTemp = false, ignoreIfExists = false, replace = false) - val expected3 = CreateFunctionCommand( - None, - "helloworld3", - "com.matthewrathbone.example.SimpleUDFExample", - Seq( - FunctionResource(FunctionResourceType.fromString("jar"), "/path/to/jar1"), - FunctionResource(FunctionResourceType.fromString("jar"), "/path/to/jar2")), - isTemp = true, ignoreIfExists = false, replace = true) - val expected4 = CreateFunctionCommand( - Some("hello"), - "world1", - "com.matthewrathbone.example.SimpleUDFExample", - Seq( - FunctionResource(FunctionResourceType.fromString("archive"), "/path/to/archive"), - FunctionResource(FunctionResourceType.fromString("file"), "/path/to/file")), - isTemp = false, ignoreIfExists = false, replace = true) - val expected5 = CreateFunctionCommand( - Some("hello"), - "world2", - "com.matthewrathbone.example.SimpleUDFExample", - Seq( - FunctionResource(FunctionResourceType.fromString("archive"), "/path/to/archive"), - FunctionResource(FunctionResourceType.fromString("file"), "/path/to/file")), - isTemp = false, ignoreIfExists = true, replace = false) - comparePlans(parsed1, expected1) - comparePlans(parsed2, expected2) - comparePlans(parsed3, expected3) - comparePlans(parsed4, expected4) - comparePlans(parsed5, expected5) - } - - test("drop function") { - val sql1 = "DROP TEMPORARY FUNCTION helloworld" - val sql2 = "DROP TEMPORARY FUNCTION IF EXISTS helloworld" - val sql3 = "DROP FUNCTION hello.world" - val sql4 = "DROP FUNCTION IF EXISTS hello.world" - - val parsed1 = parser.parsePlan(sql1) - val parsed2 = parser.parsePlan(sql2) - val parsed3 = parser.parsePlan(sql3) - val parsed4 = parser.parsePlan(sql4) - - val expected1 = DropFunctionCommand( - None, - "helloworld", - ifExists = false, - isTemp = true) - val expected2 = DropFunctionCommand( - None, - "helloworld", - ifExists = true, - isTemp = true) - val expected3 = DropFunctionCommand( - Some("hello"), - "world", - ifExists = false, - isTemp = false) - val expected4 = DropFunctionCommand( - Some("hello"), - "world", - ifExists = true, - isTemp = false) - - comparePlans(parsed1, expected1) - comparePlans(parsed2, expected2) - comparePlans(parsed3, expected3) - comparePlans(parsed4, expected4) - } - test("create hive table - table file format") { val allSources = Seq("parquet", "parquetfile", "orc", "orcfile", "avro", "avrofile", "sequencefile", "rcfile", "textfile") @@ -400,9 +163,9 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { test("create hive external table - location must be specified") { assertUnsupported( - sql = "CREATE EXTERNAL TABLE my_tab", + sql = "CREATE EXTERNAL TABLE my_tab STORED AS parquet", containsThesePhrases = Seq("create external table", "location")) - val query = "CREATE EXTERNAL TABLE my_tab LOCATION '/something/anything'" + val query = "CREATE EXTERNAL TABLE my_tab STORED AS parquet LOCATION '/something/anything'" val ct = parseAs[CreateTable](query) assert(ct.tableDesc.tableType == CatalogTableType.EXTERNAL) assert(ct.tableDesc.storage.locationUri == Some(new URI("/something/anything"))) @@ -410,7 +173,8 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { test("create hive table - property values must be set") { assertUnsupported( - sql = "CREATE TABLE my_tab TBLPROPERTIES('key_without_value', 'key_with_value'='x')", + sql = "CREATE TABLE my_tab STORED AS parquet " + + "TBLPROPERTIES('key_without_value', 'key_with_value'='x')", containsThesePhrases = Seq("key_without_value")) assertUnsupported( sql = "CREATE TABLE my_tab ROW FORMAT SERDE 'serde' " + @@ -419,7 +183,7 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { } test("create hive table - location implies external") { - val query = "CREATE TABLE my_tab LOCATION '/something/anything'" + val query = "CREATE TABLE my_tab STORED AS parquet LOCATION '/something/anything'" val ct = parseAs[CreateTable](query) assert(ct.tableDesc.tableType == CatalogTableType.EXTERNAL) assert(ct.tableDesc.storage.locationUri == Some(new URI("/something/anything"))) @@ -496,32 +260,6 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { "Directory path and 'path' in OPTIONS should be specified one, but not both")) } - // ALTER TABLE table_name RENAME TO new_table_name; - // ALTER VIEW view_name RENAME TO new_view_name; - test("alter table/view: rename table/view") { - val sql_table = "ALTER TABLE table_name RENAME TO new_table_name" - val sql_view = sql_table.replace("TABLE", "VIEW") - val parsed_table = parser.parsePlan(sql_table) - val parsed_view = parser.parsePlan(sql_view) - val expected_table = AlterTableRenameCommand( - TableIdentifier("table_name"), - TableIdentifier("new_table_name"), - isView = false) - val expected_view = AlterTableRenameCommand( - TableIdentifier("table_name"), - TableIdentifier("new_table_name"), - isView = true) - comparePlans(parsed_table, expected_table) - comparePlans(parsed_view, expected_view) - } - - test("alter table: rename table with database") { - val query = "ALTER TABLE db1.tbl RENAME TO db1.tbl2" - val plan = parseAs[AlterTableRenameCommand](query) - assert(plan.oldName == TableIdentifier("tbl", Some("db1"))) - assert(plan.newName == TableIdentifier("tbl2", Some("db1"))) - } - test("alter table - property values must be set") { assertUnsupported( sql = "ALTER TABLE my_tab SET TBLPROPERTIES('key_without_value', 'key_with_value'='x')", @@ -534,61 +272,6 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { containsThesePhrases = Seq("key_with_value")) } - test("alter table: SerDe properties") { - val sql1 = "ALTER TABLE table_name SET SERDE 'org.apache.class'" - val sql2 = - """ - |ALTER TABLE table_name SET SERDE 'org.apache.class' - |WITH SERDEPROPERTIES ('columns'='foo,bar', 'field.delim' = ',') - """.stripMargin - val sql3 = - """ - |ALTER TABLE table_name SET SERDEPROPERTIES ('columns'='foo,bar', - |'field.delim' = ',') - """.stripMargin - val sql4 = - """ - |ALTER TABLE table_name PARTITION (test=1, dt='2008-08-08', - |country='us') SET SERDE 'org.apache.class' WITH SERDEPROPERTIES ('columns'='foo,bar', - |'field.delim' = ',') - """.stripMargin - val sql5 = - """ - |ALTER TABLE table_name PARTITION (test=1, dt='2008-08-08', - |country='us') SET SERDEPROPERTIES ('columns'='foo,bar', 'field.delim' = ',') - """.stripMargin - val parsed1 = parser.parsePlan(sql1) - val parsed2 = parser.parsePlan(sql2) - val parsed3 = parser.parsePlan(sql3) - val parsed4 = parser.parsePlan(sql4) - val parsed5 = parser.parsePlan(sql5) - val tableIdent = TableIdentifier("table_name", None) - val expected1 = AlterTableSerDePropertiesCommand( - tableIdent, Some("org.apache.class"), None, None) - val expected2 = AlterTableSerDePropertiesCommand( - tableIdent, - Some("org.apache.class"), - Some(Map("columns" -> "foo,bar", "field.delim" -> ",")), - None) - val expected3 = AlterTableSerDePropertiesCommand( - tableIdent, None, Some(Map("columns" -> "foo,bar", "field.delim" -> ",")), None) - val expected4 = AlterTableSerDePropertiesCommand( - tableIdent, - Some("org.apache.class"), - Some(Map("columns" -> "foo,bar", "field.delim" -> ",")), - Some(Map("test" -> "1", "dt" -> "2008-08-08", "country" -> "us"))) - val expected5 = AlterTableSerDePropertiesCommand( - tableIdent, - None, - Some(Map("columns" -> "foo,bar", "field.delim" -> ",")), - Some(Map("test" -> "1", "dt" -> "2008-08-08", "country" -> "us"))) - comparePlans(parsed1, expected1) - comparePlans(parsed2, expected2) - comparePlans(parsed3, expected3) - comparePlans(parsed4, expected4) - comparePlans(parsed5, expected5) - } - test("alter table - SerDe property values must be set") { assertUnsupported( sql = "ALTER TABLE my_tab SET SERDE 'serde' " + @@ -596,66 +279,6 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { containsThesePhrases = Seq("key_without_value")) } - // ALTER TABLE table_name ADD [IF NOT EXISTS] PARTITION partition_spec - // [LOCATION 'location1'] partition_spec [LOCATION 'location2'] ...; - test("alter table: add partition") { - val sql1 = - """ - |ALTER TABLE table_name ADD IF NOT EXISTS PARTITION - |(dt='2008-08-08', country='us') LOCATION 'location1' PARTITION - |(dt='2009-09-09', country='uk') - """.stripMargin - val sql2 = "ALTER TABLE table_name ADD PARTITION (dt='2008-08-08') LOCATION 'loc'" - - val parsed1 = parser.parsePlan(sql1) - val parsed2 = parser.parsePlan(sql2) - - val expected1 = AlterTableAddPartitionCommand( - TableIdentifier("table_name", None), - Seq( - (Map("dt" -> "2008-08-08", "country" -> "us"), Some("location1")), - (Map("dt" -> "2009-09-09", "country" -> "uk"), None)), - ifNotExists = true) - val expected2 = AlterTableAddPartitionCommand( - TableIdentifier("table_name", None), - Seq((Map("dt" -> "2008-08-08"), Some("loc"))), - ifNotExists = false) - - comparePlans(parsed1, expected1) - comparePlans(parsed2, expected2) - } - - test("alter table: recover partitions") { - val sql = "ALTER TABLE table_name RECOVER PARTITIONS" - val parsed = parser.parsePlan(sql) - val expected = AlterTableRecoverPartitionsCommand( - TableIdentifier("table_name", None)) - comparePlans(parsed, expected) - } - - test("alter view: add partition (not supported)") { - assertUnsupported( - """ - |ALTER VIEW view_name ADD IF NOT EXISTS PARTITION - |(dt='2008-08-08', country='us') PARTITION - |(dt='2009-09-09', country='uk') - """.stripMargin) - } - - test("alter table: rename partition") { - val sql = - """ - |ALTER TABLE table_name PARTITION (dt='2008-08-08', country='us') - |RENAME TO PARTITION (dt='2008-09-09', country='uk') - """.stripMargin - val parsed = parser.parsePlan(sql) - val expected = AlterTableRenamePartitionCommand( - TableIdentifier("table_name", None), - Map("dt" -> "2008-08-08", "country" -> "us"), - Map("dt" -> "2008-09-09", "country" -> "uk")) - comparePlans(parsed, expected) - } - test("alter table: exchange partition (not supported)") { assertUnsupported( """ @@ -664,45 +287,6 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { """.stripMargin) } - // ALTER TABLE table_name DROP [IF EXISTS] PARTITION spec1[, PARTITION spec2, ...] - // ALTER VIEW table_name DROP [IF EXISTS] PARTITION spec1[, PARTITION spec2, ...] - test("alter table/view: drop partitions") { - val sql1_table = - """ - |ALTER TABLE table_name DROP IF EXISTS PARTITION - |(dt='2008-08-08', country='us'), PARTITION (dt='2009-09-09', country='uk') - """.stripMargin - val sql2_table = - """ - |ALTER TABLE table_name DROP PARTITION - |(dt='2008-08-08', country='us'), PARTITION (dt='2009-09-09', country='uk') - """.stripMargin - val sql1_view = sql1_table.replace("TABLE", "VIEW") - val sql2_view = sql2_table.replace("TABLE", "VIEW") - - val parsed1_table = parser.parsePlan(sql1_table) - val parsed2_table = parser.parsePlan(sql2_table) - val parsed1_purge = parser.parsePlan(sql1_table + " PURGE") - assertUnsupported(sql1_view) - assertUnsupported(sql2_view) - - val tableIdent = TableIdentifier("table_name", None) - val expected1_table = AlterTableDropPartitionCommand( - tableIdent, - Seq( - Map("dt" -> "2008-08-08", "country" -> "us"), - Map("dt" -> "2009-09-09", "country" -> "uk")), - ifExists = true, - purge = false, - retainData = false) - val expected2_table = expected1_table.copy(ifExists = false) - val expected1_purge = expected1_table.copy(purge = true) - - comparePlans(parsed1_table, expected1_table) - comparePlans(parsed2_table, expected2_table) - comparePlans(parsed1_purge, expected1_purge) - } - test("alter table: archive partition (not supported)") { assertUnsupported("ALTER TABLE table_name ARCHIVE PARTITION (dt='2008-08-08', country='us')") } @@ -719,46 +303,6 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { "SET FILEFORMAT PARQUET") } - test("alter table: set partition location") { - val sql2 = "ALTER TABLE table_name PARTITION (dt='2008-08-08', country='us') " + - "SET LOCATION 'new location'" - val parsed2 = parser.parsePlan(sql2) - val tableIdent = TableIdentifier("table_name", None) - val expected2 = AlterTableSetLocationCommand( - tableIdent, - Some(Map("dt" -> "2008-08-08", "country" -> "us")), - "new location") - comparePlans(parsed2, expected2) - } - - test("alter table: change column name/type/comment") { - val sql1 = "ALTER TABLE table_name CHANGE COLUMN col_old_name col_new_name INT" - val sql2 = "ALTER TABLE table_name CHANGE COLUMN col_name col_name INT COMMENT 'new_comment'" - val parsed1 = parser.parsePlan(sql1) - val parsed2 = parser.parsePlan(sql2) - val tableIdent = TableIdentifier("table_name", None) - val expected1 = AlterTableChangeColumnCommand( - tableIdent, - "col_old_name", - StructField("col_new_name", IntegerType)) - val expected2 = AlterTableChangeColumnCommand( - tableIdent, - "col_name", - StructField("col_name", IntegerType).withComment("new_comment")) - comparePlans(parsed1, expected1) - comparePlans(parsed2, expected2) - } - - test("alter table: change column position (not supported)") { - assertUnsupported("ALTER TABLE table_name CHANGE COLUMN col_old_name col_new_name INT FIRST") - assertUnsupported( - "ALTER TABLE table_name CHANGE COLUMN col_old_name col_new_name INT AFTER other_col") - } - - test("alter table: change column in partition spec") { - assertUnsupported("ALTER TABLE table_name PARTITION (a='1', a='2') CHANGE COLUMN a new_a INT") - } - test("alter table: touch (not supported)") { assertUnsupported("ALTER TABLE table_name TOUCH") assertUnsupported("ALTER TABLE table_name TOUCH PARTITION (dt='2008-08-08', country='us')") @@ -802,26 +346,6 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { """.stripMargin) } - test("show databases") { - val sql1 = "SHOW DATABASES" - val sql2 = "SHOW DATABASES LIKE 'defau*'" - val parsed1 = parser.parsePlan(sql1) - val expected1 = ShowDatabasesCommand(None) - val parsed2 = parser.parsePlan(sql2) - val expected2 = ShowDatabasesCommand(Some("defau*")) - comparePlans(parsed1, expected1) - comparePlans(parsed2, expected2) - } - - test("show tblproperties") { - val parsed1 = parser.parsePlan("SHOW TBLPROPERTIES tab1") - val expected1 = ShowTablePropertiesCommand(TableIdentifier("tab1", None), None) - val parsed2 = parser.parsePlan("SHOW TBLPROPERTIES tab1('propKey1')") - val expected2 = ShowTablePropertiesCommand(TableIdentifier("tab1", None), Some("propKey1")) - comparePlans(parsed1, expected1) - comparePlans(parsed2, expected2) - } - test("SPARK-14383: DISTRIBUTE and UNSET as non-keywords") { val sql = "SELECT distribute, unset FROM x" val parsed = parser.parsePlan(sql) @@ -851,66 +375,6 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { assert(e.contains("Found an empty partition key 'b'")) } - test("show columns") { - val sql1 = "SHOW COLUMNS FROM t1" - val sql2 = "SHOW COLUMNS IN db1.t1" - val sql3 = "SHOW COLUMNS FROM t1 IN db1" - val sql4 = "SHOW COLUMNS FROM db1.t1 IN db2" - - val parsed1 = parser.parsePlan(sql1) - val expected1 = ShowColumnsCommand(None, TableIdentifier("t1", None)) - val parsed2 = parser.parsePlan(sql2) - val expected2 = ShowColumnsCommand(None, TableIdentifier("t1", Some("db1"))) - val parsed3 = parser.parsePlan(sql3) - val expected3 = ShowColumnsCommand(Some("db1"), TableIdentifier("t1", None)) - val parsed4 = parser.parsePlan(sql4) - val expected4 = ShowColumnsCommand(Some("db2"), TableIdentifier("t1", Some("db1"))) - - comparePlans(parsed1, expected1) - comparePlans(parsed2, expected2) - comparePlans(parsed3, expected3) - comparePlans(parsed4, expected4) - } - - - test("show partitions") { - val sql1 = "SHOW PARTITIONS t1" - val sql2 = "SHOW PARTITIONS db1.t1" - val sql3 = "SHOW PARTITIONS t1 PARTITION(partcol1='partvalue', partcol2='partvalue')" - - val parsed1 = parser.parsePlan(sql1) - val expected1 = - ShowPartitionsCommand(TableIdentifier("t1", None), None) - val parsed2 = parser.parsePlan(sql2) - val expected2 = - ShowPartitionsCommand(TableIdentifier("t1", Some("db1")), None) - val expected3 = - ShowPartitionsCommand(TableIdentifier("t1", None), - Some(Map("partcol1" -> "partvalue", "partcol2" -> "partvalue"))) - val parsed3 = parser.parsePlan(sql3) - comparePlans(parsed1, expected1) - comparePlans(parsed2, expected2) - comparePlans(parsed3, expected3) - } - - test("support for other types in DBPROPERTIES") { - val sql = - """ - |CREATE DATABASE database_name - |LOCATION '/home/user/db' - |WITH DBPROPERTIES ('a'=1, 'b'=0.1, 'c'=TRUE) - """.stripMargin - val parsed = parser.parsePlan(sql) - val expected = CreateDatabaseCommand( - "database_name", - ifNotExists = false, - Some("/home/user/db"), - None, - Map("a" -> "1", "b" -> "0.1", "c" -> "true")) - - comparePlans(parsed, expected) - } - test("Test CTAS #1") { val s1 = """ @@ -957,7 +421,7 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { assert(desc.comment == Some("This is the staging page view table")) // TODO will be SQLText assert(desc.viewText.isEmpty) - assert(desc.viewDefaultDatabase.isEmpty) + assert(desc.viewCatalogAndNamespace.isEmpty) assert(desc.viewQueryColumnNames.isEmpty) assert(desc.partitionColumnNames.isEmpty) assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileInputFormat")) @@ -1009,7 +473,7 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { // TODO will be SQLText assert(desc.comment == Some("This is the staging page view table")) assert(desc.viewText.isEmpty) - assert(desc.viewDefaultDatabase.isEmpty) + assert(desc.viewCatalogAndNamespace.isEmpty) assert(desc.viewQueryColumnNames.isEmpty) assert(desc.partitionColumnNames.isEmpty) assert(desc.storage.properties == Map()) @@ -1022,22 +486,17 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { test("Test CTAS #3") { val s3 = """CREATE TABLE page_view AS SELECT * FROM src""" - val (desc, exists) = extractTableDesc(s3) - assert(exists == false) - assert(desc.identifier.database == None) - assert(desc.identifier.table == "page_view") - assert(desc.tableType == CatalogTableType.MANAGED) - assert(desc.storage.locationUri == None) - assert(desc.schema.isEmpty) - assert(desc.viewText == None) // TODO will be SQLText - assert(desc.viewDefaultDatabase.isEmpty) - assert(desc.viewQueryColumnNames.isEmpty) - assert(desc.storage.properties == Map()) - assert(desc.storage.inputFormat == Some("org.apache.hadoop.mapred.TextInputFormat")) - assert(desc.storage.outputFormat == - Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")) - assert(desc.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) - assert(desc.properties == Map()) + val statement = parser.parsePlan(s3).asInstanceOf[CreateTableAsSelectStatement] + assert(statement.tableName(0) == "page_view") + assert(statement.asSelect == parser.parsePlan("SELECT * FROM src")) + assert(statement.partitioning.isEmpty) + assert(statement.bucketSpec.isEmpty) + assert(statement.properties.isEmpty) + assert(statement.provider == conf.defaultDataSourceName) + assert(statement.options.isEmpty) + assert(statement.location.isEmpty) + assert(statement.comment.isEmpty) + assert(!statement.ifNotExists) } test("Test CTAS #4") { @@ -1067,7 +526,7 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { assert(desc.storage.locationUri == None) assert(desc.schema.isEmpty) assert(desc.viewText == None) // TODO will be SQLText - assert(desc.viewDefaultDatabase.isEmpty) + assert(desc.viewCatalogAndNamespace.isEmpty) assert(desc.viewQueryColumnNames.isEmpty) assert(desc.storage.properties == Map(("serde_p1" -> "p1"), ("serde_p2" -> "p2"))) assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileInputFormat")) @@ -1139,7 +598,7 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { assertError("select interval '23:61:15' hour to second", "minute 61 outside range [0, 59]") assertError("select interval '.1111111111' second", - "nanosecond 1111111111 outside range") + "'.1111111111' is out of range") } test("use native json_tuple instead of hive's UDTF in LATERAL VIEW") { @@ -1197,68 +656,60 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { test("create table - basic") { val query = "CREATE TABLE my_table (id int, name string)" - val (desc, allowExisting) = extractTableDesc(query) - assert(!allowExisting) - assert(desc.identifier.database.isEmpty) - assert(desc.identifier.table == "my_table") - assert(desc.tableType == CatalogTableType.MANAGED) - assert(desc.schema == new StructType().add("id", "int").add("name", "string")) - assert(desc.partitionColumnNames.isEmpty) - assert(desc.bucketSpec.isEmpty) - assert(desc.viewText.isEmpty) - assert(desc.viewDefaultDatabase.isEmpty) - assert(desc.viewQueryColumnNames.isEmpty) - assert(desc.storage.locationUri.isEmpty) - assert(desc.storage.inputFormat == - Some("org.apache.hadoop.mapred.TextInputFormat")) - assert(desc.storage.outputFormat == - Some("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat")) - assert(desc.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) - assert(desc.storage.properties.isEmpty) - assert(desc.properties.isEmpty) - assert(desc.comment.isEmpty) + withCreateTableStatement(query) { state => + assert(state.tableName(0) == "my_table") + assert(state.tableSchema == new StructType().add("id", "int").add("name", "string")) + assert(state.partitioning.isEmpty) + assert(state.bucketSpec.isEmpty) + assert(state.properties.isEmpty) + assert(state.provider == conf.defaultDataSourceName) + assert(state.options.isEmpty) + assert(state.location.isEmpty) + assert(state.comment.isEmpty) + assert(!state.ifNotExists) + } } test("create table - with database name") { val query = "CREATE TABLE dbx.my_table (id int, name string)" - val (desc, _) = extractTableDesc(query) - assert(desc.identifier.database == Some("dbx")) - assert(desc.identifier.table == "my_table") + withCreateTableStatement(query) { state => + assert(state.tableName(0) == "dbx") + assert(state.tableName(1) == "my_table") + } } test("create table - temporary") { val query = "CREATE TEMPORARY TABLE tab1 (id int, name string)" val e = intercept[ParseException] { parser.parsePlan(query) } - assert(e.message.contains("CREATE TEMPORARY TABLE is not supported yet")) + assert(e.message.contains("CREATE TEMPORARY TABLE without a provider is not allowed.")) } test("create table - external") { val query = "CREATE EXTERNAL TABLE tab1 (id int, name string) LOCATION '/path/to/nowhere'" - val (desc, _) = extractTableDesc(query) - assert(desc.tableType == CatalogTableType.EXTERNAL) - assert(desc.storage.locationUri == Some(new URI("/path/to/nowhere"))) + val e = intercept[ParseException] { parser.parsePlan(query) } + assert(e.message.contains("Operation not allowed: CREATE EXTERNAL TABLE ...")) } test("create table - if not exists") { val query = "CREATE TABLE IF NOT EXISTS tab1 (id int, name string)" - val (_, allowExisting) = extractTableDesc(query) - assert(allowExisting) + withCreateTableStatement(query) { state => + assert(state.ifNotExists) + } } test("create table - comment") { val query = "CREATE TABLE my_table (id int, name string) COMMENT 'its hot as hell below'" - val (desc, _) = extractTableDesc(query) - assert(desc.comment == Some("its hot as hell below")) + withCreateTableStatement(query) { state => + assert(state.comment == Some("its hot as hell below")) + } } test("create table - partitioned columns") { - val query = "CREATE TABLE my_table (id int, name string) PARTITIONED BY (month int)" - val (desc, _) = extractTableDesc(query) - assert(desc.schema == new StructType() - .add("id", "int") - .add("name", "string") - .add("month", "int")) - assert(desc.partitionColumnNames == Seq("month")) + val query = "CREATE TABLE my_table (id int, name string) PARTITIONED BY (id)" + withCreateTableStatement(query) { state => + val transform = IdentityTransform(FieldReference(Seq("id"))) + assert(state.partitioning == Seq(transform)) + } } test("create table - clustered by") { @@ -1274,23 +725,25 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { """ val query1 = s"$baseQuery INTO $numBuckets BUCKETS" - val (desc1, _) = extractTableDesc(query1) - assert(desc1.bucketSpec.isDefined) - val bucketSpec1 = desc1.bucketSpec.get - assert(bucketSpec1.numBuckets == numBuckets) - assert(bucketSpec1.bucketColumnNames.head.equals(bucketedColumn)) - assert(bucketSpec1.sortColumnNames.isEmpty) + withCreateTableStatement(query1) { state => + assert(state.bucketSpec.isDefined) + val bucketSpec = state.bucketSpec.get + assert(bucketSpec.numBuckets == numBuckets) + assert(bucketSpec.bucketColumnNames.head.equals(bucketedColumn)) + assert(bucketSpec.sortColumnNames.isEmpty) + } val query2 = s"$baseQuery SORTED BY($sortColumn) INTO $numBuckets BUCKETS" - val (desc2, _) = extractTableDesc(query2) - assert(desc2.bucketSpec.isDefined) - val bucketSpec2 = desc2.bucketSpec.get - assert(bucketSpec2.numBuckets == numBuckets) - assert(bucketSpec2.bucketColumnNames.head.equals(bucketedColumn)) - assert(bucketSpec2.sortColumnNames.head.equals(sortColumn)) + withCreateTableStatement(query2) { state => + assert(state.bucketSpec.isDefined) + val bucketSpec = state.bucketSpec.get + assert(bucketSpec.numBuckets == numBuckets) + assert(bucketSpec.bucketColumnNames.head.equals(bucketedColumn)) + assert(bucketSpec.sortColumnNames.head.equals(sortColumn)) + } } - test("create table - skewed by") { + test("create table(hive) - skewed by") { val baseQuery = "CREATE TABLE my_table (id int, name string) SKEWED BY" val query1 = s"$baseQuery(id) ON (1, 10, 100)" val query2 = s"$baseQuery(id, name) ON ((1, 'x'), (2, 'y'), (3, 'z'))" @@ -1303,7 +756,7 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { assert(e3.getMessage.contains("Operation not allowed")) } - test("create table - row format") { + test("create table(hive) - row format") { val baseQuery = "CREATE TABLE my_table (id int, name string) ROW FORMAT" val query1 = s"$baseQuery SERDE 'org.apache.poof.serde.Baff'" val query2 = s"$baseQuery SERDE 'org.apache.poof.serde.Baff' WITH SERDEPROPERTIES ('k1'='v1')" @@ -1331,7 +784,7 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { "mapkey.delim" -> "b")) } - test("create table - file format") { + test("create table(hive) - file format") { val baseQuery = "CREATE TABLE my_table (id int, name string) STORED AS" val query1 = s"$baseQuery INPUTFORMAT 'winput' OUTPUTFORMAT 'wowput'" val query2 = s"$baseQuery ORC" @@ -1345,7 +798,7 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { assert(desc2.storage.serde == Some("org.apache.hadoop.hive.ql.io.orc.OrcSerde")) } - test("create table - storage handler") { + test("create table(hive) - storage handler") { val baseQuery = "CREATE TABLE my_table (id int, name string) STORED BY" val query1 = s"$baseQuery 'org.papachi.StorageHandler'" val query2 = s"$baseQuery 'org.mamachi.StorageHandler' WITH SERDEPROPERTIES ('k1'='v1')" @@ -1357,11 +810,12 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { test("create table - properties") { val query = "CREATE TABLE my_table (id int, name string) TBLPROPERTIES ('k1'='v1', 'k2'='v2')" - val (desc, _) = extractTableDesc(query) - assert(desc.properties == Map("k1" -> "v1", "k2" -> "v2")) + withCreateTableStatement(query) { state => + assert(state.properties == Map("k1" -> "v1", "k2" -> "v2")) + } } - test("create table - everything!") { + test("create table(hive) - everything!") { val query = """ |CREATE EXTERNAL TABLE IF NOT EXISTS dbx.my_table (id int, name string) @@ -1384,7 +838,7 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { assert(desc.partitionColumnNames == Seq("month")) assert(desc.bucketSpec.isEmpty) assert(desc.viewText.isEmpty) - assert(desc.viewDefaultDatabase.isEmpty) + assert(desc.viewCatalogAndNamespace.isEmpty) assert(desc.viewQueryColumnNames.isEmpty) assert(desc.storage.locationUri == Some(new URI("/path/to/mercury"))) assert(desc.storage.inputFormat == Some("winput")) @@ -1395,134 +849,83 @@ class DDLParserSuite extends AnalysisTest with SharedSparkSession { assert(desc.comment == Some("no comment")) } - test("create view -- basic") { - val v1 = "CREATE VIEW view1 AS SELECT * FROM tab1" - val command = parser.parsePlan(v1).asInstanceOf[CreateViewCommand] - assert(!command.allowExisting) - assert(command.name.database.isEmpty) - assert(command.name.table == "view1") - assert(command.originalText == Some("SELECT * FROM tab1")) - assert(command.userSpecifiedColumns.isEmpty) - } - - test("create view - full") { - val v1 = - """ - |CREATE OR REPLACE VIEW view1 - |(col1, col3 COMMENT 'hello') - |TBLPROPERTIES('prop1Key'="prop1Val") - |COMMENT 'BLABLA' - |AS SELECT * FROM tab1 - """.stripMargin - val command = parser.parsePlan(v1).asInstanceOf[CreateViewCommand] - assert(command.name.database.isEmpty) - assert(command.name.table == "view1") - assert(command.userSpecifiedColumns == Seq("col1" -> None, "col3" -> Some("hello"))) - assert(command.originalText == Some("SELECT * FROM tab1")) - assert(command.properties == Map("prop1Key" -> "prop1Val")) - assert(command.comment == Some("BLABLA")) - } - - test("create view -- partitioned view") { - val v1 = "CREATE VIEW view1 partitioned on (ds, hr) as select * from srcpart" - intercept[ParseException] { - parser.parsePlan(v1) - } - } - - test("create view - duplicate clauses") { - def createViewStatement(duplicateClause: String): String = { - s""" - |CREATE OR REPLACE VIEW view1 - |(col1, col3 COMMENT 'hello') - |$duplicateClause - |$duplicateClause - |AS SELECT * FROM tab1 - """.stripMargin - } - val sql1 = createViewStatement("COMMENT 'BLABLA'") - val sql2 = createViewStatement("TBLPROPERTIES('prop1Key'=\"prop1Val\")") - intercept(sql1, "Found duplicate clauses: COMMENT") - intercept(sql2, "Found duplicate clauses: TBLPROPERTIES") - } - - test("MSCK REPAIR table") { - val sql = "MSCK REPAIR TABLE tab1" - val parsed = parser.parsePlan(sql) - val expected = AlterTableRecoverPartitionsCommand( - TableIdentifier("tab1", None), - "MSCK REPAIR TABLE") - comparePlans(parsed, expected) - } - test("create table like") { val v1 = "CREATE TABLE table1 LIKE table2" - val (target, source, location, exists) = parser.parsePlan(v1).collect { - case CreateTableLikeCommand(t, s, l, allowExisting) => (t, s, l, allowExisting) - }.head + val (target, source, fileFormat, provider, properties, exists) = + parser.parsePlan(v1).collect { + case CreateTableLikeCommand(t, s, f, p, pr, e) => (t, s, f, p, pr, e) + }.head assert(exists == false) assert(target.database.isEmpty) assert(target.table == "table1") assert(source.database.isEmpty) assert(source.table == "table2") - assert(location.isEmpty) + assert(fileFormat.locationUri.isEmpty) + assert(provider.isEmpty) val v2 = "CREATE TABLE IF NOT EXISTS table1 LIKE table2" - val (target2, source2, location2, exists2) = parser.parsePlan(v2).collect { - case CreateTableLikeCommand(t, s, l, allowExisting) => (t, s, l, allowExisting) - }.head + val (target2, source2, fileFormat2, provider2, properties2, exists2) = + parser.parsePlan(v2).collect { + case CreateTableLikeCommand(t, s, f, p, pr, e) => (t, s, f, p, pr, e) + }.head assert(exists2) assert(target2.database.isEmpty) assert(target2.table == "table1") assert(source2.database.isEmpty) assert(source2.table == "table2") - assert(location2.isEmpty) + assert(fileFormat2.locationUri.isEmpty) + assert(provider2.isEmpty) val v3 = "CREATE TABLE table1 LIKE table2 LOCATION '/spark/warehouse'" - val (target3, source3, location3, exists3) = parser.parsePlan(v3).collect { - case CreateTableLikeCommand(t, s, l, allowExisting) => (t, s, l, allowExisting) - }.head + val (target3, source3, fileFormat3, provider3, properties3, exists3) = + parser.parsePlan(v3).collect { + case CreateTableLikeCommand(t, s, f, p, pr, e) => (t, s, f, p, pr, e) + }.head assert(!exists3) assert(target3.database.isEmpty) assert(target3.table == "table1") assert(source3.database.isEmpty) assert(source3.table == "table2") - assert(location3 == Some("/spark/warehouse")) - - val v4 = "CREATE TABLE IF NOT EXISTS table1 LIKE table2 LOCATION '/spark/warehouse'" - val (target4, source4, location4, exists4) = parser.parsePlan(v4).collect { - case CreateTableLikeCommand(t, s, l, allowExisting) => (t, s, l, allowExisting) - }.head + assert(fileFormat3.locationUri.map(_.toString) == Some("/spark/warehouse")) + assert(provider3.isEmpty) + + val v4 = "CREATE TABLE IF NOT EXISTS table1 LIKE table2 LOCATION '/spark/warehouse'" + val (target4, source4, fileFormat4, provider4, properties4, exists4) = + parser.parsePlan(v4).collect { + case CreateTableLikeCommand(t, s, f, p, pr, e) => (t, s, f, p, pr, e) + }.head assert(exists4) assert(target4.database.isEmpty) assert(target4.table == "table1") assert(source4.database.isEmpty) assert(source4.table == "table2") - assert(location4 == Some("/spark/warehouse")) - } - - test("load data") { - val v1 = "LOAD DATA INPATH 'path' INTO TABLE table1" - val (table, path, isLocal, isOverwrite, partition) = parser.parsePlan(v1).collect { - case LoadDataCommand(t, path, l, o, partition) => (t, path, l, o, partition) - }.head - assert(table.database.isEmpty) - assert(table.table == "table1") - assert(path == "path") - assert(!isLocal) - assert(!isOverwrite) - assert(partition.isEmpty) - - val v2 = "LOAD DATA LOCAL INPATH 'path' OVERWRITE INTO TABLE table1 PARTITION(c='1', d='2')" - val (table2, path2, isLocal2, isOverwrite2, partition2) = parser.parsePlan(v2).collect { - case LoadDataCommand(t, path, l, o, partition) => (t, path, l, o, partition) - }.head - assert(table2.database.isEmpty) - assert(table2.table == "table1") - assert(path2 == "path") - assert(isLocal2) - assert(isOverwrite2) - assert(partition2.nonEmpty) - assert(partition2.get.apply("c") == "1" && partition2.get.apply("d") == "2") + assert(fileFormat4.locationUri.map(_.toString) == Some("/spark/warehouse")) + assert(provider4.isEmpty) + + val v5 = "CREATE TABLE IF NOT EXISTS table1 LIKE table2 USING parquet" + val (target5, source5, fileFormat5, provider5, properties5, exists5) = + parser.parsePlan(v5).collect { + case CreateTableLikeCommand(t, s, f, p, pr, e) => (t, s, f, p, pr, e) + }.head + assert(exists5) + assert(target5.database.isEmpty) + assert(target5.table == "table1") + assert(source5.database.isEmpty) + assert(source5.table == "table2") + assert(fileFormat5.locationUri.isEmpty) + assert(provider5 == Some("parquet")) + + val v6 = "CREATE TABLE IF NOT EXISTS table1 LIKE table2 USING ORC" + val (target6, source6, fileFormat6, provider6, properties6, exists6) = + parser.parsePlan(v6).collect { + case CreateTableLikeCommand(t, s, f, p, pr, e) => (t, s, f, p, pr, e) + }.head + assert(exists6) + assert(target6.database.isEmpty) + assert(target6.table == "table1") + assert(source6.database.isEmpty) + assert(source6.table == "table2") + assert(fileFormat6.locationUri.isEmpty) + assert(provider6 == Some("ORC")) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index fd1da2011f28e..dbf4b09403423 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -21,16 +21,18 @@ import java.io.{File, PrintWriter} import java.net.URI import java.util.Locale -import org.apache.hadoop.fs.Path -import org.scalatest.BeforeAndAfterEach +import org.apache.hadoop.fs.{Path, RawLocalFileSystem} +import org.apache.hadoop.fs.permission.{AclEntry, AclEntryScope, AclEntryType, AclStatus, FsAction, FsPermission} +import org.apache.spark.{SparkException, SparkFiles} import org.apache.spark.internal.config import org.apache.spark.internal.config.RDD_PARALLEL_LISTING_THRESHOLD import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SaveMode} -import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, NoSuchPartitionException, NoSuchTableException, TempTableAlreadyExistsException} +import org.apache.spark.sql.catalyst.{QualifiedTableName, TableIdentifier} +import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, NoSuchDatabaseException, NoSuchPartitionException, NoSuchTableException, TempTableAlreadyExistsException} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec +import org.apache.spark.sql.connector.catalog.SupportsNamespaces.PROP_OWNER import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils} @@ -82,7 +84,7 @@ class InMemoryCatalogedDDLSuite extends DDLSuite with SharedSparkSession { val tabName = "tbl" withTable(tabName) { val e = intercept[AnalysisException] { - sql(s"CREATE TABLE $tabName (i INT, j STRING)") + sql(s"CREATE TABLE $tabName (i INT, j STRING) STORED AS parquet") }.getMessage assert(e.contains("Hive support is required to CREATE Hive TABLE")) } @@ -110,13 +112,13 @@ class InMemoryCatalogedDDLSuite extends DDLSuite with SharedSparkSession { import testImplicits._ withTable("t", "t1") { var e = intercept[AnalysisException] { - sql("CREATE TABLE t SELECT 1 as a, 1 as b") + sql("CREATE TABLE t STORED AS parquet SELECT 1 as a, 1 as b") }.getMessage assert(e.contains("Hive support is required to CREATE Hive TABLE (AS SELECT)")) spark.range(1).select('id as 'a, 'id as 'b).write.saveAsTable("t1") e = intercept[AnalysisException] { - sql("CREATE TABLE t SELECT a, b from t1") + sql("CREATE TABLE t STORED AS parquet SELECT a, b from t1") }.getMessage assert(e.contains("Hive support is required to CREATE Hive TABLE (AS SELECT)")) } @@ -150,9 +152,9 @@ class InMemoryCatalogedDDLSuite extends DDLSuite with SharedSparkSession { Seq(3 -> "c").toDF("i", "j").write.mode("append").saveAsTable("t") checkAnswer(spark.table("t"), Row(1, "a") :: Row(2, "b") :: Row(3, "c") :: Nil) - Seq("c" -> 3).toDF("i", "j").write.mode("append").saveAsTable("t") + Seq(3.5 -> 3).toDF("i", "j").write.mode("append").saveAsTable("t") checkAnswer(spark.table("t"), Row(1, "a") :: Row(2, "b") :: Row(3, "c") - :: Row(null, "3") :: Nil) + :: Row(3, "3") :: Nil) Seq(4 -> "d").toDF("i", "j").write.saveAsTable("t1") @@ -168,10 +170,66 @@ class InMemoryCatalogedDDLSuite extends DDLSuite with SharedSparkSession { assert(e.message.contains("It doesn't match the specified format")) } } + + test("throw exception if Create Table LIKE USING Hive built-in ORC in in-memory catalog") { + val catalog = spark.sessionState.catalog + withTable("s", "t") { + sql("CREATE TABLE s(a INT, b INT) USING parquet") + val source = catalog.getTableMetadata(TableIdentifier("s")) + assert(source.provider == Some("parquet")) + val e = intercept[AnalysisException] { + sql("CREATE TABLE t LIKE s USING org.apache.spark.sql.hive.orc") + }.getMessage + assert(e.contains("Hive built-in ORC data source must be used with Hive support enabled")) + } + } + + test("ALTER TABLE ALTER COLUMN with position is not supported") { + withTable("t") { + sql("CREATE TABLE t(i INT) USING parquet") + val e = intercept[AnalysisException] { + sql("ALTER TABLE t ALTER COLUMN i FIRST") + } + assert(e.message.contains("ALTER COLUMN ... FIRST | ALTER is only supported with v2 tables")) + } + } + + test("SPARK-25403 refresh the table after inserting data") { + withTable("t") { + val catalog = spark.sessionState.catalog + val table = QualifiedTableName(catalog.getCurrentDatabase, "t") + sql("CREATE TABLE t (a INT) USING parquet") + sql("INSERT INTO TABLE t VALUES (1)") + assert(catalog.getCachedTable(table) === null, "Table relation should be invalidated.") + assert(spark.table("t").count() === 1) + assert(catalog.getCachedTable(table) !== null, "Table relation should be cached.") + } + } + + test("SPARK-19784 refresh the table after altering the table location") { + withTable("t") { + withTempDir { dir => + val catalog = spark.sessionState.catalog + val table = QualifiedTableName(catalog.getCurrentDatabase, "t") + val p1 = s"${dir.getCanonicalPath}/p1" + val p2 = s"${dir.getCanonicalPath}/p2" + sql(s"CREATE TABLE t (a INT) USING parquet LOCATION '$p1'") + sql("INSERT INTO TABLE t VALUES (1)") + assert(catalog.getCachedTable(table) === null, "Table relation should be invalidated.") + spark.range(5).toDF("a").write.parquet(p2) + spark.sql(s"ALTER TABLE t SET LOCATION '$p2'") + assert(catalog.getCachedTable(table) === null, "Table relation should be invalidated.") + assert(spark.table("t").count() === 5) + assert(catalog.getCachedTable(table) !== null, "Table relation should be cached.") + } + } + } } abstract class DDLSuite extends QueryTest with SQLTestUtils { + protected val reversedProperties = Seq(PROP_OWNER) + protected def isUsingHiveMetastore: Boolean = { spark.sparkContext.conf.get(CATALOG_IMPLEMENTATION) == "hive" } @@ -315,7 +373,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { try { sql(s"CREATE DATABASE $dbName") val db1 = catalog.getDatabaseMetadata(dbName) - assert(db1 == CatalogDatabase( + assert(db1.copy(properties = db1.properties -- reversedProperties) == CatalogDatabase( dbName, "", getDBPath(dbName), @@ -338,7 +396,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { sql(s"CREATE DATABASE $dbName Location '$path'") val db1 = catalog.getDatabaseMetadata(dbNameWithoutBackTicks) val expPath = makeQualifiedPath(tmpDir.toString) - assert(db1 == CatalogDatabase( + assert(db1.copy(properties = db1.properties -- reversedProperties) == CatalogDatabase( dbNameWithoutBackTicks, "", expPath, @@ -361,7 +419,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { val dbNameWithoutBackTicks = cleanIdentifier(dbName) sql(s"CREATE DATABASE $dbName") val db1 = catalog.getDatabaseMetadata(dbNameWithoutBackTicks) - assert(db1 == CatalogDatabase( + assert(db1.copy(properties = db1.properties -- reversedProperties) == CatalogDatabase( dbNameWithoutBackTicks, "", getDBPath(dbNameWithoutBackTicks), @@ -734,29 +792,55 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { sql(s"CREATE DATABASE $dbName") checkAnswer( - sql(s"DESCRIBE DATABASE EXTENDED $dbName"), + sql(s"DESCRIBE DATABASE EXTENDED $dbName").toDF("key", "value") + .where("key not like 'Owner%'"), // filter for consistency with in-memory catalog Row("Database Name", dbNameWithoutBackTicks) :: - Row("Description", "") :: + Row("Comment", "") :: Row("Location", CatalogUtils.URIToString(location)) :: Row("Properties", "") :: Nil) sql(s"ALTER DATABASE $dbName SET DBPROPERTIES ('a'='a', 'b'='b', 'c'='c')") checkAnswer( - sql(s"DESCRIBE DATABASE EXTENDED $dbName"), + sql(s"DESCRIBE DATABASE EXTENDED $dbName").toDF("key", "value") + .where("key not like 'Owner%'"), // filter for consistency with in-memory catalog Row("Database Name", dbNameWithoutBackTicks) :: - Row("Description", "") :: + Row("Comment", "") :: Row("Location", CatalogUtils.URIToString(location)) :: Row("Properties", "((a,a), (b,b), (c,c))") :: Nil) sql(s"ALTER DATABASE $dbName SET DBPROPERTIES ('d'='d')") checkAnswer( - sql(s"DESCRIBE DATABASE EXTENDED $dbName"), + sql(s"DESCRIBE DATABASE EXTENDED $dbName").toDF("key", "value") + .where("key not like 'Owner%'"), // filter for consistency with in-memory catalog Row("Database Name", dbNameWithoutBackTicks) :: - Row("Description", "") :: + Row("Comment", "") :: Row("Location", CatalogUtils.URIToString(location)) :: Row("Properties", "((a,a), (b,b), (c,c), (d,d))") :: Nil) + + withTempDir { tmpDir => + if (isUsingHiveMetastore) { + val e1 = intercept[AnalysisException] { + sql(s"ALTER DATABASE $dbName SET LOCATION '${tmpDir.toURI}'") + } + assert(e1.getMessage.contains("does not support altering database location")) + } else { + sql(s"ALTER DATABASE $dbName SET LOCATION '${tmpDir.toURI}'") + val uriInCatalog = catalog.getDatabaseMetadata(dbNameWithoutBackTicks).locationUri + assert("file" === uriInCatalog.getScheme) + assert(new Path(tmpDir.getPath).toUri.getPath === uriInCatalog.getPath) + } + + intercept[NoSuchDatabaseException] { + sql(s"ALTER DATABASE `db-not-exist` SET LOCATION '${tmpDir.toURI}'") + } + + val e3 = intercept[IllegalArgumentException] { + sql(s"ALTER DATABASE $dbName SET LOCATION ''") + } + assert(e3.getMessage.contains("Can not create a Path from an empty string")) + } } finally { catalog.reset() } @@ -1134,7 +1218,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { } } - protected def testRecoverPartitions() { + protected def testRecoverPartitions(): Unit = { val catalog = spark.sessionState.catalog // table to alter does not exist intercept[AnalysisException] { @@ -1372,7 +1456,8 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { // if (isUsingHiveMetastore) { // assert(storageFormat.properties.get("path") === expected) // } - assert(storageFormat.locationUri.map(_.getPath) === Some(expected.getPath)) + assert(storageFormat.locationUri === + Some(makeQualifiedPath(CatalogUtils.URIToString(expected)))) } // set table location sql("ALTER TABLE dbx.tab1 SET LOCATION '/path/to/your/lovely/heart'") @@ -1386,7 +1471,9 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { verifyLocation(new URI("/swanky/steak/place")) // set table partition location without explicitly specifying database sql("ALTER TABLE tab1 PARTITION (a='1', b='2') SET LOCATION 'vienna'") - verifyLocation(new URI("vienna"), Some(partSpec)) + val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("tab1")) + val viennaPartPath = new Path(new Path(table. location), "vienna") + verifyLocation(CatalogUtils.stringToURI(viennaPartPath.toString), Some(partSpec)) // table to alter does not exist intercept[AnalysisException] { sql("ALTER TABLE dbx.does_not_exist SET LOCATION '/mister/spark'") @@ -1550,13 +1637,11 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { "PARTITION (a='2', b='6') LOCATION 'paris' PARTITION (a='3', b='7')") assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1, part2, part3)) assert(catalog.getPartition(tableIdent, part1).storage.locationUri.isDefined) - val partitionLocation = if (isUsingHiveMetastore) { - val tableLocation = catalog.getTableMetadata(tableIdent).storage.locationUri - assert(tableLocation.isDefined) - makeQualifiedPath(new Path(tableLocation.get.toString, "paris").toString) - } else { - new URI("paris") - } + + val tableLocation = catalog.getTableMetadata(tableIdent).storage.locationUri + assert(tableLocation.isDefined) + val partitionLocation = makeQualifiedPath( + new Path(tableLocation.get.toString, "paris").toString) assert(catalog.getPartition(tableIdent, part2).storage.locationUri == Option(partitionLocation)) assert(catalog.getPartition(tableIdent, part3).storage.locationUri.isDefined) @@ -1701,7 +1786,8 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { column.map(_.metadata).getOrElse(Metadata.empty) } // Ensure that change column will preserve other metadata fields. - sql("ALTER TABLE dbx.tab1 CHANGE COLUMN col1 col1 INT COMMENT 'this is col1'") + sql("ALTER TABLE dbx.tab1 CHANGE COLUMN col1 TYPE INT") + sql("ALTER TABLE dbx.tab1 CHANGE COLUMN col1 COMMENT 'this is col1'") assert(getMetadata("col1").getString("key") == "value") assert(getMetadata("col1").getString("comment") == "this is col1") } @@ -1929,6 +2015,79 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { } } + test("SPARK-30312: truncate table - keep acl/permission") { + import testImplicits._ + val ignorePermissionAcl = Seq(true, false) + + ignorePermissionAcl.foreach { ignore => + withSQLConf( + "fs.file.impl" -> classOf[FakeLocalFsFileSystem].getName, + "fs.file.impl.disable.cache" -> "true", + SQLConf.TRUNCATE_TABLE_IGNORE_PERMISSION_ACL.key -> ignore.toString) { + withTable("tab1") { + sql("CREATE TABLE tab1 (col INT) USING parquet") + sql("INSERT INTO tab1 SELECT 1") + checkAnswer(spark.table("tab1"), Row(1)) + + val tablePath = new Path(spark.sessionState.catalog + .getTableMetadata(TableIdentifier("tab1")).storage.locationUri.get) + + val hadoopConf = spark.sessionState.newHadoopConf() + val fs = tablePath.getFileSystem(hadoopConf) + val fileStatus = fs.getFileStatus(tablePath); + + fs.setPermission(tablePath, new FsPermission("777")) + assert(fileStatus.getPermission().toString() == "rwxrwxrwx") + + // Set ACL to table path. + val customAcl = new java.util.ArrayList[AclEntry]() + customAcl.add(new AclEntry.Builder() + .setName("test") + .setType(AclEntryType.USER) + .setScope(AclEntryScope.ACCESS) + .setPermission(FsAction.READ).build()) + fs.setAcl(tablePath, customAcl) + assert(fs.getAclStatus(tablePath).getEntries().get(0) == customAcl.get(0)) + + sql("TRUNCATE TABLE tab1") + assert(spark.table("tab1").collect().isEmpty) + + val fileStatus2 = fs.getFileStatus(tablePath) + if (ignore) { + assert(fileStatus2.getPermission().toString() != "rwxrwxrwx") + } else { + assert(fileStatus2.getPermission().toString() == "rwxrwxrwx") + } + val aclEntries = fs.getAclStatus(tablePath).getEntries() + if (ignore) { + assert(aclEntries.size() == 0) + } else { + assert(aclEntries.size() == 4) + assert(aclEntries.get(0) == customAcl.get(0)) + + // Setting ACLs will also set user/group/other permissions + // as ACL entries. + val user = new AclEntry.Builder() + .setType(AclEntryType.USER) + .setScope(AclEntryScope.ACCESS) + .setPermission(FsAction.ALL).build() + val group = new AclEntry.Builder() + .setType(AclEntryType.GROUP) + .setScope(AclEntryScope.ACCESS) + .setPermission(FsAction.ALL).build() + val other = new AclEntry.Builder() + .setType(AclEntryType.OTHER) + .setScope(AclEntryScope.ACCESS) + .setPermission(FsAction.ALL).build() + assert(aclEntries.get(1) == user) + assert(aclEntries.get(2) == group) + assert(aclEntries.get(3) == other) + } + } + } + } + } + test("create temporary view with mismatched schema") { withTable("tab1") { spark.range(10).write.saveAsTable("tab1") @@ -2040,7 +2199,8 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { test("show functions") { withUserDefinedFunction("add_one" -> true) { - val numFunctions = FunctionRegistry.functionSet.size.toLong + val numFunctions = FunctionRegistry.functionSet.size.toLong + + FunctionsCommand.virtualOperators.size.toLong assert(sql("show functions").count() === numFunctions) assert(sql("show system functions").count() === numFunctions) assert(sql("show all functions").count() === numFunctions) @@ -2064,7 +2224,9 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { val message = intercept[AnalysisException] { sql(s"SHOW COLUMNS IN $db.showcolumn FROM ${db.toUpperCase(Locale.ROOT)}") }.getMessage - assert(message.contains("SHOW COLUMNS with conflicting databases")) + assert(message.contains( + s"SHOW COLUMNS with conflicting databases: " + + s"'${db.toUpperCase(Locale.ROOT)}' != '$db'")) } } } @@ -2138,7 +2300,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { spark.sessionState.catalog.refreshTable(TableIdentifier("t")) val table1 = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) - assert(table1.location == newDir) + assert(table1.location == makeQualifiedPath(newDir.toString)) assert(!newDirFile.exists) spark.sql("INSERT INTO TABLE t SELECT 'c', 1") @@ -2501,6 +2663,13 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { assert(table.location.toString.startsWith("file:/")) } + withTempDir { dir => + assert(!dir.getAbsolutePath.startsWith("file:/")) + spark.sql(s"ALTER TABLE t SET LOCATION '$dir'") + val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) + assert(table.location.toString.startsWith("file:/")) + } + withTempDir { dir => assert(!dir.getAbsolutePath.startsWith("file:/")) // The parser does not recognize the backslashes on Windows as they are. @@ -2519,6 +2688,37 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { } } + test("the qualified path of a partition is stored in the catalog") { + withTable("t") { + withTempDir { dir => + spark.sql( + s""" + |CREATE TABLE t(a STRING, b STRING) + |USING ${dataSource} PARTITIONED BY(b) LOCATION '$dir' + """.stripMargin) + spark.sql("INSERT INTO TABLE t PARTITION(b=1) SELECT 2") + val part = spark.sessionState.catalog.getPartition(TableIdentifier("t"), Map("b" -> "1")) + assert(part.storage.locationUri.contains( + makeQualifiedPath(new File(dir, "b=1").getAbsolutePath))) + assert(part.storage.locationUri.get.toString.startsWith("file:/")) + } + withTempDir { dir => + spark.sql(s"ALTER TABLE t PARTITION(b=1) SET LOCATION '$dir'") + + val part = spark.sessionState.catalog.getPartition(TableIdentifier("t"), Map("b" -> "1")) + assert(part.storage.locationUri.contains(makeQualifiedPath(dir.getAbsolutePath))) + assert(part.storage.locationUri.get.toString.startsWith("file:/")) + } + + withTempDir { dir => + spark.sql(s"ALTER TABLE t ADD PARTITION(b=2) LOCATION '$dir'") + val part = spark.sessionState.catalog.getPartition(TableIdentifier("t"), Map("b" -> "2")) + assert(part.storage.locationUri.contains(makeQualifiedPath(dir.getAbsolutePath))) + assert(part.storage.locationUri.get.toString.startsWith("file:/")) + } + } + } + protected def testAddColumn(provider: String): Unit = { withTable("t1") { sql(s"CREATE TABLE t1 (c1 int) USING $provider") @@ -2599,7 +2799,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { val e = intercept[AnalysisException] { sql("ALTER TABLE tmp_v ADD COLUMNS (c3 INT)") } - assert(e.message.contains("ALTER ADD COLUMNS does not support views")) + assert(e.message.contains("'tmp_v' is a view not a table")) } } @@ -2755,4 +2955,74 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { } } } + + test("Create Table LIKE USING provider") { + val catalog = spark.sessionState.catalog + withTable("s", "t1", "t2", "t3", "t4") { + sql("CREATE TABLE s(a INT, b INT) USING parquet") + val source = catalog.getTableMetadata(TableIdentifier("s")) + assert(source.provider == Some("parquet")) + + sql("CREATE TABLE t1 LIKE s USING orc") + val table1 = catalog.getTableMetadata(TableIdentifier("t1")) + assert(table1.provider == Some("orc")) + + sql("CREATE TABLE t2 LIKE s USING hive") + val table2 = catalog.getTableMetadata(TableIdentifier("t2")) + assert(table2.provider == Some("hive")) + + val e1 = intercept[ClassNotFoundException] { + sql("CREATE TABLE t3 LIKE s USING unknown") + }.getMessage + assert(e1.contains("Failed to find data source")) + + withGlobalTempView("src") { + val globalTempDB = spark.sharedState.globalTempViewManager.database + sql("CREATE GLOBAL TEMP VIEW src AS SELECT 1 AS a, '2' AS b") + sql(s"CREATE TABLE t4 LIKE $globalTempDB.src USING parquet") + val table = catalog.getTableMetadata(TableIdentifier("t4")) + assert(table.provider == Some("parquet")) + } + } + } + + test("Add a directory when spark.sql.legacy.addDirectory.recursive.enabled set to true") { + val directoryToAdd = Utils.createTempDir("/tmp/spark/addDirectory/") + val testFile = File.createTempFile("testFile", "1", directoryToAdd) + spark.sql(s"ADD FILE $directoryToAdd") + assert(new File(SparkFiles.get(s"${directoryToAdd.getName}/${testFile.getName}")).exists()) + } + + test("Add a directory when spark.sql.legacy.addDirectory.recursive.enabled not set to true") { + withTempDir { testDir => + withSQLConf(SQLConf.LEGACY_ADD_DIRECTORY_USING_RECURSIVE.key -> "false") { + val msg = intercept[SparkException] { + spark.sql(s"ADD FILE $testDir") + }.getMessage + assert(msg.contains("is a directory and recursive is not turned on")) + } + } + } +} + +object FakeLocalFsFileSystem { + var aclStatus = new AclStatus.Builder().build() +} + +// A fake test local filesystem used to test ACL. It keeps a ACL status. If deletes +// a path of this filesystem, it will clean up the ACL status. Note that for test purpose, +// it has only one ACL status for all paths. +class FakeLocalFsFileSystem extends RawLocalFileSystem { + import FakeLocalFsFileSystem._ + + override def delete(f: Path, recursive: Boolean): Boolean = { + aclStatus = new AclStatus.Builder().build() + super.delete(f, recursive) + } + + override def getAclStatus(path: Path): AclStatus = aclStatus + + override def setAcl(path: Path, aclSpec: java.util.List[AclEntry]): Unit = { + aclStatus = new AclStatus.Builder().addEntries(aclSpec).build() + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala index bba1dc0f697a1..d439e5b1cd651 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala @@ -18,57 +18,102 @@ package org.apache.spark.sql.execution.command import java.net.URI -import java.util.Locale +import java.util.{Collections, Locale} import org.mockito.ArgumentMatchers.any import org.mockito.Mockito.{mock, when} import org.mockito.invocation.InvocationOnMock import org.apache.spark.sql.{AnalysisException, SaveMode} -import org.apache.spark.sql.catalog.v2.{CatalogManager, CatalogNotFoundException, Identifier, TableCatalog} -import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.analysis.AnalysisTest -import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType} +import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier} +import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, Analyzer, CTESubstitution, EmptyFunctionRegistry, NoSuchTableException, ResolveCatalogs, ResolvedTable, ResolveInlineTables, ResolveSessionCatalog, UnresolvedAttribute, UnresolvedRelation, UnresolvedStar, UnresolvedSubqueryColumnAliases, UnresolvedV2Relation} +import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogStorageFormat, CatalogTable, CatalogTableType, InMemoryCatalog, SessionCatalog} +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, EqualTo, Expression, InSubquery, IntegerLiteral, ListQuery, StringLiteral} import org.apache.spark.sql.catalyst.parser.CatalystSqlParser -import org.apache.spark.sql.catalyst.plans.logical.{CreateTableAsSelect, CreateV2Table, DropTable, LogicalPlan} -import org.apache.spark.sql.connector.InMemoryTableCatalog -import org.apache.spark.sql.execution.datasources.{CreateTable, DataSourceResolution} -import org.apache.spark.sql.internal.SQLConf.DEFAULT_V2_CATALOG -import org.apache.spark.sql.sources.v2.InMemoryTableProvider -import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, StringType, StructType} -import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.sql.catalyst.plans.logical.{AlterTable, Assignment, CreateTableAsSelect, CreateV2Table, DeleteAction, DeleteFromTable, DescribeRelation, DropTable, InsertAction, InsertIntoStatement, LocalRelation, LogicalPlan, MergeIntoTable, OneRowRelation, Project, ShowTableProperties, SubqueryAlias, UpdateAction, UpdateTable} +import org.apache.spark.sql.connector.FakeV2Provider +import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogNotFoundException, Identifier, Table, TableCapability, TableCatalog, TableChange, V1Table} +import org.apache.spark.sql.connector.catalog.TableChange.{UpdateColumnComment, UpdateColumnType} +import org.apache.spark.sql.execution.datasources.CreateTable +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{CharType, DoubleType, HIVE_TYPE_STRING, IntegerType, LongType, MetadataBuilder, StringType, StructField, StructType} class PlanResolutionSuite extends AnalysisTest { import CatalystSqlParser._ - private val v2Format = classOf[InMemoryTableProvider].getName + private val v2Format = classOf[FakeV2Provider].getName + + private val table: Table = { + val t = mock(classOf[Table]) + when(t.schema()).thenReturn(new StructType().add("i", "int").add("s", "string")) + t + } + + private val tableWithAcceptAnySchemaCapability: Table = { + val t = mock(classOf[Table]) + when(t.schema()).thenReturn(new StructType().add("i", "int")) + when(t.capabilities()).thenReturn(Collections.singleton(TableCapability.ACCEPT_ANY_SCHEMA)) + t + } + + private val v1Table: V1Table = { + val t = mock(classOf[CatalogTable]) + when(t.schema).thenReturn(new StructType().add("i", "int").add("s", "string")) + when(t.tableType).thenReturn(CatalogTableType.MANAGED) + V1Table(t) + } private val testCat: TableCatalog = { - val newCatalog = new InMemoryTableCatalog - newCatalog.initialize("testcat", CaseInsensitiveStringMap.empty()) + val newCatalog = mock(classOf[TableCatalog]) + when(newCatalog.loadTable(any())).thenAnswer((invocation: InvocationOnMock) => { + invocation.getArgument[Identifier](0).name match { + case "tab" => table + case "tab1" => table + case name => throw new NoSuchTableException(name) + } + }) + when(newCatalog.name()).thenReturn("testcat") newCatalog } - private val v2SessionCatalog = { - val newCatalog = new InMemoryTableCatalog - newCatalog.initialize("session", CaseInsensitiveStringMap.empty()) + private val v2SessionCatalog: TableCatalog = { + val newCatalog = mock(classOf[TableCatalog]) + when(newCatalog.loadTable(any())).thenAnswer((invocation: InvocationOnMock) => { + invocation.getArgument[Identifier](0).name match { + case "v1Table" => v1Table + case "v1Table1" => v1Table + case "v2Table" => table + case "v2Table1" => table + case "v2TableWithAcceptAnySchemaCapability" => tableWithAcceptAnySchemaCapability + case name => throw new NoSuchTableException(name) + } + }) + when(newCatalog.name()).thenReturn(CatalogManager.SESSION_CATALOG_NAME) newCatalog } + private val v1SessionCatalog: SessionCatalog = new SessionCatalog( + new InMemoryCatalog, + EmptyFunctionRegistry, + new SQLConf().copy(SQLConf.CASE_SENSITIVE -> true)) + v1SessionCatalog.createTempView("v", LocalRelation(Nil), false) + private val catalogManagerWithDefault = { val manager = mock(classOf[CatalogManager]) when(manager.catalog(any())).thenAnswer((invocation: InvocationOnMock) => { invocation.getArgument[String](0) match { case "testcat" => testCat - case "session" => + case CatalogManager.SESSION_CATALOG_NAME => v2SessionCatalog case name => throw new CatalogNotFoundException(s"No such catalog: $name") } }) - when(manager.defaultCatalog).thenReturn(Some(testCat)) - when(manager.v2SessionCatalog).thenCallRealMethod() + when(manager.currentCatalog).thenReturn(testCat) + when(manager.currentNamespace).thenReturn(Array.empty[String]) + when(manager.v1SessionCatalog).thenReturn(v1SessionCatalog) manager } @@ -78,26 +123,37 @@ class PlanResolutionSuite extends AnalysisTest { invocation.getArgument[String](0) match { case "testcat" => testCat - case "session" => - v2SessionCatalog case name => throw new CatalogNotFoundException(s"No such catalog: $name") } }) - when(manager.defaultCatalog).thenReturn(None) - when(manager.v2SessionCatalog).thenCallRealMethod() + when(manager.currentCatalog).thenReturn(v2SessionCatalog) + when(manager.v1SessionCatalog).thenReturn(v1SessionCatalog) manager } def parseAndResolve(query: String, withDefault: Boolean = false): LogicalPlan = { - val newConf = conf.copy() - newConf.setConfString(DEFAULT_V2_CATALOG.key, "testcat") val catalogManager = if (withDefault) { catalogManagerWithDefault } else { catalogManagerWithoutDefault } - DataSourceResolution(newConf, catalogManager).apply(parsePlan(query)) + val analyzer = new Analyzer(catalogManager, conf) + // TODO: run the analyzer directly. + val rules = Seq( + CTESubstitution, + ResolveInlineTables(conf), + analyzer.ResolveRelations, + new ResolveCatalogs(catalogManager), + new ResolveSessionCatalog(catalogManager, conf, _ == Seq("v")), + analyzer.ResolveTables, + analyzer.ResolveReferences, + analyzer.ResolveSubqueryColumnAliases, + analyzer.ResolveReferences, + analyzer.ResolveAlterTableChanges) + rules.foldLeft(parsePlan(query)) { + case (plan, rule) => rule.apply(plan) + } } private def parseResolveCompare(query: String, expected: LogicalPlan): Unit = @@ -335,7 +391,7 @@ class PlanResolutionSuite extends AnalysisTest { assert(desc.schema.isEmpty) // will be populated later when the table is actually created assert(desc.comment.contains("This is the staging page view table")) assert(desc.viewText.isEmpty) - assert(desc.viewDefaultDatabase.isEmpty) + assert(desc.viewCatalogAndNamespace.isEmpty) assert(desc.viewQueryColumnNames.isEmpty) assert(desc.partitionColumnNames.isEmpty) assert(desc.provider.contains("parquet")) @@ -443,7 +499,7 @@ class PlanResolutionSuite extends AnalysisTest { parseAndResolve(sql) match { case create: CreateV2Table => - assert(create.catalog.name == "session") + assert(create.catalog.name == CatalogManager.SESSION_CATALOG_NAME) assert(create.tableName == Identifier.of(Array("mydb"), "page_view")) assert(create.tableSchema == new StructType() .add("id", LongType) @@ -547,7 +603,7 @@ class PlanResolutionSuite extends AnalysisTest { parseAndResolve(sql) match { case ctas: CreateTableAsSelect => - assert(ctas.catalog.name == "session") + assert(ctas.catalog.name == CatalogManager.SESSION_CATALOG_NAME) assert(ctas.tableName == Identifier.of(Array("mydb"), "page_view")) assert(ctas.properties == expectedProperties) assert(ctas.writeOptions.isEmpty) @@ -647,51 +703,813 @@ class PlanResolutionSuite extends AnalysisTest { // ALTER TABLE table_name SET TBLPROPERTIES ('comment' = new_comment); // ALTER TABLE table_name UNSET TBLPROPERTIES [IF EXISTS] ('comment', 'key'); test("alter table: alter table properties") { - val sql1_table = "ALTER TABLE table_name SET TBLPROPERTIES ('test' = 'test', " + - "'comment' = 'new_comment')" - val sql2_table = "ALTER TABLE table_name UNSET TBLPROPERTIES ('comment', 'test')" - val sql3_table = "ALTER TABLE table_name UNSET TBLPROPERTIES IF EXISTS ('comment', 'test')" - - val parsed1_table = parseAndResolve(sql1_table) - val parsed2_table = parseAndResolve(sql2_table) - val parsed3_table = parseAndResolve(sql3_table) + Seq("v1Table" -> true, "v2Table" -> false, "testcat.tab" -> false).foreach { + case (tblName, useV1Command) => + val sql1 = s"ALTER TABLE $tblName SET TBLPROPERTIES ('test' = 'test', " + + "'comment' = 'new_comment')" + val sql2 = s"ALTER TABLE $tblName UNSET TBLPROPERTIES ('comment', 'test')" + val sql3 = s"ALTER TABLE $tblName UNSET TBLPROPERTIES IF EXISTS ('comment', 'test')" + + val parsed1 = parseAndResolve(sql1) + val parsed2 = parseAndResolve(sql2) + val parsed3 = parseAndResolve(sql3) + + val tableIdent = TableIdentifier(tblName, None) + if (useV1Command) { + val expected1 = AlterTableSetPropertiesCommand( + tableIdent, Map("test" -> "test", "comment" -> "new_comment"), isView = false) + val expected2 = AlterTableUnsetPropertiesCommand( + tableIdent, Seq("comment", "test"), ifExists = false, isView = false) + val expected3 = AlterTableUnsetPropertiesCommand( + tableIdent, Seq("comment", "test"), ifExists = true, isView = false) + + comparePlans(parsed1, expected1) + comparePlans(parsed2, expected2) + comparePlans(parsed3, expected3) + } else { + parsed1 match { + case AlterTable(_, _, _: DataSourceV2Relation, changes) => + assert(changes == Seq( + TableChange.setProperty("test", "test"), + TableChange.setProperty("comment", "new_comment"))) + case _ => fail("expect AlterTable") + } + + parsed2 match { + case AlterTable(_, _, _: DataSourceV2Relation, changes) => + assert(changes == Seq( + TableChange.removeProperty("comment"), + TableChange.removeProperty("test"))) + case _ => fail("expect AlterTable") + } + + parsed3 match { + case AlterTable(_, _, _: DataSourceV2Relation, changes) => + assert(changes == Seq( + TableChange.removeProperty("comment"), + TableChange.removeProperty("test"))) + case _ => fail("expect AlterTable") + } + } + } - val tableIdent = TableIdentifier("table_name", None) - val expected1_table = AlterTableSetPropertiesCommand( - tableIdent, Map("test" -> "test", "comment" -> "new_comment"), isView = false) - val expected2_table = AlterTableUnsetPropertiesCommand( - tableIdent, Seq("comment", "test"), ifExists = false, isView = false) - val expected3_table = AlterTableUnsetPropertiesCommand( - tableIdent, Seq("comment", "test"), ifExists = true, isView = false) + val sql4 = "ALTER TABLE non_exist SET TBLPROPERTIES ('test' = 'test')" + val sql5 = "ALTER TABLE non_exist UNSET TBLPROPERTIES ('test')" + val parsed4 = parseAndResolve(sql4) + val parsed5 = parseAndResolve(sql5) - comparePlans(parsed1_table, expected1_table) - comparePlans(parsed2_table, expected2_table) - comparePlans(parsed3_table, expected3_table) + // For non-existing tables, we convert it to v2 command with `UnresolvedV2Table` + parsed4 match { + case AlterTable(_, _, _: UnresolvedV2Relation, _) => // OK + case _ => fail("Expect AlterTable, but got:\n" + parsed4.treeString) + } + parsed5 match { + case AlterTable(_, _, _: UnresolvedV2Relation, _) => // OK + case _ => fail("Expect AlterTable, but got:\n" + parsed5.treeString) + } } test("support for other types in TBLPROPERTIES") { - val sql = - """ - |ALTER TABLE table_name - |SET TBLPROPERTIES ('a' = 1, 'b' = 0.1, 'c' = TRUE) - """.stripMargin + Seq("v1Table" -> true, "v2Table" -> false, "testcat.tab" -> false).foreach { + case (tblName, useV1Command) => + val sql = + s""" + |ALTER TABLE $tblName + |SET TBLPROPERTIES ('a' = 1, 'b' = 0.1, 'c' = TRUE) + """.stripMargin + val parsed = parseAndResolve(sql) + if (useV1Command) { + val expected = AlterTableSetPropertiesCommand( + TableIdentifier(tblName), + Map("a" -> "1", "b" -> "0.1", "c" -> "true"), + isView = false) + + comparePlans(parsed, expected) + } else { + parsed match { + case AlterTable(_, _, _: DataSourceV2Relation, changes) => + assert(changes == Seq( + TableChange.setProperty("a", "1"), + TableChange.setProperty("b", "0.1"), + TableChange.setProperty("c", "true"))) + case _ => fail("Expect AlterTable, but got:\n" + parsed.treeString) + } + } + } + } + + test("alter table: set location") { + Seq("v1Table" -> true, "v2Table" -> false, "testcat.tab" -> false).foreach { + case (tblName, useV1Command) => + val sql = s"ALTER TABLE $tblName SET LOCATION 'new location'" + val parsed = parseAndResolve(sql) + if (useV1Command) { + val expected = AlterTableSetLocationCommand( + TableIdentifier(tblName, None), + None, + "new location") + comparePlans(parsed, expected) + } else { + parsed match { + case AlterTable(_, _, _: DataSourceV2Relation, changes) => + assert(changes == Seq(TableChange.setProperty("location", "new location"))) + case _ => fail("Expect AlterTable, but got:\n" + parsed.treeString) + } + } + } + } + + test("DESCRIBE relation") { + Seq("v1Table" -> true, "v2Table" -> false, "testcat.tab" -> false).foreach { + case (tblName, useV1Command) => + val sql1 = s"DESC TABLE $tblName" + val sql2 = s"DESC TABLE EXTENDED $tblName" + val parsed1 = parseAndResolve(sql1) + val parsed2 = parseAndResolve(sql2) + if (useV1Command) { + val expected1 = DescribeTableCommand(TableIdentifier(tblName, None), Map.empty, false) + val expected2 = DescribeTableCommand(TableIdentifier(tblName, None), Map.empty, true) + + comparePlans(parsed1, expected1) + comparePlans(parsed2, expected2) + } else { + parsed1 match { + case DescribeRelation(_: ResolvedTable, _, isExtended) => + assert(!isExtended) + case _ => fail("Expect DescribeTable, but got:\n" + parsed1.treeString) + } + + parsed2 match { + case DescribeRelation(_: ResolvedTable, _, isExtended) => + assert(isExtended) + case _ => fail("Expect DescribeTable, but got:\n" + parsed2.treeString) + } + } + + val sql3 = s"DESC TABLE $tblName PARTITION(a=1)" + val parsed3 = parseAndResolve(sql3) + if (useV1Command) { + val expected3 = DescribeTableCommand( + TableIdentifier(tblName, None), Map("a" -> "1"), false) + comparePlans(parsed3, expected3) + } else { + parsed3 match { + case DescribeRelation(_: ResolvedTable, partitionSpec, isExtended) => + assert(!isExtended) + assert(partitionSpec == Map("a" -> "1")) + case _ => fail("Expect DescribeTable, but got:\n" + parsed2.treeString) + } + } + } + + // use v1 command to describe views. + val sql4 = "DESC TABLE v" + val parsed4 = parseAndResolve(sql4) + assert(parsed4.isInstanceOf[DescribeTableCommand]) + } + + test("DELETE FROM") { + Seq("v2Table", "testcat.tab").foreach { tblName => + val sql1 = s"DELETE FROM $tblName" + val sql2 = s"DELETE FROM $tblName where name='Robert'" + val sql3 = s"DELETE FROM $tblName AS t where t.name='Robert'" + val sql4 = + s""" + |WITH s(name) AS (SELECT 'Robert') + |DELETE FROM $tblName AS t WHERE t.name IN (SELECT s.name FROM s) + """.stripMargin + + val parsed1 = parseAndResolve(sql1) + val parsed2 = parseAndResolve(sql2) + val parsed3 = parseAndResolve(sql3) + val parsed4 = parseAndResolve(sql4) + + parsed1 match { + case DeleteFromTable(AsDataSourceV2Relation(_), None) => + case _ => fail("Expect DeleteFromTable, but got:\n" + parsed1.treeString) + } + + parsed2 match { + case DeleteFromTable( + AsDataSourceV2Relation(_), + Some(EqualTo(name: UnresolvedAttribute, StringLiteral("Robert")))) => + assert(name.name == "name") + case _ => fail("Expect DeleteFromTable, but got:\n" + parsed2.treeString) + } + + parsed3 match { + case DeleteFromTable( + SubqueryAlias(AliasIdentifier("t", Seq()), AsDataSourceV2Relation(_)), + Some(EqualTo(name: UnresolvedAttribute, StringLiteral("Robert")))) => + assert(name.name == "t.name") + case _ => fail("Expect DeleteFromTable, but got:\n" + parsed3.treeString) + } + + parsed4 match { + case DeleteFromTable( + SubqueryAlias(AliasIdentifier("t", Seq()), AsDataSourceV2Relation(_)), + Some(InSubquery(values, query))) => + assert(values.size == 1 && values.head.isInstanceOf[UnresolvedAttribute]) + assert(values.head.asInstanceOf[UnresolvedAttribute].name == "t.name") + query match { + case ListQuery(Project(projects, SubqueryAlias(AliasIdentifier("s", Seq()), + UnresolvedSubqueryColumnAliases(outputColumnNames, Project(_, _: OneRowRelation)))), + _, _, _) => + assert(projects.size == 1 && projects.head.name == "s.name") + assert(outputColumnNames.size == 1 && outputColumnNames.head == "name") + case o => fail("Unexpected subquery: \n" + o.treeString) + } + + case _ => fail("Expect DeleteFromTable, bug got:\n" + parsed4.treeString) + } + } + } + + test("UPDATE TABLE") { + Seq("v2Table", "testcat.tab").foreach { tblName => + val sql1 = s"UPDATE $tblName SET name='Robert', age=32" + val sql2 = s"UPDATE $tblName AS t SET name='Robert', age=32" + val sql3 = s"UPDATE $tblName AS t SET name='Robert', age=32 WHERE p=1" + val sql4 = + s""" + |WITH s(name) AS (SELECT 'Robert') + |UPDATE $tblName AS t + |SET t.age=32 + |WHERE t.name IN (SELECT s.name FROM s) + """.stripMargin + + val parsed1 = parseAndResolve(sql1) + val parsed2 = parseAndResolve(sql2) + val parsed3 = parseAndResolve(sql3) + val parsed4 = parseAndResolve(sql4) + + parsed1 match { + case UpdateTable( + AsDataSourceV2Relation(_), + Seq(Assignment(name: UnresolvedAttribute, StringLiteral("Robert")), + Assignment(age: UnresolvedAttribute, IntegerLiteral(32))), + None) => + assert(name.name == "name") + assert(age.name == "age") + + case _ => fail("Expect UpdateTable, but got:\n" + parsed1.treeString) + } + + parsed2 match { + case UpdateTable( + SubqueryAlias(AliasIdentifier("t", Seq()), AsDataSourceV2Relation(_)), + Seq(Assignment(name: UnresolvedAttribute, StringLiteral("Robert")), + Assignment(age: UnresolvedAttribute, IntegerLiteral(32))), + None) => + assert(name.name == "name") + assert(age.name == "age") + + case _ => fail("Expect UpdateTable, but got:\n" + parsed2.treeString) + } + + parsed3 match { + case UpdateTable( + SubqueryAlias(AliasIdentifier("t", Seq()), AsDataSourceV2Relation(_)), + Seq(Assignment(name: UnresolvedAttribute, StringLiteral("Robert")), + Assignment(age: UnresolvedAttribute, IntegerLiteral(32))), + Some(EqualTo(p: UnresolvedAttribute, IntegerLiteral(1)))) => + assert(name.name == "name") + assert(age.name == "age") + assert(p.name == "p") + + case _ => fail("Expect UpdateTable, but got:\n" + parsed3.treeString) + } + + parsed4 match { + case UpdateTable(SubqueryAlias(AliasIdentifier("t", Seq()), AsDataSourceV2Relation(_)), + Seq(Assignment(key: UnresolvedAttribute, IntegerLiteral(32))), + Some(InSubquery(values, query))) => + assert(key.name == "t.age") + assert(values.size == 1 && values.head.isInstanceOf[UnresolvedAttribute]) + assert(values.head.asInstanceOf[UnresolvedAttribute].name == "t.name") + query match { + case ListQuery(Project(projects, SubqueryAlias(AliasIdentifier("s", Seq()), + UnresolvedSubqueryColumnAliases(outputColumnNames, Project(_, _: OneRowRelation)))), + _, _, _) => + assert(projects.size == 1 && projects.head.name == "s.name") + assert(outputColumnNames.size == 1 && outputColumnNames.head == "name") + case o => fail("Unexpected subquery: \n" + o.treeString) + } + + case _ => fail("Expect UpdateTable, but got:\n" + parsed4.treeString) + } + } + + val sql = "UPDATE non_existing SET id=1" val parsed = parseAndResolve(sql) - val expected = AlterTableSetPropertiesCommand( - TableIdentifier("table_name"), - Map("a" -> "1", "b" -> "0.1", "c" -> "true"), - isView = false) + parsed match { + case u: UpdateTable => + assert(u.table.isInstanceOf[UnresolvedRelation]) + case _ => fail("Expect UpdateTable, but got:\n" + parsed.treeString) + } + } - comparePlans(parsed, expected) + test("alter table: alter column") { + Seq("v1Table" -> true, "v2Table" -> false, "testcat.tab" -> false).foreach { + case (tblName, useV1Command) => + val sql1 = s"ALTER TABLE $tblName ALTER COLUMN i TYPE bigint" + val sql2 = s"ALTER TABLE $tblName ALTER COLUMN i COMMENT 'new comment'" + + val parsed1 = parseAndResolve(sql1) + val parsed2 = parseAndResolve(sql2) + + val tableIdent = TableIdentifier(tblName, None) + if (useV1Command) { + val oldColumn = StructField("i", IntegerType) + val newColumn = StructField("i", LongType) + val expected1 = AlterTableChangeColumnCommand( + tableIdent, "i", newColumn) + val expected2 = AlterTableChangeColumnCommand( + tableIdent, "i", oldColumn.withComment("new comment")) + + comparePlans(parsed1, expected1) + comparePlans(parsed2, expected2) + + val sql3 = s"ALTER TABLE $tblName ALTER COLUMN j COMMENT 'new comment'" + val e1 = intercept[AnalysisException] { + parseAndResolve(sql3) + } + assert(e1.getMessage.contains( + "ALTER COLUMN cannot find column j in v1 table. Available: i, s")) + + val sql4 = s"ALTER TABLE $tblName ALTER COLUMN a.b.c TYPE bigint" + val e2 = intercept[AnalysisException] { + parseAndResolve(sql4) + } + assert(e2.getMessage.contains( + "ALTER COLUMN with qualified column is only supported with v2 tables")) + + val sql5 = s"ALTER TABLE $tblName ALTER COLUMN i TYPE char(1)" + val builder = new MetadataBuilder + builder.putString(HIVE_TYPE_STRING, CharType(1).catalogString) + val newColumnWithCleanedType = StructField("i", StringType, true, builder.build()) + val expected5 = AlterTableChangeColumnCommand( + tableIdent, "i", newColumnWithCleanedType) + val parsed5 = parseAndResolve(sql5) + comparePlans(parsed5, expected5) + } else { + parsed1 match { + case AlterTable(_, _, _: DataSourceV2Relation, changes) => + assert(changes == Seq( + TableChange.updateColumnType(Array("i"), LongType))) + case _ => fail("expect AlterTable") + } + + parsed2 match { + case AlterTable(_, _, _: DataSourceV2Relation, changes) => + assert(changes == Seq( + TableChange.updateColumnComment(Array("i"), "new comment"))) + case _ => fail("expect AlterTable") + } + } + } } - test("alter table: set location") { - val sql1 = "ALTER TABLE table_name SET LOCATION 'new location'" - val parsed1 = parseAndResolve(sql1) - val tableIdent = TableIdentifier("table_name", None) - val expected1 = AlterTableSetLocationCommand( - tableIdent, - None, - "new location") - comparePlans(parsed1, expected1) + test("alter table: alter column action is not specified") { + val e = intercept[AnalysisException] { + parseAndResolve("ALTER TABLE v1Table ALTER COLUMN i") + } + assert(e.getMessage.contains( + "ALTER TABLE table ALTER COLUMN requires a TYPE, a SET/DROP, a COMMENT, or a FIRST/AFTER")) + } + + test("alter table: alter column case sensitivity for v1 table") { + val tblName = "v1Table" + Seq(true, false).foreach { caseSensitive => + withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { + val sql = s"ALTER TABLE $tblName ALTER COLUMN I COMMENT 'new comment'" + if (caseSensitive) { + val e = intercept[AnalysisException] { + parseAndResolve(sql) + } + assert(e.getMessage.contains( + "ALTER COLUMN cannot find column I in v1 table. Available: i, s")) + } else { + val actual = parseAndResolve(sql) + val expected = AlterTableChangeColumnCommand( + TableIdentifier(tblName, None), + "I", + StructField("I", IntegerType).withComment("new comment")) + comparePlans(actual, expected) + } + } + } + } + + test("alter table: hive style change column") { + Seq("v2Table", "testcat.tab").foreach { tblName => + parseAndResolve(s"ALTER TABLE $tblName CHANGE COLUMN i i int COMMENT 'an index'") match { + case AlterTable(_, _, _: DataSourceV2Relation, changes) => + assert(changes.length == 1, "Should only have a comment change") + assert(changes.head.isInstanceOf[UpdateColumnComment], + s"Expected only a UpdateColumnComment change but got: ${changes.head}") + case _ => fail("expect AlterTable") + } + + parseAndResolve(s"ALTER TABLE $tblName CHANGE COLUMN i i long COMMENT 'an index'") match { + case AlterTable(_, _, _: DataSourceV2Relation, changes) => + assert(changes.length == 2, "Should have a comment change and type change") + assert(changes.exists(_.isInstanceOf[UpdateColumnComment]), + s"Expected UpdateColumnComment change but got: ${changes}") + assert(changes.exists(_.isInstanceOf[UpdateColumnType]), + s"Expected UpdateColumnType change but got: ${changes}") + case _ => fail("expect AlterTable") + } + } + } + + val DSV2ResolutionTests = { + val v2SessionCatalogTable = s"${CatalogManager.SESSION_CATALOG_NAME}.v2Table" + Seq( + ("ALTER TABLE testcat.tab ALTER COLUMN i TYPE bigint", false), + ("ALTER TABLE tab ALTER COLUMN i TYPE bigint", false), + (s"ALTER TABLE $v2SessionCatalogTable ALTER COLUMN i TYPE bigint", true), + ("INSERT INTO TABLE tab VALUES (1)", false), + ("INSERT INTO TABLE testcat.tab VALUES (1)", false), + (s"INSERT INTO TABLE $v2SessionCatalogTable VALUES (1)", true), + ("DESC TABLE tab", false), + ("DESC TABLE testcat.tab", false), + (s"DESC TABLE $v2SessionCatalogTable", true), + ("SHOW TBLPROPERTIES tab", false), + ("SHOW TBLPROPERTIES testcat.tab", false), + (s"SHOW TBLPROPERTIES $v2SessionCatalogTable", true), + ("SELECT * from tab", false), + ("SELECT * from testcat.tab", false), + (s"SELECT * from ${CatalogManager.SESSION_CATALOG_NAME}.v2Table", true) + ) + } + + DSV2ResolutionTests.foreach { case (sql, isSessionCatlog) => + test(s"Data source V2 relation resolution '$sql'") { + val parsed = parseAndResolve(sql, withDefault = true) + val catlogIdent = if (isSessionCatlog) v2SessionCatalog else testCat + val tableIdent = if (isSessionCatlog) "v2Table" else "tab" + parsed match { + case AlterTable(_, _, r: DataSourceV2Relation, _) => + assert(r.catalog.exists(_ == catlogIdent)) + assert(r.identifier.exists(_.name() == tableIdent)) + case Project(_, AsDataSourceV2Relation(r)) => + assert(r.catalog.exists(_ == catlogIdent)) + assert(r.identifier.exists(_.name() == tableIdent)) + case InsertIntoStatement(r: DataSourceV2Relation, _, _, _, _) => + assert(r.catalog.exists(_ == catlogIdent)) + assert(r.identifier.exists(_.name() == tableIdent)) + case DescribeRelation(r: ResolvedTable, _, _) => + assert(r.catalog == catlogIdent) + assert(r.identifier.name() == tableIdent) + case ShowTableProperties(r: ResolvedTable, _) => + assert(r.catalog == catlogIdent) + assert(r.identifier.name() == tableIdent) + case ShowTablePropertiesCommand(t: TableIdentifier, _) => + assert(t.identifier == tableIdent) + } + } + } + + test("MERGE INTO TABLE") { + def checkResolution( + target: LogicalPlan, + source: LogicalPlan, + mergeCondition: Expression, + deleteCondAttr: Option[AttributeReference], + updateCondAttr: Option[AttributeReference], + insertCondAttr: Option[AttributeReference], + updateAssigns: Seq[Assignment], + insertAssigns: Seq[Assignment], + starInUpdate: Boolean = false): Unit = { + val ti = target.output.find(_.name == "i").get.asInstanceOf[AttributeReference] + val ts = target.output.find(_.name == "s").get.asInstanceOf[AttributeReference] + val si = source.output.find(_.name == "i").get.asInstanceOf[AttributeReference] + val ss = source.output.find(_.name == "s").get.asInstanceOf[AttributeReference] + + mergeCondition match { + case EqualTo(l: AttributeReference, r: AttributeReference) => + assert(l.sameRef(ti) && r.sameRef(si)) + case other => fail("unexpected merge condition " + other) + } + + deleteCondAttr.foreach(a => assert(a.sameRef(ts))) + updateCondAttr.foreach(a => assert(a.sameRef(ts))) + insertCondAttr.foreach(a => assert(a.sameRef(ss))) + + if (starInUpdate) { + assert(updateAssigns.size == 2) + assert(updateAssigns(0).key.asInstanceOf[AttributeReference].sameRef(ti)) + assert(updateAssigns(0).value.asInstanceOf[AttributeReference].sameRef(si)) + assert(updateAssigns(1).key.asInstanceOf[AttributeReference].sameRef(ts)) + assert(updateAssigns(1).value.asInstanceOf[AttributeReference].sameRef(ss)) + } else { + assert(updateAssigns.size == 1) + assert(updateAssigns.head.key.asInstanceOf[AttributeReference].sameRef(ts)) + assert(updateAssigns.head.value.asInstanceOf[AttributeReference].sameRef(ss)) + } + assert(insertAssigns.size == 2) + assert(insertAssigns(0).key.asInstanceOf[AttributeReference].sameRef(ti)) + assert(insertAssigns(0).value.asInstanceOf[AttributeReference].sameRef(si)) + assert(insertAssigns(1).key.asInstanceOf[AttributeReference].sameRef(ts)) + assert(insertAssigns(1).value.asInstanceOf[AttributeReference].sameRef(ss)) + } + + Seq(("v2Table", "v2Table1"), ("testcat.tab", "testcat.tab1")).foreach { + case(target, source) => + // basic + val sql1 = + s""" + |MERGE INTO $target AS target + |USING $source AS source + |ON target.i = source.i + |WHEN MATCHED AND (target.s='delete') THEN DELETE + |WHEN MATCHED AND (target.s='update') THEN UPDATE SET target.s = source.s + |WHEN NOT MATCHED AND (source.s='insert') + | THEN INSERT (target.i, target.s) values (source.i, source.s) + """.stripMargin + parseAndResolve(sql1) match { + case MergeIntoTable( + SubqueryAlias(AliasIdentifier("target", Seq()), AsDataSourceV2Relation(target)), + SubqueryAlias(AliasIdentifier("source", Seq()), AsDataSourceV2Relation(source)), + mergeCondition, + Seq(DeleteAction(Some(EqualTo(dl: AttributeReference, StringLiteral("delete")))), + UpdateAction(Some(EqualTo(ul: AttributeReference, StringLiteral("update"))), + updateAssigns)), + Seq(InsertAction(Some(EqualTo(il: AttributeReference, StringLiteral("insert"))), + insertAssigns))) => + checkResolution(target, source, mergeCondition, Some(dl), Some(ul), Some(il), + updateAssigns, insertAssigns) + + case other => fail("Expect MergeIntoTable, but got:\n" + other.treeString) + } + + // star + val sql2 = + s""" + |MERGE INTO $target AS target + |USING $source AS source + |ON target.i = source.i + |WHEN MATCHED AND (target.s='delete') THEN DELETE + |WHEN MATCHED AND (target.s='update') THEN UPDATE SET * + |WHEN NOT MATCHED AND (source.s='insert') THEN INSERT * + """.stripMargin + parseAndResolve(sql2) match { + case MergeIntoTable( + SubqueryAlias(AliasIdentifier("target", Seq()), AsDataSourceV2Relation(target)), + SubqueryAlias(AliasIdentifier("source", Seq()), AsDataSourceV2Relation(source)), + mergeCondition, + Seq(DeleteAction(Some(EqualTo(dl: AttributeReference, StringLiteral("delete")))), + UpdateAction(Some(EqualTo(ul: AttributeReference, + StringLiteral("update"))), updateAssigns)), + Seq(InsertAction(Some(EqualTo(il: AttributeReference, StringLiteral("insert"))), + insertAssigns))) => + checkResolution(target, source, mergeCondition, Some(dl), Some(ul), Some(il), + updateAssigns, insertAssigns, starInUpdate = true) + + case other => fail("Expect MergeIntoTable, but got:\n" + other.treeString) + } + + // no additional conditions + val sql3 = + s""" + |MERGE INTO $target AS target + |USING $source AS source + |ON target.i = source.i + |WHEN MATCHED THEN DELETE + |WHEN MATCHED THEN UPDATE SET target.s = source.s + |WHEN NOT MATCHED THEN INSERT (target.i, target.s) values (source.i, source.s) + """.stripMargin + parseAndResolve(sql3) match { + case MergeIntoTable( + SubqueryAlias(AliasIdentifier("target", Seq()), AsDataSourceV2Relation(target)), + SubqueryAlias(AliasIdentifier("source", Seq()), AsDataSourceV2Relation(source)), + mergeCondition, + Seq(DeleteAction(None), UpdateAction(None, updateAssigns)), + Seq(InsertAction(None, insertAssigns))) => + checkResolution(target, source, mergeCondition, None, None, None, + updateAssigns, insertAssigns) + + case other => fail("Expect MergeIntoTable, but got:\n" + other.treeString) + } + + // using subquery + val sql4 = + s""" + |MERGE INTO $target AS target + |USING (SELECT * FROM $source) AS source + |ON target.i = source.i + |WHEN MATCHED AND (target.s='delete') THEN DELETE + |WHEN MATCHED AND (target.s='update') THEN UPDATE SET target.s = source.s + |WHEN NOT MATCHED AND (source.s='insert') + | THEN INSERT (target.i, target.s) values (source.i, source.s) + """.stripMargin + parseAndResolve(sql4) match { + case MergeIntoTable( + SubqueryAlias(AliasIdentifier("target", Seq()), AsDataSourceV2Relation(target)), + SubqueryAlias(AliasIdentifier("source", Seq()), source: Project), + mergeCondition, + Seq(DeleteAction(Some(EqualTo(dl: AttributeReference, StringLiteral("delete")))), + UpdateAction(Some(EqualTo(ul: AttributeReference, StringLiteral("update"))), + updateAssigns)), + Seq(InsertAction(Some(EqualTo(il: AttributeReference, StringLiteral("insert"))), + insertAssigns))) => + checkResolution(target, source, mergeCondition, Some(dl), Some(ul), Some(il), + updateAssigns, insertAssigns) + + case other => fail("Expect MergeIntoTable, but got:\n" + other.treeString) + } + + // cte + val sql5 = + s""" + |WITH source(i, s) AS + | (SELECT * FROM $source) + |MERGE INTO $target AS target + |USING source + |ON target.i = source.i + |WHEN MATCHED AND (target.s='delete') THEN DELETE + |WHEN MATCHED AND (target.s='update') THEN UPDATE SET target.s = source.s + |WHEN NOT MATCHED AND (source.s='insert') + |THEN INSERT (target.i, target.s) values (source.i, source.s) + """.stripMargin + parseAndResolve(sql5) match { + case MergeIntoTable( + SubqueryAlias(AliasIdentifier("target", Seq()), AsDataSourceV2Relation(target)), + SubqueryAlias(AliasIdentifier("source", Seq()), source: Project), + mergeCondition, + Seq(DeleteAction(Some(EqualTo(dl: AttributeReference, StringLiteral("delete")))), + UpdateAction(Some(EqualTo(ul: AttributeReference, StringLiteral("update"))), + updateAssigns)), + Seq(InsertAction(Some(EqualTo(il: AttributeReference, StringLiteral("insert"))), + insertAssigns))) => + assert(source.output.map(_.name) == Seq("i", "s")) + checkResolution(target, source, mergeCondition, Some(dl), Some(ul), Some(il), + updateAssigns, insertAssigns) + + case other => fail("Expect MergeIntoTable, but got:\n" + other.treeString) + } + } + + // no aliases + Seq(("v2Table", "v2Table1"), + ("testcat.tab", "testcat.tab1")).foreach { pair => + + val target = pair._1 + val source = pair._2 + + val sql1 = + s""" + |MERGE INTO $target + |USING $source + |ON 1 = 1 + |WHEN MATCHED THEN DELETE + |WHEN MATCHED THEN UPDATE SET s = 1 + |WHEN NOT MATCHED AND (s = 'a') THEN INSERT (i) values (i) + """.stripMargin + + parseAndResolve(sql1) match { + case MergeIntoTable( + AsDataSourceV2Relation(target), + AsDataSourceV2Relation(source), + _, + Seq(DeleteAction(None), UpdateAction(None, updateAssigns)), + Seq(InsertAction( + Some(EqualTo(il: AttributeReference, StringLiteral("a"))), + insertAssigns))) => + val ti = target.output.find(_.name == "i").get + val ts = target.output.find(_.name == "s").get + val si = source.output.find(_.name == "i").get + val ss = source.output.find(_.name == "s").get + + // INSERT condition is resolved with source table only, so column `s` is not ambiguous. + assert(il.sameRef(ss)) + assert(updateAssigns.size == 1) + // UPDATE key is resolved with target table only, so column `s` is not ambiguous. + assert(updateAssigns.head.key.asInstanceOf[AttributeReference].sameRef(ts)) + assert(insertAssigns.size == 1) + // INSERT key is resolved with target table only, so column `i` is not ambiguous. + assert(insertAssigns.head.key.asInstanceOf[AttributeReference].sameRef(ti)) + // INSERT value is resolved with source table only, so column `i` is not ambiguous. + assert(insertAssigns.head.value.asInstanceOf[AttributeReference].sameRef(si)) + + case p => fail("Expect MergeIntoTable, but got:\n" + p.treeString) + } + + val sql2 = + s""" + |MERGE INTO $target + |USING $source + |ON i = 1 + |WHEN MATCHED THEN DELETE + """.stripMargin + // merge condition is resolved with both target and source tables, and we can't + // resolve column `i` as it's ambiguous. + val e2 = intercept[AnalysisException](parseAndResolve(sql2)) + assert(e2.message.contains("Reference 'i' is ambiguous")) + + val sql3 = + s""" + |MERGE INTO $target + |USING $source + |ON 1 = 1 + |WHEN MATCHED AND (s='delete') THEN DELETE + """.stripMargin + // delete condition is resolved with both target and source tables, and we can't + // resolve column `s` as it's ambiguous. + val e3 = intercept[AnalysisException](parseAndResolve(sql3)) + assert(e3.message.contains("Reference 's' is ambiguous")) + + val sql4 = + s""" + |MERGE INTO $target + |USING $source + |ON 1 = 1 + |WHEN MATCHED AND (s = 'a') THEN UPDATE SET i = 1 + """.stripMargin + // update condition is resolved with both target and source tables, and we can't + // resolve column `s` as it's ambiguous. + val e4 = intercept[AnalysisException](parseAndResolve(sql4)) + assert(e4.message.contains("Reference 's' is ambiguous")) + + val sql5 = + s""" + |MERGE INTO $target + |USING $source + |ON 1 = 1 + |WHEN MATCHED THEN UPDATE SET s = s + """.stripMargin + // update value is resolved with both target and source tables, and we can't + // resolve column `s` as it's ambiguous. + val e5 = intercept[AnalysisException](parseAndResolve(sql5)) + assert(e5.message.contains("Reference 's' is ambiguous")) + } + + val sql6 = + s""" + |MERGE INTO non_exist_target + |USING non_exist_source + |ON target.i = source.i + |WHEN MATCHED THEN DELETE + |WHEN MATCHED THEN UPDATE SET * + |WHEN NOT MATCHED THEN INSERT * + """.stripMargin + val parsed = parseAndResolve(sql6) + parsed match { + case u: MergeIntoTable => + assert(u.targetTable.isInstanceOf[UnresolvedRelation]) + assert(u.sourceTable.isInstanceOf[UnresolvedRelation]) + case _ => fail("Expect MergeIntoTable, but got:\n" + parsed.treeString) + } + } + + test("MERGE INTO TABLE - skip resolution on v2 tables that accept any schema") { + val sql = + s""" + |MERGE INTO v2TableWithAcceptAnySchemaCapability AS target + |USING v2Table AS source + |ON target.i = source.i + |WHEN MATCHED AND (target.s='delete') THEN DELETE + |WHEN MATCHED AND (target.s='update') THEN UPDATE SET target.s = source.s + |WHEN NOT MATCHED AND (target.s='insert') + | THEN INSERT (target.i, target.s) values (source.i, source.s) + """.stripMargin + + parseAndResolve(sql) match { + case MergeIntoTable( + SubqueryAlias(AliasIdentifier("target", Seq()), AsDataSourceV2Relation(_)), + SubqueryAlias(AliasIdentifier("source", Seq()), AsDataSourceV2Relation(_)), + EqualTo(l: UnresolvedAttribute, r: UnresolvedAttribute), + Seq( + DeleteAction(Some(EqualTo(dl: UnresolvedAttribute, StringLiteral("delete")))), + UpdateAction( + Some(EqualTo(ul: UnresolvedAttribute, StringLiteral("update"))), + updateAssigns)), + Seq( + InsertAction( + Some(EqualTo(il: UnresolvedAttribute, StringLiteral("insert"))), + insertAssigns))) => + assert(l.name == "target.i" && r.name == "source.i") + assert(dl.name == "target.s") + assert(ul.name == "target.s") + assert(il.name == "target.s") + assert(updateAssigns.size == 1) + assert(updateAssigns.head.key.asInstanceOf[UnresolvedAttribute].name == "target.s") + assert(updateAssigns.head.value.asInstanceOf[UnresolvedAttribute].name == "source.s") + assert(insertAssigns.size == 2) + assert(insertAssigns.head.key.asInstanceOf[UnresolvedAttribute].name == "target.i") + assert(insertAssigns.head.value.asInstanceOf[UnresolvedAttribute].name == "source.i") + + case l => fail("Expected unresolved MergeIntoTable, but got:\n" + l.treeString) + } } + // TODO: add tests for more commands. } + +object AsDataSourceV2Relation { + def unapply(plan: LogicalPlan): Option[DataSourceV2Relation] = plan match { + case SubqueryAlias(_, r: DataSourceV2Relation) => Some(r) + case _ => None + } +} + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategySuite.scala index a44a94aaa4f94..b76db70494cf8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategySuite.scala @@ -222,7 +222,7 @@ class DataSourceStrategySuite extends PlanTest with SharedSparkSession { test("SPARK-26865 DataSourceV2Strategy should push normalized filters") { val attrInt = 'cint.int assertResult(Seq(IsNotNull(attrInt))) { - DataSourceStrategy.normalizeFilters(Seq(IsNotNull(attrInt.withName("CiNt"))), Seq(attrInt)) + DataSourceStrategy.normalizeExprs(Seq(IsNotNull(attrInt.withName("CiNt"))), Seq(attrInt)) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala index 4b086e830e456..553773e2555cf 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileIndexSuite.scala @@ -352,6 +352,26 @@ class FileIndexSuite extends SharedSparkSession { "driver side must not be negative")) } + test ("SPARK-29537: throw exception when user defined a wrong base path") { + withTempDir { dir => + val partitionDirectory = new File(dir, "a=foo") + partitionDirectory.mkdir() + val file = new File(partitionDirectory, "text.txt") + stringToFile(file, "text") + val path = new Path(dir.getCanonicalPath) + val wrongBasePath = new File(dir, "unknown") + // basePath must be a directory + wrongBasePath.mkdir() + val parameters = Map("basePath" -> wrongBasePath.getCanonicalPath) + val fileIndex = new InMemoryFileIndex(spark, Seq(path), parameters, None) + val msg = intercept[IllegalArgumentException] { + // trigger inferPartitioning() + fileIndex.partitionSpec() + }.getMessage + assert(msg === s"Wrong basePath ${wrongBasePath.getCanonicalPath} for the root path: $path") + } + } + test("refresh for InMemoryFileIndex with FileStatusCache") { withTempDir { dir => val fileStatusCache = FileStatusCache.getOrCreate(spark) @@ -416,6 +436,35 @@ class FileIndexSuite extends SharedSparkSession { } } + test("Add an option to ignore block locations when listing file") { + withTempDir { dir => + val partitionDirectory = new File(dir, "a=foo") + partitionDirectory.mkdir() + for (i <- 1 to 8) { + val file = new File(partitionDirectory, i + ".txt") + stringToFile(file, "text") + } + val path = new Path(dir.getCanonicalPath) + val fileIndex = new InMemoryFileIndex(spark, Seq(path), Map.empty, None) + withSQLConf(SQLConf.IGNORE_DATA_LOCALITY.key -> "false", + "fs.file.impl" -> classOf[SpecialBlockLocationFileSystem].getName) { + val withBlockLocations = fileIndex. + listLeafFiles(Seq(new Path(partitionDirectory.getPath))) + + withSQLConf(SQLConf.IGNORE_DATA_LOCALITY.key -> "true") { + val withoutBlockLocations = fileIndex. + listLeafFiles(Seq(new Path(partitionDirectory.getPath))) + + assert(withBlockLocations.size == withoutBlockLocations.size) + assert(withBlockLocations.forall(b => b.isInstanceOf[LocatedFileStatus] && + b.asInstanceOf[LocatedFileStatus].getBlockLocations.nonEmpty)) + assert(withoutBlockLocations.forall(b => b.isInstanceOf[FileStatus] && + !b.isInstanceOf[LocatedFileStatus])) + assert(withoutBlockLocations.forall(withBlockLocations.contains)) + } + } + } + } } object DeletionRaceFileSystem { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala index fa8111407665a..812305ba24403 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala @@ -31,12 +31,13 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionSet, PredicateHelper} import org.apache.spark.sql.catalyst.util -import org.apache.spark.sql.execution.{DataSourceScanExec, SparkPlan} +import org.apache.spark.sql.execution.{DataSourceScanExec, FileSourceScanExec, SparkPlan} +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources._ import org.apache.spark.sql.test.SharedSparkSession -import org.apache.spark.sql.types.{IntegerType, StructType} +import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType} import org.apache.spark.util.Utils class FileSourceStrategySuite extends QueryTest with SharedSparkSession with PredicateHelper { @@ -497,6 +498,36 @@ class FileSourceStrategySuite extends QueryTest with SharedSparkSession with Pre } } + test("SPARK-29768: Column pruning through non-deterministic expressions") { + withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> "parquet") { + withTempPath { path => + spark.range(10) + .selectExpr("id as key", "id * 3 as s1", "id * 5 as s2") + .write.format("parquet").save(path.getAbsolutePath) + val df1 = spark.read.parquet(path.getAbsolutePath) + val df2 = df1.selectExpr("key", "rand()").where("key > 5") + val plan = df2.queryExecution.sparkPlan + val scan = plan.collect { case scan: FileSourceScanExec => scan } + assert(scan.size === 1) + assert(scan.head.requiredSchema == StructType(StructField("key", LongType) :: Nil)) + } + } + + withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> "") { + withTempPath { path => + spark.range(10) + .selectExpr("id as key", "id * 3 as s1", "id * 5 as s2") + .write.format("parquet").save(path.getAbsolutePath) + val df1 = spark.read.parquet(path.getAbsolutePath) + val df2 = df1.selectExpr("key", "rand()").where("key > 5") + val plan = df2.queryExecution.optimizedPlan + val scan = plan.collect { case r: DataSourceV2ScanRelation => r } + assert(scan.size === 1) + assert(scan.head.scan.readSchema() == StructType(StructField("key", LongType) :: Nil)) + } + } + } + // Helpers for checking the arguments passed to the FileFormat. protected val checkPartitionSchema = diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala index d5502ba5737c0..5256043289d5e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/ReadSchemaSuite.scala @@ -99,13 +99,13 @@ class OrcReadSchemaSuite override val format: String = "orc" - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() originalConf = spark.conf.get(SQLConf.ORC_VECTORIZED_READER_ENABLED) spark.conf.set(SQLConf.ORC_VECTORIZED_READER_ENABLED.key, "false") } - override def afterAll() { + override def afterAll(): Unit = { spark.conf.set(SQLConf.ORC_VECTORIZED_READER_ENABLED.key, originalConf) super.afterAll() } @@ -124,13 +124,13 @@ class VectorizedOrcReadSchemaSuite override val format: String = "orc" - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() originalConf = spark.conf.get(SQLConf.ORC_VECTORIZED_READER_ENABLED) spark.conf.set(SQLConf.ORC_VECTORIZED_READER_ENABLED.key, "true") } - override def afterAll() { + override def afterAll(): Unit = { spark.conf.set(SQLConf.ORC_VECTORIZED_READER_ENABLED.key, originalConf) super.afterAll() } @@ -165,13 +165,13 @@ class ParquetReadSchemaSuite override val format: String = "parquet" - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() originalConf = spark.conf.get(SQLConf.PARQUET_VECTORIZED_READER_ENABLED) spark.conf.set(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key, "false") } - override def afterAll() { + override def afterAll(): Unit = { spark.conf.set(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key, originalConf) super.afterAll() } @@ -187,13 +187,13 @@ class VectorizedParquetReadSchemaSuite override val format: String = "parquet" - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() originalConf = spark.conf.get(SQLConf.PARQUET_VECTORIZED_READER_ENABLED) spark.conf.set(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key, "true") } - override def afterAll() { + override def afterAll(): Unit = { spark.conf.set(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key, originalConf) super.afterAll() } @@ -209,13 +209,13 @@ class MergedParquetReadSchemaSuite override val format: String = "parquet" - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() originalConf = spark.conf.get(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED) spark.conf.set(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key, "true") } - override def afterAll() { + override def afterAll(): Unit = { spark.conf.set(SQLConf.PARQUET_SCHEMA_MERGING_ENABLED.key, originalConf) super.afterAll() } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala index bb3cec579016e..a3d4905e82cee 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.{DataFrame, QueryTest, Row} import org.apache.spark.sql.catalyst.SchemaPruningTest import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.execution.FileSourceScanExec +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -34,7 +35,8 @@ abstract class SchemaPruningSuite extends QueryTest with FileBasedDataSourceTest with SchemaPruningTest - with SharedSparkSession { + with SharedSparkSession + with AdaptiveSparkPlanHelper { case class FullName(first: String, middle: String, last: String) case class Company(name: String, address: String) case class Employer(id: Int, company: Company) @@ -90,6 +92,36 @@ abstract class SchemaPruningSuite briefContacts.map { case BriefContact(id, name, address) => BriefContactWithDataPartitionColumn(id, name, address, 2) } + testSchemaPruning("select only top-level fields") { + val query = sql("select address from contacts") + checkScan(query, "struct") + checkAnswer(query.orderBy("id"), + Row("123 Main Street") :: + Row("321 Wall Street") :: + Row("567 Maple Drive") :: + Row("6242 Ash Street") :: + Nil) + } + + testSchemaPruning("select a single complex field with disabled nested schema pruning") { + withSQLConf(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key -> "false") { + val query = sql("select name.middle from contacts") + checkScan(query, "struct>") + checkAnswer(query.orderBy("id"), Row("X.") :: Row("Y.") :: Row(null) :: Row(null) :: Nil) + } + } + + testSchemaPruning("select only input_file_name()") { + val query = sql("select input_file_name() from contacts") + checkScan(query, "struct<>") + } + + testSchemaPruning("select only expressions without references") { + val query = sql("select count(*) from contacts") + checkScan(query, "struct<>") + checkAnswer(query, Row(4)) + } + testSchemaPruning("select a single complex field") { val query = sql("select name.middle from contacts") checkScan(query, "struct>") @@ -269,7 +301,7 @@ abstract class SchemaPruningSuite checkAnswer(query, Row("Y.", 1) :: Row("X.", 1) :: Row(null, 2) :: Row(null, 2) :: Nil) } - protected def testSchemaPruning(testName: String)(testThunk: => Unit) { + protected def testSchemaPruning(testName: String)(testThunk: => Unit): Unit = { test(s"Spark vectorized reader - without partition data column - $testName") { withSQLConf(vectorizedReaderEnabledKey -> "true") { withContacts(testThunk) @@ -293,7 +325,7 @@ abstract class SchemaPruningSuite } } - private def withContacts(testThunk: => Unit) { + private def withContacts(testThunk: => Unit): Unit = { withTempPath { dir => val path = dir.getCanonicalPath @@ -315,7 +347,7 @@ abstract class SchemaPruningSuite } } - private def withContactsWithDataPartitionColumn(testThunk: => Unit) { + private def withContactsWithDataPartitionColumn(testThunk: => Unit): Unit = { withTempPath { dir => val path = dir.getCanonicalPath @@ -378,10 +410,24 @@ abstract class SchemaPruningSuite checkAnswer(query.orderBy("id"), Row(1) :: Nil) } + testMixedCaseQueryPruning("subquery filter with different-case column names") { + withTempView("temp") { + val spark = this.spark + import spark.implicits._ + + val df = Seq(2).toDF("col2") + df.createOrReplaceTempView("temp") + + val query = sql("select id from mixedcase where Col2.b IN (select col2 from temp)") + checkScan(query, "struct>") + checkAnswer(query.orderBy("id"), Row(1) :: Nil) + } + } + // Tests schema pruning for a query whose column and field names are exactly the same as the table // schema's column and field names. N.B. this implies that `testThunk` should pass using either a // case-sensitive or case-insensitive query parser - private def testExactCaseQueryPruning(testName: String)(testThunk: => Unit) { + private def testExactCaseQueryPruning(testName: String)(testThunk: => Unit): Unit = { test(s"Case-sensitive parser - mixed-case schema - $testName") { withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { withMixedCaseData(testThunk) @@ -392,7 +438,7 @@ abstract class SchemaPruningSuite // Tests schema pruning for a query whose column and field names may differ in case from the table // schema's column and field names - private def testMixedCaseQueryPruning(testName: String)(testThunk: => Unit) { + private def testMixedCaseQueryPruning(testName: String)(testThunk: => Unit): Unit = { test(s"Case-insensitive parser - mixed-case schema - $testName") { withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { withMixedCaseData(testThunk) @@ -401,7 +447,7 @@ abstract class SchemaPruningSuite } // Tests given test function with Spark vectorized reader and non-vectorized reader. - private def withMixedCaseData(testThunk: => Unit) { + private def withMixedCaseData(testThunk: => Unit): Unit = { withDataSourceTable(mixedCaseData, "mixedcase") { testThunk } @@ -424,7 +470,7 @@ abstract class SchemaPruningSuite protected def checkScanSchemata(df: DataFrame, expectedSchemaCatalogStrings: String*): Unit = { val fileSourceScanSchemata = - df.queryExecution.executedPlan.collect { + collect(df.queryExecution.executedPlan) { case scan: FileSourceScanExec => scan.requiredSchema } assert(fileSourceScanSchemata.size === expectedSchemaCatalogStrings.size, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala index 70ec9bbf4819d..2cd142f913072 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/binaryfile/BinaryFileFormatSuite.scala @@ -352,15 +352,15 @@ class BinaryFileFormatSuite extends QueryTest with SharedSparkSession { .select(CONTENT) } val expected = Seq(Row(content)) - QueryTest.checkAnswer(readContent(), expected) + checkAnswer(readContent(), expected) withSQLConf(SOURCES_BINARY_FILE_MAX_LENGTH.key -> content.length.toString) { - QueryTest.checkAnswer(readContent(), expected) + checkAnswer(readContent(), expected) } // Disable read. If the implementation attempts to read, the exception would be different. file.setReadable(false) val caught = intercept[SparkException] { withSQLConf(SOURCES_BINARY_FILE_MAX_LENGTH.key -> (content.length - 1).toString) { - QueryTest.checkAnswer(readContent(), expected) + checkAnswer(readContent(), expected) } } assert(caught.getMessage.contains("exceeds the max length allowed")) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmark.scala index e41e81af508f1..e2abb39c986a7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmark.scala @@ -23,6 +23,7 @@ import org.apache.spark.benchmark.Benchmark import org.apache.spark.sql.{Column, Dataset, Row} import org.apache.spark.sql.execution.benchmark.SqlBasedBenchmark import org.apache.spark.sql.functions._ +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ /** @@ -42,8 +43,6 @@ import org.apache.spark.sql.types._ object CSVBenchmark extends SqlBasedBenchmark { import spark.implicits._ - private def toNoop(ds: Dataset[_]): Unit = ds.write.format("noop").save() - private def quotedValuesBenchmark(rowsNum: Int, numIters: Int): Unit = { val benchmark = new Benchmark(s"Parsing quoted values", rowsNum, output = output) @@ -59,7 +58,7 @@ object CSVBenchmark extends SqlBasedBenchmark { val ds = spark.read.option("header", true).schema(schema).csv(path.getAbsolutePath) benchmark.addCase(s"One quoted string", numIters) { _ => - toNoop(ds) + ds.noop() } benchmark.run() @@ -84,14 +83,14 @@ object CSVBenchmark extends SqlBasedBenchmark { val ds = spark.read.schema(schema).csv(path.getAbsolutePath) benchmark.addCase(s"Select $colsNum columns", numIters) { _ => - toNoop(ds.select("*")) + ds.select("*").noop() } val cols100 = columnNames.take(100).map(Column(_)) benchmark.addCase(s"Select 100 columns", numIters) { _ => - toNoop(ds.select(cols100: _*)) + ds.select(cols100: _*).noop() } benchmark.addCase(s"Select one column", numIters) { _ => - toNoop(ds.select($"col1")) + ds.select($"col1").noop() } benchmark.addCase(s"count()", numIters) { _ => ds.count() @@ -101,7 +100,7 @@ object CSVBenchmark extends SqlBasedBenchmark { (1 until colsNum).map(i => StructField(s"col$i", IntegerType))) val dsErr1 = spark.read.schema(schemaErr1).csv(path.getAbsolutePath) benchmark.addCase(s"Select 100 columns, one bad input field", numIters) { _ => - toNoop(dsErr1.select(cols100: _*)) + dsErr1.select(cols100: _*).noop() } val badRecColName = "badRecord" @@ -110,7 +109,7 @@ object CSVBenchmark extends SqlBasedBenchmark { .option("columnNameOfCorruptRecord", badRecColName) .csv(path.getAbsolutePath) benchmark.addCase(s"Select 100 columns, corrupt record field", numIters) { _ => - toNoop(dsErr2.select((Column(badRecColName) +: cols100): _*)) + dsErr2.select((Column(badRecColName) +: cols100): _*).noop() } benchmark.run() @@ -167,11 +166,11 @@ object CSVBenchmark extends SqlBasedBenchmark { val writeBench = new Benchmark("Write dates and timestamps", rowsNum, output = output) writeBench.addCase(s"Create a dataset of timestamps", numIters) { _ => - toNoop(timestamps) + timestamps.noop() } writeBench.addCase("to_csv(timestamp)", numIters) { _ => - toNoop(timestamps.select(to_csv(struct($"timestamp")))) + timestamps.select(to_csv(struct($"timestamp"))).noop() } writeBench.addCase("write timestamps to files", numIters) { _ => @@ -179,11 +178,11 @@ object CSVBenchmark extends SqlBasedBenchmark { } writeBench.addCase("Create a dataset of dates", numIters) { _ => - toNoop(dates) + dates.noop() } writeBench.addCase("to_csv(date)", numIters) { _ => - toNoop(dates.select(to_csv(struct($"date")))) + dates.select(to_csv(struct($"date"))).noop() } writeBench.addCase("write dates to files", numIters) { _ => @@ -196,7 +195,7 @@ object CSVBenchmark extends SqlBasedBenchmark { val tsSchema = new StructType().add("timestamp", TimestampType) readBench.addCase("read timestamp text from files", numIters) { _ => - toNoop(spark.read.text(timestampDir)) + spark.read.text(timestampDir).noop() } readBench.addCase("read timestamps from files", numIters) { _ => @@ -204,7 +203,7 @@ object CSVBenchmark extends SqlBasedBenchmark { .option("header", true) .schema(tsSchema) .csv(timestampDir) - toNoop(ds) + ds.noop() } readBench.addCase("infer timestamps from files", numIters) { _ => @@ -212,13 +211,13 @@ object CSVBenchmark extends SqlBasedBenchmark { .option("header", true) .option("inferSchema", true) .csv(timestampDir) - toNoop(ds) + ds.noop() } val dateSchema = new StructType().add("date", DateType) readBench.addCase("read date text from files", numIters) { _ => - toNoop(spark.read.text(dateDir)) + spark.read.text(dateDir).noop() } readBench.addCase("read date from files", numIters) { _ => @@ -226,7 +225,7 @@ object CSVBenchmark extends SqlBasedBenchmark { .option("header", true) .schema(dateSchema) .csv(dateDir) - toNoop(ds) + ds.noop() } readBench.addCase("infer date from files", numIters) { _ => @@ -234,7 +233,7 @@ object CSVBenchmark extends SqlBasedBenchmark { .option("header", true) .option("inferSchema", true) .csv(dateDir) - toNoop(ds) + ds.noop() } def timestampStr: Dataset[String] = { @@ -244,7 +243,7 @@ object CSVBenchmark extends SqlBasedBenchmark { } readBench.addCase("timestamp strings", numIters) { _ => - toNoop(timestampStr) + timestampStr.noop() } readBench.addCase("parse timestamps from Dataset[String]", numIters) { _ => @@ -252,7 +251,7 @@ object CSVBenchmark extends SqlBasedBenchmark { .option("header", false) .schema(tsSchema) .csv(timestampStr) - toNoop(ds) + ds.noop() } readBench.addCase("infer timestamps from Dataset[String]", numIters) { _ => @@ -260,7 +259,7 @@ object CSVBenchmark extends SqlBasedBenchmark { .option("header", false) .option("inferSchema", true) .csv(timestampStr) - toNoop(ds) + ds.noop() } def dateStr: Dataset[String] = { @@ -270,7 +269,7 @@ object CSVBenchmark extends SqlBasedBenchmark { } readBench.addCase("date strings", numIters) { _ => - toNoop(dateStr) + dateStr.noop() } readBench.addCase("parse dates from Dataset[String]", numIters) { _ => @@ -278,23 +277,67 @@ object CSVBenchmark extends SqlBasedBenchmark { .option("header", false) .schema(dateSchema) .csv(dateStr) - toNoop(ds) + ds.noop() } readBench.addCase("from_csv(timestamp)", numIters) { _ => val ds = timestampStr.select(from_csv($"timestamp", tsSchema, Map.empty[String, String])) - toNoop(ds) + ds.noop() } readBench.addCase("from_csv(date)", numIters) { _ => val ds = dateStr.select(from_csv($"date", dateSchema, Map.empty[String, String])) - toNoop(ds) + ds.noop() } readBench.run() } } + private def filtersPushdownBenchmark(rowsNum: Int, numIters: Int): Unit = { + val benchmark = new Benchmark(s"Filters pushdown", rowsNum, output = output) + val colsNum = 100 + val fields = Seq.tabulate(colsNum)(i => StructField(s"col$i", TimestampType)) + val schema = StructType(StructField("key", IntegerType) +: fields) + def columns(): Seq[Column] = { + val ts = Seq.tabulate(colsNum) { i => + lit(Instant.ofEpochSecond(i * 12345678)).as(s"col$i") + } + ($"id" % 1000).as("key") +: ts + } + withTempPath { path => + spark.range(rowsNum).select(columns(): _*) + .write.option("header", true) + .csv(path.getAbsolutePath) + def readback = { + spark.read + .option("header", true) + .schema(schema) + .csv(path.getAbsolutePath) + } + + benchmark.addCase(s"w/o filters", numIters) { _ => + readback.noop() + } + + def withFilter(configEnabled: Boolean): Unit = { + withSQLConf(SQLConf.CSV_FILTER_PUSHDOWN_ENABLED.key -> configEnabled.toString()) { + readback.filter($"key" === 0).noop() + } + } + + benchmark.addCase(s"pushdown disabled", numIters) { _ => + withFilter(configEnabled = false) + } + + benchmark.addCase(s"w/ filters", numIters) { _ => + withFilter(configEnabled = true) + } + + benchmark.run() + } + } + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { runBenchmark("Benchmark to measure CSV read/write performance") { val numIters = 3 @@ -302,6 +345,7 @@ object CSVBenchmark extends SqlBasedBenchmark { multiColumnsBenchmark(rowsNum = 1000 * 1000, numIters) countBenchmark(rowsNum = 10 * 1000 * 1000, numIters) datetimeBenchmark(rowsNum = 10 * 1000 * 1000, numIters) + filtersPushdownBenchmark(rowsNum = 100 * 1000, numIters) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 5afd019c11a16..0be0e1e3da3dc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -32,17 +32,15 @@ import com.univocity.parsers.common.TextParsingException import org.apache.commons.lang3.time.FastDateFormat import org.apache.hadoop.io.SequenceFile.CompressionType import org.apache.hadoop.io.compress.GzipCodec -import org.apache.log4j.{AppenderSkeleton, LogManager} -import org.apache.log4j.spi.LoggingEvent -import org.apache.spark.{SparkException, TestUtils} -import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row} +import org.apache.spark.{SparkConf, SparkException, TestUtils} +import org.apache.spark.sql.{AnalysisException, Column, DataFrame, QueryTest, Row} import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ -class CSVSuite extends QueryTest with SharedSparkSession with TestCsvData { +abstract class CSVSuite extends QueryTest with SharedSparkSession with TestCsvData { import testImplicits._ private val carsFile = "test-data/cars.csv" @@ -50,6 +48,8 @@ class CSVSuite extends QueryTest with SharedSparkSession with TestCsvData { private val carsFile8859 = "test-data/cars_iso-8859-1.csv" private val carsTsvFile = "test-data/cars.tsv" private val carsAltFile = "test-data/cars-alternative.csv" + private val carsMultiCharDelimitedFile = "test-data/cars-multichar-delim.csv" + private val carsMultiCharCrazyDelimitedFile = "test-data/cars-multichar-delim-crazy.csv" private val carsUnbalancedQuotesFile = "test-data/cars-unbalanced-quotes.csv" private val carsNullFile = "test-data/cars-null.csv" private val carsEmptyValueFile = "test-data/cars-empty-value.csv" @@ -66,6 +66,7 @@ class CSVSuite extends QueryTest with SharedSparkSession with TestCsvData { private val unescapedQuotesFile = "test-data/unescaped-quotes.csv" private val valueMalformedFile = "test-data/value-malformed.csv" private val badAfterGoodFile = "test-data/bad_after_good.csv" + private val malformedRowFile = "test-data/malformedRow.csv" /** Verifies data and schema. */ private def verifyCars( @@ -187,6 +188,49 @@ class CSVSuite extends QueryTest with SharedSparkSession with TestCsvData { verifyCars(cars, withHeader = true) } + test("test with tab delimiter and double quote") { + val cars = spark.read + .options(Map("quote" -> "\"", "delimiter" -> """\t""", "header" -> "true")) + .csv(testFile(carsTsvFile)) + + verifyCars(cars, numFields = 6, withHeader = true, checkHeader = false) + } + + test("SPARK-24540: test with multiple character delimiter (comma space)") { + val cars = spark.read + .options(Map("quote" -> "\'", "delimiter" -> ", ", "header" -> "true")) + .csv(testFile(carsMultiCharDelimitedFile)) + + verifyCars(cars, withHeader = true) + } + + test("SPARK-24540: test with multiple (crazy) character delimiter") { + val cars = spark.read + .options(Map("quote" -> "\'", "delimiter" -> """_/-\\_""", "header" -> "true")) + .csv(testFile(carsMultiCharCrazyDelimitedFile)) + + verifyCars(cars, withHeader = true) + + // check all the other columns, besides year (which is covered by verifyCars) + val otherCols = cars.select("make", "model", "comment", "blank").collect() + val expectedOtherColVals = Seq( + ("Tesla", "S", "No comment", null), + ("Ford", "E350", "Go get one now they are going fast", null), + ("Chevy", "Volt", null, null) + ) + + expectedOtherColVals.zipWithIndex.foreach { case (values, index) => + val actualRow = otherCols(index) + values match { + case (make, model, comment, blank) => + assert(make == actualRow.getString(0)) + assert(model == actualRow.getString(1)) + assert(comment == actualRow.getString(2)) + assert(blank == actualRow.getString(3)) + } + } + } + test("parse unescaped quotes with maxCharsPerColumn") { val rows = spark.read .format("csv") @@ -819,8 +863,8 @@ class CSVSuite extends QueryTest with SharedSparkSession with TestCsvData { .load(testFile(simpleSparseFile)) assert( - df.schema.fields.map(field => field.dataType).deep == - Array(IntegerType, IntegerType, IntegerType, IntegerType).deep) + df.schema.fields.map(field => field.dataType).sameElements( + Array(IntegerType, IntegerType, IntegerType, IntegerType))) } test("old csv data source name works") { @@ -1138,7 +1182,7 @@ class CSVSuite extends QueryTest with SharedSparkSession with TestCsvData { .schema(schemaWithCorrField1) .csv(testFile(valueMalformedFile)) checkAnswer(df2, - Row(0, null, "0,2013-111-11 12:13:14") :: + Row(0, null, "0,2013-111_11 12:13:14") :: Row(1, java.sql.Date.valueOf("1983-08-04"), null) :: Nil) @@ -1155,7 +1199,7 @@ class CSVSuite extends QueryTest with SharedSparkSession with TestCsvData { .schema(schemaWithCorrField2) .csv(testFile(valueMalformedFile)) checkAnswer(df3, - Row(0, "0,2013-111-11 12:13:14", null) :: + Row(0, "0,2013-111_11 12:13:14", null) :: Row(1, null, java.sql.Date.valueOf("1983-08-04")) :: Nil) @@ -1391,7 +1435,7 @@ class CSVSuite extends QueryTest with SharedSparkSession with TestCsvData { assert(df.filter($"_corrupt_record".isNull).count() == 1) checkAnswer( df.select(columnNameOfCorruptRecord), - Row("0,2013-111-11 12:13:14") :: Row(null) :: Nil + Row("0,2013-111_11 12:13:14") :: Row(null) :: Nil ) } @@ -1717,24 +1761,17 @@ class CSVSuite extends QueryTest with SharedSparkSession with TestCsvData { } test("SPARK-23786: warning should be printed if CSV header doesn't conform to schema") { - class TestAppender extends AppenderSkeleton { - var events = new java.util.ArrayList[LoggingEvent] - override def close(): Unit = {} - override def requiresLayout: Boolean = false - protected def append(event: LoggingEvent): Unit = events.add(event) - } - - val testAppender1 = new TestAppender + val testAppender1 = new LogAppender("CSV header matches to schema") withLogAppender(testAppender1) { val ds = Seq("columnA,columnB", "1.0,1000.0").toDS() val ischema = new StructType().add("columnB", DoubleType).add("columnA", DoubleType) spark.read.schema(ischema).option("header", true).option("enforceSchema", true).csv(ds) } - assert(testAppender1.events.asScala + assert(testAppender1.loggingEvents .exists(msg => msg.getRenderedMessage.contains("CSV header does not conform to the schema"))) - val testAppender2 = new TestAppender + val testAppender2 = new LogAppender("CSV header matches to schema w/ enforceSchema") withLogAppender(testAppender2) { withTempPath { path => val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType) @@ -1749,7 +1786,7 @@ class CSVSuite extends QueryTest with SharedSparkSession with TestCsvData { .collect() } } - assert(testAppender2.events.asScala + assert(testAppender2.loggingEvents .exists(msg => msg.getRenderedMessage.contains("CSV header does not conform to the schema"))) } @@ -2027,15 +2064,6 @@ class CSVSuite extends QueryTest with SharedSparkSession with TestCsvData { } } - test("do not produce empty files for empty partitions") { - withTempPath { dir => - val path = dir.getCanonicalPath - spark.emptyDataset[String].write.csv(path) - val files = new File(path).listFiles() - assert(!files.exists(_.getName.endsWith("csv"))) - } - } - test("Do not reuse last good value for bad input field") { val schema = StructType( StructField("col1", StringType) :: @@ -2065,7 +2093,7 @@ class CSVSuite extends QueryTest with SharedSparkSession with TestCsvData { Seq("csv", "").foreach { reader => withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> reader) { withTempPath { path => - val df = Seq(("0", "2013-111-11")).toDF("a", "b") + val df = Seq(("0", "2013-111_11")).toDF("a", "b") df.write .option("header", "true") .csv(path.getAbsolutePath) @@ -2081,7 +2109,7 @@ class CSVSuite extends QueryTest with SharedSparkSession with TestCsvData { .option("columnNameOfCorruptRecord", columnNameOfCorruptRecord) .schema(schemaWithCorrField) .csv(path.getAbsoluteFile.toString) - checkAnswer(readDF, Row(0, null, "0,2013-111-11") :: Nil) + checkAnswer(readDF, Row(0, null, "0,2013-111_11") :: Nil) } } } @@ -2109,4 +2137,189 @@ class CSVSuite extends QueryTest with SharedSparkSession with TestCsvData { "expect the TextParsingException truncate the error content to be 1000 length.") } } + + test("SPARK-29101 test count with DROPMALFORMED mode") { + Seq((true, 4), (false, 3)).foreach { case (csvColumnPruning, expectedCount) => + withSQLConf(SQLConf.CSV_PARSER_COLUMN_PRUNING.key -> csvColumnPruning.toString) { + val count = spark.read + .option("header", "true") + .option("mode", "DROPMALFORMED") + .csv(testFile(malformedRowFile)) + .count() + assert(expectedCount == count) + } + } + } + + test("parse timestamp in microsecond precision") { + withTempPath { path => + val t = "2019-11-14 20:35:30.123456" + Seq(t).toDF("t").write.text(path.getAbsolutePath) + val readback = spark.read + .schema("t timestamp") + .option("timestampFormat", "yyyy-MM-dd HH:mm:ss.SSSSSS") + .csv(path.getAbsolutePath) + checkAnswer(readback, Row(Timestamp.valueOf(t))) + } + } + + test("Roundtrip in reading and writing timestamps in microsecond precision") { + withTempPath { path => + val timestamp = Timestamp.valueOf("2019-11-18 11:56:00.123456") + Seq(timestamp).toDF("t") + .write + .option("timestampFormat", "yyyy-MM-dd HH:mm:ss.SSSSSS") + .csv(path.getAbsolutePath) + val readback = spark.read + .schema("t timestamp") + .option("timestampFormat", "yyyy-MM-dd HH:mm:ss.SSSSSS") + .csv(path.getAbsolutePath) + checkAnswer(readback, Row(timestamp)) + } + } + + test("return correct results when data columns overlap with partition columns") { + withTempPath { path => + val tablePath = new File(s"${path.getCanonicalPath}/cOl3=c/cOl1=a/cOl5=e") + + val inputDF = Seq((1, 2, 3, 4, 5)).toDF("cOl1", "cOl2", "cOl3", "cOl4", "cOl5") + inputDF.write + .option("header", "true") + .csv(tablePath.getCanonicalPath) + + val resultDF = spark.read + .option("header", "true") + .option("inferSchema", "true") + .csv(path.getCanonicalPath) + .select("CoL1", "Col2", "CoL5", "CoL3") + checkAnswer(resultDF, Row("a", 2, "e", "c")) + } + } + + test("filters push down") { + Seq(true, false).foreach { filterPushdown => + Seq(true, false).foreach { columnPruning => + withSQLConf( + SQLConf.CSV_FILTER_PUSHDOWN_ENABLED.key -> filterPushdown.toString, + SQLConf.CSV_PARSER_COLUMN_PRUNING.key -> columnPruning.toString) { + + withTempPath { path => + val t = "2019-12-17 00:01:02" + Seq( + "c0,c1,c2", + "abc,1,2019-11-14 20:35:30", + s"def,2,$t").toDF("data") + .repartition(1) + .write.text(path.getAbsolutePath) + Seq(true, false).foreach { multiLine => + Seq("PERMISSIVE", "DROPMALFORMED", "FAILFAST").foreach { mode => + val readback = spark.read + .option("mode", mode) + .option("header", true) + .option("timestampFormat", "yyyy-MM-dd HH:mm:ss") + .option("multiLine", multiLine) + .schema("c0 string, c1 integer, c2 timestamp") + .csv(path.getAbsolutePath) + .where($"c1" === 2) + .select($"c2") + // count() pushes empty schema. This checks handling of a filter + // which refers to not existed field. + assert(readback.count() === 1) + checkAnswer(readback, Row(Timestamp.valueOf(t))) + } + } + } + } + } + } + } + + test("filters push down - malformed input in PERMISSIVE mode") { + val invalidTs = "2019-123_14 20:35:30" + val invalidRow = s"0,$invalidTs,999" + val validTs = "2019-12-14 20:35:30" + Seq(true, false).foreach { filterPushdown => + withSQLConf(SQLConf.CSV_FILTER_PUSHDOWN_ENABLED.key -> filterPushdown.toString) { + withTempPath { path => + Seq( + "c0,c1,c2", + invalidRow, + s"1,$validTs,999").toDF("data") + .repartition(1) + .write.text(path.getAbsolutePath) + def checkReadback(condition: Column, expected: Seq[Row]): Unit = { + val readback = spark.read + .option("mode", "PERMISSIVE") + .option("columnNameOfCorruptRecord", "c3") + .option("header", true) + .option("timestampFormat", "yyyy-MM-dd HH:mm:ss") + .schema("c0 integer, c1 timestamp, c2 integer, c3 string") + .csv(path.getAbsolutePath) + .where(condition) + .select($"c0", $"c1", $"c3") + checkAnswer(readback, expected) + } + + checkReadback( + condition = $"c2" === 999, + expected = Seq(Row(0, null, invalidRow), Row(1, Timestamp.valueOf(validTs), null))) + checkReadback( + condition = $"c2" === 999 && $"c1" > "1970-01-01 00:00:00", + expected = Seq(Row(1, Timestamp.valueOf(validTs), null))) + } + } + } + } + + test("SPARK-30530: apply filters to malformed rows") { + withSQLConf(SQLConf.CSV_FILTER_PUSHDOWN_ENABLED.key -> "true") { + withTempPath { path => + Seq( + "100.0,1.0,", + "200.0,,", + "300.0,3.0,", + "1.0,4.0,", + ",4.0,", + "500.0,,", + ",6.0,", + "-500.0,50.5").toDF("data") + .repartition(1) + .write.text(path.getAbsolutePath) + val schema = new StructType().add("floats", FloatType).add("more_floats", FloatType) + val readback = spark.read + .schema(schema) + .csv(path.getAbsolutePath) + .filter("floats is null") + checkAnswer(readback, Seq(Row(null, 4.0), Row(null, 6.0))) + } + } + } + + test("SPARK-30810: parses and convert a CSV Dataset having different column from 'value'") { + val ds = spark.range(2).selectExpr("concat('a,b,', id) AS `a.text`").as[String] + val csv = spark.read.option("header", true).option("inferSchema", true).csv(ds) + assert(csv.schema.fieldNames === Seq("a", "b", "0")) + checkAnswer(csv, Row("a", "b", 1)) + } +} + +class CSVv1Suite extends CSVSuite { + override protected def sparkConf: SparkConf = + super + .sparkConf + .set(SQLConf.USE_V1_SOURCE_LIST, "csv") +} + +class CSVv2Suite extends CSVSuite { + override protected def sparkConf: SparkConf = + super + .sparkConf + .set(SQLConf.USE_V1_SOURCE_LIST, "") +} + +class CSVLegacyTimeParserSuite extends CSVSuite { + override protected def sparkConf: SparkConf = + super + .sparkConf + .set(SQLConf.LEGACY_TIME_PARSER_ENABLED, true) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala index f486e603e2552..bcecaccc8cc89 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala @@ -39,7 +39,7 @@ import org.apache.spark.sql.types._ * }}} */ -object JSONBenchmark extends SqlBasedBenchmark { +object JsonBenchmark extends SqlBasedBenchmark { import spark.implicits._ private def prepareDataInfo(benchmark: Benchmark): Unit = { @@ -48,10 +48,6 @@ object JSONBenchmark extends SqlBasedBenchmark { // scalastyle:on println } - private def run(ds: Dataset[_]): Unit = { - ds.write.format("noop").save() - } - def schemaInferring(rowsNum: Int, numIters: Int): Unit = { val benchmark = new Benchmark("JSON schema inferring", rowsNum, output = output) @@ -219,11 +215,11 @@ object JSONBenchmark extends SqlBasedBenchmark { benchmark.addCase(s"Select $colsNum columns", numIters) { _ => val ds = in.select("*") - run(ds) + ds.noop() } benchmark.addCase(s"Select 1 column", numIters) { _ => val ds = in.select($"col1") - run(ds) + ds.noop() } benchmark.run() @@ -244,7 +240,7 @@ object JSONBenchmark extends SqlBasedBenchmark { benchmark.addCase("Short column without encoding", numIters) { _ => val ds = spark.read.schema(shortSchema).json(shortColumnPath) - run(ds) + ds.noop() } benchmark.addCase("Short column with UTF-8", numIters) { _ => @@ -252,12 +248,12 @@ object JSONBenchmark extends SqlBasedBenchmark { .option("encoding", "UTF-8") .schema(shortSchema) .json(shortColumnPath) - run(ds) + ds.noop() } benchmark.addCase("Wide column without encoding", numIters) { _ => val ds = spark.read.schema(wideSchema).json(wideColumnPath) - run(ds) + ds.noop() } benchmark.addCase("Wide column with UTF-8", numIters) { _ => @@ -265,7 +261,7 @@ object JSONBenchmark extends SqlBasedBenchmark { .option("encoding", "UTF-8") .schema(wideSchema) .json(wideColumnPath) - run(ds) + ds.noop() } benchmark.run() @@ -280,23 +276,23 @@ object JSONBenchmark extends SqlBasedBenchmark { val in = spark.range(0, rows, 1, 1).map(_ => """{"a":1}""") benchmark.addCase("Text read", iters) { _ => - run(in) + in.noop() } benchmark.addCase("from_json", iters) { _ => val schema = new StructType().add("a", IntegerType) val from_json_ds = in.select(from_json('value, schema)) - run(from_json_ds) + from_json_ds.noop() } benchmark.addCase("json_tuple", iters) { _ => val json_tuple_ds = in.select(json_tuple($"value", "a")) - run(json_tuple_ds) + json_tuple_ds.noop() } benchmark.addCase("get_json_object", iters) { _ => val get_json_object_ds = in.select(get_json_object($"value", "$.a")) - run(get_json_object_ds) + get_json_object_ds.noop() } benchmark.run() @@ -310,7 +306,7 @@ object JSONBenchmark extends SqlBasedBenchmark { val in = spark.range(0, rows, 1, 1).map(_ => """{"a":1}""") benchmark.addCase("Text read", iters) { _ => - run(in) + in.noop() } benchmark.addCase("schema inferring", iters) { _ => @@ -322,7 +318,7 @@ object JSONBenchmark extends SqlBasedBenchmark { val ds = spark.read .schema(schema) .json(in) - run(ds) + ds.noop() } benchmark.run() @@ -343,7 +339,7 @@ object JSONBenchmark extends SqlBasedBenchmark { val ds = spark.read .format("text") .load(path.getAbsolutePath) - run(ds) + ds.noop() } benchmark.addCase("Schema inferring", iters) { _ => @@ -360,7 +356,7 @@ object JSONBenchmark extends SqlBasedBenchmark { .schema(schema) .option("multiLine", false) .json(path.getAbsolutePath) - run(ds) + ds.noop() } benchmark.addCase("Parsing with UTF-8", iters) { _ => @@ -370,7 +366,7 @@ object JSONBenchmark extends SqlBasedBenchmark { .option("charset", "UTF-8") .json(path.getAbsolutePath) - run(ds) + ds.noop() } benchmark.run() @@ -397,11 +393,11 @@ object JSONBenchmark extends SqlBasedBenchmark { val writeBench = new Benchmark("Write dates and timestamps", rowsNum, output = output) writeBench.addCase(s"Create a dataset of timestamps", numIters) { _ => - run(timestamps) + timestamps.noop() } writeBench.addCase("to_json(timestamp)", numIters) { _ => - run(timestamps.select(to_json(struct($"timestamp")))) + timestamps.select(to_json(struct($"timestamp"))).noop() } writeBench.addCase("write timestamps to files", numIters) { _ => @@ -409,11 +405,11 @@ object JSONBenchmark extends SqlBasedBenchmark { } writeBench.addCase("Create a dataset of dates", numIters) { _ => - run(dates) + dates.noop() } writeBench.addCase("to_json(date)", numIters) { _ => - run(dates.select(to_json(struct($"date")))) + dates.select(to_json(struct($"date"))).noop() } writeBench.addCase("write dates to files", numIters) { _ => @@ -426,25 +422,25 @@ object JSONBenchmark extends SqlBasedBenchmark { val tsSchema = new StructType().add("timestamp", TimestampType) readBench.addCase("read timestamp text from files", numIters) { _ => - run(spark.read.text(timestampDir)) + spark.read.text(timestampDir).noop() } readBench.addCase("read timestamps from files", numIters) { _ => - run(spark.read.schema(tsSchema).json(timestampDir)) + spark.read.schema(tsSchema).json(timestampDir).noop() } readBench.addCase("infer timestamps from files", numIters) { _ => - run(spark.read.json(timestampDir)) + spark.read.json(timestampDir).noop() } val dateSchema = new StructType().add("date", DateType) readBench.addCase("read date text from files", numIters) { _ => - run(spark.read.text(dateDir)) + spark.read.text(dateDir).noop() } readBench.addCase("read date from files", numIters) { _ => - run(spark.read.schema(dateSchema).json(dateDir)) + spark.read.schema(dateSchema).json(dateDir).noop() } def timestampStr: Dataset[String] = { @@ -454,15 +450,15 @@ object JSONBenchmark extends SqlBasedBenchmark { } readBench.addCase("timestamp strings", numIters) { _ => - run(timestampStr) + timestampStr.noop() } readBench.addCase("parse timestamps from Dataset[String]", numIters) { _ => - run(spark.read.schema(tsSchema).json(timestampStr)) + spark.read.schema(tsSchema).json(timestampStr).noop() } readBench.addCase("infer timestamps from Dataset[String]", numIters) { _ => - run(spark.read.json(timestampStr)) + spark.read.json(timestampStr).noop() } def dateStr: Dataset[String] = { @@ -472,7 +468,7 @@ object JSONBenchmark extends SqlBasedBenchmark { } readBench.addCase("date strings", numIters) { _ => - run(dateStr) + dateStr.noop() } readBench.addCase("parse dates from Dataset[String]", numIters) { _ => @@ -480,17 +476,17 @@ object JSONBenchmark extends SqlBasedBenchmark { .option("header", false) .schema(dateSchema) .json(dateStr) - run(ds) + ds.noop() } readBench.addCase("from_json(timestamp)", numIters) { _ => val ds = timestampStr.select(from_json($"timestamp", tsSchema, Map.empty[String, String])) - run(ds) + ds.noop() } readBench.addCase("from_json(date)", numIters) { _ => val ds = dateStr.select(from_json($"date", dateSchema, Map.empty[String, String])) - run(ds) + ds.noop() } readBench.run() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala index bafb6769af69c..7592809d7c85b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala @@ -103,7 +103,7 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSparkSession { } // The following two tests are not really working - need to look into Jackson's - // JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS. + // JsonReadFeature.ALLOW_NON_NUMERIC_NUMBERS. ignore("allowNonNumericNumbers off") { val str = """{"age": NaN}""" val df = spark.read.json(Seq(str).toDS()) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala index 2998e673bd45c..7abe818a29d9f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala @@ -28,7 +28,7 @@ import org.apache.hadoop.fs.{Path, PathFilter} import org.apache.hadoop.io.SequenceFile.CompressionType import org.apache.hadoop.io.compress.GzipCodec -import org.apache.spark.{SparkException, TestUtils} +import org.apache.spark.{SparkConf, SparkException, TestUtils} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{functions => F, _} import org.apache.spark.sql.catalyst.json._ @@ -45,11 +45,11 @@ class TestFileFilter extends PathFilter { override def accept(path: Path): Boolean = path.getParent.getName != "p=2" } -class JsonSuite extends QueryTest with SharedSparkSession with TestJsonData { +abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJsonData { import testImplicits._ test("Type promotion") { - def checkTypePromotion(expected: Any, actual: Any) { + def checkTypePromotion(expected: Any, actual: Any): Unit = { assert(expected.getClass == actual.getClass, s"Failed to promote ${actual.getClass} to ${expected.getClass}.") assert(expected == actual, @@ -92,7 +92,7 @@ class JsonSuite extends QueryTest with SharedSparkSession with TestJsonData { checkTypePromotion( Decimal(longNumber), enforceCorrectType(longNumber, DecimalType.SYSTEM_DEFAULT)) - val doubleNumber: Double = 1.7976931348623157E308d + val doubleNumber: Double = 1.7976931348623157d checkTypePromotion(doubleNumber.toDouble, enforceCorrectType(doubleNumber, DoubleType)) checkTypePromotion(DateTimeUtils.fromJavaTimestamp(new Timestamp(intNumber * 1000L)), @@ -128,7 +128,7 @@ class JsonSuite extends QueryTest with SharedSparkSession with TestJsonData { } test("Get compatible type") { - def checkDataType(t1: DataType, t2: DataType, expected: DataType) { + def checkDataType(t1: DataType, t2: DataType, expected: DataType): Unit = { var actual = JsonInferSchema.compatibleType(t1, t2) assert(actual == expected, s"Expected $expected as the most general data type for $t1 and $t2, found $actual") @@ -284,7 +284,7 @@ class JsonSuite extends QueryTest with SharedSparkSession with TestJsonData { sql("select * from jsonTable"), Row(new java.math.BigDecimal("92233720368547758070"), true, - 1.7976931348623157E308, + 1.7976931348623157, 10, 21474836470L, null, @@ -624,7 +624,7 @@ class JsonSuite extends QueryTest with SharedSparkSession with TestJsonData { sql("select * from jsonTable"), Row(new java.math.BigDecimal("92233720368547758070"), true, - 1.7976931348623157E308, + 1.7976931348623157, 10, 21474836470L, null, @@ -656,7 +656,7 @@ class JsonSuite extends QueryTest with SharedSparkSession with TestJsonData { sql("select * from jsonTable"), Row("92233720368547758070", "true", - "1.7976931348623157E308", + "1.7976931348623157", "10", "21474836470", null, @@ -768,7 +768,7 @@ class JsonSuite extends QueryTest with SharedSparkSession with TestJsonData { val expectedSchema = StructType( StructField("bigInteger", DecimalType(20, 0), true) :: StructField("boolean", BooleanType, true) :: - StructField("double", DecimalType(17, -292), true) :: + StructField("double", DecimalType(17, 16), true) :: StructField("integer", LongType, true) :: StructField("long", LongType, true) :: StructField("null", StringType, true) :: @@ -782,7 +782,7 @@ class JsonSuite extends QueryTest with SharedSparkSession with TestJsonData { sql("select * from jsonTable"), Row(BigDecimal("92233720368547758070"), true, - BigDecimal("1.7976931348623157E308"), + BigDecimal("1.7976931348623157"), 10, 21474836470L, null, @@ -875,7 +875,7 @@ class JsonSuite extends QueryTest with SharedSparkSession with TestJsonData { sql("select * from jsonTableSQL"), Row(new java.math.BigDecimal("92233720368547758070"), true, - 1.7976931348623157E308, + 1.7976931348623157, 10, 21474836470L, null, @@ -908,7 +908,7 @@ class JsonSuite extends QueryTest with SharedSparkSession with TestJsonData { sql("select * from jsonTable1"), Row(new java.math.BigDecimal("92233720368547758070"), true, - 1.7976931348623157E308, + 1.7976931348623157, 10, 21474836470L, null, @@ -925,7 +925,7 @@ class JsonSuite extends QueryTest with SharedSparkSession with TestJsonData { sql("select * from jsonTable2"), Row(new java.math.BigDecimal("92233720368547758070"), true, - 1.7976931348623157E308, + 1.7976931348623157, 10, 21474836470L, null, @@ -1274,7 +1274,7 @@ class JsonSuite extends QueryTest with SharedSparkSession with TestJsonData { sql("select * from primitiveTable"), Row(new java.math.BigDecimal("92233720368547758070"), true, - 1.7976931348623157E308, + 1.7976931348623157, 10, 21474836470L, "this is a simple string.") @@ -2436,23 +2436,24 @@ class JsonSuite extends QueryTest with SharedSparkSession with TestJsonData { } } - test("SPARK-25040: empty strings should be disallowed") { - def failedOnEmptyString(dataType: DataType): Unit = { - val df = spark.read.schema(s"a ${dataType.catalogString}") - .option("mode", "FAILFAST").json(Seq("""{"a":""}""").toDS) - val errMessage = intercept[SparkException] { - df.collect() - }.getMessage - assert(errMessage.contains( - s"Failed to parse an empty string for data type ${dataType.catalogString}")) - } - def emptyString(dataType: DataType, expected: Any): Unit = { - val df = spark.read.schema(s"a ${dataType.catalogString}") - .option("mode", "FAILFAST").json(Seq("""{"a":""}""").toDS) - checkAnswer(df, Row(expected) :: Nil) - } + private def failedOnEmptyString(dataType: DataType): Unit = { + val df = spark.read.schema(s"a ${dataType.catalogString}") + .option("mode", "FAILFAST").json(Seq("""{"a":""}""").toDS) + val errMessage = intercept[SparkException] { + df.collect() + }.getMessage + assert(errMessage.contains( + s"Failed to parse an empty string for data type ${dataType.catalogString}")) + } + private def emptyString(dataType: DataType, expected: Any): Unit = { + val df = spark.read.schema(s"a ${dataType.catalogString}") + .option("mode", "FAILFAST").json(Seq("""{"a":""}""").toDS) + checkAnswer(df, Row(expected) :: Nil) + } + + test("SPARK-25040: empty strings should be disallowed") { failedOnEmptyString(BooleanType) failedOnEmptyString(ByteType) failedOnEmptyString(ShortType) @@ -2471,12 +2472,33 @@ class JsonSuite extends QueryTest with SharedSparkSession with TestJsonData { emptyString(BinaryType, "".getBytes(StandardCharsets.UTF_8)) } - test("do not produce empty files for empty partitions") { - withTempPath { dir => - val path = dir.getCanonicalPath - spark.emptyDataset[String].write.json(path) - val files = new File(path).listFiles() - assert(!files.exists(_.getName.endsWith("json"))) + test("SPARK-25040: allowing empty strings when legacy config is enabled") { + def emptyStringAsNull(dataType: DataType): Unit = { + val df = spark.read.schema(s"a ${dataType.catalogString}") + .option("mode", "FAILFAST").json(Seq("""{"a":""}""").toDS) + checkAnswer(df, Row(null) :: Nil) + } + + // Legacy mode prior to Spark 3.0.0 + withSQLConf(SQLConf.LEGACY_ALLOW_EMPTY_STRING_IN_JSON.key -> "true") { + emptyStringAsNull(BooleanType) + emptyStringAsNull(ByteType) + emptyStringAsNull(ShortType) + emptyStringAsNull(IntegerType) + emptyStringAsNull(LongType) + + failedOnEmptyString(FloatType) + failedOnEmptyString(DoubleType) + failedOnEmptyString(TimestampType) + failedOnEmptyString(DateType) + + emptyStringAsNull(DecimalType.SYSTEM_DEFAULT) + emptyStringAsNull(ArrayType(IntegerType)) + emptyStringAsNull(MapType(StringType, IntegerType, true)) + emptyStringAsNull(StructType(StructField("f1", IntegerType, true) :: Nil)) + + emptyString(StringType, "") + emptyString(BinaryType, "".getBytes(StandardCharsets.UTF_8)) } } @@ -2536,3 +2558,24 @@ class JsonSuite extends QueryTest with SharedSparkSession with TestJsonData { } } } + +class JsonV1Suite extends JsonSuite { + override protected def sparkConf: SparkConf = + super + .sparkConf + .set(SQLConf.USE_V1_SOURCE_LIST, "json") +} + +class JsonV2Suite extends JsonSuite { + override protected def sparkConf: SparkConf = + super + .sparkConf + .set(SQLConf.USE_V1_SOURCE_LIST, "") +} + +class JsonLegacyTimeParserSuite extends JsonSuite { + override protected def sparkConf: SparkConf = + super + .sparkConf + .set(SQLConf.LEGACY_TIME_PARSER_ENABLED, true) +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala index 17503330bfd5c..5c35ee03fb271 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala @@ -28,7 +28,7 @@ private[json] trait TestJsonData { "integer":10, "long":21474836470, "bigInteger":92233720368547758070, - "double":1.7976931348623157E308, + "double":1.7976931348623157, "boolean":true, "null":null }""" :: Nil))(Encoders.STRING) @@ -87,7 +87,7 @@ private[json] trait TestJsonData { "arrayOfInteger":[1, 2147483647, -2147483648], "arrayOfLong":[21474836470, 9223372036854775807, -9223372036854775808], "arrayOfBigInteger":[922337203685477580700, -922337203685477580800], - "arrayOfDouble":[1.2, 1.7976931348623157E308, 4.9E-324, 2.2250738585072014E-308], + "arrayOfDouble":[1.2, 1.7976931348623157, 4.9E-324, 2.2250738585072014E-308], "arrayOfBoolean":[true, false, true], "arrayOfNull":[null, null, null, null], "arrayOfStruct":[{"field1": true, "field2": "str1"}, {"field1": false}, {"field3": null}], diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/noop/NoopSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/noop/NoopSuite.scala index c5a03cb8ef6d3..b4073bedf5597 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/noop/NoopSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/noop/NoopSuite.scala @@ -32,6 +32,7 @@ class NoopSuite extends SharedSparkSession { } .write .format("noop") + .mode("append") .save() assert(accum.value == numElems) } @@ -54,7 +55,7 @@ class NoopSuite extends SharedSparkSession { accum.add(1) x } - .write.format("noop").save() + .write.mode("append").format("noop").save() assert(accum.value == numElems) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcPartitionDiscoverySuite.scala index 5d21ee698f4e6..ea839b8e1ef10 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcPartitionDiscoverySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcPartitionDiscoverySuite.scala @@ -169,6 +169,8 @@ abstract class OrcPartitionDiscoveryTest extends OrcTest { } class OrcPartitionDiscoverySuite extends OrcPartitionDiscoveryTest with SharedSparkSession { + override protected def sparkConf: SparkConf = super.sparkConf.set(SQLConf.USE_V1_SOURCE_LIST, "") + test("read partitioned table - partition key included in orc file") { withTempDir { base => for { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala index edc1822887f9f..b8bf4b16fe53c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala @@ -194,7 +194,9 @@ abstract class OrcQueryTest extends OrcTest { val orcFilePath = new Path(maybeOrcFile.get.getAbsolutePath) val conf = OrcFile.readerOptions(new Configuration()) - assert("ZLIB" === OrcFile.createReader(orcFilePath, conf).getCompressionKind.name) + Utils.tryWithResource(OrcFile.createReader(orcFilePath, conf)) { reader => + assert("ZLIB" === reader.getCompressionKind.name) + } } // `compression` overrides `orc.compress`. @@ -209,7 +211,9 @@ abstract class OrcQueryTest extends OrcTest { val orcFilePath = new Path(maybeOrcFile.get.getAbsolutePath) val conf = OrcFile.readerOptions(new Configuration()) - assert("ZLIB" === OrcFile.createReader(orcFilePath, conf).getCompressionKind.name) + Utils.tryWithResource(OrcFile.createReader(orcFilePath, conf)) { reader => + assert("ZLIB" === reader.getCompressionKind.name) + } } } @@ -225,7 +229,9 @@ abstract class OrcQueryTest extends OrcTest { val orcFilePath = new Path(maybeOrcFile.get.getAbsolutePath) val conf = OrcFile.readerOptions(new Configuration()) - assert("ZLIB" === OrcFile.createReader(orcFilePath, conf).getCompressionKind.name) + Utils.tryWithResource(OrcFile.createReader(orcFilePath, conf)) { reader => + assert("ZLIB" === reader.getCompressionKind.name) + } } withTempPath { file => @@ -238,7 +244,9 @@ abstract class OrcQueryTest extends OrcTest { val orcFilePath = new Path(maybeOrcFile.get.getAbsolutePath) val conf = OrcFile.readerOptions(new Configuration()) - assert("SNAPPY" === OrcFile.createReader(orcFilePath, conf).getCompressionKind.name) + Utils.tryWithResource(OrcFile.createReader(orcFilePath, conf)) { reader => + assert("SNAPPY" === reader.getCompressionKind.name) + } } withTempPath { file => @@ -251,7 +259,9 @@ abstract class OrcQueryTest extends OrcTest { val orcFilePath = new Path(maybeOrcFile.get.getAbsolutePath) val conf = OrcFile.readerOptions(new Configuration()) - assert("NONE" === OrcFile.createReader(orcFilePath, conf).getCompressionKind.name) + Utils.tryWithResource(OrcFile.createReader(orcFilePath, conf)) { reader => + assert("NONE" === reader.getCompressionKind.name) + } } } @@ -635,7 +645,9 @@ class OrcQuerySuite extends OrcQueryTest with SharedSparkSession { val orcFilePath = new Path(maybeOrcFile.get.getAbsolutePath) val conf = OrcFile.readerOptions(new Configuration()) - assert("LZO" === OrcFile.createReader(orcFilePath, conf).getCompressionKind.name) + Utils.tryWithResource(OrcFile.createReader(orcFilePath, conf)) { reader => + assert("LZO" === reader.getCompressionKind.name) + } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala index 0d904a09c07e8..1e27593584786 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala @@ -60,7 +60,7 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll { .createOrReplaceTempView("orc_temp_table") } - protected def testBloomFilterCreation(bloomFilterKind: Kind) { + protected def testBloomFilterCreation(bloomFilterKind: Kind): Unit = { val tableName = "bloomFilter" withTempDir { dir => @@ -120,7 +120,8 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll { } } - protected def testSelectiveDictionaryEncoding(isSelective: Boolean, isHive23: Boolean = false) { + protected def testSelectiveDictionaryEncoding(isSelective: Boolean, + isHive23: Boolean = false): Unit = { val tableName = "orcTable" withTempDir { dir => @@ -345,7 +346,9 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll { } } - test("SPARK-23340 Empty float/double array columns raise EOFException") { + // SPARK-28885 String value is not allowed to be stored as numeric type with + // ANSI store assignment policy. + ignore("SPARK-23340 Empty float/double array columns raise EOFException") { Seq(Seq(Array.empty[Float]).toDF(), Seq(Array.empty[Double]).toDF()).foreach { df => withTempPath { path => df.write.format("orc").save(path.getCanonicalPath) @@ -372,9 +375,10 @@ abstract class OrcSuite extends OrcTest with BeforeAndAfterAll { val orcFilePath = new Path(partFiles.head.getAbsolutePath) val readerOptions = OrcFile.readerOptions(new Configuration()) - val reader = OrcFile.createReader(orcFilePath, readerOptions) - val version = UTF_8.decode(reader.getMetadataValue(SPARK_VERSION_METADATA_KEY)).toString - assert(version === SPARK_VERSION_SHORT) + Utils.tryWithResource(OrcFile.createReader(orcFilePath, readerOptions)) { reader => + val version = UTF_8.decode(reader.getMetadataValue(SPARK_VERSION_METADATA_KEY)).toString + assert(version === SPARK_VERSION_SHORT) + } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala index adbd93dcb4fe8..388744bd0fd6e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala @@ -27,9 +27,9 @@ import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.expressions.{Attribute, Predicate} import org.apache.spark.sql.catalyst.planning.PhysicalOperation -import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, FileBasedDataSourceTest} -import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation -import org.apache.spark.sql.execution.datasources.v2.orc.OrcTable +import org.apache.spark.sql.execution.datasources.FileBasedDataSourceTest +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation +import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.ORC_IMPLEMENTATION @@ -119,17 +119,14 @@ abstract class OrcTest extends QueryTest with FileBasedDataSourceTest with Befor query.queryExecution.optimizedPlan match { case PhysicalOperation(_, filters, - DataSourceV2Relation(orcTable: OrcTable, _, options)) => + DataSourceV2ScanRelation(_, o: OrcScan, _)) => assert(filters.nonEmpty, "No filter is analyzed from the given query") - val scanBuilder = orcTable.newScanBuilder(options) - scanBuilder.pushFilters(filters.flatMap(DataSourceStrategy.translateFilter).toArray) - val pushedFilters = scanBuilder.pushedFilters() if (noneSupported) { - assert(pushedFilters.isEmpty, "Unsupported filters should not show in pushed filters") + assert(o.pushedFilters.isEmpty, "Unsupported filters should not show in pushed filters") } else { - assert(pushedFilters.nonEmpty, "No filter is pushed down") - val maybeFilter = OrcFilters.createFilter(query.schema, pushedFilters) - assert(maybeFilter.isEmpty, s"Couldn't generate filter predicate for $pushedFilters") + assert(o.pushedFilters.nonEmpty, "No filter is pushed down") + val maybeFilter = OrcFilters.createFilter(query.schema, o.pushedFilters) + assert(maybeFilter.isEmpty, s"Couldn't generate filter predicate for ${o.pushedFilters}") } case _ => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcV2SchemaPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcV2SchemaPruningSuite.scala index b626edf5dc28e..6c9bd32913178 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcV2SchemaPruningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcV2SchemaPruningSuite.scala @@ -17,14 +17,15 @@ package org.apache.spark.sql.execution.datasources.orc import org.apache.spark.SparkConf -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.catalyst.parser.CatalystSqlParser +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.datasources.SchemaPruningSuite import org.apache.spark.sql.execution.datasources.v2.BatchScanExec import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan import org.apache.spark.sql.internal.SQLConf -class OrcV2SchemaPruningSuite extends SchemaPruningSuite { +class OrcV2SchemaPruningSuite extends SchemaPruningSuite with AdaptiveSparkPlanHelper { override protected val dataSourceName: String = "orc" override protected val vectorizedReaderEnabledKey: String = SQLConf.ORC_VECTORIZED_READER_ENABLED.key @@ -36,7 +37,7 @@ class OrcV2SchemaPruningSuite extends SchemaPruningSuite { override def checkScanSchemata(df: DataFrame, expectedSchemaCatalogStrings: String*): Unit = { val fileSourceScanSchemata = - df.queryExecution.executedPlan.collect { + collect(df.queryExecution.executedPlan) { case BatchScanExec(_, scan: OrcScan) => scan.readDataSchema } assert(fileSourceScanSchemata.size === expectedSchemaCatalogStrings.size, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala index 9671866fe1535..4e0c1c2dbe601 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala @@ -33,9 +33,8 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.optimizer.InferFiltersFromConstraints import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, HadoopFsRelation, LogicalRelation} -import org.apache.spark.sql.execution.datasources.orc.OrcFilters -import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation -import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetTable +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation +import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.ParquetOutputTimestampType @@ -1391,6 +1390,27 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared } } } + + test("SPARK-30826: case insensitivity of StringStartsWith attribute") { + import testImplicits._ + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { + withTable("t1") { + withTempPath { dir => + val path = dir.toURI.toString + Seq("42").toDF("COL").write.parquet(path) + spark.sql( + s""" + |CREATE TABLE t1 (col STRING) + |USING parquet + |OPTIONS (path '$path') + """.stripMargin) + checkAnswer( + spark.sql("SELECT * FROM t1 WHERE col LIKE '4%'"), + Row("42")) + } + } + } + } } class ParquetV1FilterSuite extends ParquetFilterSuite { @@ -1484,12 +1504,10 @@ class ParquetV2FilterSuite extends ParquetFilterSuite { query.queryExecution.optimizedPlan.collectFirst { case PhysicalOperation(_, filters, - DataSourceV2Relation(parquetTable: ParquetTable, _, options)) => + DataSourceV2ScanRelation(_, scan: ParquetScan, _)) => assert(filters.nonEmpty, "No filter is analyzed from the given query") - val scanBuilder = parquetTable.newScanBuilder(options) val sourceFilters = filters.flatMap(DataSourceStrategy.translateFilter).toArray - scanBuilder.pushFilters(sourceFilters) - val pushedFilters = scanBuilder.pushedFilters() + val pushedFilters = scan.pushedFilters assert(pushedFilters.nonEmpty, "No filter is pushed down") val schema = new SparkToParquetSchemaConverter(conf).convert(df.schema) val parquetFilters = createParquetFilters(schema) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala index 026ba5deffdfd..1550b3bbb6242 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala @@ -66,7 +66,7 @@ private[parquet] class TestGroupWriteSupport(schema: MessageType) extends WriteS new WriteContext(schema, new java.util.HashMap[String, String]()) } - override def write(record: Group) { + override def write(record: Group): Unit = { groupWriter.write(record) } } @@ -204,6 +204,42 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession } } + testStandardAndLegacyModes("array of struct") { + val data = (1 to 4).map { i => + Tuple1( + Seq( + Tuple1(s"1st_val_$i"), + Tuple1(s"2nd_val_$i") + ) + ) + } + withParquetDataFrame(data) { df => + // Structs are converted to `Row`s + checkAnswer(df, data.map { case Tuple1(array) => + Row(array.map(struct => Row(struct.productIterator.toSeq: _*))) + }) + } + } + + testStandardAndLegacyModes("array of nested struct") { + val data = (1 to 4).map { i => + Tuple1( + Seq( + Tuple1( + Tuple1(s"1st_val_$i")), + Tuple1( + Tuple1(s"2nd_val_$i")) + ) + ) + } + withParquetDataFrame(data) { df => + // Structs are converted to `Row`s + checkAnswer(df, data.map { case Tuple1(array) => + Row(array.map { case Tuple1(Tuple1(str)) => Row(Row(str))}) + }) + } + } + testStandardAndLegacyModes("nested struct with array of array as field") { val data = (1 to 4).map(i => Tuple1((i, Seq(Seq(s"val_$i"))))) withParquetDataFrame(data) { df => @@ -214,9 +250,34 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession } } + testStandardAndLegacyModes("nested map with struct as key type") { + val data = (1 to 4).map { i => + Tuple1( + Map( + (i, s"kA_$i") -> s"vA_$i", + (i, s"kB_$i") -> s"vB_$i" + ) + ) + } + withParquetDataFrame(data) { df => + // Structs are converted to `Row`s + checkAnswer(df, data.map { case Tuple1(m) => + Row(m.map { case (k, v) => Row(k.productIterator.toSeq: _*) -> v }) + }) + } + } + testStandardAndLegacyModes("nested map with struct as value type") { - val data = (1 to 4).map(i => Tuple1(Map(i -> ((i, s"val_$i"))))) + val data = (1 to 4).map { i => + Tuple1( + Map( + s"kA_$i" -> ((i, s"vA_$i")), + s"kB_$i" -> ((i, s"vB_$i")) + ) + ) + } withParquetDataFrame(data) { df => + // Structs are converted to `Row`s checkAnswer(df, data.map { case Tuple1(m) => Row(m.mapValues(struct => Row(struct.productIterator.toSeq: _*))) }) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala index 1ded34f24e436..649a46f190580 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.execution.datasources.parquet import java.io.File +import java.time.ZoneOffset import org.apache.commons.io.FileUtils import org.apache.hadoop.fs.{FileSystem, Path, PathFilter} @@ -145,8 +146,8 @@ class ParquetInteroperabilitySuite extends ParquetCompatibilityTest with SharedS impalaFileData.map { ts => DateTimeUtils.toJavaTimestamp(DateTimeUtils.convertTz( DateTimeUtils.fromJavaTimestamp(ts), - DateTimeUtils.TimeZoneUTC, - DateTimeUtils.getTimeZone(conf.sessionLocalTimeZone))) + ZoneOffset.UTC, + DateTimeUtils.getZoneId(conf.sessionLocalTimeZone))) } } val fullExpectations = (ts ++ impalaExpectations).map(_.toString).sorted.toArray diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala index 0a85e3cdeaf1d..e63929470ce5f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetPartitionDiscoverySuite.scala @@ -58,7 +58,7 @@ abstract class ParquetPartitionDiscoverySuite val defaultPartitionName = ExternalCatalogUtils.DEFAULT_PARTITION_NAME val timeZoneId = ZoneId.systemDefault() - val df = DateFormatter() + val df = DateFormatter(timeZoneId) val tf = TimestampFormatter(timestampPartitionPattern, timeZoneId) protected override def beforeAll(): Unit = { @@ -215,14 +215,14 @@ abstract class ParquetPartitionDiscoverySuite check("file://path/a=10", Some { PartitionValues( - ArrayBuffer("a"), - ArrayBuffer(Literal.create(10, IntegerType))) + Seq("a"), + Seq(Literal.create(10, IntegerType))) }) check("file://path/a=10/b=hello/c=1.5", Some { PartitionValues( - ArrayBuffer("a", "b", "c"), - ArrayBuffer( + Seq("a", "b", "c"), + Seq( Literal.create(10, IntegerType), Literal.create("hello", StringType), Literal.create(1.5, DoubleType))) @@ -230,8 +230,8 @@ abstract class ParquetPartitionDiscoverySuite check("file://path/a=10/b_hello/c=1.5", Some { PartitionValues( - ArrayBuffer("c"), - ArrayBuffer(Literal.create(1.5, DoubleType))) + Seq("c"), + Seq(Literal.create(1.5, DoubleType))) }) check("file:///", None) @@ -272,8 +272,8 @@ abstract class ParquetPartitionDiscoverySuite assert(partitionSpec2 == Option(PartitionValues( - ArrayBuffer("a"), - ArrayBuffer(Literal.create(10, IntegerType))))) + Seq("a"), + Seq(Literal.create(10, IntegerType))))) } test("parse partitions") { @@ -1281,7 +1281,7 @@ class ParquetV2PartitionDiscoverySuite extends ParquetPartitionDiscoverySuite { (1 to 10).map(i => (i, i.toString)).toDF("a", "b").write.parquet(dir.getCanonicalPath) val queryExecution = spark.read.parquet(dir.getCanonicalPath).queryExecution queryExecution.analyzed.collectFirst { - case DataSourceV2Relation(fileTable: FileTable, _, _) => + case DataSourceV2Relation(fileTable: FileTable, _, _, _, _) => assert(fileTable.fileIndex.partitionSpec() === PartitionSpec.emptySpec) }.getOrElse { fail(s"Expecting a matching DataSourceV2Relation, but got:\n$queryExecution") diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala index 88b94281d88ee..917aaba2669ce 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala @@ -141,30 +141,12 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS } } - test("SPARK-10634 timestamp written and read as INT64 - TIMESTAMP_MILLIS") { - val data = (1 to 10).map(i => Row(i, new java.sql.Timestamp(i))) - val schema = StructType(List(StructField("d", IntegerType, false), - StructField("time", TimestampType, false)).toArray) - withSQLConf(SQLConf.PARQUET_INT64_AS_TIMESTAMP_MILLIS.key -> "true") { - withTempPath { file => - val df = spark.createDataFrame(sparkContext.parallelize(data), schema) - df.write.parquet(file.getCanonicalPath) - ("true" :: "false" :: Nil).foreach { vectorized => - withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorized) { - val df2 = spark.read.parquet(file.getCanonicalPath) - checkAnswer(df2, df.collect().toSeq) - } - } - } - } - } - test("SPARK-10634 timestamp written and read as INT64 - truncation") { withTable("ts") { sql("create table ts (c1 int, c2 timestamp) using parquet") - sql("insert into ts values (1, '2016-01-01 10:11:12.123456')") + sql("insert into ts values (1, timestamp'2016-01-01 10:11:12.123456')") sql("insert into ts values (2, null)") - sql("insert into ts values (3, '1965-01-01 10:11:12.123456')") + sql("insert into ts values (3, timestamp'1965-01-01 10:11:12.123456')") val expected = Seq( (1, "2016-01-01 10:11:12.123456"), (2, null), @@ -172,45 +154,6 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS .toDS().select('_1, $"_2".cast("timestamp")) checkAnswer(sql("select * from ts"), expected) } - - // The microsecond portion is truncated when written as TIMESTAMP_MILLIS. - withTable("ts") { - withSQLConf(SQLConf.PARQUET_INT64_AS_TIMESTAMP_MILLIS.key -> "true") { - sql("create table ts (c1 int, c2 timestamp) using parquet") - sql("insert into ts values (1, '2016-01-01 10:11:12.123456')") - sql("insert into ts values (2, null)") - sql("insert into ts values (3, '1965-01-01 10:11:12.125456')") - sql("insert into ts values (4, '1965-01-01 10:11:12.125')") - sql("insert into ts values (5, '1965-01-01 10:11:12.1')") - sql("insert into ts values (6, '1965-01-01 10:11:12.123456789')") - sql("insert into ts values (7, '0001-01-01 00:00:00.000000')") - val expected = Seq( - (1, "2016-01-01 10:11:12.123"), - (2, null), - (3, "1965-01-01 10:11:12.125"), - (4, "1965-01-01 10:11:12.125"), - (5, "1965-01-01 10:11:12.1"), - (6, "1965-01-01 10:11:12.123"), - (7, "0001-01-01 00:00:00.000")) - .toDS().select('_1, $"_2".cast("timestamp")) - checkAnswer(sql("select * from ts"), expected) - - // Read timestamps that were encoded as TIMESTAMP_MILLIS annotated as INT64 - // with PARQUET_INT64_AS_TIMESTAMP_MILLIS set to false. - withSQLConf(SQLConf.PARQUET_INT64_AS_TIMESTAMP_MILLIS.key -> "false") { - val expected = Seq( - (1, "2016-01-01 10:11:12.123"), - (2, null), - (3, "1965-01-01 10:11:12.125"), - (4, "1965-01-01 10:11:12.125"), - (5, "1965-01-01 10:11:12.1"), - (6, "1965-01-01 10:11:12.123"), - (7, "0001-01-01 00:00:00.000")) - .toDS().select('_1, $"_2".cast("timestamp")) - checkAnswer(sql("select * from ts"), expected) - } - } - } } test("SPARK-10365 timestamp written and read as INT64 - TIMESTAMP_MICROS") { @@ -391,7 +334,7 @@ abstract class ParquetQuerySuite extends QueryTest with ParquetTest with SharedS withTempPath { dir => val basePath = dir.getCanonicalPath val schema = StructType(Array(StructField("name", DecimalType(10, 5), false))) - val rowRDD = sparkContext.parallelize(Array(Row(Decimal("67123.45")))) + val rowRDD = sparkContext.parallelize(Seq(Row(Decimal("67123.45")))) val df = spark.createDataFrame(rowRDD, schema) df.write.parquet(basePath) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala index 309507d4ddd84..c64e95078e916 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaPruningSuite.scala @@ -20,12 +20,13 @@ package org.apache.spark.sql.execution.datasources.parquet import org.apache.spark.SparkConf import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.parser.CatalystSqlParser +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.datasources.SchemaPruningSuite import org.apache.spark.sql.execution.datasources.v2.BatchScanExec import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan import org.apache.spark.sql.internal.SQLConf -abstract class ParquetSchemaPruningSuite extends SchemaPruningSuite { +abstract class ParquetSchemaPruningSuite extends SchemaPruningSuite with AdaptiveSparkPlanHelper { override protected val dataSourceName: String = "parquet" override protected val vectorizedReaderEnabledKey: String = SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key @@ -48,7 +49,7 @@ class ParquetV2SchemaPruningSuite extends ParquetSchemaPruningSuite { override def checkScanSchemata(df: DataFrame, expectedSchemaCatalogStrings: String*): Unit = { val fileSourceScanSchemata = - df.queryExecution.executedPlan.collect { + collect(df.queryExecution.executedPlan) { case scan: BatchScanExec => scan.scan.asInstanceOf[ParquetScan].readDataSchema } assert(fileSourceScanSchemata.size === expectedSchemaCatalogStrings.size, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/TextSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/TextSuite.scala index 62a779528cec1..539ff0d0e905c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/TextSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/TextSuite.scala @@ -233,13 +233,4 @@ class TextSuite extends QueryTest with SharedSparkSession { assert(data(3) == Row("\"doh\"")) assert(data.length == 4) } - - test("do not produce empty files for empty partitions") { - withTempPath { dir => - val path = dir.getCanonicalPath - spark.emptyDataset[String].write.text(path) - val files = new File(path).listFiles() - assert(!files.exists(_.getName.endsWith("txt"))) - } - } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/FileTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/FileTableSuite.scala index ad0dfadacca15..8f001e0e4d668 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/FileTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/FileTableSuite.scala @@ -21,10 +21,10 @@ import scala.collection.JavaConverters._ import org.apache.hadoop.fs.FileStatus import org.apache.spark.sql.{QueryTest, SparkSession} +import org.apache.spark.sql.connector.read.ScanBuilder +import org.apache.spark.sql.connector.write.{LogicalWriteInfo, WriteBuilder} import org.apache.spark.sql.execution.datasources.FileFormat import org.apache.spark.sql.execution.datasources.text.TextFileFormat -import org.apache.spark.sql.sources.v2.reader.ScanBuilder -import org.apache.spark.sql.sources.v2.writer.WriteBuilder import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -44,7 +44,7 @@ class DummyFileTable( override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = null - override def newWriteBuilder(options: CaseInsensitiveStringMap): WriteBuilder = null + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = null override def supportsDataType(dataType: DataType): Boolean = dataType == StringType diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalogSuite.scala index 275bc339b3b5b..c399a011f9073 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalogSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/V2SessionCatalogSuite.scala @@ -24,17 +24,15 @@ import scala.collection.JavaConverters._ import org.scalatest.BeforeAndAfter -import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException -import org.apache.spark.sql.catalog.v2.{Catalogs, Identifier, NamespaceChange, TableChange} import org.apache.spark.sql.catalyst.analysis.{NamespaceAlreadyExistsException, NoSuchNamespaceException, NoSuchTableException, TableAlreadyExistsException} import org.apache.spark.sql.catalyst.parser.CatalystSqlParser -import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.connector.catalog.{CatalogV2Util, Identifier, NamespaceChange, SupportsNamespaces, TableChange} import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, StringType, StructField, StructType, TimestampType} import org.apache.spark.sql.util.CaseInsensitiveStringMap -class V2SessionCatalogBaseSuite extends SparkFunSuite with SharedSparkSession with BeforeAndAfter { +abstract class V2SessionCatalogBaseSuite extends SharedSparkSession with BeforeAndAfter { val emptyProps: util.Map[String, String] = Collections.emptyMap[String, String] val schema: StructType = new StructType() @@ -46,7 +44,7 @@ class V2SessionCatalogBaseSuite extends SparkFunSuite with SharedSparkSession wi val testIdent: Identifier = Identifier.of(testNs, "test_table") def newCatalog(): V2SessionCatalog = { - val newCatalog = new V2SessionCatalog(spark.sessionState) + val newCatalog = new V2SessionCatalog(spark.sessionState.catalog, spark.sessionState.conf) newCatalog.initialize("test", CaseInsensitiveStringMap.empty()) newCatalog } @@ -54,11 +52,10 @@ class V2SessionCatalogBaseSuite extends SparkFunSuite with SharedSparkSession wi class V2SessionCatalogTableSuite extends V2SessionCatalogBaseSuite { - import org.apache.spark.sql.catalog.v2.CatalogV2Implicits._ + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ override protected def beforeAll(): Unit = { super.beforeAll() - // TODO: when there is a public API for v2 catalogs, use that instead val catalog = newCatalog() catalog.createNamespace(Array("db"), emptyProps) catalog.createNamespace(Array("db2"), emptyProps) @@ -82,16 +79,6 @@ class V2SessionCatalogTableSuite extends V2SessionCatalogBaseSuite { private val testIdentNew = Identifier.of(testNs, "test_table_new") - test("Catalogs can load the catalog") { - val catalog = newCatalog() - - val conf = new SQLConf - conf.setConfString("spark.sql.catalog.test", catalog.getClass.getName) - - val loaded = Catalogs.load("test", conf) - assert(loaded.getClass == catalog.getClass) - } - test("listTables") { val catalog = newCatalog() val ident1 = Identifier.of(Array("ns"), "test_table_1") @@ -404,7 +391,7 @@ class V2SessionCatalogTableSuite extends V2SessionCatalogBaseSuite { assert(updated.schema == expectedSchema) } - test("alterTable: update column data type and nullability") { + test("alterTable: update column nullability") { val catalog = newCatalog() val originalSchema = new StructType() @@ -415,27 +402,12 @@ class V2SessionCatalogTableSuite extends V2SessionCatalogBaseSuite { assert(table.schema == originalSchema) val updated = catalog.alterTable(testIdent, - TableChange.updateColumnType(Array("id"), LongType, true)) + TableChange.updateColumnNullability(Array("id"), true)) - val expectedSchema = new StructType().add("id", LongType).add("data", StringType) + val expectedSchema = new StructType().add("id", IntegerType).add("data", StringType) assert(updated.schema == expectedSchema) } - test("alterTable: update optional column to required fails") { - val catalog = newCatalog() - - val table = catalog.createTable(testIdent, schema, Array.empty, emptyProps) - - assert(table.schema == schema) - - val exc = intercept[IllegalArgumentException] { - catalog.alterTable(testIdent, TableChange.updateColumnType(Array("id"), LongType, false)) - } - - assert(exc.getMessage.contains("Cannot change optional column to required")) - assert(exc.getMessage.contains("id")) - } - test("alterTable: update missing column fails") { val catalog = newCatalog() @@ -763,13 +735,14 @@ class V2SessionCatalogTableSuite extends V2SessionCatalogBaseSuite { class V2SessionCatalogNamespaceSuite extends V2SessionCatalogBaseSuite { - import org.apache.spark.sql.catalog.v2.CatalogV2Implicits._ + import org.apache.spark.sql.connector.catalog.CatalogV2Implicits._ def checkMetadata( expected: scala.collection.Map[String, String], actual: scala.collection.Map[String, String]): Unit = { // remove location and comment that are automatically added by HMS unless they are expected - val toRemove = V2SessionCatalog.RESERVED_PROPERTIES.filter(expected.contains) + val toRemove = + CatalogV2Util.NAMESPACE_RESERVED_PROPERTIES.filter(expected.contains) assert(expected -- toRemove === actual) } @@ -1022,31 +995,18 @@ class V2SessionCatalogNamespaceSuite extends V2SessionCatalogBaseSuite { assert(exc.getMessage.contains(testNs.quoted)) } - test("alterNamespace: fail to remove location") { + test("alterNamespace: fail to remove reserved properties") { val catalog = newCatalog() catalog.createNamespace(testNs, emptyProps) - val exc = intercept[UnsupportedOperationException] { - catalog.alterNamespace(testNs, NamespaceChange.removeProperty("location")) - } + CatalogV2Util.NAMESPACE_RESERVED_PROPERTIES.foreach { p => + val exc = intercept[UnsupportedOperationException] { + catalog.alterNamespace(testNs, NamespaceChange.removeProperty(p)) + } + assert(exc.getMessage.contains(s"Cannot remove reserved property: $p")) - assert(exc.getMessage.contains("Cannot remove reserved property: location")) - - catalog.dropNamespace(testNs) - } - - test("alterNamespace: fail to remove comment") { - val catalog = newCatalog() - - catalog.createNamespace(testNs, Map("comment" -> "test db").asJava) - - val exc = intercept[UnsupportedOperationException] { - catalog.alterNamespace(testNs, NamespaceChange.removeProperty("comment")) } - - assert(exc.getMessage.contains("Cannot remove reserved property: comment")) - catalog.dropNamespace(testNs) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala index 7a8da7e7669a4..4cb845b2487d6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/debug/DebuggingSuite.scala @@ -19,12 +19,33 @@ package org.apache.spark.sql.execution.debug import java.io.ByteArrayOutputStream +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext +import org.apache.spark.sql.execution.{CodegenSupport, LeafExecNode, WholeStageCodegenExec} import org.apache.spark.sql.functions._ +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.test.SQLTestData.TestData +import org.apache.spark.sql.types.StructType class DebuggingSuite extends SharedSparkSession { + + var originalValue: String = _ + // With on AQE, the WholeStageCodegenExec is added when running QueryStageExec. + override def beforeAll(): Unit = { + super.beforeAll() + originalValue = spark.conf.get(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key) + spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false") + } + + override def afterAll(): Unit = { + spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, originalValue) + super.afterAll() + } + test("DataFrame.debug()") { testData.debug() } @@ -46,7 +67,7 @@ class DebuggingSuite extends SharedSparkSession { val res = codegenStringSeq(spark.range(10).groupBy(col("id") * 2).count() .queryExecution.executedPlan) assert(res.length == 2) - assert(res.forall{ case (subtree, code) => + assert(res.forall{ case (subtree, code, _) => subtree.contains("Range") && code.contains("Object[]")}) } @@ -65,7 +86,7 @@ class DebuggingSuite extends SharedSparkSession { """== BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, false])), [id=#x] == |Tuples output: 0 | id LongType: {} - |== WholeStageCodegen == + |== WholeStageCodegen (1) == |Tuples output: 10 | id LongType: {java.lang.Long} |== Range (0, 10, step=1, splits=2) == @@ -90,4 +111,41 @@ class DebuggingSuite extends SharedSparkSession { | id LongType: {} |""".stripMargin)) } + + case class DummyCodeGeneratorPlan(useInnerClass: Boolean) + extends CodegenSupport with LeafExecNode { + override def output: Seq[Attribute] = StructType.fromDDL("d int").toAttributes + override def inputRDDs(): Seq[RDD[InternalRow]] = Seq(spark.sparkContext.emptyRDD[InternalRow]) + override protected def doExecute(): RDD[InternalRow] = sys.error("Not used") + override protected def doProduce(ctx: CodegenContext): String = { + if (useInnerClass) { + val innerClassName = ctx.freshName("innerClass") + ctx.addInnerClass( + s""" + |public class $innerClassName { + | public $innerClassName() {} + |} + """.stripMargin) + } + "" + } + } + + test("Prints bytecode statistics in debugCodegen") { + Seq(true, false).foreach { useInnerClass => + val plan = WholeStageCodegenExec(DummyCodeGeneratorPlan(useInnerClass))(codegenStageId = 0) + + val genCodes = codegenStringSeq(plan) + assert(genCodes.length == 1) + val (_, _, codeStats) = genCodes.head + val expectedNumInnerClasses = if (useInnerClass) 1 else 0 + assert(codeStats.maxMethodCodeSize > 0 && codeStats.maxConstPoolSize > 0 && + codeStats.numInnerClasses == expectedNumInnerClasses) + + val debugCodegenStr = codegenString(plan) + assert(debugCodegenStr.contains("maxMethodCodeSize:")) + assert(debugCodegenStr.contains("maxConstantPoolSize:")) + assert(debugCodegenStr.contains("numInnerClasses:")) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/history/SQLEventFilterBuilderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/history/SQLEventFilterBuilderSuite.scala new file mode 100644 index 0000000000000..5f3d750e8f271 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/history/SQLEventFilterBuilderSuite.scala @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.history + +import java.util.Properties + +import org.apache.spark.SparkFunSuite +import org.apache.spark.scheduler._ +import org.apache.spark.sql.execution.{SparkPlanInfo, SQLExecution} +import org.apache.spark.sql.execution.ui.{SparkListenerSQLExecutionEnd, SparkListenerSQLExecutionStart} +import org.apache.spark.status.ListenerEventsTestHelper + +class SQLEventFilterBuilderSuite extends SparkFunSuite { + import ListenerEventsTestHelper._ + + override protected def beforeEach(): Unit = { + ListenerEventsTestHelper.reset() + } + + test("track live SQL executions") { + var time = 0L + + val listener = new SQLEventFilterBuilder + + listener.onOtherEvent(SparkListenerLogStart("TestSparkVersion")) + + // Start the application. + time += 1 + listener.onApplicationStart(SparkListenerApplicationStart( + "name", + Some("id"), + time, + "user", + Some("attempt"), + None)) + + // Start a couple of executors. + time += 1 + val execIds = Array("1", "2") + execIds.foreach { id => + listener.onExecutorAdded(createExecutorAddedEvent(id, time)) + } + + // Start SQL Execution + listener.onOtherEvent(SparkListenerSQLExecutionStart(1, "desc1", "details1", "plan", + new SparkPlanInfo("node", "str", Seq.empty, Map.empty, Seq.empty), time)) + + time += 1 + + // job 1, 2: coupled with SQL execution 1, finished + val jobProp = createJobProps() + val jobPropWithSqlExecution = new Properties(jobProp) + jobPropWithSqlExecution.setProperty(SQLExecution.EXECUTION_ID_KEY, "1") + val jobInfoForJob1 = pushJobEventsWithoutJobEnd(listener, 1, jobPropWithSqlExecution, + execIds, time) + listener.onJobEnd(SparkListenerJobEnd(1, time, JobSucceeded)) + + val jobInfoForJob2 = pushJobEventsWithoutJobEnd(listener, 2, jobPropWithSqlExecution, + execIds, time) + listener.onJobEnd(SparkListenerJobEnd(2, time, JobSucceeded)) + + // job 3: not coupled with SQL execution 1, finished + pushJobEventsWithoutJobEnd(listener, 3, jobProp, execIds, time) + listener.onJobEnd(SparkListenerJobEnd(3, time, JobSucceeded)) + + // job 4: not coupled with SQL execution 1, not finished + pushJobEventsWithoutJobEnd(listener, 4, jobProp, execIds, time) + listener.onJobEnd(SparkListenerJobEnd(4, time, JobSucceeded)) + + assert(listener.liveSQLExecutions === Set(1)) + + // only SQL executions related jobs are tracked + assert(listener.liveJobs === Set(1, 2)) + assert(listener.liveStages === + (jobInfoForJob1.stageIds ++ jobInfoForJob2.stageIds).toSet) + assert(listener.liveTasks === + (jobInfoForJob1.stageToTaskIds.values.flatten ++ + jobInfoForJob2.stageToTaskIds.values.flatten).toSet) + assert(listener.liveRDDs === + (jobInfoForJob1.stageToRddIds.values.flatten ++ + jobInfoForJob2.stageToRddIds.values.flatten).toSet) + + // End SQL execution + listener.onOtherEvent(SparkListenerSQLExecutionEnd(1, 0)) + + assert(listener.liveSQLExecutions.isEmpty) + assert(listener.liveJobs.isEmpty) + assert(listener.liveStages.isEmpty) + assert(listener.liveTasks.isEmpty) + assert(listener.liveRDDs.isEmpty) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/history/SQLLiveEntitiesEventFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/history/SQLLiveEntitiesEventFilterSuite.scala new file mode 100644 index 0000000000000..46fdaba413c6e --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/history/SQLLiveEntitiesEventFilterSuite.scala @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.history + +import org.apache.spark.{SparkFunSuite, Success, TaskState} +import org.apache.spark.executor.ExecutorMetrics +import org.apache.spark.scheduler._ +import org.apache.spark.sql.execution.ui.{SparkListenerDriverAccumUpdates, SparkListenerSQLAdaptiveExecutionUpdate, SparkListenerSQLExecutionEnd, SparkListenerSQLExecutionStart} +import org.apache.spark.status.ListenerEventsTestHelper.{createRddsWithId, createStage, createTasks} + +class SQLLiveEntitiesEventFilterSuite extends SparkFunSuite { + test("filter in events for jobs related to live SQL execution") { + // assume finished job 1 with stage 1, task (1, 2), rdds (1, 2) and finished sql execution id 1 + // live job 2 with stages 2, tasks (3, 4), rdds (3, 4) and job 2 belongs to the live + // sql execution id 2 + + val liveSQLExecutions = Set(2L) + val liveJobs = Set(2) + val liveStages = Set(2, 3) + val liveTasks = Set(3L, 4L, 5L, 6L) + val liveRDDs = Set(3, 4, 5, 6) + val liveExecutors: Set[String] = Set("1", "2") + + val filter = new SQLLiveEntitiesEventFilter(liveSQLExecutions, liveJobs, liveStages, liveTasks, + liveRDDs) + val acceptFn = filter.acceptFn().lift + + // Verifying with finished SQL execution 1 + assert(Some(false) === acceptFn(SparkListenerSQLExecutionStart(1, "description1", "details1", + "plan", null, 0))) + assert(Some(false) === acceptFn(SparkListenerSQLExecutionEnd(1, 0))) + assert(Some(false) === acceptFn(SparkListenerSQLAdaptiveExecutionUpdate(1, "plan", null))) + assert(Some(false) === acceptFn(SparkListenerDriverAccumUpdates(1, Seq.empty))) + + // Verifying with finished job 1 + val rddsForStage1 = createRddsWithId(1 to 2) + val stage1 = createStage(1, rddsForStage1, Nil) + val tasksForStage1 = createTasks(Seq(1L, 2L), liveExecutors.toArray, 0) + tasksForStage1.foreach { task => task.markFinished(TaskState.FINISHED, 5) } + + val jobStartEventForJob1 = SparkListenerJobStart(1, 0, Seq(stage1)) + val jobEndEventForJob1 = SparkListenerJobEnd(1, 0, JobSucceeded) + val stageSubmittedEventsForJob1 = SparkListenerStageSubmitted(stage1) + val stageCompletedEventsForJob1 = SparkListenerStageCompleted(stage1) + val unpersistRDDEventsForJob1 = (1 to 2).map(SparkListenerUnpersistRDD) + + // job events for finished job should be considered as "don't know" + assert(None === acceptFn(jobStartEventForJob1)) + assert(None === acceptFn(jobEndEventForJob1)) + + // stage events for finished job should be considered as "don't know" + assert(None === acceptFn(stageSubmittedEventsForJob1)) + assert(None === acceptFn(stageCompletedEventsForJob1)) + unpersistRDDEventsForJob1.foreach { event => + assert(None === acceptFn(event)) + } + + val taskSpeculativeTaskSubmittedEvent = SparkListenerSpeculativeTaskSubmitted(stage1.stageId, + stageAttemptId = 1) + assert(None === acceptFn(taskSpeculativeTaskSubmittedEvent)) + + // task events for finished job should be considered as "don't know" + tasksForStage1.foreach { task => + val taskStartEvent = SparkListenerTaskStart(stage1.stageId, 0, task) + assert(None === acceptFn(taskStartEvent)) + + val taskGettingResultEvent = SparkListenerTaskGettingResult(task) + assert(None === acceptFn(taskGettingResultEvent)) + + val taskEndEvent = SparkListenerTaskEnd(stage1.stageId, 0, "taskType", + Success, task, new ExecutorMetrics, null) + assert(None === acceptFn(taskEndEvent)) + } + + // Verifying with live SQL execution 2 + assert(Some(true) === acceptFn(SparkListenerSQLExecutionStart(2, "description2", "details2", + "plan", null, 0))) + assert(Some(true) === acceptFn(SparkListenerSQLExecutionEnd(2, 0))) + assert(Some(true) === acceptFn(SparkListenerSQLAdaptiveExecutionUpdate(2, "plan", null))) + assert(Some(true) === acceptFn(SparkListenerDriverAccumUpdates(2, Seq.empty))) + + // Verifying with live job 2 + val rddsForStage2 = createRddsWithId(3 to 4) + val stage2 = createStage(2, rddsForStage2, Nil) + val tasksForStage2 = createTasks(Seq(3L, 4L), liveExecutors.toArray, 0) + tasksForStage1.foreach { task => task.markFinished(TaskState.FINISHED, 5) } + + val jobStartEventForJob2 = SparkListenerJobStart(2, 0, Seq(stage2)) + val stageSubmittedEventsForJob2 = SparkListenerStageSubmitted(stage2) + val stageCompletedEventsForJob2 = SparkListenerStageCompleted(stage2) + val unpersistRDDEventsForJob2 = rddsForStage2.map { rdd => SparkListenerUnpersistRDD(rdd.id) } + + // job events for live job should be accepted + assert(Some(true) === acceptFn(jobStartEventForJob2)) + + // stage events for live job should be accepted + assert(Some(true) === acceptFn(stageSubmittedEventsForJob2)) + assert(Some(true) === acceptFn(stageCompletedEventsForJob2)) + unpersistRDDEventsForJob2.foreach { event => + assert(Some(true) === acceptFn(event)) + } + + val taskSpeculativeTaskSubmittedEvent2 = SparkListenerSpeculativeTaskSubmitted(stage2.stageId, + stageAttemptId = 1) + assert(Some(true) === acceptFn(taskSpeculativeTaskSubmittedEvent2)) + + // task events for live job should be accepted + tasksForStage2.foreach { task => + val taskStartEvent = SparkListenerTaskStart(stage2.stageId, 0, task) + assert(Some(true) === acceptFn(taskStartEvent)) + + val taskGettingResultEvent = SparkListenerTaskGettingResult(task) + assert(Some(true) === acceptFn(taskGettingResultEvent)) + + val taskEndEvent = SparkListenerTaskEnd(stage1.stageId, 0, "taskType", + Success, task, new ExecutorMetrics, null) + assert(Some(true) === acceptFn(taskEndEvent)) + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala index 91cb919479bfa..5ce758e1e4eb8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala @@ -24,6 +24,7 @@ import org.apache.spark.sql.{Dataset, QueryTest, Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{BitwiseAnd, BitwiseOr, Cast, Literal, ShiftLeft} import org.apache.spark.sql.catalyst.plans.logical.BROADCAST import org.apache.spark.sql.execution.{SparkPlan, WholeStageCodegenExec} +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec import org.apache.spark.sql.execution.exchange.EnsureRequirements import org.apache.spark.sql.functions._ @@ -38,7 +39,7 @@ import org.apache.spark.sql.types.{LongType, ShortType} * unsafe map in [[org.apache.spark.sql.execution.joins.UnsafeHashedRelation]] is not triggered * without serializing the hashed relation, which does not happen in local mode. */ -class BroadcastJoinSuite extends QueryTest with SQLTestUtils { +class BroadcastJoinSuite extends QueryTest with SQLTestUtils with AdaptiveSparkPlanHelper { import testImplicits._ protected var spark: SparkSession = null @@ -122,7 +123,7 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils { val df2 = Seq((1, "1"), (2, "2")).toDF("key", "value") df2.cache() val df3 = df1.join(broadcast(df2), Seq("key"), "inner") - val numBroadCastHashJoin = df3.queryExecution.executedPlan.collect { + val numBroadCastHashJoin = collect(df3.queryExecution.executedPlan) { case b: BroadcastHashJoinExec => b }.size assert(numBroadCastHashJoin === 1) @@ -140,13 +141,13 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils { broadcast(df2).cache() val df3 = df1.join(df2, Seq("key"), "inner") - val numCachedPlan = df3.queryExecution.executedPlan.collect { + val numCachedPlan = collect(df3.queryExecution.executedPlan) { case i: InMemoryTableScanExec => i }.size // df2 should be cached. assert(numCachedPlan === 1) - val numBroadCastHashJoin = df3.queryExecution.executedPlan.collect { + val numBroadCastHashJoin = collect(df3.queryExecution.executedPlan) { case b: BroadcastHashJoinExec => b }.size // df2 should not be broadcasted. @@ -272,7 +273,6 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils { } test("Shouldn't change broadcast join buildSide if user clearly specified") { - withTempView("t1", "t2") { Seq((1, "4"), (2, "2")).toDF("key", "value").createTempView("t1") Seq((1, "1"), (2, "12.3"), (2, "123")).toDF("key", "value").createTempView("t2") @@ -378,7 +378,7 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils { private val bl = BroadcastNestedLoopJoinExec.toString private def assertJoinBuildSide(sqlStr: String, joinMethod: String, buildSide: BuildSide): Any = { - val executedPlan = sql(sqlStr).queryExecution.executedPlan + val executedPlan = stripAQEPlan(sql(sqlStr).queryExecution.executedPlan) executedPlan match { case b: BroadcastNestedLoopJoinExec => assert(b.getClass.getSimpleName === joinMethod) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala index e7f1c42d7d7c5..7d09577075d5d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala @@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate.{Final, Partial} import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.execution.{FilterExec, RangeExec, SparkPlan, WholeStageCodegenExec} import org.apache.spark.sql.execution.aggregate.HashAggregateExec +import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -35,6 +36,19 @@ import org.apache.spark.util.{AccumulatorContext, JsonProtocol} class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils { import testImplicits._ + var originalValue: String = _ + // With AQE on/off, the metric info is different. + override def beforeAll(): Unit = { + super.beforeAll() + originalValue = spark.conf.get(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key) + spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false") + } + + override def afterAll(): Unit = { + spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, originalValue) + super.afterAll() + } + /** * Generates a `DataFrame` by filling randomly generated bytes for hash collision. */ @@ -83,9 +97,10 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils { // TODO: update metrics in generated operators val ds = spark.range(10).filter('id < 5) testSparkPlanMetricsWithPredicates(ds.toDF(), 1, Map( - 0L -> (("WholeStageCodegen", Map( - "duration total (min, med, max)" -> {_.toString.matches(timingMetricPattern)}))) - ), true) + 0L -> (("WholeStageCodegen (1)", Map( + "duration total (min, med, max (stageId (attemptId): taskId))" -> { + _.toString.matches(timingMetricPattern) + })))), true) } test("Aggregate metrics") { @@ -95,9 +110,11 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils { val df = testData2.groupBy().count() // 2 partitions val expected1 = Seq( Map("number of output rows" -> 2L, - "avg hash probe bucket list iters (min, med, max)" -> "\n(1, 1, 1)"), + "avg hash probe bucket list iters (min, med, max (stageId (attemptId): taskId))" -> + aggregateMetricsPattern), Map("number of output rows" -> 1L, - "avg hash probe bucket list iters (min, med, max)" -> "\n(1, 1, 1)")) + "avg hash probe bucket list iters (min, med, max (stageId (attemptId): taskId))" -> + aggregateMetricsPattern)) val shuffleExpected1 = Map( "records read" -> 2L, "local blocks read" -> 2L, @@ -113,9 +130,12 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils { val df2 = testData2.groupBy('a).count() val expected2 = Seq( Map("number of output rows" -> 4L, - "avg hash probe bucket list iters (min, med, max)" -> "\n(1, 1, 1)"), + "avg hash probe bucket list iters (min, med, max (stageId (attemptId): taskId))" -> + aggregateMetricsPattern), Map("number of output rows" -> 3L, - "avg hash probe bucket list iters (min, med, max)" -> "\n(1, 1, 1)")) + "avg hash probe bucket list iters (min, med, max (stageId (attemptId): taskId))" -> + aggregateMetricsPattern)) + val shuffleExpected2 = Map( "records read" -> 4L, "local blocks read" -> 4L, @@ -161,9 +181,12 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils { } val metrics = getSparkPlanMetrics(df, 1, nodeIds, enableWholeStage).get nodeIds.foreach { nodeId => - val probes = metrics(nodeId)._2("avg hash probe bucket list iters (min, med, max)") - probes.toString.stripPrefix("\n(").stripSuffix(")").split(", ").foreach { probe => - assert(probe.toDouble > 1.0) + val probes = metrics(nodeId)._2("avg hash probe bucket list iters (min, med, max (stageId" + + " (attemptId): taskId))") + // Extract min, med, max from the string and strip off everthing else. + val index = probes.toString.stripPrefix("\n(").stripSuffix(")").indexOf(" (", 0) + probes.toString.stripPrefix("\n(").stripSuffix(")").slice(0, index).split(", ").foreach { + probe => assert(probe.toDouble > 1.0) } } } @@ -208,9 +231,15 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils { val df = Seq(1, 3, 2).toDF("id").sort('id) testSparkPlanMetricsWithPredicates(df, 2, Map( 0L -> (("Sort", Map( - "sort time total (min, med, max)" -> {_.toString.matches(timingMetricPattern)}, - "peak memory total (min, med, max)" -> {_.toString.matches(sizeMetricPattern)}, - "spill size total (min, med, max)" -> {_.toString.matches(sizeMetricPattern)}))) + "sort time total (min, med, max (stageId (attemptId): taskId))" -> { + _.toString.matches(timingMetricPattern) + }, + "peak memory total (min, med, max (stageId (attemptId): taskId))" -> { + _.toString.matches(sizeMetricPattern) + }, + "spill size total (min, med, max (stageId (attemptId): taskId))" -> { + _.toString.matches(sizeMetricPattern) + }))) )) } @@ -388,7 +417,7 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils { // Assume the execution plan is // PhysicalRDD(nodeId = 0) data.write.format("json").save(file.getAbsolutePath) - sparkContext.listenerBus.waitUntilEmpty(10000) + sparkContext.listenerBus.waitUntilEmpty() val executionIds = currentExecutionIds().diff(previousExecutionIds) assert(executionIds.size === 1) val executionId = executionIds.head @@ -598,4 +627,29 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils { Map(1L -> (("InMemoryTableScan", Map.empty))) ) } + + test("SPARK-28332: SQLMetric merge should handle -1 properly") { + def checkSparkPlanMetrics(plan: SparkPlan, expected: Map[String, Long]): Unit = { + expected.foreach { case (metricName: String, metricValue: Long) => + assert(plan.metrics.contains(metricName), s"The query plan should have metric $metricName") + val actualMetric = plan.metrics.get(metricName).get + assert(actualMetric.value == metricValue, + s"The query plan metric $metricName did not match, " + + s"expected:$metricValue, actual:${actualMetric.value}") + } + } + + val df = testData.join(testData2.filter('b === 0), $"key" === $"a", "left_outer") + df.collect() + val plan = df.queryExecution.executedPlan + + val exchanges = plan.collect { + case s: ShuffleExchangeExec => s + } + + assert(exchanges.size == 2, "The query plan should have two shuffle exchanges") + + checkSparkPlanMetrics(exchanges(0), Map("dataSize" -> 3200, "shuffleRecordsWritten" -> 100)) + checkSparkPlanMetrics(exchanges(1), Map("dataSize" -> 0, "shuffleRecordsWritten" -> 0)) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala index 8f26c04307adc..0c1148f7b82e4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala @@ -41,16 +41,28 @@ trait SQLMetricsTestUtils extends SQLTestUtils { protected def statusStore: SQLAppStatusStore = spark.sharedState.statusStore - // Pattern of size SQLMetric value, e.g. "\n96.2 MiB (32.1 MiB, 32.1 MiB, 32.1 MiB)" + // Pattern of size SQLMetric value, e.g. "\n96.2 MiB (32.1 MiB, 32.1 MiB, 32.1 MiB (stage 0 + // (attempt 0): task 4))" OR "\n96.2 MiB (32.1 MiB, 32.1 MiB, 32.1 MiB)" protected val sizeMetricPattern = { val bytes = "([0-9]+(\\.[0-9]+)?) (EiB|PiB|TiB|GiB|MiB|KiB|B)" - s"\\n$bytes \\($bytes, $bytes, $bytes\\)" + val maxMetrics = "\\(stage ([0-9])+ \\(attempt ([0-9])+\\)\\: task ([0-9])+\\)" + s"\\n$bytes \\($bytes, $bytes, $bytes( $maxMetrics)?\\)" } - // Pattern of timing SQLMetric value, e.g. "\n2.0 ms (1.0 ms, 1.0 ms, 1.0 ms)" + // Pattern of timing SQLMetric value, e.g. "\n2.0 ms (1.0 ms, 1.0 ms, 1.0 ms (stage 3 (attempt + // 0): task 217))" OR "\n2.0 ms (1.0 ms, 1.0 ms, 1.0 ms)" protected val timingMetricPattern = { val duration = "([0-9]+(\\.[0-9]+)?) (ms|s|m|h)" - s"\\n$duration \\($duration, $duration, $duration\\)" + val maxMetrics = "\\(stage ([0-9])+ \\(attempt ([0-9])+\\)\\: task ([0-9])+\\)" + s"\\n$duration \\($duration, $duration, $duration( $maxMetrics)?\\)" + } + + // Pattern of size SQLMetric value for Aggregate tests. + // e.g "\n(1, 1, 0.9 (stage 1 (attempt 0): task 8)) OR "\n(1, 1, 0.9 )" + protected val aggregateMetricsPattern = { + val iters = "([0-9]+(\\.[0-9]+)?)" + val maxMetrics = "\\(stage ([0-9])+ \\(attempt ([0-9])+\\)\\: task ([0-9])+\\)" + s"\\n\\($iters, $iters, $iters( $maxMetrics)?\\)" } /** @@ -86,7 +98,7 @@ trait SQLMetricsTestUtils extends SQLTestUtils { } val totalNumBytesMetric = executedNode.metrics.find( - _.name == "written output total (min, med, max)").get + _.name == "written output total (min, med, max (stageId (attemptId): taskId))").get val totalNumBytes = metrics(totalNumBytesMetric.accumulatorId).replaceAll(",", "") .split(" ").head.trim.toDouble assert(totalNumBytes > 0) @@ -115,29 +127,31 @@ trait SQLMetricsTestUtils extends SQLTestUtils { provider: String, dataFormat: String, tableName: String): Unit = { - withTempPath { dir => - spark.sql( - s""" - |CREATE TABLE $tableName(a int, b int) - |USING $provider - |PARTITIONED BY(a) - |LOCATION '${dir.toURI}' - """.stripMargin) - val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName)) - assert(table.location == makeQualifiedPath(dir.getAbsolutePath)) - - val df = spark.range(start = 0, end = 40, step = 1, numPartitions = 1) - .selectExpr("id a", "id b") - - // 40 files, 80 rows, 40 dynamic partitions. - verifyWriteDataMetrics(Seq(40, 40, 80)) { - df.union(df).repartition(2, $"a") - .write - .format(dataFormat) - .mode("overwrite") - .insertInto(tableName) + withTable(tableName) { + withTempPath { dir => + spark.sql( + s""" + |CREATE TABLE $tableName(a int, b int) + |USING $provider + |PARTITIONED BY(a) + |LOCATION '${dir.toURI}' + """.stripMargin) + val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier(tableName)) + assert(table.location == makeQualifiedPath(dir.getAbsolutePath)) + + val df = spark.range(start = 0, end = 40, step = 1, numPartitions = 1) + .selectExpr("id a", "id b") + + // 40 files, 80 rows, 40 dynamic partitions. + verifyWriteDataMetrics(Seq(40, 40, 80)) { + df.union(df).repartition(2, $"a") + .write + .format(dataFormat) + .mode("overwrite") + .insertInto(tableName) + } + assert(TestUtils.recursiveList(dir).count(_.getName.startsWith("part-")) == 40) } - assert(TestUtils.recursiveList(dir).count(_.getName.startsWith("part-")) == 40) } } @@ -203,7 +217,9 @@ trait SQLMetricsTestUtils extends SQLTestUtils { expectedMetrics: Map[Long, (String, Map[String, Any])]): Unit = { val expectedMetricsPredicates = expectedMetrics.mapValues { case (nodeName, nodeMetrics) => (nodeName, nodeMetrics.mapValues(expectedMetricValue => - (actualMetricValue: Any) => expectedMetricValue.toString === actualMetricValue)) + (actualMetricValue: Any) => { + actualMetricValue.toString.matches(expectedMetricValue.toString) + })) } testSparkPlanMetricsWithPredicates(df, expectedNumOfJobs, expectedMetricsPredicates) } @@ -230,7 +246,8 @@ trait SQLMetricsTestUtils extends SQLTestUtils { val (actualNodeName, actualMetricsMap) = actualMetrics(nodeId) assert(expectedNodeName === actualNodeName) for ((metricName, metricPredicate) <- expectedMetricsPredicatesMap) { - assert(metricPredicate(actualMetricsMap(metricName))) + assert(metricPredicate(actualMetricsMap(metricName)), + s"$nodeId / '$metricName' (= ${actualMetricsMap(metricName)}) did not match predicate.") } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala index d26989b00a651..5fe3d6a71167e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala @@ -24,10 +24,13 @@ import org.apache.spark.api.python.{PythonEvalType, PythonFunction} import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, GreaterThan, In} import org.apache.spark.sql.execution.{FilterExec, InputAdapter, SparkPlanTest, WholeStageCodegenExec} +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{BooleanType, DoubleType} -class BatchEvalPythonExecSuite extends SparkPlanTest with SharedSparkSession { +class BatchEvalPythonExecSuite extends SparkPlanTest + with SharedSparkSession + with AdaptiveSparkPlanHelper { import testImplicits.newProductEncoder import testImplicits.localSeqToDatasetHolder @@ -95,7 +98,7 @@ class BatchEvalPythonExecSuite extends SparkPlanTest with SharedSparkSession { val df = Seq(("Hello", 4)).toDF("a", "b") val df2 = Seq(("Hello", 4)).toDF("c", "d") val joinDF = df.crossJoin(df2).where("dummyPythonUDF(a, c) == dummyPythonUDF(d, c)") - val qualifiedPlanNodes = joinDF.queryExecution.executedPlan.collect { + val qualifiedPlanNodes = collect(joinDF.queryExecution.executedPlan) { case b: BatchEvalPythonExec => b } assert(qualifiedPlanNodes.size == 1) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonForeachWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonForeachWriterSuite.scala index d02014c0dee54..61c9782bd175d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonForeachWriterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/PythonForeachWriterSuite.scala @@ -21,8 +21,8 @@ import scala.collection.mutable.ArrayBuffer import org.mockito.Mockito.when import org.scalatest.concurrent.Eventually -import org.scalatest.mockito.MockitoSugar import org.scalatest.time.SpanSugar._ +import org.scalatestplus.mockito.MockitoSugar import org.apache.spark._ import org.apache.spark.memory.{TaskMemoryManager, TestMemoryManager} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/RowQueueSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/RowQueueSuite.scala index 1ec9986328429..06077c94b66fc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/RowQueueSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/RowQueueSuite.scala @@ -33,7 +33,7 @@ class RowQueueSuite extends SparkFunSuite with EncryptionFunSuite { test("in-memory queue") { val page = MemoryBlock.fromLongArray(new Array[Long](1<<10)) val queue = new InMemoryRowQueue(page, 1) { - override def close() {} + override def close(): Unit = {} } val row = new UnsafeRow(1) row.pointTo(new Array[Byte](16), 16) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamProviderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamProviderSuite.scala index ef88598fcb11b..6440e69e2ec23 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamProviderSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamProviderSuite.scala @@ -24,12 +24,12 @@ import scala.collection.mutable.ArrayBuffer import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.connector.read.streaming.{Offset, SparkDataStream} import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.execution.datasources.v2.StreamingDataSourceV2Relation import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.continuous._ import org.apache.spark.sql.functions._ -import org.apache.spark.sql.sources.v2.reader.streaming.{Offset, SparkDataStream} import org.apache.spark.sql.streaming.StreamTest import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util.ManualClock diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketStreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketStreamSuite.scala index e1284ea03267e..5c66fc52592b3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketStreamSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketStreamSuite.scala @@ -29,12 +29,12 @@ import scala.collection.JavaConverters._ import org.apache.spark.internal.Logging import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.connector.read.streaming.{Offset, SparkDataStream} import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.execution.datasources.v2.StreamingDataSourceV2Relation import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.continuous._ import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.sources.v2.reader.streaming.{Offset, SparkDataStream} import org.apache.spark.sql.streaming.{StreamingQueryException, StreamTest} import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ @@ -42,7 +42,7 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap class TextSocketStreamSuite extends StreamTest with SharedSparkSession { - override def afterEach() { + override def afterEach(): Unit = { sqlContext.streams.active.foreach(_.stop()) if (serverThread != null) { serverThread.interrupt() @@ -194,13 +194,12 @@ class TextSocketStreamSuite extends StreamTest with SharedSparkSession { } test("user-specified schema given") { - val provider = new TextSocketSourceProvider val userSpecifiedSchema = StructType( StructField("name", StringType) :: StructField("area", StringType) :: Nil) val params = Map("host" -> "localhost", "port" -> "1234") val exception = intercept[UnsupportedOperationException] { - provider.getTable(new CaseInsensitiveStringMap(params.asJava), userSpecifiedSchema) + spark.readStream.schema(userSpecifiedSchema).format("socket").options(params).load() } assert(exception.getMessage.contains( "TextSocketSourceProvider source does not support user-specified schema")) @@ -318,7 +317,7 @@ class TextSocketStreamSuite extends StreamTest with SharedSparkSession { for (i <- 0 until numRecords / 2) { r.next() offsets.append(r.getOffset().asInstanceOf[ContinuousRecordPartitionOffset].offset) - data.append(r.get().get(0, DataTypes.StringType).asInstanceOf[String].toInt) + data.append(r.get().getString(0).toInt) // commit the offsets in the middle and validate if processing continues if (i == 2) { commitOffset(t.partitionId, i + 1) @@ -381,7 +380,10 @@ class TextSocketStreamSuite extends StreamTest with SharedSparkSession { val r = readerFactory.createReader(t).asInstanceOf[TextSocketContinuousPartitionReader] for (_ <- 0 until numRecords / 2) { r.next() - assert(r.get().get(0, TextSocketReader.SCHEMA_TIMESTAMP).isInstanceOf[(_, _)]) + assert(r.get().numFields === 2) + // just try to read columns one by one - it would throw error if the row is corrupted + r.get().getString(0) + r.get().getLong(1) } case _ => throw new IllegalStateException("Unexpected task type") } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala index a84d107f2cbc0..488879938339d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/StateStoreSuite.scala @@ -406,7 +406,7 @@ class StateStoreSuite extends StateStoreSuiteBase[HDFSBackedStateStoreProvider] var latestStoreVersion = 0 - def generateStoreVersions() { + def generateStoreVersions(): Unit = { for (i <- 1 to 20) { val store = StateStore.get(storeProviderId, keySchema, valueSchema, None, latestStoreVersion, storeConf, hadoopConf) @@ -586,7 +586,8 @@ class StateStoreSuite extends StateStoreSuiteBase[HDFSBackedStateStoreProvider] query.processAllAvailable() require(query.lastProgress != null) // at least one batch processed after start val loadedProvidersMethod = - PrivateMethod[mutable.HashMap[StateStoreProviderId, StateStoreProvider]]('loadedProviders) + PrivateMethod[mutable.HashMap[StateStoreProviderId, StateStoreProvider]]( + Symbol("loadedProviders")) val loadedProvidersMap = StateStore invokePrivate loadedProvidersMethod() val loadedProviders = loadedProvidersMap.synchronized { loadedProvidersMap.values.toSeq } query.stop() @@ -781,7 +782,7 @@ class StateStoreSuite extends StateStoreSuiteBase[HDFSBackedStateStoreProvider] provider: HDFSBackedStateStoreProvider, version: Long, isSnapshot: Boolean): Boolean = { - val method = PrivateMethod[Path]('baseDir) + val method = PrivateMethod[Path](Symbol("baseDir")) val basePath = provider invokePrivate method() val fileName = if (isSnapshot) s"$version.snapshot" else s"$version.delta" val filePath = new File(basePath.toString, fileName) @@ -789,7 +790,7 @@ class StateStoreSuite extends StateStoreSuiteBase[HDFSBackedStateStoreProvider] } def deleteFilesEarlierThanVersion(provider: HDFSBackedStateStoreProvider, version: Long): Unit = { - val method = PrivateMethod[Path]('baseDir) + val method = PrivateMethod[Path](Symbol("baseDir")) val basePath = provider invokePrivate method() for (version <- 0 until version.toInt) { for (isSnapshot <- Seq(false, true)) { @@ -804,7 +805,7 @@ class StateStoreSuite extends StateStoreSuiteBase[HDFSBackedStateStoreProvider] provider: HDFSBackedStateStoreProvider, version: Long, isSnapshot: Boolean): Unit = { - val method = PrivateMethod[Path]('baseDir) + val method = PrivateMethod[Path](Symbol("baseDir")) val basePath = provider invokePrivate method() val fileName = if (isSnapshot) s"$version.snapshot" else s"$version.delta" val filePath = new File(basePath.toString, fileName) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala index c0216a2ef3e61..ce1eabeb932fb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala @@ -38,9 +38,14 @@ class SymmetricHashJoinStateManagerSuite extends StreamTest with BeforeAndAfter spark.streams.stateStoreCoordinator // initialize the lazy coordinator } + SymmetricHashJoinStateManager.supportedVersions.foreach { version => + test(s"StreamingJoinStateManager V${version} - all operations") { + testAllOperations(version) + } + } - test("SymmetricHashJoinStateManager - all operations") { - withJoinStateManager(inputValueAttribs, joinKeyExprs) { manager => + private def testAllOperations(stateFormatVersion: Int): Unit = { + withJoinStateManager(inputValueAttribs, joinKeyExprs, stateFormatVersion) { manager => implicit val mgr = manager assert(get(20) === Seq.empty) // initially empty @@ -123,7 +128,8 @@ class SymmetricHashJoinStateManagerSuite extends StreamTest with BeforeAndAfter def toValueInt(inputValueRow: UnsafeRow): Int = inputValueRow.getInt(0) def append(key: Int, value: Int)(implicit manager: SymmetricHashJoinStateManager): Unit = { - manager.append(toJoinKeyRow(key), toInputValue(value)) + // we only put matched = false for simplicity - StreamingJoinSuite will test the functionality + manager.append(toJoinKeyRow(key), toInputValue(value), matched = false) } def get(key: Int)(implicit manager: SymmetricHashJoinStateManager): Seq[Int] = { @@ -156,13 +162,15 @@ class SymmetricHashJoinStateManagerSuite extends StreamTest with BeforeAndAfter def withJoinStateManager( inputValueAttribs: Seq[Attribute], - joinKeyExprs: Seq[Expression])(f: SymmetricHashJoinStateManager => Unit): Unit = { + joinKeyExprs: Seq[Expression], + stateFormatVersion: Int)(f: SymmetricHashJoinStateManager => Unit): Unit = { withTempDir { file => val storeConf = new StateStoreConf() val stateInfo = StatefulOperatorStateInfo(file.getAbsolutePath, UUID.randomUUID, 0, 0, 5) val manager = new SymmetricHashJoinStateManager( - LeftSide, inputValueAttribs, joinKeyExprs, Some(stateInfo), storeConf, new Configuration) + LeftSide, inputValueAttribs, joinKeyExprs, Some(stateInfo), storeConf, new Configuration, + partitionId = 0, stateFormatVersion) try { f(manager) } finally { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/AllExecutionsPageSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/AllExecutionsPageSuite.scala index 9e42056c19a0c..298afa880c930 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/AllExecutionsPageSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/AllExecutionsPageSuite.scala @@ -73,7 +73,7 @@ class AllExecutionsPageSuite extends SharedSparkSession with BeforeAndAfter { map.put("failed.sort", Array("duration")) when(request.getParameterMap()).thenReturn(map) val html = renderSQLPage(request, tab, statusStore).toString().toLowerCase(Locale.ROOT) - assert(!html.contains("IllegalArgumentException")) + assert(!html.contains("illegalargumentexception")) assert(html.contains("duration")) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/MetricsAggregationBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/MetricsAggregationBenchmark.scala new file mode 100644 index 0000000000000..c09ff51ecaff2 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/MetricsAggregationBenchmark.scala @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.ui + +import java.util.Properties +import java.util.concurrent.atomic.AtomicInteger + +import scala.collection.mutable +import scala.concurrent.duration._ + +import org.apache.spark.{SparkConf, TaskState} +import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} +import org.apache.spark.executor.ExecutorMetrics +import org.apache.spark.internal.config.Status._ +import org.apache.spark.resource.ResourceProfile +import org.apache.spark.scheduler._ +import org.apache.spark.sql.execution._ +import org.apache.spark.sql.execution.metric.SQLMetricInfo +import org.apache.spark.status.ElementTrackingStore +import org.apache.spark.util.{AccumulatorMetadata, LongAccumulator, Utils} +import org.apache.spark.util.kvstore.InMemoryStore + +/** + * Benchmark for metrics aggregation in the SQL listener. + * {{{ + * To run this benchmark: + * 1. without sbt: bin/spark-submit --class --jars + * 2. build/sbt "core/test:runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "core/test:runMain " + * Results will be written to "benchmarks/MetricsAggregationBenchmark-results.txt". + * }}} + */ +object MetricsAggregationBenchmark extends BenchmarkBase { + + private def metricTrackingBenchmark( + timer: Benchmark.Timer, + numMetrics: Int, + numTasks: Int, + numStages: Int): Measurements = { + val conf = new SparkConf() + .set(LIVE_ENTITY_UPDATE_PERIOD, 0L) + .set(ASYNC_TRACKING_ENABLED, false) + val kvstore = new ElementTrackingStore(new InMemoryStore(), conf) + val listener = new SQLAppStatusListener(conf, kvstore, live = true) + val store = new SQLAppStatusStore(kvstore, Some(listener)) + + val metrics = (0 until numMetrics).map { i => + new SQLMetricInfo(s"metric$i", i.toLong, "average") + } + + val planInfo = new SparkPlanInfo( + getClass().getName(), + getClass().getName(), + Nil, + Map.empty, + metrics) + + val idgen = new AtomicInteger() + val executionId = idgen.incrementAndGet() + val executionStart = SparkListenerSQLExecutionStart( + executionId, + getClass().getName(), + getClass().getName(), + getClass().getName(), + planInfo, + System.currentTimeMillis()) + + val executionEnd = SparkListenerSQLExecutionEnd(executionId, System.currentTimeMillis()) + + val properties = new Properties() + properties.setProperty(SQLExecution.EXECUTION_ID_KEY, executionId.toString) + + timer.startTiming() + listener.onOtherEvent(executionStart) + + val taskEventsTime = (0 until numStages).map { _ => + val stageInfo = new StageInfo(idgen.incrementAndGet(), 0, getClass().getName(), + numTasks, Nil, Nil, getClass().getName(), + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) + + val jobId = idgen.incrementAndGet() + val jobStart = SparkListenerJobStart( + jobId = jobId, + time = System.currentTimeMillis(), + stageInfos = Seq(stageInfo), + properties) + + val stageStart = SparkListenerStageSubmitted(stageInfo) + + val taskOffset = idgen.incrementAndGet().toLong + val taskEvents = (0 until numTasks).map { i => + val info = new TaskInfo( + taskId = taskOffset + i.toLong, + index = i, + attemptNumber = 0, + // The following fields are not used. + launchTime = 0, + executorId = "", + host = "", + taskLocality = null, + speculative = false) + info.markFinished(TaskState.FINISHED, 1L) + + val accumulables = (0 until numMetrics).map { mid => + val acc = new LongAccumulator + acc.metadata = AccumulatorMetadata(mid, None, false) + acc.toInfo(Some(i.toLong), None) + } + + info.setAccumulables(accumulables) + + val start = SparkListenerTaskStart(stageInfo.stageId, stageInfo.attemptNumber, info) + val end = SparkListenerTaskEnd(stageInfo.stageId, stageInfo.attemptNumber, + taskType = "", + reason = null, + info, + new ExecutorMetrics(), + null) + + (start, end) + } + + val jobEnd = SparkListenerJobEnd( + jobId = jobId, + time = System.currentTimeMillis(), + JobSucceeded) + + listener.onJobStart(jobStart) + listener.onStageSubmitted(stageStart) + + val (_, _taskEventsTime) = Utils.timeTakenMs { + taskEvents.foreach { case (start, end) => + listener.onTaskStart(start) + listener.onTaskEnd(end) + } + } + + listener.onJobEnd(jobEnd) + _taskEventsTime + } + + val (_, aggTime) = Utils.timeTakenMs { + listener.onOtherEvent(executionEnd) + val metrics = store.executionMetrics(executionId) + assert(metrics.size == numMetrics, s"${metrics.size} != $numMetrics") + } + + timer.stopTiming() + kvstore.close() + + Measurements(taskEventsTime, aggTime) + } + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + val metricCount = 50 + val taskCount = 100000 + val stageCounts = Seq(1, 2, 3) + + val benchmark = new Benchmark( + s"metrics aggregation ($metricCount metrics, $taskCount tasks per stage)", 1, + warmupTime = 0.seconds, output = output) + + // Run this outside the measurement code so that classes are loaded and JIT is triggered, + // otherwise the first run tends to be much slower than others. Also because this benchmark is a + // bit weird and doesn't really map to what the Benchmark class expects, so it's a bit harder + // to use warmupTime and friends effectively. + stageCounts.foreach { count => + metricTrackingBenchmark(new Benchmark.Timer(-1), metricCount, taskCount, count) + } + + val measurements = mutable.HashMap[Int, Seq[Measurements]]() + + stageCounts.foreach { count => + benchmark.addTimerCase(s"$count stage(s)") { timer => + val m = metricTrackingBenchmark(timer, metricCount, taskCount, count) + val all = measurements.getOrElse(count, Nil) + measurements(count) = all ++ Seq(m) + } + } + + benchmark.run() + + benchmark.out.printf("Stage Count Stage Proc. Time Aggreg. Time\n") + stageCounts.foreach { count => + val data = measurements(count) + val eventsTimes = data.flatMap(_.taskEventsTimes) + val aggTimes = data.map(_.aggregationTime) + + val msg = " %d %d %d\n".format( + count, + eventsTimes.sum / eventsTimes.size, + aggTimes.sum / aggTimes.size) + benchmark.out.printf(msg) + } + } + + /** + * Finer-grained measurements of how long it takes to run some parts of the benchmark. This is + * collected by the benchmark method, so this collection slightly affects the overall benchmark + * results, but this data helps with seeing where the time is going, since this benchmark is + * triggering a whole lot of code in the listener class. + */ + case class Measurements( + taskEventsTimes: Seq[Long], + aggregationTime: Long) +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala index 90966d2efec23..d18a35c3110f9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala @@ -30,6 +30,7 @@ import org.apache.spark.executor.ExecutorMetrics import org.apache.spark.internal.config import org.apache.spark.internal.config.Status._ import org.apache.spark.rdd.RDD +import org.apache.spark.resource.ResourceProfile import org.apache.spark.scheduler._ import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.catalyst.InternalRow @@ -38,6 +39,8 @@ import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.catalyst.util.quietly import org.apache.spark.sql.execution.{LeafExecNode, QueryExecution, SparkPlanInfo, SQLExecution} import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} +import org.apache.spark.sql.functions.count +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.StaticSQLConf.UI_RETAINED_EXECUTIONS import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.status.ElementTrackingStore @@ -79,12 +82,13 @@ class SQLAppStatusListenerSuite extends SharedSparkSession with JsonTestUtils private def createStageInfo(stageId: Int, attemptId: Int): StageInfo = { new StageInfo(stageId = stageId, attemptId = attemptId, + numTasks = 8, // The following fields are not used in tests name = "", - numTasks = 0, rddInfos = Nil, parentIds = Nil, - details = "") + details = "", + resourceProfileId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) } private def createTaskInfo( @@ -94,8 +98,8 @@ class SQLAppStatusListenerSuite extends SharedSparkSession with JsonTestUtils val info = new TaskInfo( taskId = taskId, attemptNumber = attemptNumber, + index = taskId.toInt, // The following fields are not used in tests - index = 0, launchTime = 0, executorId = "", host = "", @@ -190,6 +194,8 @@ class SQLAppStatusListenerSuite extends SharedSparkSession with JsonTestUtils ), createProperties(executionId))) listener.onStageSubmitted(SparkListenerStageSubmitted(createStageInfo(0, 0))) + listener.onTaskStart(SparkListenerTaskStart(0, 0, createTaskInfo(0, 0))) + listener.onTaskStart(SparkListenerTaskStart(0, 0, createTaskInfo(1, 0))) assert(statusStore.executionMetrics(executionId).isEmpty) @@ -217,6 +223,8 @@ class SQLAppStatusListenerSuite extends SharedSparkSession with JsonTestUtils // Retrying a stage should reset the metrics listener.onStageSubmitted(SparkListenerStageSubmitted(createStageInfo(0, 1))) + listener.onTaskStart(SparkListenerTaskStart(0, 1, createTaskInfo(0, 0))) + listener.onTaskStart(SparkListenerTaskStart(0, 1, createTaskInfo(1, 0))) listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate("", Seq( // (task id, stage id, stage attempt, accum updates) @@ -260,6 +268,8 @@ class SQLAppStatusListenerSuite extends SharedSparkSession with JsonTestUtils // Summit a new stage listener.onStageSubmitted(SparkListenerStageSubmitted(createStageInfo(1, 0))) + listener.onTaskStart(SparkListenerTaskStart(1, 0, createTaskInfo(0, 0))) + listener.onTaskStart(SparkListenerTaskStart(1, 0, createTaskInfo(1, 0))) listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate("", Seq( // (task id, stage id, stage attempt, accum updates) @@ -480,7 +490,7 @@ class SQLAppStatusListenerSuite extends SharedSparkSession with JsonTestUtils // At the beginning of this test case, there should be no live data in the listener. assert(listener.noLiveData()) spark.sparkContext.parallelize(1 to 10).foreach(i => ()) - spark.sparkContext.listenerBus.waitUntilEmpty(10000) + spark.sparkContext.listenerBus.waitUntilEmpty() // Listener should ignore the non-SQL stages, as the stage data are only removed when SQL // execution ends, which will not be triggered for non-SQL jobs. assert(listener.noLiveData()) @@ -490,15 +500,15 @@ class SQLAppStatusListenerSuite extends SharedSparkSession with JsonTestUtils val statusStore = spark.sharedState.statusStore val oldCount = statusStore.executionsList().size - val expectedAccumValue = 12345 - val expectedAccumValue2 = 54321 + val expectedAccumValue = 12345L + val expectedAccumValue2 = 54321L val physicalPlan = MyPlan(sqlContext.sparkContext, expectedAccumValue, expectedAccumValue2) val dummyQueryExecution = new QueryExecution(spark, LocalRelation()) { override lazy val sparkPlan = physicalPlan override lazy val executedPlan = physicalPlan } - SQLExecution.withNewExecutionId(spark, dummyQueryExecution) { + SQLExecution.withNewExecutionId(dummyQueryExecution) { physicalPlan.execute().collect() } @@ -517,8 +527,10 @@ class SQLAppStatusListenerSuite extends SharedSparkSession with JsonTestUtils val metrics = statusStore.executionMetrics(execId) val driverMetric = physicalPlan.metrics("dummy") val driverMetric2 = physicalPlan.metrics("dummy2") - val expectedValue = SQLMetrics.stringValue(driverMetric.metricType, Seq(expectedAccumValue)) - val expectedValue2 = SQLMetrics.stringValue(driverMetric2.metricType, Seq(expectedAccumValue2)) + val expectedValue = SQLMetrics.stringValue(driverMetric.metricType, + Array(expectedAccumValue), Array.empty[Long]) + val expectedValue2 = SQLMetrics.stringValue(driverMetric2.metricType, + Array(expectedAccumValue2), Array.empty[Long]) assert(metrics.contains(driverMetric.id)) assert(metrics(driverMetric.id) === expectedValue) @@ -609,6 +621,15 @@ class SQLAppStatusListenerSuite extends SharedSparkSession with JsonTestUtils assert(statusStore.executionsCount === 2) assert(statusStore.execution(2) === None) } + + test("SPARK-29894 test Codegen Stage Id in SparkPlanInfo") { + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { + // with AQE on, the WholeStageCodegen rule is applied when running QueryStageExec. + val df = createTestDataFrame.select(count("*")) + val sparkPlanInfo = SparkPlanInfo.fromSparkPlan(df.queryExecution.executedPlan) + assert(sparkPlanInfo.nodeName === "WholeStageCodegen (2)") + } + } } @@ -673,7 +694,7 @@ class SQLAppStatusListenerMemoryLeakSuite extends SparkFunSuite { case e: SparkException => // This is expected for a failed job } } - sc.listenerBus.waitUntilEmpty(10000) + sc.listenerBus.waitUntilEmpty() val statusStore = spark.sharedState.statusStore assert(statusStore.executionsCount() <= 50) assert(statusStore.planGraphCount() <= 50) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala index 758780c80b284..37d028d6a713f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala @@ -636,30 +636,24 @@ class ColumnarBatchSuite extends SparkFunSuite { assert(column.arrayData().elementsAppended == 0) } - testVector("CalendarInterval APIs", 4, CalendarIntervalType) { + testVector("CalendarInterval APIs", 5, CalendarIntervalType) { column => val reference = mutable.ArrayBuffer.empty[CalendarInterval] val months = column.getChild(0) - val microseconds = column.getChild(1) + val days = column.getChild(1) + val microseconds = column.getChild(2) assert(months.dataType() == IntegerType) + assert(days.dataType() == IntegerType) assert(microseconds.dataType() == LongType) - months.putInt(0, 1) - microseconds.putLong(0, 100) - reference += new CalendarInterval(1, 100) - - months.putInt(1, 0) - microseconds.putLong(1, 2000) - reference += new CalendarInterval(0, 2000) - - column.putNull(2) - assert(column.getInterval(2) == null) - reference += null - - months.putInt(3, 20) - microseconds.putLong(3, 0) - reference += new CalendarInterval(20, 0) + Seq(new CalendarInterval(1, 10, 100), + new CalendarInterval(0, 0, 2000), + new CalendarInterval(20, 0, 0), + new CalendarInterval(0, 200, 0)).zipWithIndex.foreach { case (v, i) => + column.putInterval(i, v) + reference += v + } reference.zipWithIndex.foreach { case (v, i) => val errMsg = "VectorType=" + column.getClass.getSimpleName @@ -1067,7 +1061,8 @@ class ColumnarBatchSuite extends SparkFunSuite { } } - private def compareStruct(fields: Seq[StructField], r1: InternalRow, r2: Row, seed: Long) { + private def compareStruct(fields: Seq[StructField], r1: InternalRow, r2: Row, + seed: Long): Unit = { fields.zipWithIndex.foreach { case (field: StructField, ordinal: Int) => assert(r1.isNullAt(ordinal) == r2.isNullAt(ordinal), "Seed = " + seed) if (!r1.isNullAt(ordinal)) { @@ -1159,7 +1154,7 @@ class ColumnarBatchSuite extends SparkFunSuite { * This test generates a random schema data, serializes it to column batches and verifies the * results. */ - def testRandomRows(flatSchema: Boolean, numFields: Int) { + def testRandomRows(flatSchema: Boolean, numFields: Int): Unit = { // TODO: Figure out why StringType doesn't work on jenkins. val types = Array( BooleanType, ByteType, FloatType, DoubleType, IntegerType, LongType, ShortType, @@ -1310,7 +1305,7 @@ class ColumnarBatchSuite extends SparkFunSuite { Decimal("1234.23456"), DateTimeUtils.fromJavaDate(java.sql.Date.valueOf("2015-01-01")), DateTimeUtils.fromJavaTimestamp(java.sql.Timestamp.valueOf("2015-01-01 23:50:59.123")), - new CalendarInterval(1, 0), + new CalendarInterval(1, 0, 0), new GenericArrayData(Array(1, 2, 3, 4, null)), new GenericInternalRow(Array[Any](5.asInstanceOf[Any], 10)), mapBuilder.build() @@ -1331,7 +1326,7 @@ class ColumnarBatchSuite extends SparkFunSuite { Decimal("0.01000"), DateTimeUtils.fromJavaDate(java.sql.Date.valueOf("1875-12-12")), DateTimeUtils.fromJavaTimestamp(java.sql.Timestamp.valueOf("1880-01-05 12:45:21.321")), - new CalendarInterval(-10, -100), + new CalendarInterval(-10, -50, -100), new GenericArrayData(Array(5, 10, -100)), new GenericInternalRow(Array[Any](20.asInstanceOf[Any], null)), mapBuilder.build() @@ -1423,8 +1418,8 @@ class ColumnarBatchSuite extends SparkFunSuite { assert(columns(10).isNullAt(2)) assert(columns(11).dataType() == CalendarIntervalType) - assert(columns(11).getInterval(0) == new CalendarInterval(1, 0)) - assert(columns(11).getInterval(1) == new CalendarInterval(-10, -100)) + assert(columns(11).getInterval(0) == new CalendarInterval(1, 0, 0)) + assert(columns(11).getInterval(1) == new CalendarInterval(-10, -50, -100)) assert(columns(11).isNullAt(2)) assert(columns(12).dataType() == ArrayType(IntegerType)) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala index d885348f3774a..46d0c64592a00 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/ExecutorSideSQLConfSuite.scala @@ -17,8 +17,17 @@ package org.apache.spark.sql.internal -import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.{AnalysisException, SparkSession} +import java.util.UUID + +import org.scalatest.Assertions._ + +import org.apache.spark.{SparkException, SparkFunSuite, TaskContext} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{Dataset, SparkSession} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.catalyst.plans.logical.LocalRelation +import org.apache.spark.sql.execution.{LeafExecNode, QueryExecution, SparkPlan} import org.apache.spark.sql.execution.debug.codegenStringSeq import org.apache.spark.sql.functions.col import org.apache.spark.sql.test.SQLTestUtils @@ -91,15 +100,86 @@ class ExecutorSideSQLConfSuite extends SparkFunSuite with SQLTestUtils { test("SPARK-22219: refactor to control to generate comment") { Seq(true, false).foreach { flag => - withSQLConf(StaticSQLConf.CODEGEN_COMMENTS.key -> flag.toString) { + withSQLConf(StaticSQLConf.CODEGEN_COMMENTS.key -> flag.toString, + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { + // with AQE on, the WholeStageCodegen rule is applied when running QueryStageExec. val res = codegenStringSeq(spark.range(10).groupBy(col("id") * 2).count() .queryExecution.executedPlan) assert(res.length == 2) - assert(res.forall { case (_, code) => + assert(res.forall { case (_, code, _) => (code.contains("* Codegend pipeline") == flag) && (code.contains("// input[") == flag) }) } } } + + test("SPARK-28939: propagate SQLConf also in conversions to RDD") { + val confs = Seq("spark.sql.a" -> "x", "spark.sql.b" -> "y") + val physicalPlan = SQLConfAssertPlan(confs) + val dummyQueryExecution = FakeQueryExecution(spark, physicalPlan) + withSQLConf(confs: _*) { + // Force RDD evaluation to trigger asserts + dummyQueryExecution.toRdd.collect() + } + val dummyQueryExecution1 = FakeQueryExecution(spark, physicalPlan) + // Without setting the configs assertions fail + val e = intercept[SparkException](dummyQueryExecution1.toRdd.collect()) + assert(e.getCause.isInstanceOf[NoSuchElementException]) + } + + test("SPARK-30556 propagate local properties to subquery execution thread") { + withSQLConf(StaticSQLConf.SUBQUERY_MAX_THREAD_THRESHOLD.key -> "1") { + withTempView("l", "m", "n") { + Seq(true).toDF().createOrReplaceTempView("l") + val confKey = "spark.sql.y" + + def createDataframe(confKey: String, confValue: String): Dataset[Boolean] = { + Seq(true) + .toDF() + .mapPartitions { _ => + TaskContext.get.getLocalProperty(confKey) == confValue match { + case true => Iterator(true) + case false => Iterator.empty + } + } + } + + // set local configuration and assert + val confValue1 = UUID.randomUUID().toString() + createDataframe(confKey, confValue1).createOrReplaceTempView("m") + spark.sparkContext.setLocalProperty(confKey, confValue1) + assert(sql("SELECT * FROM l WHERE EXISTS (SELECT * FROM m)").collect().length == 1) + + // change the conf value and assert again + val confValue2 = UUID.randomUUID().toString() + createDataframe(confKey, confValue2).createOrReplaceTempView("n") + spark.sparkContext.setLocalProperty(confKey, confValue2) + assert(sql("SELECT * FROM l WHERE EXISTS (SELECT * FROM n)").collect().length == 1) + } + } + } +} + +case class SQLConfAssertPlan(confToCheck: Seq[(String, String)]) extends LeafExecNode { + override protected def doExecute(): RDD[InternalRow] = { + sqlContext + .sparkContext + .parallelize(0 until 2, 2) + .mapPartitions { it => + val confs = SQLConf.get + confToCheck.foreach { case (key, expectedValue) => + assert(confs.getConfString(key) == expectedValue) + } + it.map(i => InternalRow.fromSeq(Seq(i))) + } + } + + override def output: Seq[Attribute] = Seq.empty +} + +case class FakeQueryExecution(spark: SparkSession, physicalPlan: SparkPlan) + extends QueryExecution(spark, LocalRelation()) { + override lazy val sparkPlan: SparkPlan = physicalPlan + override lazy val executedPlan: SparkPlan = physicalPlan } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala index 1dfbca64f5778..61be3672f3ebe 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala @@ -17,7 +17,10 @@ package org.apache.spark.sql.internal +import scala.language.reflectiveCalls + import org.apache.hadoop.fs.Path +import org.apache.log4j.Level import org.apache.spark.sql._ import org.apache.spark.sql.internal.StaticSQLConf._ @@ -25,7 +28,6 @@ import org.apache.spark.sql.test.{SharedSparkSession, TestSQLContext} import org.apache.spark.util.Utils class SQLConfSuite extends QueryTest with SharedSparkSession { - import testImplicits._ private val testKey = "test.key.0" private val testVal = "test.val.0" @@ -259,12 +261,6 @@ class SQLConfSuite extends QueryTest with SharedSparkSession { assert(spark.sessionState.conf.parquetOutputTimestampType == SQLConf.ParquetOutputTimestampType.TIMESTAMP_MICROS) - // PARQUET_INT64_AS_TIMESTAMP_MILLIS should be respected. - spark.sessionState.conf.setConf(SQLConf.PARQUET_INT64_AS_TIMESTAMP_MILLIS, true) - assert(spark.sessionState.conf.parquetOutputTimestampType == - SQLConf.ParquetOutputTimestampType.TIMESTAMP_MILLIS) - - // PARQUET_OUTPUT_TIMESTAMP_TYPE has higher priority over PARQUET_INT64_AS_TIMESTAMP_MILLIS spark.sessionState.conf.setConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE, "timestamp_micros") assert(spark.sessionState.conf.parquetOutputTimestampType == SQLConf.ParquetOutputTimestampType.TIMESTAMP_MICROS) @@ -320,4 +316,36 @@ class SQLConfSuite extends QueryTest with SharedSparkSession { assert(e2.getMessage.contains("spark.sql.shuffle.partitions")) } + test("set removed config to non-default value") { + val config = "spark.sql.fromJsonForceNullableSchema" + val defaultValue = true + + spark.conf.set(config, defaultValue) + + val e = intercept[AnalysisException] { + spark.conf.set(config, !defaultValue) + } + assert(e.getMessage.contains(config)) + } + + test("log deprecation warnings") { + val logAppender = new LogAppender("deprecated SQL configs") + def check(config: String): Unit = { + assert(logAppender.loggingEvents.exists( + e => e.getLevel == Level.WARN && + e.getRenderedMessage.contains(config))) + } + + val config1 = SQLConf.HIVE_VERIFY_PARTITION_PATH.key + withLogAppender(logAppender) { + spark.conf.set(config1, true) + } + check(config1) + + val config2 = SQLConf.ARROW_EXECUTION_ENABLED.key + withLogAppender(logAppender) { + spark.conf.unset(config2) + } + check(config2) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala index 7fe00aef56e16..9cba95f7d7df2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala @@ -29,7 +29,7 @@ import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeTestUtils} -import org.apache.spark.sql.execution.DataSourceScanExec +import org.apache.spark.sql.execution.{DataSourceScanExec, ExtendedMode} import org.apache.spark.sql.execution.command.{ExplainCommand, ShowCreateTableCommand} import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JDBCPartition, JDBCRDD, JDBCRelation, JdbcUtils} @@ -51,7 +51,7 @@ class JDBCSuite extends QueryTest val testBytes = Array[Byte](99.toByte, 134.toByte, 135.toByte, 200.toByte, 205.toByte) val testH2Dialect = new JdbcDialect { - override def canHandle(url: String) : Boolean = url.startsWith("jdbc:h2") + override def canHandle(url: String): Boolean = url.startsWith("jdbc:h2") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = Some(StringType) @@ -450,15 +450,6 @@ class JDBCSuite extends QueryTest urlWithUserAndPass, "TEST.PEOPLE", new Properties()).collect().length === 3) } - test("Basic API with illegal fetchsize") { - val properties = new Properties() - properties.setProperty(JDBCOptions.JDBC_BATCH_FETCH_SIZE, "-1") - val e = intercept[IllegalArgumentException] { - spark.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", properties).collect() - }.getMessage - assert(e.contains("Invalid value `-1` for parameter `fetchsize`")) - } - test("Missing partition columns") { withView("tempPeople") { val e = intercept[IllegalArgumentException] { @@ -743,7 +734,7 @@ class JDBCSuite extends QueryTest } test("compile filters") { - val compileFilter = PrivateMethod[Option[String]]('compileFilter) + val compileFilter = PrivateMethod[Option[String]](Symbol("compileFilter")) def doCompileFilter(f: Filter): String = JDBCRDD invokePrivate compileFilter(f, JdbcDialects.get("jdbc:")) getOrElse("") assert(doCompileFilter(EqualTo("col0", 3)) === """"col0" = 3""") @@ -893,17 +884,37 @@ class JDBCSuite extends QueryTest "BIT") assert(msSqlServerDialect.getJDBCType(BinaryType).map(_.databaseTypeDefinition).get == "VARBINARY(MAX)") - assert(msSqlServerDialect.getJDBCType(ShortType).map(_.databaseTypeDefinition).get == - "SMALLINT") + Seq(true, false).foreach { flag => + withSQLConf(SQLConf.LEGACY_MSSQLSERVER_NUMERIC_MAPPING_ENABLED.key -> s"$flag") { + if (SQLConf.get.legacyMsSqlServerNumericMappingEnabled) { + assert(msSqlServerDialect.getJDBCType(ShortType).map(_.databaseTypeDefinition).isEmpty) + } else { + assert(msSqlServerDialect.getJDBCType(ShortType).map(_.databaseTypeDefinition).get == + "SMALLINT") + } + } + } } test("SPARK-28152 MsSqlServerDialect catalyst type mapping") { val msSqlServerDialect = JdbcDialects.get("jdbc:sqlserver") val metadata = new MetadataBuilder().putLong("scale", 1) - assert(msSqlServerDialect.getCatalystType(java.sql.Types.SMALLINT, "SMALLINT", 1, - metadata).get == ShortType) - assert(msSqlServerDialect.getCatalystType(java.sql.Types.REAL, "REAL", 1, - metadata).get == FloatType) + + Seq(true, false).foreach { flag => + withSQLConf(SQLConf.LEGACY_MSSQLSERVER_NUMERIC_MAPPING_ENABLED.key -> s"$flag") { + if (SQLConf.get.legacyMsSqlServerNumericMappingEnabled) { + assert(msSqlServerDialect.getCatalystType(java.sql.Types.SMALLINT, "SMALLINT", 1, + metadata).isEmpty) + assert(msSqlServerDialect.getCatalystType(java.sql.Types.REAL, "REAL", 1, + metadata).isEmpty) + } else { + assert(msSqlServerDialect.getCatalystType(java.sql.Types.SMALLINT, "SMALLINT", 1, + metadata).get == ShortType) + assert(msSqlServerDialect.getCatalystType(java.sql.Types.REAL, "REAL", 1, + metadata).get == FloatType) + } + } + } } test("table exists query by jdbc dialect") { @@ -983,7 +994,7 @@ class JDBCSuite extends QueryTest test("test credentials in the properties are not in plan output") { val df = sql("SELECT * FROM parts") - val explain = ExplainCommand(df.queryExecution.logical, extended = true) + val explain = ExplainCommand(df.queryExecution.logical, ExtendedMode) spark.sessionState.executePlan(explain).executedPlan.executeCollect().foreach { r => assert(!List("testPass", "testUser").exists(r.toString.contains)) } @@ -996,7 +1007,7 @@ class JDBCSuite extends QueryTest test("test credentials in the connection url are not in the plan output") { val df = spark.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", new Properties()) - val explain = ExplainCommand(df.queryExecution.logical, extended = true) + val explain = ExplainCommand(df.queryExecution.logical, ExtendedMode) spark.sessionState.executePlan(explain).executedPlan.executeCollect().foreach { r => assert(!List("testPass", "testUser").exists(r.toString.contains)) } @@ -1018,7 +1029,7 @@ class JDBCSuite extends QueryTest | password '$password') """.stripMargin) - val explain = ExplainCommand(df.queryExecution.logical, extended = true) + val explain = ExplainCommand(df.queryExecution.logical, ExtendedMode) spark.sessionState.executePlan(explain).executedPlan.executeCollect().foreach { r => assert(!r.toString.contains(password)) } @@ -1658,4 +1669,34 @@ class JDBCSuite extends QueryTest } } } + + test("Add exception when isolationLevel is Illegal") { + val e = intercept[IllegalArgumentException] { + spark.read.format("jdbc") + .option("Url", urlWithUserAndPass) + .option("dbTable", "test.people") + .option("isolationLevel", "test") + .load() + }.getMessage + assert(e.contains( + "Invalid value `test` for parameter `isolationLevel`. This can be " + + "`NONE`, `READ_UNCOMMITTED`, `READ_COMMITTED`, `REPEATABLE_READ` or `SERIALIZABLE`.")) + } + + test("SPARK-28552: Case-insensitive database URLs in JdbcDialect") { + assert(JdbcDialects.get("jdbc:mysql://localhost/db") === MySQLDialect) + assert(JdbcDialects.get("jdbc:MySQL://localhost/db") === MySQLDialect) + assert(JdbcDialects.get("jdbc:postgresql://localhost/db") === PostgresDialect) + assert(JdbcDialects.get("jdbc:postGresql://localhost/db") === PostgresDialect) + assert(JdbcDialects.get("jdbc:db2://localhost/db") === DB2Dialect) + assert(JdbcDialects.get("jdbc:DB2://localhost/db") === DB2Dialect) + assert(JdbcDialects.get("jdbc:sqlserver://localhost/db") === MsSqlServerDialect) + assert(JdbcDialects.get("jdbc:sqlServer://localhost/db") === MsSqlServerDialect) + assert(JdbcDialects.get("jdbc:derby://localhost/db") === DerbyDialect) + assert(JdbcDialects.get("jdbc:derBy://localhost/db") === DerbyDialect) + assert(JdbcDialects.get("jdbc:oracle://localhost/db") === OracleDialect) + assert(JdbcDialects.get("jdbc:Oracle://localhost/db") === OracleDialect) + assert(JdbcDialects.get("jdbc:teradata://localhost/db") === TeradataDialect) + assert(JdbcDialects.get("jdbc:Teradata://localhost/db") === TeradataDialect) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala index b28c6531d42b2..8021ef1a17a18 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala @@ -21,10 +21,12 @@ import java.sql.DriverManager import java.util.Properties import scala.collection.JavaConverters.propertiesAsScalaMapConverter +import scala.collection.mutable.ArrayBuffer import org.scalatest.BeforeAndAfter import org.apache.spark.SparkException +import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd} import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SaveMode} import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils} @@ -543,4 +545,57 @@ class JDBCWriteSuite extends SharedSparkSession with BeforeAndAfter { }.getMessage assert(errMsg.contains("Statement was canceled or the session timed out")) } + + test("metrics") { + val df = spark.createDataFrame(sparkContext.parallelize(arr2x2), schema2) + val df2 = spark.createDataFrame(sparkContext.parallelize(arr1x2), schema2) + + runAndVerifyRecordsWritten(2) { + df.write.mode(SaveMode.Append).jdbc(url, "TEST.BASICCREATETEST", new Properties()) + } + + runAndVerifyRecordsWritten(1) { + df2.write.mode(SaveMode.Overwrite).jdbc(url, "TEST.BASICCREATETEST", new Properties()) + } + + runAndVerifyRecordsWritten(1) { + df2.write.mode(SaveMode.Overwrite).option("truncate", true) + .jdbc(url, "TEST.BASICCREATETEST", new Properties()) + } + + runAndVerifyRecordsWritten(0) { + intercept[AnalysisException] { + df2.write.mode(SaveMode.ErrorIfExists).jdbc(url, "TEST.BASICCREATETEST", new Properties()) + } + } + + runAndVerifyRecordsWritten(0) { + df.write.mode(SaveMode.Ignore).jdbc(url, "TEST.BASICCREATETEST", new Properties()) + } + } + + private def runAndVerifyRecordsWritten(expected: Long)(job: => Unit): Unit = { + assert(expected === runAndReturnMetrics(job, _.taskMetrics.outputMetrics.recordsWritten)) + } + + private def runAndReturnMetrics(job: => Unit, collector: (SparkListenerTaskEnd) => Long): Long = { + val taskMetrics = new ArrayBuffer[Long]() + + // Avoid receiving earlier taskEnd events + sparkContext.listenerBus.waitUntilEmpty() + + val listener = new SparkListener() { + override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = { + taskMetrics += collector(taskEnd) + } + } + sparkContext.addSparkListener(listener) + + job + + sparkContext.listenerBus.waitUntilEmpty() + + sparkContext.removeSparkListener(listener) + taskMetrics.sum + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala index 7043b6d396977..c7266c886128c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala @@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning import org.apache.spark.sql.execution.{DataSourceScanExec, SortExec} +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec import org.apache.spark.sql.execution.datasources.BucketingUtils import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec import org.apache.spark.sql.execution.joins.SortMergeJoinExec @@ -382,8 +383,16 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils { joined.sort("bucketed_table1.k", "bucketed_table2.k"), df1.join(df2, joinCondition(df1, df2), joinType).sort("df1.k", "df2.k")) - assert(joined.queryExecution.executedPlan.isInstanceOf[SortMergeJoinExec]) - val joinOperator = joined.queryExecution.executedPlan.asInstanceOf[SortMergeJoinExec] + val joinOperator = if (joined.sqlContext.conf.adaptiveExecutionEnabled) { + val executedPlan = + joined.queryExecution.executedPlan.asInstanceOf[AdaptiveSparkPlanExec].executedPlan + assert(executedPlan.isInstanceOf[SortMergeJoinExec]) + executedPlan.asInstanceOf[SortMergeJoinExec] + } else { + val executedPlan = joined.queryExecution.executedPlan + assert(executedPlan.isInstanceOf[SortMergeJoinExec]) + executedPlan.asInstanceOf[SortMergeJoinExec] + } // check existence of shuffle assert( @@ -595,6 +604,20 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils { } } + test("bucket join should work with SubqueryAlias plan") { + withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "0") { + withTable("t") { + withView("v") { + spark.range(20).selectExpr("id as i").write.bucketBy(8, "i").saveAsTable("t") + sql("CREATE VIEW v AS SELECT * FROM t").collect() + + val plan = sql("SELECT * FROM t a JOIN v b ON a.i = b.i").queryExecution.executedPlan + assert(plan.collect { case exchange: ShuffleExchangeExec => exchange }.isEmpty) + } + } + } + } + test("avoid shuffle when grouping keys are a super-set of bucket keys") { withTable("bucketed_table") { df1.write.format("parquet").bucketBy(8, "i").saveAsTable("bucketed_table") @@ -795,4 +818,22 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils { } } + test("SPARK-29655 Read bucketed tables obeys spark.sql.shuffle.partitions") { + withSQLConf( + SQLConf.SHUFFLE_PARTITIONS.key -> "5", + SQLConf.SHUFFLE_MAX_NUM_POSTSHUFFLE_PARTITIONS.key -> "7") { + val bucketSpec = Some(BucketSpec(6, Seq("i", "j"), Nil)) + Seq(false, true).foreach { enableAdaptive => + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> s"$enableAdaptive") { + val bucketedTableTestSpecLeft = BucketedTableTestSpec(bucketSpec, expectedShuffle = false) + val bucketedTableTestSpecRight = BucketedTableTestSpec(None, expectedShuffle = true) + testBucketing( + bucketedTableTestSpecLeft = bucketedTableTestSpecLeft, + bucketedTableTestSpecRight = bucketedTableTestSpecRight, + joinCondition = joinCondition(Seq("i", "j")) + ) + } + } + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala index 08f0865c1e128..983209051c8ae 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala @@ -183,7 +183,7 @@ class CreateTableAsSelectSuite extends DataSourceTest with SharedSparkSession { }.getMessage assert(error.contains("Operation not allowed") && - error.contains("CREATE EXTERNAL TABLE ... USING")) + error.contains("CREATE EXTERNAL TABLE ...")) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala index 1ece98aa7eb3a..7c10f9950f8eb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DataSourceTest.scala @@ -26,7 +26,8 @@ import org.apache.spark.unsafe.types.UTF8String private[sql] abstract class DataSourceTest extends QueryTest { - protected def sqlTest(sqlString: String, expectedAnswer: Seq[Row], enableRegex: Boolean = false) { + protected def sqlTest(sqlString: String, expectedAnswer: Seq[Row], + enableRegex: Boolean = false): Unit = { test(sqlString) { withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> enableRegex.toString) { checkAnswer(spark.sql(sqlString), expectedAnswer) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/ExternalCommandRunnerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/ExternalCommandRunnerSuite.scala new file mode 100644 index 0000000000000..55fb3eb8ade35 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/ExternalCommandRunnerSuite.scala @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.sources + +import scala.collection.JavaConverters._ + +import org.apache.spark.sql.{QueryTest, Row} +import org.apache.spark.sql.connector.ExternalCommandRunner +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.util.CaseInsensitiveStringMap + +class ExternalCommandRunnerSuite extends QueryTest with SharedSparkSession { + test("execute command") { + try { + System.setProperty("command", "hello") + assert(System.getProperty("command") === "hello") + + val options = Map("one" -> "1", "two" -> "2") + val df = spark.executeCommand(classOf[FakeCommandRunner].getName, "world", options) + // executeCommand should execute the command eagerly + assert(System.getProperty("command") === "world") + checkAnswer(df, Seq(Row("one"), Row("two"))) + } finally { + System.clearProperty("command") + } + } +} + +class FakeCommandRunner extends ExternalCommandRunner { + + override def executeCommand(command: String, options: CaseInsensitiveStringMap): Array[String] = { + System.setProperty("command", command) + options.keySet().iterator().asScala.toSeq.sorted.toArray + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala index 0d236a43ece6b..bcff30a51c3f5 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala @@ -20,6 +20,8 @@ package org.apache.spark.sql.sources import java.io.File import java.sql.Date +import org.apache.hadoop.fs.{FileAlreadyExistsException, FSDataOutputStream, Path, RawLocalFileSystem} + import org.apache.spark.SparkException import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.TableIdentifier @@ -268,6 +270,55 @@ class InsertSuite extends DataSourceTest with SharedSparkSession { "INSERT OVERWRITE to a table while querying it should not be allowed.") } + test("SPARK-30112: it is allowed to write to a table while querying it for " + + "dynamic partition overwrite.") { + Seq(PartitionOverwriteMode.DYNAMIC.toString, + PartitionOverwriteMode.STATIC.toString).foreach { mode => + withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> mode) { + withTable("insertTable") { + sql( + """ + |CREATE TABLE insertTable(i int, part1 int, part2 int) USING PARQUET + |PARTITIONED BY (part1, part2) + """.stripMargin) + + sql("INSERT INTO TABLE insertTable PARTITION(part1=1, part2=1) SELECT 1") + checkAnswer(spark.table("insertTable"), Row(1, 1, 1)) + sql("INSERT OVERWRITE TABLE insertTable PARTITION(part1=1, part2=2) SELECT 2") + checkAnswer(spark.table("insertTable"), Row(1, 1, 1) :: Row(2, 1, 2) :: Nil) + + if (mode == PartitionOverwriteMode.DYNAMIC.toString) { + sql( + """ + |INSERT OVERWRITE TABLE insertTable PARTITION(part1=1, part2) + |SELECT i + 1, part2 FROM insertTable + """.stripMargin) + checkAnswer(spark.table("insertTable"), Row(2, 1, 1) :: Row(3, 1, 2) :: Nil) + + sql( + """ + |INSERT OVERWRITE TABLE insertTable PARTITION(part1=1, part2) + |SELECT i + 1, part2 + 1 FROM insertTable + """.stripMargin) + checkAnswer(spark.table("insertTable"), + Row(2, 1, 1) :: Row(3, 1, 2) :: Row(4, 1, 3) :: Nil) + } else { + val message = intercept[AnalysisException] { + sql( + """ + |INSERT OVERWRITE TABLE insertTable PARTITION(part1=1, part2) + |SELECT i + 1, part2 FROM insertTable + """.stripMargin) + }.getMessage + assert( + message.contains("Cannot overwrite a path that is also being read from."), + "INSERT OVERWRITE to a table while querying it should not be allowed.") + } + } + } + } + } + test("Caching") { // write something to the jsonTable sql( @@ -470,6 +521,20 @@ class InsertSuite extends DataSourceTest with SharedSparkSession { } } + test("new partitions should be added to catalog after writing to catalog table") { + val table = "partitioned_catalog_table" + val numParts = 210 + withTable(table) { + val df = (1 to numParts).map(i => (i, i)).toDF("part", "col1") + val tempTable = "partitioned_catalog_temp_table" + df.createOrReplaceTempView(tempTable) + sql(s"CREATE TABLE $table (part Int, col1 Int) USING parquet PARTITIONED BY (part)") + sql(s"INSERT INTO TABLE $table SELECT * from $tempTable") + val partitions = spark.sessionState.catalog.listPartitionNames(TableIdentifier(table)) + assert(partitions.size == numParts) + } + } + test("SPARK-20236: dynamic partition overwrite without catalog table") { withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> PartitionOverwriteMode.DYNAMIC.toString) { withTempPath { path => @@ -634,6 +699,60 @@ class InsertSuite extends DataSourceTest with SharedSparkSession { } } + test("Throw exceptions on inserting out-of-range int value with ANSI casting policy") { + withSQLConf( + SQLConf.STORE_ASSIGNMENT_POLICY.key -> SQLConf.StoreAssignmentPolicy.ANSI.toString) { + withTable("t") { + sql("create table t(b int) using parquet") + val outOfRangeValue1 = (Int.MaxValue + 1L).toString + var msg = intercept[SparkException] { + sql(s"insert into t values($outOfRangeValue1)") + }.getCause.getMessage + assert(msg.contains(s"Casting $outOfRangeValue1 to int causes overflow")) + + val outOfRangeValue2 = (Int.MinValue - 1L).toString + msg = intercept[SparkException] { + sql(s"insert into t values($outOfRangeValue2)") + }.getCause.getMessage + assert(msg.contains(s"Casting $outOfRangeValue2 to int causes overflow")) + } + } + } + + test("Throw exceptions on inserting out-of-range long value with ANSI casting policy") { + withSQLConf( + SQLConf.STORE_ASSIGNMENT_POLICY.key -> SQLConf.StoreAssignmentPolicy.ANSI.toString) { + withTable("t") { + sql("create table t(b long) using parquet") + val outOfRangeValue1 = Math.nextUp(Long.MaxValue) + var msg = intercept[SparkException] { + sql(s"insert into t values(${outOfRangeValue1}D)") + }.getCause.getMessage + assert(msg.contains(s"Casting $outOfRangeValue1 to long causes overflow")) + + val outOfRangeValue2 = Math.nextDown(Long.MinValue) + msg = intercept[SparkException] { + sql(s"insert into t values(${outOfRangeValue2}D)") + }.getCause.getMessage + assert(msg.contains(s"Casting $outOfRangeValue2 to long causes overflow")) + } + } + } + + test("Throw exceptions on inserting out-of-range decimal value with ANSI casting policy") { + withSQLConf( + SQLConf.STORE_ASSIGNMENT_POLICY.key -> SQLConf.StoreAssignmentPolicy.ANSI.toString) { + withTable("t") { + sql("create table t(b decimal(3,2)) using parquet") + val outOfRangeValue = "123.45" + val msg = intercept[SparkException] { + sql(s"insert into t values(${outOfRangeValue})") + }.getCause.getMessage + assert(msg.contains("cannot be represented as Decimal(3, 2)")) + } + } + } + test("SPARK-24860: dynamic partition overwrite specified per source without catalog table") { withTempPath { path => Seq((1, 1), (2, 2)).toDF("i", "part") @@ -675,7 +794,41 @@ class InsertSuite extends DataSourceTest with SharedSparkSession { spark.sessionState.catalog.createTable(newTable, false) sql("INSERT INTO TABLE test_table SELECT 1, 'a'") - sql("INSERT INTO TABLE test_table SELECT 2, null") + val msg = intercept[AnalysisException] { + sql("INSERT INTO TABLE test_table SELECT 2, null") + }.getMessage + assert(msg.contains("Cannot write nullable values to non-null column 's'")) } } + + test("Stop task set if FileAlreadyExistsException was thrown") { + withSQLConf("fs.file.impl" -> classOf[FileExistingTestFileSystem].getName, + "fs.file.impl.disable.cache" -> "true") { + withTable("t") { + sql( + """ + |CREATE TABLE t(i INT, part1 INT) USING PARQUET + |PARTITIONED BY (part1) + """.stripMargin) + + val df = Seq((1, 1)).toDF("i", "part1") + val err = intercept[SparkException] { + df.write.mode("overwrite").format("parquet").insertInto("t") + } + assert(err.getCause.getMessage.contains("can not write to output file: " + + "org.apache.hadoop.fs.FileAlreadyExistsException")) + } + } + } +} + +class FileExistingTestFileSystem extends RawLocalFileSystem { + override def create( + f: Path, + overwrite: Boolean, + bufferSize: Int, + replication: Short, + blockSize: Long): FSDataOutputStream = { + throw new FileAlreadyExistsException(s"${f.toString} already exists") + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala index 87dce376a09dd..9b26a5659df49 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PathOptionSuite.scala @@ -123,7 +123,8 @@ class PathOptionSuite extends DataSourceTest with SharedSparkSession { |USING ${classOf[TestOptionsSource].getCanonicalName} |OPTIONS (PATH '/tmp/path')""".stripMargin) sql("ALTER TABLE src SET LOCATION '/tmp/path2'") - assert(getPathOption("src").map(makeQualifiedPath) == Some(makeQualifiedPath("/tmp/path2"))) + assert(getPathOption("src") == + Some(CatalogUtils.URIToString(makeQualifiedPath("/tmp/path2")))) } withTable("src", "src2") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala index d99c605b2e478..f242f75f39f20 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/PrunedScanSuite.scala @@ -115,6 +115,10 @@ class PrunedScanSuite extends DataSourceTest with SharedSparkSession { testPruning("SELECT b, b FROM oneToTenPruned", "b") testPruning("SELECT a FROM oneToTenPruned", "a") testPruning("SELECT b FROM oneToTenPruned", "b") + testPruning("SELECT a, rand() FROM oneToTenPruned WHERE a > 5", "a") + testPruning("SELECT a FROM oneToTenPruned WHERE rand() > 0.5", "a") + testPruning("SELECT a, rand() FROM oneToTenPruned WHERE rand() > 0.5", "a") + testPruning("SELECT a, rand() FROM oneToTenPruned WHERE b > 5", "a", "b") def testPruning(sqlString: String, expectedColumns: String*): Unit = { test(s"Columns output ${expectedColumns.mkString(",")}: $sqlString") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala index d4e117953942e..9a95bf770772e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/TableScanSuite.scala @@ -358,7 +358,7 @@ class TableScanSuite extends DataSourceTest with SharedSparkSession { // Make sure we do throw correct exception when users use a relation provider that // only implements the RelationProvider or the SchemaRelationProvider. Seq("TEMPORARY VIEW", "TABLE").foreach { tableType => - val schemaNotAllowed = intercept[Exception] { + val schemaNotMatch = intercept[Exception] { sql( s""" |CREATE $tableType relationProvierWithSchema (i int) @@ -369,7 +369,8 @@ class TableScanSuite extends DataSourceTest with SharedSparkSession { |) """.stripMargin) } - assert(schemaNotAllowed.getMessage.contains("does not allow user-specified schemas")) + assert(schemaNotMatch.getMessage.contains( + "The user-specified schema doesn't match the actual schema")) val schemaNeeded = intercept[Exception] { sql( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2SQLSuite.scala deleted file mode 100644 index b6e7bc5d1a4dc..0000000000000 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2SQLSuite.scala +++ /dev/null @@ -1,962 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.sources.v2 - -import scala.collection.JavaConverters._ - -import org.apache.spark.SparkException -import org.apache.spark.sql._ -import org.apache.spark.sql.catalog.v2.{CatalogPlugin, Identifier, TableCatalog} -import org.apache.spark.sql.catalyst.analysis.{CannotReplaceMissingTableException, NoSuchDatabaseException, NoSuchTableException, TableAlreadyExistsException} -import org.apache.spark.sql.connector.{InMemoryTable, InMemoryTableCatalog, StagingInMemoryTableCatalog} -import org.apache.spark.sql.execution.datasources.v2.V2SessionCatalog -import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.internal.SQLConf.V2_SESSION_CATALOG -import org.apache.spark.sql.sources.v2.internal.V1Table -import org.apache.spark.sql.types.{ArrayType, BooleanType, DoubleType, IntegerType, LongType, MapType, StringType, StructField, StructType, TimestampType} -import org.apache.spark.sql.util.CaseInsensitiveStringMap - -class DataSourceV2SQLSuite - extends InsertIntoTests(supportsDynamicOverwrite = true, includeSQLOnlyTests = true) - with AlterTableTests { - - import org.apache.spark.sql.catalog.v2.CatalogV2Implicits._ - - private val v2Source = classOf[FakeV2Provider].getName - override protected val v2Format = v2Source - override protected val catalogAndNamespace = "testcat.ns1.ns2." - - private def catalog(name: String): CatalogPlugin = { - spark.sessionState.catalogManager.catalog(name) - } - - protected def doInsert(tableName: String, insert: DataFrame, mode: SaveMode): Unit = { - val tmpView = "tmp_view" - withTempView(tmpView) { - insert.createOrReplaceTempView(tmpView) - val overwrite = if (mode == SaveMode.Overwrite) "OVERWRITE" else "INTO" - sql(s"INSERT $overwrite TABLE $tableName SELECT * FROM $tmpView") - } - } - - override def verifyTable(tableName: String, expected: DataFrame): Unit = { - checkAnswer(spark.table(tableName), expected) - } - - override def getTableMetadata(tableName: String): Table = { - val nameParts = spark.sessionState.sqlParser.parseMultipartIdentifier(tableName) - val v2Catalog = catalog(nameParts.head).asTableCatalog - val namespace = nameParts.drop(1).init.toArray - v2Catalog.loadTable(Identifier.of(namespace, nameParts.last)) - } - - before { - spark.conf.set("spark.sql.catalog.testcat", classOf[InMemoryTableCatalog].getName) - spark.conf.set( - "spark.sql.catalog.testcat_atomic", classOf[StagingInMemoryTableCatalog].getName) - spark.conf.set("spark.sql.catalog.testcat2", classOf[InMemoryTableCatalog].getName) - spark.conf.set(V2_SESSION_CATALOG.key, classOf[InMemoryTableSessionCatalog].getName) - - val df = spark.createDataFrame(Seq((1L, "a"), (2L, "b"), (3L, "c"))).toDF("id", "data") - df.createOrReplaceTempView("source") - val df2 = spark.createDataFrame(Seq((4L, "d"), (5L, "e"), (6L, "f"))).toDF("id", "data") - df2.createOrReplaceTempView("source2") - } - - after { - spark.sessionState.catalog.reset() - spark.sessionState.catalogManager.reset() - spark.sessionState.conf.clear() - } - - test("CreateTable: use v2 plan because catalog is set") { - spark.sql("CREATE TABLE testcat.table_name (id bigint, data string) USING foo") - - val testCatalog = catalog("testcat").asTableCatalog - val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) - - assert(table.name == "testcat.table_name") - assert(table.partitioning.isEmpty) - assert(table.properties == Map("provider" -> "foo").asJava) - assert(table.schema == new StructType().add("id", LongType).add("data", StringType)) - - val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) - checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), Seq.empty) - } - - test("DescribeTable using v2 catalog") { - spark.sql("CREATE TABLE testcat.table_name (id bigint, data string)" + - " USING foo" + - " PARTITIONED BY (id)") - val descriptionDf = spark.sql("DESCRIBE TABLE testcat.table_name") - assert(descriptionDf.schema.map(field => (field.name, field.dataType)) === - Seq( - ("col_name", StringType), - ("data_type", StringType), - ("comment", StringType))) - val description = descriptionDf.collect() - assert(description === Seq( - Row("id", "bigint", ""), - Row("data", "string", ""))) - } - - test("DescribeTable with v2 catalog when table does not exist.") { - intercept[AnalysisException] { - spark.sql("DESCRIBE TABLE testcat.table_name") - } - } - - test("DescribeTable extended using v2 catalog") { - spark.sql("CREATE TABLE testcat.table_name (id bigint, data string)" + - " USING foo" + - " PARTITIONED BY (id)" + - " TBLPROPERTIES ('bar'='baz')") - val descriptionDf = spark.sql("DESCRIBE TABLE EXTENDED testcat.table_name") - assert(descriptionDf.schema.map(field => (field.name, field.dataType)) - === Seq( - ("col_name", StringType), - ("data_type", StringType), - ("comment", StringType))) - assert(descriptionDf.collect() - .map(_.toSeq) - .map(_.toArray.map(_.toString.trim)) === Array( - Array("id", "bigint", ""), - Array("data", "string", ""), - Array("", "", ""), - Array("Partitioning", "", ""), - Array("--------------", "", ""), - Array("Part 0", "id", ""), - Array("", "", ""), - Array("Table Property", "Value", ""), - Array("----------------", "-------", ""), - Array("bar", "baz", ""), - Array("provider", "foo", ""))) - - } - - test("CreateTable: use v2 plan and session catalog when provider is v2") { - spark.sql(s"CREATE TABLE table_name (id bigint, data string) USING $v2Source") - - val testCatalog = catalog("session").asTableCatalog - val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) - - assert(table.name == "default.table_name") - assert(table.partitioning.isEmpty) - assert(table.properties == Map("provider" -> v2Source).asJava) - assert(table.schema == new StructType().add("id", LongType).add("data", StringType)) - - val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) - checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), Seq.empty) - } - - test("CreateTable: fail if table exists") { - spark.sql("CREATE TABLE testcat.table_name (id bigint, data string) USING foo") - - val testCatalog = catalog("testcat").asTableCatalog - - val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) - assert(table.name == "testcat.table_name") - assert(table.partitioning.isEmpty) - assert(table.properties == Map("provider" -> "foo").asJava) - assert(table.schema == new StructType().add("id", LongType).add("data", StringType)) - - // run a second create query that should fail - val exc = intercept[TableAlreadyExistsException] { - spark.sql("CREATE TABLE testcat.table_name (id bigint, data string, id2 bigint) USING bar") - } - - assert(exc.getMessage.contains("table_name")) - - // table should not have changed - val table2 = testCatalog.loadTable(Identifier.of(Array(), "table_name")) - assert(table2.name == "testcat.table_name") - assert(table2.partitioning.isEmpty) - assert(table2.properties == Map("provider" -> "foo").asJava) - assert(table2.schema == new StructType().add("id", LongType).add("data", StringType)) - - // check that the table is still empty - val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) - checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), Seq.empty) - } - - test("CreateTable: if not exists") { - spark.sql( - "CREATE TABLE IF NOT EXISTS testcat.table_name (id bigint, data string) USING foo") - - val testCatalog = catalog("testcat").asTableCatalog - val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) - - assert(table.name == "testcat.table_name") - assert(table.partitioning.isEmpty) - assert(table.properties == Map("provider" -> "foo").asJava) - assert(table.schema == new StructType().add("id", LongType).add("data", StringType)) - - spark.sql("CREATE TABLE IF NOT EXISTS testcat.table_name (id bigint, data string) USING bar") - - // table should not have changed - val table2 = testCatalog.loadTable(Identifier.of(Array(), "table_name")) - assert(table2.name == "testcat.table_name") - assert(table2.partitioning.isEmpty) - assert(table2.properties == Map("provider" -> "foo").asJava) - assert(table2.schema == new StructType().add("id", LongType).add("data", StringType)) - - // check that the table is still empty - val rdd2 = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) - checkAnswer(spark.internalCreateDataFrame(rdd2, table.schema), Seq.empty) - } - - test("CreateTable: use default catalog for v2 sources when default catalog is set") { - spark.conf.set("spark.sql.default.catalog", "testcat") - spark.sql(s"CREATE TABLE table_name (id bigint, data string) USING foo") - - val testCatalog = catalog("testcat").asTableCatalog - val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) - - assert(table.name == "testcat.table_name") - assert(table.partitioning.isEmpty) - assert(table.properties == Map("provider" -> "foo").asJava) - assert(table.schema == new StructType().add("id", LongType).add("data", StringType)) - - // check that the table is empty - val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) - checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), Seq.empty) - } - - test("CreateTableAsSelect: use v2 plan because catalog is set") { - val basicCatalog = catalog("testcat").asTableCatalog - val atomicCatalog = catalog("testcat_atomic").asTableCatalog - val basicIdentifier = "testcat.table_name" - val atomicIdentifier = "testcat_atomic.table_name" - - Seq((basicCatalog, basicIdentifier), (atomicCatalog, atomicIdentifier)).foreach { - case (catalog, identifier) => - spark.sql(s"CREATE TABLE $identifier USING foo AS SELECT id, data FROM source") - - val table = catalog.loadTable(Identifier.of(Array(), "table_name")) - - assert(table.name == identifier) - assert(table.partitioning.isEmpty) - assert(table.properties == Map("provider" -> "foo").asJava) - assert(table.schema == new StructType() - .add("id", LongType) - .add("data", StringType)) - - val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) - checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), spark.table("source")) - } - } - - test("ReplaceTableAsSelect: basic v2 implementation.") { - val basicCatalog = catalog("testcat").asTableCatalog - val atomicCatalog = catalog("testcat_atomic").asTableCatalog - val basicIdentifier = "testcat.table_name" - val atomicIdentifier = "testcat_atomic.table_name" - - Seq((basicCatalog, basicIdentifier), (atomicCatalog, atomicIdentifier)).foreach { - case (catalog, identifier) => - spark.sql(s"CREATE TABLE $identifier USING foo AS SELECT id, data FROM source") - val originalTable = catalog.loadTable(Identifier.of(Array(), "table_name")) - - spark.sql(s"REPLACE TABLE $identifier USING foo AS SELECT id FROM source") - val replacedTable = catalog.loadTable(Identifier.of(Array(), "table_name")) - - assert(replacedTable != originalTable, "Table should have been replaced.") - assert(replacedTable.name == identifier) - assert(replacedTable.partitioning.isEmpty) - assert(replacedTable.properties == Map("provider" -> "foo").asJava) - assert(replacedTable.schema == new StructType().add("id", LongType)) - - val rdd = spark.sparkContext.parallelize(replacedTable.asInstanceOf[InMemoryTable].rows) - checkAnswer( - spark.internalCreateDataFrame(rdd, replacedTable.schema), - spark.table("source").select("id")) - } - } - - test("ReplaceTableAsSelect: Non-atomic catalog drops the table if the write fails.") { - spark.sql("CREATE TABLE testcat.table_name USING foo AS SELECT id, data FROM source") - val testCatalog = catalog("testcat").asTableCatalog - val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) - assert(table.asInstanceOf[InMemoryTable].rows.nonEmpty) - - intercept[Exception] { - spark.sql("REPLACE TABLE testcat.table_name" + - s" USING foo OPTIONS (`${InMemoryTable.SIMULATE_FAILED_WRITE_OPTION}`=true)" + - s" AS SELECT id FROM source") - } - - assert(!testCatalog.tableExists(Identifier.of(Array(), "table_name")), - "Table should have been dropped as a result of the replace.") - } - - test("ReplaceTableAsSelect: Non-atomic catalog drops the table permanently if the" + - " subsequent table creation fails.") { - spark.sql("CREATE TABLE testcat.table_name USING foo AS SELECT id, data FROM source") - val testCatalog = catalog("testcat").asTableCatalog - val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) - assert(table.asInstanceOf[InMemoryTable].rows.nonEmpty) - - intercept[Exception] { - spark.sql("REPLACE TABLE testcat.table_name" + - s" USING foo" + - s" TBLPROPERTIES (`${InMemoryTableCatalog.SIMULATE_FAILED_CREATE_PROPERTY}`=true)" + - s" AS SELECT id FROM source") - } - - assert(!testCatalog.tableExists(Identifier.of(Array(), "table_name")), - "Table should have been dropped and failed to be created.") - } - - test("ReplaceTableAsSelect: Atomic catalog does not drop the table when replace fails.") { - spark.sql("CREATE TABLE testcat_atomic.table_name USING foo AS SELECT id, data FROM source") - val testCatalog = catalog("testcat_atomic").asTableCatalog - val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) - - intercept[Exception] { - spark.sql("REPLACE TABLE testcat_atomic.table_name" + - s" USING foo OPTIONS (`${InMemoryTable.SIMULATE_FAILED_WRITE_OPTION}=true)" + - s" AS SELECT id FROM source") - } - - var maybeReplacedTable = testCatalog.loadTable(Identifier.of(Array(), "table_name")) - assert(maybeReplacedTable === table, "Table should not have changed.") - - intercept[Exception] { - spark.sql("REPLACE TABLE testcat_atomic.table_name" + - s" USING foo" + - s" TBLPROPERTIES (`${InMemoryTableCatalog.SIMULATE_FAILED_CREATE_PROPERTY}`=true)" + - s" AS SELECT id FROM source") - } - - maybeReplacedTable = testCatalog.loadTable(Identifier.of(Array(), "table_name")) - assert(maybeReplacedTable === table, "Table should not have changed.") - } - - test("ReplaceTable: Erases the table contents and changes the metadata.") { - spark.sql(s"CREATE TABLE testcat.table_name USING $v2Source AS SELECT id, data FROM source") - - val testCatalog = catalog("testcat").asTableCatalog - val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) - assert(table.asInstanceOf[InMemoryTable].rows.nonEmpty) - - spark.sql("REPLACE TABLE testcat.table_name (id bigint) USING foo") - val replaced = testCatalog.loadTable(Identifier.of(Array(), "table_name")) - - assert(replaced.asInstanceOf[InMemoryTable].rows.isEmpty, - "Replaced table should have no rows after committing.") - assert(replaced.schema().fields.length === 1, - "Replaced table should have new schema.") - assert(replaced.schema().fields(0).name === "id", - "Replaced table should have new schema.") - } - - test("ReplaceTableAsSelect: CREATE OR REPLACE new table has same behavior as CTAS.") { - Seq("testcat", "testcat_atomic").foreach { catalogName => - spark.sql( - s""" - |CREATE TABLE $catalogName.created USING $v2Source - |AS SELECT id, data FROM source - """.stripMargin) - spark.sql( - s""" - |CREATE OR REPLACE TABLE $catalogName.replaced USING $v2Source - |AS SELECT id, data FROM source - """.stripMargin) - - val testCatalog = catalog(catalogName).asTableCatalog - val createdTable = testCatalog.loadTable(Identifier.of(Array(), "created")) - val replacedTable = testCatalog.loadTable(Identifier.of(Array(), "replaced")) - - assert(createdTable.asInstanceOf[InMemoryTable].rows === - replacedTable.asInstanceOf[InMemoryTable].rows) - assert(createdTable.schema === replacedTable.schema) - } - } - - test("ReplaceTableAsSelect: REPLACE TABLE throws exception if table does not exist.") { - Seq("testcat", "testcat_atomic").foreach { catalog => - spark.sql(s"CREATE TABLE $catalog.created USING $v2Source AS SELECT id, data FROM source") - intercept[CannotReplaceMissingTableException] { - spark.sql(s"REPLACE TABLE $catalog.replaced USING $v2Source AS SELECT id, data FROM source") - } - } - } - - test("ReplaceTableAsSelect: REPLACE TABLE throws exception if table is dropped before commit.") { - import InMemoryTableCatalog._ - spark.sql(s"CREATE TABLE testcat_atomic.created USING $v2Source AS SELECT id, data FROM source") - intercept[CannotReplaceMissingTableException] { - spark.sql(s"REPLACE TABLE testcat_atomic.replaced" + - s" USING $v2Source" + - s" TBLPROPERTIES (`$SIMULATE_DROP_BEFORE_REPLACE_PROPERTY`=true)" + - s" AS SELECT id, data FROM source") - } - } - - test("CreateTableAsSelect: use v2 plan and session catalog when provider is v2") { - spark.sql(s"CREATE TABLE table_name USING $v2Source AS SELECT id, data FROM source") - - val testCatalog = catalog("session").asTableCatalog - val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) - - assert(table.name == "default.table_name") - assert(table.partitioning.isEmpty) - assert(table.properties == Map("provider" -> v2Source).asJava) - assert(table.schema == new StructType() - .add("id", LongType) - .add("data", StringType)) - - val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) - checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), spark.table("source")) - } - - test("CreateTableAsSelect: fail if table exists") { - spark.sql("CREATE TABLE testcat.table_name USING foo AS SELECT id, data FROM source") - - val testCatalog = catalog("testcat").asTableCatalog - - val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) - assert(table.name == "testcat.table_name") - assert(table.partitioning.isEmpty) - assert(table.properties == Map("provider" -> "foo").asJava) - assert(table.schema == new StructType() - .add("id", LongType) - .add("data", StringType)) - - val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) - checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), spark.table("source")) - - // run a second CTAS query that should fail - val exc = intercept[TableAlreadyExistsException] { - spark.sql( - "CREATE TABLE testcat.table_name USING bar AS SELECT id, data, id as id2 FROM source2") - } - - assert(exc.getMessage.contains("table_name")) - - // table should not have changed - val table2 = testCatalog.loadTable(Identifier.of(Array(), "table_name")) - assert(table2.name == "testcat.table_name") - assert(table2.partitioning.isEmpty) - assert(table2.properties == Map("provider" -> "foo").asJava) - assert(table2.schema == new StructType() - .add("id", LongType) - .add("data", StringType)) - - val rdd2 = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) - checkAnswer(spark.internalCreateDataFrame(rdd2, table.schema), spark.table("source")) - } - - test("CreateTableAsSelect: if not exists") { - spark.sql( - "CREATE TABLE IF NOT EXISTS testcat.table_name USING foo AS SELECT id, data FROM source") - - val testCatalog = catalog("testcat").asTableCatalog - val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) - - assert(table.name == "testcat.table_name") - assert(table.partitioning.isEmpty) - assert(table.properties == Map("provider" -> "foo").asJava) - assert(table.schema == new StructType() - .add("id", LongType) - .add("data", StringType)) - - val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) - checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), spark.table("source")) - - spark.sql( - "CREATE TABLE IF NOT EXISTS testcat.table_name USING foo AS SELECT id, data FROM source2") - - // check that the table contains data from just the first CTAS - val rdd2 = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) - checkAnswer(spark.internalCreateDataFrame(rdd2, table.schema), spark.table("source")) - } - - test("CreateTableAsSelect: use default catalog for v2 sources when default catalog is set") { - spark.conf.set("spark.sql.default.catalog", "testcat") - - val df = spark.createDataFrame(Seq((1L, "a"), (2L, "b"), (3L, "c"))).toDF("id", "data") - df.createOrReplaceTempView("source") - - // setting the default catalog breaks the reference to source because the default catalog is - // used and AsTableIdentifier no longer matches - spark.sql(s"CREATE TABLE table_name USING foo AS SELECT id, data FROM source") - - val testCatalog = catalog("testcat").asTableCatalog - val table = testCatalog.loadTable(Identifier.of(Array(), "table_name")) - - assert(table.name == "testcat.table_name") - assert(table.partitioning.isEmpty) - assert(table.properties == Map("provider" -> "foo").asJava) - assert(table.schema == new StructType() - .add("id", LongType) - .add("data", StringType)) - - val rdd = sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) - checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), spark.table("source")) - } - - test("CreateTableAsSelect: v2 session catalog can load v1 source table") { - spark.conf.set(V2_SESSION_CATALOG.key, classOf[V2SessionCatalog].getName) - - val df = spark.createDataFrame(Seq((1L, "a"), (2L, "b"), (3L, "c"))).toDF("id", "data") - df.createOrReplaceTempView("source") - - sql(s"CREATE TABLE table_name USING parquet AS SELECT id, data FROM source") - - checkAnswer(sql(s"TABLE default.table_name"), spark.table("source")) - // The fact that the following line doesn't throw an exception means, the session catalog - // can load the table. - val t = catalog("session").asTableCatalog - .loadTable(Identifier.of(Array.empty, "table_name")) - assert(t.isInstanceOf[V1Table], "V1 table wasn't returned as an unresolved table") - } - - test("CreateTableAsSelect: nullable schema") { - val basicCatalog = catalog("testcat").asTableCatalog - val atomicCatalog = catalog("testcat_atomic").asTableCatalog - val basicIdentifier = "testcat.table_name" - val atomicIdentifier = "testcat_atomic.table_name" - - Seq((basicCatalog, basicIdentifier), (atomicCatalog, atomicIdentifier)).foreach { - case (catalog, identifier) => - spark.sql(s"CREATE TABLE $identifier USING foo AS SELECT 1 i") - - val table = catalog.loadTable(Identifier.of(Array(), "table_name")) - - assert(table.name == identifier) - assert(table.partitioning.isEmpty) - assert(table.properties == Map("provider" -> "foo").asJava) - assert(table.schema == new StructType().add("i", "int")) - - val rdd = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) - checkAnswer(spark.internalCreateDataFrame(rdd, table.schema), Row(1)) - - sql(s"INSERT INTO $identifier SELECT CAST(null AS INT)") - val rdd2 = spark.sparkContext.parallelize(table.asInstanceOf[InMemoryTable].rows) - checkAnswer(spark.internalCreateDataFrame(rdd2, table.schema), Seq(Row(1), Row(null))) - } - } - - test("DropTable: basic") { - val tableName = "testcat.ns1.ns2.tbl" - val ident = Identifier.of(Array("ns1", "ns2"), "tbl") - sql(s"CREATE TABLE $tableName USING foo AS SELECT id, data FROM source") - assert(catalog("testcat").asTableCatalog.tableExists(ident) === true) - sql(s"DROP TABLE $tableName") - assert(catalog("testcat").asTableCatalog.tableExists(ident) === false) - } - - test("DropTable: if exists") { - intercept[NoSuchTableException] { - sql(s"DROP TABLE testcat.db.notbl") - } - sql(s"DROP TABLE IF EXISTS testcat.db.notbl") - } - - test("Relation: basic") { - val t1 = "testcat.ns1.ns2.tbl" - withTable(t1) { - sql(s"CREATE TABLE $t1 USING foo AS SELECT id, data FROM source") - checkAnswer(sql(s"TABLE $t1"), spark.table("source")) - checkAnswer(sql(s"SELECT * FROM $t1"), spark.table("source")) - } - } - - test("Relation: SparkSession.table()") { - val t1 = "testcat.ns1.ns2.tbl" - withTable(t1) { - sql(s"CREATE TABLE $t1 USING foo AS SELECT id, data FROM source") - checkAnswer(spark.table(s"$t1"), spark.table("source")) - } - } - - test("Relation: CTE") { - val t1 = "testcat.ns1.ns2.tbl" - withTable(t1) { - sql(s"CREATE TABLE $t1 USING foo AS SELECT id, data FROM source") - checkAnswer( - sql(s""" - |WITH cte AS (SELECT * FROM $t1) - |SELECT * FROM cte - """.stripMargin), - spark.table("source")) - } - } - - test("Relation: view text") { - val t1 = "testcat.ns1.ns2.tbl" - withTable(t1) { - withView("view1") { v1: String => - sql(s"CREATE TABLE $t1 USING foo AS SELECT id, data FROM source") - sql(s"CREATE VIEW $v1 AS SELECT * from $t1") - checkAnswer(sql(s"TABLE $v1"), spark.table("source")) - } - } - } - - test("Relation: join tables in 2 catalogs") { - val t1 = "testcat.ns1.ns2.tbl" - val t2 = "testcat2.v2tbl" - withTable(t1, t2) { - sql(s"CREATE TABLE $t1 USING foo AS SELECT id, data FROM source") - sql(s"CREATE TABLE $t2 USING foo AS SELECT id, data FROM source2") - val df1 = spark.table("source") - val df2 = spark.table("source2") - val df_joined = df1.join(df2).where(df1("id") + 1 === df2("id")) - checkAnswer( - sql(s""" - |SELECT * - |FROM $t1 t1, $t2 t2 - |WHERE t1.id + 1 = t2.id - """.stripMargin), - df_joined) - } - } - - test("InsertInto: append - across catalog") { - val t1 = "testcat.ns1.ns2.tbl" - val t2 = "testcat2.db.tbl" - withTable(t1, t2) { - sql(s"CREATE TABLE $t1 USING foo AS SELECT * FROM source") - sql(s"CREATE TABLE $t2 (id bigint, data string) USING foo") - sql(s"INSERT INTO $t2 SELECT * FROM $t1") - checkAnswer(spark.table(t2), spark.table("source")) - } - } - - test("ShowTables: using v2 catalog") { - spark.sql("CREATE TABLE testcat.db.table_name (id bigint, data string) USING foo") - spark.sql("CREATE TABLE testcat.n1.n2.db.table_name (id bigint, data string) USING foo") - - runShowTablesSql("SHOW TABLES FROM testcat.db", Seq(Row("db", "table_name"))) - - runShowTablesSql( - "SHOW TABLES FROM testcat.n1.n2.db", - Seq(Row("n1.n2.db", "table_name"))) - } - - test("ShowTables: using v2 catalog with a pattern") { - spark.sql("CREATE TABLE testcat.db.table (id bigint, data string) USING foo") - spark.sql("CREATE TABLE testcat.db.table_name_1 (id bigint, data string) USING foo") - spark.sql("CREATE TABLE testcat.db.table_name_2 (id bigint, data string) USING foo") - spark.sql("CREATE TABLE testcat.db2.table_name_2 (id bigint, data string) USING foo") - - runShowTablesSql( - "SHOW TABLES FROM testcat.db", - Seq( - Row("db", "table"), - Row("db", "table_name_1"), - Row("db", "table_name_2"))) - - runShowTablesSql( - "SHOW TABLES FROM testcat.db LIKE '*name*'", - Seq(Row("db", "table_name_1"), Row("db", "table_name_2"))) - - runShowTablesSql( - "SHOW TABLES FROM testcat.db LIKE '*2'", - Seq(Row("db", "table_name_2"))) - } - - test("ShowTables: using v2 catalog, namespace doesn't exist") { - runShowTablesSql("SHOW TABLES FROM testcat.unknown", Seq()) - } - - test("ShowTables: using v1 catalog") { - runShowTablesSql( - "SHOW TABLES FROM default", - Seq(Row("", "source", true), Row("", "source2", true)), - expectV2Catalog = false) - } - - test("ShowTables: using v1 catalog, db doesn't exist ") { - // 'db' below resolves to a database name for v1 catalog because there is no catalog named - // 'db' and there is no default catalog set. - val exception = intercept[NoSuchDatabaseException] { - runShowTablesSql("SHOW TABLES FROM db", Seq(), expectV2Catalog = false) - } - - assert(exception.getMessage.contains("Database 'db' not found")) - } - - test("ShowTables: using v1 catalog, db name with multipartIdentifier ('a.b') is not allowed.") { - val exception = intercept[AnalysisException] { - runShowTablesSql("SHOW TABLES FROM a.b", Seq(), expectV2Catalog = false) - } - - assert(exception.getMessage.contains("The database name is not valid: a.b")) - } - - test("ShowTables: using v2 catalog with empty namespace") { - spark.sql("CREATE TABLE testcat.table (id bigint, data string) USING foo") - runShowTablesSql("SHOW TABLES FROM testcat", Seq(Row("", "table"))) - } - - test("ShowTables: namespace is not specified and default v2 catalog is set") { - spark.conf.set("spark.sql.default.catalog", "testcat") - spark.sql("CREATE TABLE testcat.table (id bigint, data string) USING foo") - - // v2 catalog is used where default namespace is empty for TestInMemoryTableCatalog. - runShowTablesSql("SHOW TABLES", Seq(Row("", "table"))) - } - - test("ShowTables: namespace not specified and default v2 catalog not set - fallback to v1") { - runShowTablesSql( - "SHOW TABLES", - Seq(Row("", "source", true), Row("", "source2", true)), - expectV2Catalog = false) - - runShowTablesSql( - "SHOW TABLES LIKE '*2'", - Seq(Row("", "source2", true)), - expectV2Catalog = false) - } - - private def runShowTablesSql( - sqlText: String, - expected: Seq[Row], - expectV2Catalog: Boolean = true): Unit = { - val schema = if (expectV2Catalog) { - new StructType() - .add("namespace", StringType, nullable = false) - .add("tableName", StringType, nullable = false) - } else { - new StructType() - .add("database", StringType, nullable = false) - .add("tableName", StringType, nullable = false) - .add("isTemporary", BooleanType, nullable = false) - } - - val df = spark.sql(sqlText) - assert(df.schema === schema) - assert(expected === df.collect()) - } - - test("tableCreation: partition column case insensitive resolution") { - val testCatalog = catalog("testcat").asTableCatalog - val sessionCatalog = catalog("session").asTableCatalog - - def checkPartitioning(cat: TableCatalog, partition: String): Unit = { - val table = cat.loadTable(Identifier.of(Array.empty, "tbl")) - val partitions = table.partitioning().map(_.references()) - assert(partitions.length === 1) - val fieldNames = partitions.flatMap(_.map(_.fieldNames())) - assert(fieldNames === Array(Array(partition))) - } - - sql(s"CREATE TABLE tbl (a int, b string) USING $v2Source PARTITIONED BY (A)") - checkPartitioning(sessionCatalog, "a") - sql(s"CREATE TABLE testcat.tbl (a int, b string) USING $v2Source PARTITIONED BY (A)") - checkPartitioning(testCatalog, "a") - sql(s"CREATE OR REPLACE TABLE tbl (a int, b string) USING $v2Source PARTITIONED BY (B)") - checkPartitioning(sessionCatalog, "b") - sql(s"CREATE OR REPLACE TABLE testcat.tbl (a int, b string) USING $v2Source PARTITIONED BY (B)") - checkPartitioning(testCatalog, "b") - } - - test("tableCreation: partition column case sensitive resolution") { - def checkFailure(statement: String): Unit = { - withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { - val e = intercept[AnalysisException] { - sql(statement) - } - assert(e.getMessage.contains("Couldn't find column")) - } - } - - checkFailure(s"CREATE TABLE tbl (a int, b string) USING $v2Source PARTITIONED BY (A)") - checkFailure(s"CREATE TABLE testcat.tbl (a int, b string) USING $v2Source PARTITIONED BY (A)") - checkFailure( - s"CREATE OR REPLACE TABLE tbl (a int, b string) USING $v2Source PARTITIONED BY (B)") - checkFailure( - s"CREATE OR REPLACE TABLE testcat.tbl (a int, b string) USING $v2Source PARTITIONED BY (B)") - } - - test("tableCreation: duplicate column names in the table definition") { - val errorMsg = "Found duplicate column(s) in the table definition of `t`" - Seq((true, ("a", "a")), (false, ("aA", "Aa"))).foreach { case (caseSensitive, (c0, c1)) => - withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { - testCreateAnalysisError( - s"CREATE TABLE t ($c0 INT, $c1 INT) USING $v2Source", - errorMsg - ) - testCreateAnalysisError( - s"CREATE TABLE testcat.t ($c0 INT, $c1 INT) USING $v2Source", - errorMsg - ) - testCreateAnalysisError( - s"CREATE OR REPLACE TABLE t ($c0 INT, $c1 INT) USING $v2Source", - errorMsg - ) - testCreateAnalysisError( - s"CREATE OR REPLACE TABLE testcat.t ($c0 INT, $c1 INT) USING $v2Source", - errorMsg - ) - } - } - } - - test("tableCreation: duplicate nested column names in the table definition") { - val errorMsg = "Found duplicate column(s) in the table definition of `t`" - Seq((true, ("a", "a")), (false, ("aA", "Aa"))).foreach { case (caseSensitive, (c0, c1)) => - withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { - testCreateAnalysisError( - s"CREATE TABLE t (d struct<$c0: INT, $c1: INT>) USING $v2Source", - errorMsg - ) - testCreateAnalysisError( - s"CREATE TABLE testcat.t (d struct<$c0: INT, $c1: INT>) USING $v2Source", - errorMsg - ) - testCreateAnalysisError( - s"CREATE OR REPLACE TABLE t (d struct<$c0: INT, $c1: INT>) USING $v2Source", - errorMsg - ) - testCreateAnalysisError( - s"CREATE OR REPLACE TABLE testcat.t (d struct<$c0: INT, $c1: INT>) USING $v2Source", - errorMsg - ) - } - } - } - - test("tableCreation: bucket column names not in table definition") { - val errorMsg = "Couldn't find column c in" - testCreateAnalysisError( - s"CREATE TABLE tbl (a int, b string) USING $v2Source CLUSTERED BY (c) INTO 4 BUCKETS", - errorMsg - ) - testCreateAnalysisError( - s"CREATE TABLE testcat.tbl (a int, b string) USING $v2Source CLUSTERED BY (c) INTO 4 BUCKETS", - errorMsg - ) - testCreateAnalysisError( - s"CREATE OR REPLACE TABLE tbl (a int, b string) USING $v2Source " + - "CLUSTERED BY (c) INTO 4 BUCKETS", - errorMsg - ) - testCreateAnalysisError( - s"CREATE OR REPLACE TABLE testcat.tbl (a int, b string) USING $v2Source " + - "CLUSTERED BY (c) INTO 4 BUCKETS", - errorMsg - ) - } - - test("tableCreation: column repeated in partition columns") { - val errorMsg = "Found duplicate column(s) in the partitioning" - Seq((true, ("a", "a")), (false, ("aA", "Aa"))).foreach { case (caseSensitive, (c0, c1)) => - withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { - testCreateAnalysisError( - s"CREATE TABLE t ($c0 INT) USING $v2Source PARTITIONED BY ($c0, $c1)", - errorMsg - ) - testCreateAnalysisError( - s"CREATE TABLE testcat.t ($c0 INT) USING $v2Source PARTITIONED BY ($c0, $c1)", - errorMsg - ) - testCreateAnalysisError( - s"CREATE OR REPLACE TABLE t ($c0 INT) USING $v2Source PARTITIONED BY ($c0, $c1)", - errorMsg - ) - testCreateAnalysisError( - s"CREATE OR REPLACE TABLE testcat.t ($c0 INT) USING $v2Source PARTITIONED BY ($c0, $c1)", - errorMsg - ) - } - } - } - - test("tableCreation: column repeated in bucket columns") { - val errorMsg = "Found duplicate column(s) in the bucket definition" - Seq((true, ("a", "a")), (false, ("aA", "Aa"))).foreach { case (caseSensitive, (c0, c1)) => - withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { - testCreateAnalysisError( - s"CREATE TABLE t ($c0 INT) USING $v2Source " + - s"CLUSTERED BY ($c0, $c1) INTO 2 BUCKETS", - errorMsg - ) - testCreateAnalysisError( - s"CREATE TABLE testcat.t ($c0 INT) USING $v2Source " + - s"CLUSTERED BY ($c0, $c1) INTO 2 BUCKETS", - errorMsg - ) - testCreateAnalysisError( - s"CREATE OR REPLACE TABLE t ($c0 INT) USING $v2Source " + - s"CLUSTERED BY ($c0, $c1) INTO 2 BUCKETS", - errorMsg - ) - testCreateAnalysisError( - s"CREATE OR REPLACE TABLE testcat.t ($c0 INT) USING $v2Source " + - s"CLUSTERED BY ($c0, $c1) INTO 2 BUCKETS", - errorMsg - ) - } - } - } - - test("DeleteFrom: basic") { - val t = "testcat.ns1.ns2.tbl" - withTable(t) { - sql(s"CREATE TABLE $t (id bigint, data string, p int) USING foo PARTITIONED BY (id, p)") - sql(s"INSERT INTO $t VALUES (2L, 'a', 2), (2L, 'b', 3), (3L, 'c', 3)") - sql(s"DELETE FROM $t WHERE id = 2") - checkAnswer(spark.table(t), Seq( - Row(3, "c", 3))) - } - } - - test("DeleteFrom: alias") { - val t = "testcat.ns1.ns2.tbl" - withTable(t) { - sql(s"CREATE TABLE $t (id bigint, data string, p int) USING foo PARTITIONED BY (id, p)") - sql(s"INSERT INTO $t VALUES (2L, 'a', 2), (2L, 'b', 3), (3L, 'c', 3)") - sql(s"DELETE FROM $t tbl WHERE tbl.id = 2") - checkAnswer(spark.table(t), Seq( - Row(3, "c", 3))) - } - } - - test("DeleteFrom: fail if has subquery") { - val t = "testcat.ns1.ns2.tbl" - withTable(t) { - sql(s"CREATE TABLE $t (id bigint, data string, p int) USING foo PARTITIONED BY (id, p)") - sql(s"INSERT INTO $t VALUES (2L, 'a', 2), (2L, 'b', 3), (3L, 'c', 3)") - val exc = intercept[AnalysisException] { - sql(s"DELETE FROM $t WHERE id IN (SELECT id FROM $t)") - } - - assert(spark.table(t).count === 3) - assert(exc.getMessage.contains("Delete by condition with subquery is not supported")) - } - } - - private def testCreateAnalysisError(sqlStatement: String, expectedError: String): Unit = { - val errMsg = intercept[AnalysisException] { - sql(sqlStatement) - }.getMessage - assert(errMsg.contains(expectedError)) - } -} - - -/** Used as a V2 DataSource for V2SessionCatalog DDL */ -class FakeV2Provider extends TableProvider { - override def getTable(options: CaseInsensitiveStringMap): Table = { - throw new UnsupportedOperationException("Unnecessary for DDL tests") - } -} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeprecatedStreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeprecatedStreamingAggregationSuite.scala new file mode 100644 index 0000000000000..99f7e32d4df72 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeprecatedStreamingAggregationSuite.scala @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.streaming + +import org.scalatest.Assertions + +import org.apache.spark.sql.execution.streaming._ +import org.apache.spark.sql.execution.streaming.state.StreamingAggregationStateManager +import org.apache.spark.sql.expressions.scalalang.typed +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.streaming.OutputMode._ + +@deprecated("This test suite will be removed.", "3.0.0") +class DeprecatedStreamingAggregationSuite extends StateStoreMetricsTest with Assertions { + + import testImplicits._ + + def executeFuncWithStateVersionSQLConf( + stateVersion: Int, + confPairs: Seq[(String, String)], + func: => Any): Unit = { + withSQLConf(confPairs ++ + Seq(SQLConf.STREAMING_AGGREGATION_STATE_FORMAT_VERSION.key -> stateVersion.toString): _*) { + func + } + } + + def testWithAllStateVersions(name: String, confPairs: (String, String)*) + (func: => Any): Unit = { + for (version <- StreamingAggregationStateManager.supportedVersions) { + test(s"$name - state format version $version") { + executeFuncWithStateVersionSQLConf(version, confPairs, func) + } + } + } + + + testWithAllStateVersions("typed aggregators") { + val inputData = MemoryStream[(String, Int)] + val aggregated = inputData.toDS().groupByKey(_._1).agg(typed.sumLong(_._2)) + + testStream(aggregated, Update)( + AddData(inputData, ("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)), + CheckLastBatch(("a", 30), ("b", 3), ("c", 1)) + ) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala index 7d343bb58ea3f..877965100f018 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala @@ -22,10 +22,13 @@ import java.nio.file.Files import java.util.Locale import scala.collection.JavaConverters._ +import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.fs.Path +import org.apache.hadoop.mapreduce.JobContext import org.apache.spark.SparkConf +import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd} import org.apache.spark.sql.{AnalysisException, DataFrame} import org.apache.spark.sql.execution.DataSourceScanExec @@ -389,7 +392,7 @@ abstract class FileStreamSinkSuite extends StreamTest { var bytesWritten: Long = 0L try { spark.sparkContext.addSparkListener(new SparkListener() { - override def onTaskEnd(taskEnd: SparkListenerTaskEnd) { + override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = { val outputMetrics = taskEnd.taskMetrics.outputMetrics recordsWritten += outputMetrics.recordsWritten bytesWritten += outputMetrics.bytesWritten @@ -473,6 +476,125 @@ abstract class FileStreamSinkSuite extends StreamTest { assert(outputFiles.toList.isEmpty, "Incomplete files should be cleaned up.") } } + + testQuietly("cleanup complete but invalid output for aborted job") { + withSQLConf(("spark.sql.streaming.commitProtocolClass", + classOf[PendingCommitFilesTrackingManifestFileCommitProtocol].getCanonicalName)) { + withTempDir { tempDir => + val checkpointDir = new File(tempDir, "chk") + val outputDir = new File(tempDir, "output @#output") + val inputData = MemoryStream[Int] + inputData.addData(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) + val q = inputData.toDS() + .repartition(10) + .map { value => + // we intend task failure after some tasks succeeds + if (value == 5) { + // put some delay to let other task commits before this task fails + Thread.sleep(100) + value / 0 + } else { + value + } + } + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .format("parquet") + .start(outputDir.getCanonicalPath) + + intercept[StreamingQueryException] { + try { + q.processAllAvailable() + } finally { + q.stop() + } + } + + import PendingCommitFilesTrackingManifestFileCommitProtocol._ + val outputFileNames = Files.walk(outputDir.toPath).iterator().asScala + .filter(_.toString.endsWith(".parquet")) + .map(_.getFileName.toString) + .toSet + val trackingFileNames = tracking.map(new Path(_).getName).toSet + + // there would be possible to have race condition: + // - some tasks complete while abortJob is being called + // we can't delete complete files for these tasks (it's OK since this is a best effort) + assert(outputFileNames.intersect(trackingFileNames).isEmpty, + "abortJob should clean up files reported as successful.") + } + } + } + + test("Handle FileStreamSink metadata correctly for empty partition") { + Seq("parquet", "orc", "text", "json").foreach { format => + val inputData = MemoryStream[String] + val df = inputData.toDF() + + withTempDir { outputDir => + withTempDir { checkpointDir => + var query: StreamingQuery = null + try { + // repartition to more than the input to leave empty partitions + query = + df.repartition(10) + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .format(format) + .start(outputDir.getCanonicalPath) + + inputData.addData("1", "2", "3") + inputData.addData("4", "5") + + failAfter(streamingTimeout) { + query.processAllAvailable() + } + } finally { + if (query != null) { + query.stop() + } + } + + val fs = new Path(outputDir.getCanonicalPath).getFileSystem( + spark.sessionState.newHadoopConf()) + val sinkLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, spark, + outputDir.getCanonicalPath) + + val allFiles = sinkLog.allFiles() + // only files from non-empty partition should be logged + assert(allFiles.length < 10) + assert(allFiles.forall(file => fs.exists(new Path(file.path)))) + + // the query should be able to read all rows correctly with metadata log + val outputDf = spark.read.format(format).load(outputDir.getCanonicalPath) + .selectExpr("CAST(value AS INT)").as[Int] + checkDatasetUnorderly(outputDf, 1, 2, 3, 4, 5) + } + } + } + } +} + +object PendingCommitFilesTrackingManifestFileCommitProtocol { + val tracking: ArrayBuffer[String] = new ArrayBuffer[String]() + + def cleanPendingCommitFiles(): Unit = tracking.clear() + def addPendingCommitFiles(paths: Seq[String]): Unit = tracking ++= paths +} + +class PendingCommitFilesTrackingManifestFileCommitProtocol(jobId: String, path: String) + extends ManifestFileCommitProtocol(jobId, path) { + import PendingCommitFilesTrackingManifestFileCommitProtocol._ + + override def setupJob(jobContext: JobContext): Unit = { + super.setupJob(jobContext) + cleanPendingCommitFiles() + } + + override def onTaskCommit(taskCommit: FileCommitProtocol.TaskCommitMessage): Unit = { + super.onTaskCommit(taskCommit) + addPendingCommitFiles(taskCommit.obj.asInstanceOf[Seq[SinkFileStatus]].map(_.path)) + } } class FileStreamSinkV1Suite extends FileStreamSinkSuite { @@ -535,7 +657,7 @@ class FileStreamSinkV2Suite extends FileStreamSinkSuite { // Verify that MetadataLogFileIndex is being used and the correct partitioning schema has // been inferred val table = df.queryExecution.analyzed.collect { - case DataSourceV2Relation(table: FileTable, _, _) => table + case DataSourceV2Relation(table: FileTable, _, _, _, _) => table } assert(table.size === 1) assert(table.head.fileIndex.isInstanceOf[MetadataLogFileIndex]) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala index f59f819c9c108..fa320333143ec 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala @@ -20,23 +20,27 @@ package org.apache.spark.sql.streaming import java.io.File import java.net.URI +import scala.collection.mutable import scala.util.Random -import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem} +import org.apache.hadoop.fs._ +import org.apache.hadoop.fs.permission.FsPermission +import org.apache.hadoop.util.Progressable import org.scalatest.PrivateMethodTester -import org.scalatest.concurrent.Eventually._ import org.scalatest.time.SpanSugar._ +import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.execution.streaming._ -import org.apache.spark.sql.execution.streaming.FileStreamSource.{FileEntry, SeenFilesMap} +import org.apache.spark.sql.execution.streaming.FileStreamSource.{FileEntry, SeenFilesMap, SourceFileArchiver} import org.apache.spark.sql.execution.streaming.sources.MemorySink import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.ExistsThrowsExceptionFileSystem._ import org.apache.spark.sql.streaming.util.StreamManualClock import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ +import org.apache.spark.sql.types.{StructType, _} import org.apache.spark.util.Utils abstract class FileStreamSourceTest @@ -49,7 +53,7 @@ abstract class FileStreamSourceTest * `FileStreamSource` actually being used in the execution. */ abstract class AddFileData extends AddData { - private val _qualifiedBasePath = PrivateMethod[Path]('qualifiedBasePath) + private val _qualifiedBasePath = PrivateMethod[Path](Symbol("qualifiedBasePath")) private def isSamePath(fileSource: FileStreamSource, srcPath: File): Boolean = { val path = (fileSource invokePrivate _qualifiedBasePath()).toString.stripPrefix("file:") @@ -146,6 +150,20 @@ abstract class FileStreamSourceTest } } + case class AddFilesToFileStreamSinkLog( + fs: FileSystem, + srcDir: Path, + sinkLog: FileStreamSinkLog, + batchId: Int)( + pathFilter: Path => Boolean) extends ExternalAction { + override def runAction(): Unit = { + val statuses = fs.listStatus(srcDir, new PathFilter { + override def accept(path: Path): Boolean = pathFilter(path) + }) + sinkLog.add(batchId, statuses.map(SinkFileStatus(_))) + } + } + /** Use `format` and `path` to create FileStreamSource via DataFrameReader */ def createFileStream( format: String, @@ -177,8 +195,7 @@ abstract class FileStreamSourceTest } } - - protected def withTempDirs(body: (File, File) => Unit) { + protected def withTempDirs(body: (File, File) => Unit): Unit = { val src = Utils.createTempDir(namePrefix = "streaming.src") val tmp = Utils.createTempDir(namePrefix = "streaming.tmp") try { @@ -189,6 +206,19 @@ abstract class FileStreamSourceTest } } + protected def withThreeTempDirs(body: (File, File, File) => Unit): Unit = { + val src = Utils.createTempDir(namePrefix = "streaming.src") + val tmp = Utils.createTempDir(namePrefix = "streaming.tmp") + val archive = Utils.createTempDir(namePrefix = "streaming.archive") + try { + body(src, tmp, archive) + } finally { + Utils.deleteRecursively(src) + Utils.deleteRecursively(tmp) + Utils.deleteRecursively(archive) + } + } + val valueSchema = new StructType().add("value", StringType) } @@ -1144,6 +1174,62 @@ class FileStreamSourceSuite extends FileStreamSourceTest { } } + test("SPARK-30669: maxFilesPerTrigger - ignored when using Trigger.Once") { + withTempDirs { (src, target) => + val checkpoint = new File(target, "chk").getCanonicalPath + val targetDir = new File(target, "data").getCanonicalPath + var lastFileModTime: Option[Long] = None + + /** Create a text file with a single data item */ + def createFile(data: Int): File = { + val file = stringToFile(new File(src, s"$data.txt"), data.toString) + if (lastFileModTime.nonEmpty) file.setLastModified(lastFileModTime.get + 1000) + lastFileModTime = Some(file.lastModified) + file + } + + createFile(1) + createFile(2) + createFile(3) + + // Set up a query to read text files one at a time + val df = spark + .readStream + .option("maxFilesPerTrigger", 1) + .text(src.getCanonicalPath) + + def startQuery(): StreamingQuery = { + df.writeStream + .format("parquet") + .trigger(Trigger.Once) + .option("checkpointLocation", checkpoint) + .start(targetDir) + } + val q = startQuery() + + try { + assert(q.awaitTermination(streamingTimeout.toMillis)) + assert(q.recentProgress.count(_.numInputRows != 0) == 1) // only one trigger was run + checkAnswer(sql(s"SELECT * from parquet.`$targetDir`"), (1 to 3).map(_.toString).toDF) + } finally { + q.stop() + } + + createFile(4) + createFile(5) + + // run a second batch + val q2 = startQuery() + try { + assert(q2.awaitTermination(streamingTimeout.toMillis)) + assert(q2.recentProgress.count(_.numInputRows != 0) == 1) // only one trigger was run + checkAnswer(sql(s"SELECT * from parquet.`$targetDir`"), (1 to 5).map(_.toString).toDF) + } finally { + q2.stop() + } + } + } + test("explain") { withTempDirs { case (src, tmp) => src.mkdirs() @@ -1218,8 +1304,8 @@ class FileStreamSourceSuite extends FileStreamSourceTest { } test("compact interval metadata log") { - val _sources = PrivateMethod[Seq[Source]]('sources) - val _metadataLog = PrivateMethod[FileStreamSourceLog]('metadataLog) + val _sources = PrivateMethod[Seq[Source]](Symbol("sources")) + val _metadataLog = PrivateMethod[FileStreamSourceLog](Symbol("metadataLog")) def verify( execution: StreamExecution, @@ -1303,7 +1389,7 @@ class FileStreamSourceSuite extends FileStreamSourceTest { AddTextFileData("keep3", src, tmp), CheckAnswer("keep1", "keep2", "keep3"), AssertOnQuery("check getBatch") { execution: StreamExecution => - val _sources = PrivateMethod[Seq[Source]]('sources) + val _sources = PrivateMethod[Seq[Source]](Symbol("sources")) val fileSource = getSourcesFromStreamingQuery(execution).head def verify(startId: Option[Int], endId: Int, expected: String*): Unit = { @@ -1386,9 +1472,13 @@ class FileStreamSourceSuite extends FileStreamSourceTest { latestFirst: Boolean, firstBatch: String, secondBatch: String, - maxFileAge: Option[String] = None): Unit = { + maxFileAge: Option[String] = None, + cleanSource: CleanSourceMode.Value = CleanSourceMode.OFF, + archiveDir: Option[String] = None): Unit = { val srcOptions = Map("latestFirst" -> latestFirst.toString, "maxFilesPerTrigger" -> "1") ++ - maxFileAge.map("maxFileAge" -> _) + maxFileAge.map("maxFileAge" -> _) ++ + Seq("cleanSource" -> cleanSource.toString) ++ + archiveDir.map("sourceArchiveDir" -> _) val fileStream = createFileStream( "text", src.getCanonicalPath, @@ -1547,7 +1637,7 @@ class FileStreamSourceSuite extends FileStreamSourceTest { val actions = Seq( AddTextFileData(source1Content, sourceDir1, tmp), AddTextFileData(source2Content, sourceDir2, tmp) - ).filter(_.content != null) // don't write to a source dir if no content specified + ).filter(_.content != null) // don't write to a source dir if no content specified StreamProgressLockedActions(actions, desc = actions.mkString("[ ", " | ", " ]")) } @@ -1596,6 +1686,255 @@ class FileStreamSourceSuite extends FileStreamSourceTest { } } } + + test("remove completed files when remove option is enabled") { + withTempDirs { case (src, tmp) => + withSQLConf( + SQLConf.FILE_SOURCE_LOG_COMPACT_INTERVAL.key -> "2", + // Force deleting the old logs + SQLConf.FILE_SOURCE_LOG_CLEANUP_DELAY.key -> "1", + SQLConf.FILE_SOURCE_CLEANER_NUM_THREADS.key -> "0" + ) { + val option = Map("latestFirst" -> "false", "maxFilesPerTrigger" -> "1", + "cleanSource" -> "delete") + + val fileStream = createFileStream("text", src.getCanonicalPath, options = option) + val filtered = fileStream.filter($"value" contains "keep") + + testStream(filtered)( + AddTextFileData("keep1", src, tmp, tmpFilePrefix = "keep1"), + CheckAnswer("keep1"), + AssertOnQuery("input file removed") { _: StreamExecution => + // it doesn't rename any file yet + assertFileIsNotRemoved(src, "keep1") + true + }, + AddTextFileData("keep2", src, tmp, tmpFilePrefix = "ke ep2 %"), + CheckAnswer("keep1", "keep2"), + AssertOnQuery("input file removed") { _: StreamExecution => + // it renames input file for first batch, but not for second batch yet + assertFileIsRemoved(src, "keep1") + assertFileIsNotRemoved(src, "ke ep2 %") + + true + }, + AddTextFileData("keep3", src, tmp, tmpFilePrefix = "keep3"), + CheckAnswer("keep1", "keep2", "keep3"), + AssertOnQuery("input file renamed") { _: StreamExecution => + // it renames input file for second batch, but not third batch yet + assertFileIsRemoved(src, "ke ep2 %") + assertFileIsNotRemoved(src, "keep3") + + true + } + ) + } + } + } + + test("move completed files to archive directory when archive option is enabled") { + withThreeTempDirs { case (src, tmp, archiveDir) => + withSQLConf( + SQLConf.FILE_SOURCE_LOG_COMPACT_INTERVAL.key -> "2", + // Force deleting the old logs + SQLConf.FILE_SOURCE_LOG_CLEANUP_DELAY.key -> "1", + SQLConf.FILE_SOURCE_CLEANER_NUM_THREADS.key -> "0" + ) { + val option = Map("latestFirst" -> "false", "maxFilesPerTrigger" -> "1", + "cleanSource" -> "archive", "sourceArchiveDir" -> archiveDir.getAbsolutePath) + + val fileStream = createFileStream("text", s"${src.getCanonicalPath}/*/*", + options = option) + val filtered = fileStream.filter($"value" contains "keep") + + // src/k %1 + // file: src/k %1/keep1 + val dirForKeep1 = new File(src, "k %1") + // src/k %1/k 2 + // file: src/k %1/k 2/keep2 + val dirForKeep2 = new File(dirForKeep1, "k 2") + // src/k3 + // file: src/k3/keep3 + val dirForKeep3 = new File(src, "k3") + + val expectedMovedDir1 = new File(archiveDir.getAbsolutePath + dirForKeep1.toURI.getPath) + val expectedMovedDir2 = new File(archiveDir.getAbsolutePath + dirForKeep2.toURI.getPath) + val expectedMovedDir3 = new File(archiveDir.getAbsolutePath + dirForKeep3.toURI.getPath) + + testStream(filtered)( + AddTextFileData("keep1", dirForKeep1, tmp, tmpFilePrefix = "keep1"), + CheckAnswer("keep1"), + AssertOnQuery("input file archived") { _: StreamExecution => + // it doesn't rename any file yet + assertFileIsNotMoved(dirForKeep1, expectedMovedDir1, "keep1") + true + }, + AddTextFileData("keep2", dirForKeep2, tmp, tmpFilePrefix = "keep2 %"), + CheckAnswer("keep1", "keep2"), + AssertOnQuery("input file archived") { _: StreamExecution => + // it renames input file for first batch, but not for second batch yet + assertFileIsMoved(dirForKeep1, expectedMovedDir1, "keep1") + assertFileIsNotMoved(dirForKeep2, expectedMovedDir2, "keep2 %") + true + }, + AddTextFileData("keep3", dirForKeep3, tmp, tmpFilePrefix = "keep3"), + CheckAnswer("keep1", "keep2", "keep3"), + AssertOnQuery("input file archived") { _: StreamExecution => + // it renames input file for second batch, but not third batch yet + assertFileIsMoved(dirForKeep2, expectedMovedDir2, "keep2 %") + assertFileIsNotMoved(dirForKeep3, expectedMovedDir3, "keep3") + + true + }, + AddTextFileData("keep4", dirForKeep3, tmp, tmpFilePrefix = "keep4"), + CheckAnswer("keep1", "keep2", "keep3", "keep4"), + AssertOnQuery("input file archived") { _: StreamExecution => + // it renames input file for third batch, but not fourth batch yet + assertFileIsMoved(dirForKeep3, expectedMovedDir3, "keep3") + assertFileIsNotMoved(dirForKeep3, expectedMovedDir3, "keep4") + + true + } + ) + } + } + } + + Seq("delete", "archive").foreach { cleanOption => + test(s"Throw UnsupportedOperationException on configuring $cleanOption when source path" + + " refers the output dir of FileStreamSink") { + withThreeTempDirs { case (src, tmp, archiveDir) => + withSQLConf( + SQLConf.FILE_SOURCE_LOG_COMPACT_INTERVAL.key -> "2", + // Force deleting the old logs + SQLConf.FILE_SOURCE_LOG_CLEANUP_DELAY.key -> "1", + SQLConf.FILE_SOURCE_CLEANER_NUM_THREADS.key -> "0" + ) { + val option = Map("latestFirst" -> "false", "maxFilesPerTrigger" -> "1", + "cleanSource" -> cleanOption, "sourceArchiveDir" -> archiveDir.getAbsolutePath) + + val fileStream = createFileStream("text", src.getCanonicalPath, options = option) + val filtered = fileStream.filter($"value" contains "keep") + + // create FileStreamSinkLog under source directory + val sinkLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, spark, + new File(src, FileStreamSink.metadataDir).getCanonicalPath) + val hadoopConf = SparkHadoopUtil.newConfiguration(sparkConf) + val srcPath = new Path(src.getCanonicalPath) + val fileSystem = srcPath.getFileSystem(hadoopConf) + + // Here we will just check whether the source file is removed or not, as we cover + // functionality test of "archive" in other UT. + testStream(filtered)( + AddTextFileData("keep1", src, tmp, tmpFilePrefix = "keep1"), + AddFilesToFileStreamSinkLog(fileSystem, srcPath, sinkLog, 0) { path => + path.getName.startsWith("keep1") + }, + ExpectFailure[UnsupportedOperationException]( + t => assert(t.getMessage.startsWith("Clean up source files is not supported")), + isFatalError = false) + ) + } + } + } + } + + class FakeFileSystem(scheme: String) extends FileSystem { + override def exists(f: Path): Boolean = true + + override def mkdirs(f: Path, permission: FsPermission): Boolean = true + + override def rename(src: Path, dst: Path): Boolean = true + + override def getUri: URI = URI.create(s"${scheme}:///") + + override def open(f: Path, bufferSize: Int): FSDataInputStream = throw new NotImplementedError + + override def create( + f: Path, + permission: FsPermission, + overwrite: Boolean, + bufferSize: Int, + replication: Short, + blockSize: Long, + progress: Progressable): FSDataOutputStream = throw new NotImplementedError + + override def append(f: Path, bufferSize: Int, progress: Progressable): FSDataOutputStream = + throw new NotImplementedError + + override def delete(f: Path, recursive: Boolean): Boolean = throw new NotImplementedError + + override def listStatus(f: Path): Array[FileStatus] = throw new NotImplementedError + + override def setWorkingDirectory(new_dir: Path): Unit = throw new NotImplementedError + + override def getWorkingDirectory: Path = new Path("/somewhere") + + override def getFileStatus(f: Path): FileStatus = throw new NotImplementedError + } + + test("SourceFileArchiver - fail when base archive path matches source pattern") { + val fakeFileSystem = new FakeFileSystem("fake") + + def assertThrowIllegalArgumentException(sourcePatttern: Path, baseArchivePath: Path): Unit = { + intercept[IllegalArgumentException] { + new SourceFileArchiver(fakeFileSystem, sourcePatttern, fakeFileSystem, baseArchivePath) + } + } + + // 1) prefix of base archive path matches source pattern (baseArchiveDirPath has more depths) + val sourcePatternPath = new Path("/hello*/spar?") + val baseArchiveDirPath = new Path("/hello/spark/structured/streaming") + assertThrowIllegalArgumentException(sourcePatternPath, baseArchiveDirPath) + + // 2) prefix of source pattern matches base archive path (source pattern has more depths) + val sourcePatternPath2 = new Path("/hello*/spar?/structured/streaming") + val baseArchiveDirPath2 = new Path("/hello/spark/structured") + assertThrowIllegalArgumentException(sourcePatternPath2, baseArchiveDirPath2) + + // 3) source pattern matches base archive path (both have same depth) + val sourcePatternPath3 = new Path("/hello*/spar?/structured/*") + val baseArchiveDirPath3 = new Path("/hello/spark/structured/streaming") + assertThrowIllegalArgumentException(sourcePatternPath3, baseArchiveDirPath3) + } + + test("SourceFileArchiver - different filesystems between source and archive") { + val fakeFileSystem = new FakeFileSystem("fake") + val fakeFileSystem2 = new FakeFileSystem("fake2") + + val sourcePatternPath = new Path("/hello*/h{e,f}ll?") + val baseArchiveDirPath = new Path("/hello") + + intercept[IllegalArgumentException] { + new SourceFileArchiver(fakeFileSystem, sourcePatternPath, fakeFileSystem2, + baseArchiveDirPath) + } + } + + private def assertFileIsRemoved(sourceDir: File, fileName: String): Unit = { + assert(!sourceDir.list().exists(_.startsWith(fileName))) + } + + private def assertFileIsNotRemoved(sourceDir: File, fileName: String): Unit = { + assert(sourceDir.list().exists(_.startsWith(fileName))) + } + + private def assertFileIsNotMoved(sourceDir: File, expectedDir: File, filePrefix: String): Unit = { + assert(sourceDir.exists()) + assert(sourceDir.list().exists(_.startsWith(filePrefix))) + if (!expectedDir.exists()) { + // OK + } else { + assert(!expectedDir.list().exists(_.startsWith(filePrefix))) + } + } + + private def assertFileIsMoved(sourceDir: File, expectedDir: File, filePrefix: String): Unit = { + assert(sourceDir.exists()) + assert(!sourceDir.list().exists(_.startsWith(filePrefix))) + assert(expectedDir.exists()) + assert(expectedDir.list().exists(_.startsWith(filePrefix))) + } } class FileStreamSourceStressTestSuite extends FileStreamSourceTest { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala index df7e9217f9140..d36c64f61a726 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala @@ -125,6 +125,8 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest { var state: GroupStateImpl[Int] = GroupStateImpl.createForStreaming( None, 1000, 1000, ProcessingTimeTimeout, hasTimedOut = false, watermarkPresent = false) assert(state.getTimeoutTimestamp === NO_TIMESTAMP) + state.setTimeoutDuration("-1 month 31 days 1 second") + assert(state.getTimeoutTimestamp === 2000) state.setTimeoutDuration(500) assert(state.getTimeoutTimestamp === 1500) // can be set without initializing state testTimeoutTimestampNotAllowed[UnsupportedOperationException](state) @@ -225,8 +227,9 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest { testIllegalTimeout { state.setTimeoutDuration("-1 month") } + testIllegalTimeout { - state.setTimeoutDuration("1 month -1 day") + state.setTimeoutDuration("1 month -31 day") } state = GroupStateImpl.createForStreaming( @@ -241,7 +244,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest { state.setTimeoutTimestamp(10000, "-1 month") } testIllegalTimeout { - state.setTimeoutTimestamp(10000, "1 month -1 day") + state.setTimeoutTimestamp(10000, "1 month -32 day") } testIllegalTimeout { state.setTimeoutTimestamp(new Date(-10000)) @@ -253,7 +256,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest { state.setTimeoutTimestamp(new Date(-10000), "-1 month") } testIllegalTimeout { - state.setTimeoutTimestamp(new Date(-10000), "1 month -1 day") + state.setTimeoutTimestamp(new Date(-10000), "1 month -32 day") } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala index 958d15ba1701d..b6618826487c6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala @@ -19,8 +19,9 @@ package org.apache.spark.sql.streaming import java.io.{File, InterruptedIOException, IOException, UncheckedIOException} import java.nio.channels.ClosedByInterruptException -import java.util.concurrent.{CountDownLatch, ExecutionException, TimeoutException, TimeUnit} +import java.util.concurrent.{CountDownLatch, ExecutionException, TimeUnit} +import scala.concurrent.TimeoutException import scala.reflect.ClassTag import scala.util.control.ControlThrowable @@ -35,6 +36,7 @@ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.plans.logical.Range import org.apache.spark.sql.catalyst.streaming.InternalOutputModes import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.execution.{LocalLimitExec, SimpleMode, SparkPlan} import org.apache.spark.sql.execution.command.ExplainCommand import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.sources.{ContinuousMemoryStream, MemorySink} @@ -42,7 +44,7 @@ import org.apache.spark.sql.execution.streaming.state.{StateStore, StateStoreCon import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.StreamSourceProvider -import org.apache.spark.sql.streaming.util.StreamManualClock +import org.apache.spark.sql.streaming.util.{BlockOnStopSourceProvider, StreamManualClock} import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import org.apache.spark.util.Utils @@ -201,7 +203,7 @@ class StreamSuite extends StreamTest { } test("DataFrame reuse") { - def assertDF(df: DataFrame) { + def assertDF(df: DataFrame): Unit = { withTempDir { outputDir => withTempDir { checkpointDir => val query = df.writeStream.format("parquet") @@ -471,7 +473,7 @@ class StreamSuite extends StreamTest { val df = inputData.toDS().map(_ + "foo").groupBy("value").agg(count("*")) // Test `df.explain` - val explain = ExplainCommand(df.queryExecution.logical, extended = false) + val explain = ExplainCommand(df.queryExecution.logical, SimpleMode) val explainString = spark.sessionState .executePlan(explain) @@ -523,7 +525,7 @@ class StreamSuite extends StreamTest { val df = inputData.toDS().map(_ * 2).filter(_ > 5) // Test `df.explain` - val explain = ExplainCommand(df.queryExecution.logical, extended = false) + val explain = ExplainCommand(df.queryExecution.logical, SimpleMode) val explainString = spark.sessionState .executePlan(explain) @@ -755,9 +757,9 @@ class StreamSuite extends StreamTest { inputData.addData(9) streamingQuery.processAllAvailable() - QueryTest.checkAnswer(spark.table("counts").toDF(), - Row("1", 1) :: Row("2", 1) :: Row("3", 2) :: Row("4", 2) :: - Row("5", 2) :: Row("6", 2) :: Row("7", 1) :: Row("8", 1) :: Row("9", 1) :: Nil) + checkAnswer(spark.table("counts").toDF(), + Row(1, 1L) :: Row(2, 1L) :: Row(3, 2L) :: Row(4, 2L) :: + Row(5, 2L) :: Row(6, 2L) :: Row(7, 1L) :: Row(8, 1L) :: Row(9, 1L) :: Nil) } finally { if (streamingQuery ne null) { streamingQuery.stop() @@ -974,24 +976,50 @@ class StreamSuite extends StreamTest { CheckAnswer(1 to 3: _*)) } - test("streaming limit in complete mode") { + test("SPARK-30658: streaming limit before agg in complete mode") { val inputData = MemoryStream[Int] val limited = inputData.toDF().limit(5).groupBy("value").count() testStream(limited, OutputMode.Complete())( AddData(inputData, 1 to 3: _*), CheckAnswer(Row(1, 1), Row(2, 1), Row(3, 1)), AddData(inputData, 1 to 9: _*), - CheckAnswer(Row(1, 2), Row(2, 2), Row(3, 2), Row(4, 1), Row(5, 1))) + CheckAnswer(Row(1, 2), Row(2, 2), Row(3, 1))) } - test("streaming limits in complete mode") { + test("SPARK-30658: streaming limits before and after agg in complete mode " + + "(after limit < before limit)") { val inputData = MemoryStream[Int] val limited = inputData.toDF().limit(4).groupBy("value").count().orderBy("value").limit(3) testStream(limited, OutputMode.Complete())( + StartStream(additionalConfs = Map(SQLConf.SHUFFLE_PARTITIONS.key -> "1")), AddData(inputData, 1 to 9: _*), + // only 1 to 4 should be allowed to aggregate, and counts for only 1 to 3 should be output CheckAnswer(Row(1, 1), Row(2, 1), Row(3, 1)), AddData(inputData, 2 to 6: _*), - CheckAnswer(Row(1, 1), Row(2, 2), Row(3, 2))) + // None of the new values should be allowed to aggregate, same 3 counts should be output + CheckAnswer(Row(1, 1), Row(2, 1), Row(3, 1))) + } + + test("SPARK-30658: streaming limits before and after agg in complete mode " + + "(before limit < after limit)") { + val inputData = MemoryStream[Int] + val limited = inputData.toDF().limit(2).groupBy("value").count().orderBy("value").limit(3) + testStream(limited, OutputMode.Complete())( + StartStream(additionalConfs = Map(SQLConf.SHUFFLE_PARTITIONS.key -> "1")), + AddData(inputData, 1 to 9: _*), + CheckAnswer(Row(1, 1), Row(2, 1)), + AddData(inputData, 2 to 6: _*), + CheckAnswer(Row(1, 1), Row(2, 1))) + } + + test("SPARK-30657: streaming limit after streaming dedup in append mode") { + val inputData = MemoryStream[Int] + val limited = inputData.toDF().dropDuplicates().limit(1) + testStream(limited)( + AddData(inputData, 1, 2), + CheckAnswer(Row(1)), + AddData(inputData, 3, 4), + CheckAnswer(Row(1))) } test("streaming limit in update mode") { @@ -1032,6 +1060,82 @@ class StreamSuite extends StreamTest { false)) } + test("SPARK-30657: streaming limit should not apply on limits on state subplans") { + val streanData = MemoryStream[Int] + val streamingDF = streanData.toDF().toDF("value") + val staticDF = spark.createDataset(Seq(1)).toDF("value").orderBy("value") + testStream(streamingDF.join(staticDF.limit(1), "value"))( + AddData(streanData, 1, 2, 3), + CheckAnswer(Row(1)), + AddData(streanData, 1, 3, 5), + CheckAnswer(Row(1), Row(1))) + } + + test("SPARK-30657: streaming limit optimization from StreamingLocalLimitExec to LocalLimitExec") { + val inputData = MemoryStream[Int] + val inputDF = inputData.toDF() + + /** Verify whether the local limit in the plan is a streaming limit or is a simple */ + def verifyLocalLimit( + df: DataFrame, + expectStreamingLimit: Boolean, + outputMode: OutputMode = OutputMode.Append): Unit = { + + var execPlan: SparkPlan = null + testStream(df, outputMode)( + AddData(inputData, 1), + AssertOnQuery { q => + q.processAllAvailable() + execPlan = q.lastExecution.executedPlan + true + } + ) + require(execPlan != null) + + val localLimits = execPlan.collect { + case l: LocalLimitExec => l + case l: StreamingLocalLimitExec => l + } + + require( + localLimits.size == 1, + s"Cant verify local limit optimization with this plan:\n$execPlan") + + if (expectStreamingLimit) { + assert( + localLimits.head.isInstanceOf[StreamingLocalLimitExec], + s"Local limit was not StreamingLocalLimitExec:\n$execPlan") + } else { + assert( + localLimits.head.isInstanceOf[LocalLimitExec], + s"Local limit was not LocalLimitExec:\n$execPlan") + } + } + + // Should not be optimized, so StreamingLocalLimitExec should be present + verifyLocalLimit(inputDF.dropDuplicates().limit(1), expectStreamingLimit = true) + + // Should be optimized from StreamingLocalLimitExec to LocalLimitExec + verifyLocalLimit(inputDF.limit(1), expectStreamingLimit = false) + verifyLocalLimit( + inputDF.limit(1).groupBy().count(), + expectStreamingLimit = false, + outputMode = OutputMode.Complete()) + + // Should be optimized as repartition is sufficient to ensure that the iterators of + // StreamingDeduplicationExec should be consumed completely by the repartition exchange. + verifyLocalLimit(inputDF.dropDuplicates().repartition(1).limit(1), expectStreamingLimit = false) + + // Should be LocalLimitExec in the first place, not from optimization of StreamingLocalLimitExec + val staticDF = spark.range(1).toDF("value").limit(1) + verifyLocalLimit(inputDF.toDF("value").join(staticDF, "value"), expectStreamingLimit = false) + + verifyLocalLimit( + inputDF.groupBy().count().limit(1), + expectStreamingLimit = false, + outputMode = OutputMode.Complete()) + } + test("is_continuous_processing property should be false for microbatch processing") { val input = MemoryStream[Int] val df = input.toDS() @@ -1125,6 +1229,36 @@ class StreamSuite extends StreamTest { } ) } + + // ProcessingTime trigger generates MicroBatchExecution, and ContinuousTrigger starts a + // ContinuousExecution + Seq(Trigger.ProcessingTime("1 second"), Trigger.Continuous("1 second")).foreach { trigger => + test(s"SPARK-30143: stop waits until timeout if blocked - trigger: $trigger") { + BlockOnStopSourceProvider.enableBlocking() + val sq = spark.readStream.format(classOf[BlockOnStopSourceProvider].getName) + .load() + .writeStream + .format("console") + .trigger(trigger) + .start() + failAfter(60.seconds) { + val startTime = System.nanoTime() + withSQLConf(SQLConf.STREAMING_STOP_TIMEOUT.key -> "2000") { + intercept[TimeoutException] { + sq.stop() + } + } + val duration = (System.nanoTime() - startTime) / 1e6 + assert(duration >= 2000, + s"Should have waited more than 2000 millis, but waited $duration millis") + + BlockOnStopSourceProvider.disableBlocking() + withSQLConf(SQLConf.STREAMING_STOP_TIMEOUT.key -> "0") { + sq.stop() + } + } + } + } } abstract class FakeSource extends StreamSourceProvider { @@ -1175,7 +1309,7 @@ class FakeDefaultSource extends FakeSource { ds.toDF("a") } - override def stop() {} + override def stop(): Unit = {} } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala index 7914a713f0baa..6d5ad873eedea 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala @@ -37,12 +37,12 @@ import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder, Ro import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.plans.physical.AllTuples import org.apache.spark.sql.catalyst.util._ +import org.apache.spark.sql.connector.read.streaming.{Offset => OffsetV2, SparkDataStream} import org.apache.spark.sql.execution.datasources.v2.StreamingDataSourceV2Relation import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.continuous.{ContinuousExecution, EpochCoordinatorRef, IncrementAndGetEpoch} import org.apache.spark.sql.execution.streaming.sources.MemorySink import org.apache.spark.sql.execution.streaming.state.StateStore -import org.apache.spark.sql.sources.v2.reader.streaming.{Offset => OffsetV2, SparkDataStream} import org.apache.spark.sql.streaming.StreamingQueryListener._ import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.util.{Clock, SystemClock, Utils} @@ -112,7 +112,11 @@ trait StreamTest extends QueryTest with SharedSparkSession with TimeLimits with object MultiAddData { def apply[A] (source1: MemoryStream[A], data1: A*)(source2: MemoryStream[A], data2: A*): StreamAction = { - val actions = Seq(AddDataMemory(source1, data1), AddDataMemory(source2, data2)) + apply((source1, data1), (source2, data2)) + } + + def apply[A](inputs: (MemoryStream[A], Seq[A])*): StreamAction = { + val actions = inputs.map { case (source, data) => AddDataMemory(source, data) } StreamProgressLockedActions(actions, desc = actions.mkString("[ ", " | ", " ]")) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala index 134e61ed12a21..741355381222d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala @@ -20,6 +20,8 @@ package org.apache.spark.sql.streaming import java.io.File import java.util.{Locale, TimeZone} +import scala.collection.mutable + import org.apache.commons.io.FileUtils import org.scalatest.Assertions @@ -28,13 +30,12 @@ import org.apache.spark.rdd.BlockRDD import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.plans.logical.Aggregate -import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.catalyst.util.DateTimeConstants._ import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.execution.exchange.Exchange import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.sources.MemorySink import org.apache.spark.sql.execution.streaming.state.StreamingAggregationStateManager -import org.apache.spark.sql.expressions.scalalang.typed import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.OutputMode._ @@ -184,7 +185,68 @@ class StreamingAggregationSuite extends StateStoreMetricsTest with Assertions { ) } - testWithAllStateVersions("state metrics") { + testWithAllStateVersions("state metrics - append mode") { + val inputData = MemoryStream[Int] + val aggWithWatermark = inputData.toDF() + .withColumn("eventTime", $"value".cast("timestamp")) + .withWatermark("eventTime", "10 seconds") + .groupBy(window($"eventTime", "5 seconds") as 'window) + .agg(count("*") as 'count) + .select($"window".getField("start").cast("long").as[Long], $"count".as[Long]) + + implicit class RichStreamExecution(query: StreamExecution) { + // this could be either empty row batch or actual batch + def stateNodes: Seq[SparkPlan] = { + query.lastExecution.executedPlan.collect { + case p if p.isInstanceOf[StateStoreSaveExec] => p + } + } + + def stateOperatorProgresses: Seq[StateOperatorProgress] = { + val operatorProgress = mutable.ArrayBuffer[StateOperatorProgress]() + var progress = query.recentProgress.last + + operatorProgress ++= progress.stateOperators.map { op => op.copy(op.numRowsUpdated) } + if (progress.numInputRows == 0) { + // empty batch, merge metrics from previous batch as well + progress = query.recentProgress.takeRight(2).head + operatorProgress.zipWithIndex.foreach { case (sop, index) => + // "numRowsUpdated" should be merged, as it could be updated in both batches. + // (for now it is only updated from previous batch, but things can be changed.) + // other metrics represent current status of state so picking up the latest values. + val newOperatorProgress = sop.copy( + sop.numRowsUpdated + progress.stateOperators(index).numRowsUpdated) + operatorProgress(index) = newOperatorProgress + } + } + + operatorProgress + } + } + + testStream(aggWithWatermark)( + AddData(inputData, 15), + CheckAnswer(), // watermark = 5 + AssertOnQuery { _.stateNodes.size === 1 }, + AssertOnQuery { _.stateNodes.head.metrics("numOutputRows").value === 0 }, + AssertOnQuery { _.stateOperatorProgresses.head.numRowsUpdated === 1 }, + AssertOnQuery { _.stateOperatorProgresses.head.numRowsTotal === 1 }, + AddData(inputData, 10, 12, 14), + CheckAnswer(), // watermark = 5 + AssertOnQuery { _.stateNodes.size === 1 }, + AssertOnQuery { _.stateNodes.head.metrics("numOutputRows").value === 0 }, + AssertOnQuery { _.stateOperatorProgresses.head.numRowsUpdated === 1 }, + AssertOnQuery { _.stateOperatorProgresses.head.numRowsTotal === 2 }, + AddData(inputData, 25), + CheckAnswer((10, 3)), // watermark = 15 + AssertOnQuery { _.stateNodes.size === 1 }, + AssertOnQuery { _.stateNodes.head.metrics("numOutputRows").value === 1 }, + AssertOnQuery { _.stateOperatorProgresses.head.numRowsUpdated === 1 }, + AssertOnQuery { _.stateOperatorProgresses.head.numRowsTotal === 2 } + ) + } + + testWithAllStateVersions("state metrics - update/complete mode") { val inputData = MemoryStream[Int] val aggregated = @@ -280,16 +342,6 @@ class StreamingAggregationSuite extends StateStoreMetricsTest with Assertions { ) } - testWithAllStateVersions("typed aggregators") { - val inputData = MemoryStream[(String, Int)] - val aggregated = inputData.toDS().groupByKey(_._1).agg(typed.sumLong(_._2)) - - testStream(aggregated, Update)( - AddData(inputData, ("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)), - CheckLastBatch(("a", 30), ("b", 3), ("c", 1)) - ) - } - testWithAllStateVersions("prune results by current_time, complete mode") { import testImplicits._ val clock = new StreamManualClock @@ -345,28 +397,29 @@ class StreamingAggregationSuite extends StateStoreMetricsTest with Assertions { testWithAllStateVersions("prune results by current_date, complete mode") { import testImplicits._ val clock = new StreamManualClock + val tz = TimeZone.getDefault.getID val inputData = MemoryStream[Long] val aggregated = inputData.toDF() - .select(($"value" * DateTimeUtils.SECONDS_PER_DAY).cast("timestamp").as("value")) + .select(to_utc_timestamp(from_unixtime('value * SECONDS_PER_DAY), tz)) + .toDF("value") .groupBy($"value") .agg(count("*")) - .where($"value".cast("date") >= date_sub(current_timestamp().cast("date"), 10)) - .select( - ($"value".cast("long") / DateTimeUtils.SECONDS_PER_DAY).cast("long"), $"count(1)") + .where($"value".cast("date") >= date_sub(current_date(), 10)) + .select(($"value".cast("long") / SECONDS_PER_DAY).cast("long"), $"count(1)") testStream(aggregated, Complete)( StartStream(Trigger.ProcessingTime("10 day"), triggerClock = clock), // advance clock to 10 days, should retain all keys AddData(inputData, 0L, 5L, 5L, 10L), - AdvanceManualClock(DateTimeUtils.MILLIS_PER_DAY * 10), + AdvanceManualClock(MILLIS_PER_DAY * 10), CheckLastBatch((0L, 1), (5L, 2), (10L, 1)), // advance clock to 20 days, should retain keys >= 10 AddData(inputData, 15L, 15L, 20L), - AdvanceManualClock(DateTimeUtils.MILLIS_PER_DAY * 10), + AdvanceManualClock(MILLIS_PER_DAY * 10), CheckLastBatch((10L, 1), (15L, 2), (20L, 1)), // advance clock to 30 days, should retain keys >= 20 AddData(inputData, 85L), - AdvanceManualClock(DateTimeUtils.MILLIS_PER_DAY * 10), + AdvanceManualClock(MILLIS_PER_DAY * 10), CheckLastBatch((20L, 1), (85L, 1)), // bounce stream and ensure correct batch timestamp is used @@ -376,7 +429,7 @@ class StreamingAggregationSuite extends StateStoreMetricsTest with Assertions { q.sink.asInstanceOf[MemorySink].clear() q.commitLog.purge(3) // advance by 60 days i.e., 90 days total - clock.advance(DateTimeUtils.MILLIS_PER_DAY * 60) + clock.advance(MILLIS_PER_DAY * 60) true }, StartStream(Trigger.ProcessingTime("10 day"), triggerClock = clock), @@ -385,7 +438,7 @@ class StreamingAggregationSuite extends StateStoreMetricsTest with Assertions { // advance clock to 100 days, should retain keys >= 90 AddData(inputData, 85L, 90L, 100L, 105L), - AdvanceManualClock(DateTimeUtils.MILLIS_PER_DAY * 10), + AdvanceManualClock(MILLIS_PER_DAY * 10), CheckLastBatch((90L, 1), (100L, 1), (105L, 1)) ) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala index 42fe9f34ee3ec..3f218c9cb7fd9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala @@ -17,12 +17,15 @@ package org.apache.spark.sql.streaming -import java.util.UUID +import java.io.File +import java.util.{Locale, UUID} import scala.util.Random +import org.apache.commons.io.FileUtils import org.scalatest.BeforeAndAfter +import org.apache.spark.SparkContext import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SparkSession} import org.apache.spark.sql.catalyst.analysis.StreamingJoinHelper @@ -31,7 +34,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{EventTimeWatermark, Filter} import org.apache.spark.sql.catalyst.trees.TreeNode import org.apache.spark.sql.execution.{FileSourceScanExec, LogicalRDD} import org.apache.spark.sql.execution.datasources.LogicalRelation -import org.apache.spark.sql.execution.streaming.{MemoryStream, StatefulOperatorStateInfo, StreamingSymmetricHashJoinHelper} +import org.apache.spark.sql.execution.streaming.{MemoryStream, StatefulOperatorStateInfo, StreamingSymmetricHashJoinExec, StreamingSymmetricHashJoinHelper} import org.apache.spark.sql.execution.streaming.state.{StateStore, StateStoreProviderId} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ @@ -374,7 +377,7 @@ class StreamingInnerJoinSuite extends StreamTest with StateStoreMetricsTest with val rdd1 = spark.sparkContext.makeRDD(1 to 10, numPartitions) val rdd2 = spark.sparkContext.makeRDD((1 to 10).map(_.toString), numPartitions) val rdd = rdd1.stateStoreAwareZipPartitions(rdd2, stateInfo, storeNames, coordinatorRef) { - (left, right) => left.zip(right) + (_, left, right) => left.zip(right) } require(rdd.partitions.length === numPartitions) for (partIndex <- 0 until numPartitions) { @@ -418,6 +421,63 @@ class StreamingInnerJoinSuite extends StreamTest with StateStoreMetricsTest with AddData(input2, 1.to(1000): _*), CheckAnswer(1.to(1000): _*)) } + + test("SPARK-26187 restore the stream-stream inner join query from Spark 2.4") { + val inputStream = MemoryStream[(Int, Long)] + val df = inputStream.toDS() + .select(col("_1").as("value"), col("_2").cast("timestamp").as("timestamp")) + + val leftStream = df.select(col("value").as("leftId"), col("timestamp").as("leftTime")) + + val rightStream = df + // Introduce misses for ease of debugging + .where(col("value") % 2 === 0) + .select(col("value").as("rightId"), col("timestamp").as("rightTime")) + + val query = leftStream + .withWatermark("leftTime", "5 seconds") + .join( + rightStream.withWatermark("rightTime", "5 seconds"), + expr("rightId = leftId AND rightTime >= leftTime AND " + + "rightTime <= leftTime + interval 5 seconds"), + joinType = "inner") + .select(col("leftId"), col("leftTime").cast("int"), + col("rightId"), col("rightTime").cast("int")) + + val resourceUri = this.getClass.getResource( + "/structured-streaming/checkpoint-version-2.4.0-streaming-join/").toURI + val checkpointDir = Utils.createTempDir().getCanonicalFile + // Copy the checkpoint to a temp dir to prevent changes to the original. + // Not doing this will lead to the test passing on the first run, but fail subsequent runs. + FileUtils.copyDirectory(new File(resourceUri), checkpointDir) + inputStream.addData((1, 1L), (2, 2L), (3, 3L), (4, 4L), (5, 5L)) + + testStream(query)( + StartStream(checkpointLocation = checkpointDir.getAbsolutePath), + /* + Note: The checkpoint was generated using the following input in Spark version 2.4.0 + AddData(inputStream, (1, 1L), (2, 2L), (3, 3L), (4, 4L), (5, 5L)), + // batch 1 - global watermark = 0 + // states + // left: (1, 1L), (2, 2L), (3, 3L), (4, 4L), (5, 5L) + // right: (2, 2L), (4, 4L) + CheckNewAnswer((2, 2L, 2, 2L), (4, 4L, 4, 4L)), + assertNumStateRows(7, 7), + */ + AddData(inputStream, (6, 6L), (7, 7L), (8, 8L), (9, 9L), (10, 10L)), + // batch 2: same result as above test + CheckNewAnswer((6, 6L, 6, 6L), (8, 8L, 8, 8L), (10, 10L, 10, 10L)), + assertNumStateRows(11, 6), + Execute { query => + // Verify state format = 1 + val f = query.lastExecution.executedPlan.collect { + case f: StreamingSymmetricHashJoinExec => f + } + assert(f.size == 1) + assert(f.head.stateFormatVersion == 1) + } + ) + } } @@ -712,5 +772,223 @@ class StreamingOuterJoinSuite extends StreamTest with StateStoreMetricsTest with assertNumStateRows(total = 2, updated = 2) ) } -} + test("SPARK-26187 self left outer join should not return outer nulls for already matched rows") { + val inputStream = MemoryStream[(Int, Long)] + + val df = inputStream.toDS() + .select(col("_1").as("value"), col("_2").cast("timestamp").as("timestamp")) + + val leftStream = df.select(col("value").as("leftId"), col("timestamp").as("leftTime")) + + val rightStream = df + // Introduce misses for ease of debugging + .where(col("value") % 2 === 0) + .select(col("value").as("rightId"), col("timestamp").as("rightTime")) + + val query = leftStream + .withWatermark("leftTime", "5 seconds") + .join( + rightStream.withWatermark("rightTime", "5 seconds"), + expr("leftId = rightId AND rightTime >= leftTime AND " + + "rightTime <= leftTime + interval 5 seconds"), + joinType = "leftOuter") + .select(col("leftId"), col("leftTime").cast("int"), + col("rightId"), col("rightTime").cast("int")) + + testStream(query)( + AddData(inputStream, (1, 1L), (2, 2L), (3, 3L), (4, 4L), (5, 5L)), + // batch 1 - global watermark = 0 + // states + // left: (1, 1L), (2, 2L), (3, 3L), (4, 4L), (5, 5L) + // right: (2, 2L), (4, 4L) + CheckNewAnswer((2, 2L, 2, 2L), (4, 4L, 4, 4L)), + assertNumStateRows(7, 7), + + AddData(inputStream, (6, 6L), (7, 7L), (8, 8L), (9, 9L), (10, 10L)), + // batch 2 - global watermark = 5 + // states + // left: (1, 1L), (2, 2L), (3, 3L), (4, 4L), (5, 5L), (6, 6L), (7, 7L), (8, 8L), + // (9, 9L), (10, 10L) + // right: (6, 6L), (8, 8L), (10, 10L) + // states evicted + // left: nothing (it waits for 5 seconds more than watermark due to join condition) + // right: (2, 2L), (4, 4L) + // NOTE: look for evicted rows in right which are not evicted from left - they were + // properly joined in batch 1 + CheckNewAnswer((6, 6L, 6, 6L), (8, 8L, 8, 8L), (10, 10L, 10, 10L)), + assertNumStateRows(13, 8), + + AddData(inputStream, (11, 11L), (12, 12L), (13, 13L), (14, 14L), (15, 15L)), + // batch 3 + // - global watermark = 9 <= min(9, 10) + // states + // left: (4, 4L), (5, 5L), (6, 6L), (7, 7L), (8, 8L), (9, 9L), (10, 10L), (11, 11L), + // (12, 12L), (13, 13L), (14, 14L), (15, 15L) + // right: (10, 10L), (12, 12L), (14, 14L) + // states evicted + // left: (1, 1L), (2, 2L), (3, 3L) + // right: (6, 6L), (8, 8L) + CheckNewAnswer( + Row(12, 12L, 12, 12L), Row(14, 14L, 14, 14L), + Row(1, 1L, null, null), Row(3, 3L, null, null)), + assertNumStateRows(15, 7) + ) + } + + test("SPARK-26187 self right outer join should not return outer nulls for already matched rows") { + val inputStream = MemoryStream[(Int, Long)] + + val df = inputStream.toDS() + .select(col("_1").as("value"), col("_2").cast("timestamp").as("timestamp")) + + // we're just flipping "left" and "right" from left outer join and apply right outer join + + val leftStream = df + // Introduce misses for ease of debugging + .where(col("value") % 2 === 0) + .select(col("value").as("leftId"), col("timestamp").as("leftTime")) + + val rightStream = df.select(col("value").as("rightId"), col("timestamp").as("rightTime")) + + val query = leftStream + .withWatermark("leftTime", "5 seconds") + .join( + rightStream.withWatermark("rightTime", "5 seconds"), + expr("leftId = rightId AND leftTime >= rightTime AND " + + "leftTime <= rightTime + interval 5 seconds"), + joinType = "rightOuter") + .select(col("leftId"), col("leftTime").cast("int"), + col("rightId"), col("rightTime").cast("int")) + + // we can just flip left and right in the explanation of left outer query test + // to assume the status of right outer query, hence skip explaining here + testStream(query)( + AddData(inputStream, (1, 1L), (2, 2L), (3, 3L), (4, 4L), (5, 5L)), + CheckNewAnswer((2, 2L, 2, 2L), (4, 4L, 4, 4L)), + assertNumStateRows(7, 7), + + AddData(inputStream, (6, 6L), (7, 7L), (8, 8L), (9, 9L), (10, 10L)), + CheckNewAnswer((6, 6L, 6, 6L), (8, 8L, 8, 8L), (10, 10L, 10, 10L)), + assertNumStateRows(13, 8), + + AddData(inputStream, (11, 11L), (12, 12L), (13, 13L), (14, 14L), (15, 15L)), + CheckNewAnswer( + Row(12, 12L, 12, 12L), Row(14, 14L, 14, 14L), + Row(null, null, 1, 1L), Row(null, null, 3, 3L)), + assertNumStateRows(15, 7) + ) + } + + test("SPARK-26187 restore the stream-stream outer join query from Spark 2.4") { + val inputStream = MemoryStream[(Int, Long)] + val df = inputStream.toDS() + .select(col("_1").as("value"), col("_2").cast("timestamp").as("timestamp")) + + val leftStream = df.select(col("value").as("leftId"), col("timestamp").as("leftTime")) + + val rightStream = df + // Introduce misses for ease of debugging + .where(col("value") % 2 === 0) + .select(col("value").as("rightId"), col("timestamp").as("rightTime")) + + val query = leftStream + .withWatermark("leftTime", "5 seconds") + .join( + rightStream.withWatermark("rightTime", "5 seconds"), + expr("rightId = leftId AND rightTime >= leftTime AND " + + "rightTime <= leftTime + interval 5 seconds"), + joinType = "leftOuter") + .select(col("leftId"), col("leftTime").cast("int"), + col("rightId"), col("rightTime").cast("int")) + + val resourceUri = this.getClass.getResource( + "/structured-streaming/checkpoint-version-2.4.0-streaming-join/").toURI + val checkpointDir = Utils.createTempDir().getCanonicalFile + // Copy the checkpoint to a temp dir to prevent changes to the original. + // Not doing this will lead to the test passing on the first run, but fail subsequent runs. + FileUtils.copyDirectory(new File(resourceUri), checkpointDir) + inputStream.addData((1, 1L), (2, 2L), (3, 3L), (4, 4L), (5, 5L)) + + /* + Note: The checkpoint was generated using the following input in Spark version 2.4.0 + AddData(inputStream, (1, 1L), (2, 2L), (3, 3L), (4, 4L), (5, 5L)), + // batch 1 - global watermark = 0 + // states + // left: (1, 1L), (2, 2L), (3, 3L), (4, 4L), (5, 5L) + // right: (2, 2L), (4, 4L) + CheckNewAnswer((2, 2L, 2, 2L), (4, 4L, 4, 4L)), + assertNumStateRows(7, 7), + */ + + // we just fail the query if the checkpoint was create from less than Spark 3.0 + val e = intercept[StreamingQueryException] { + val writer = query.writeStream.format("console") + .option("checkpointLocation", checkpointDir.getAbsolutePath).start() + inputStream.addData((7, 7L), (8, 8L)) + eventually(timeout(streamingTimeout)) { + assert(writer.exception.isDefined) + } + throw writer.exception.get + } + assert(e.getMessage.toLowerCase(Locale.ROOT) + .contains("the query is using stream-stream outer join with state format version 1")) + } + + test("SPARK-29438: ensure UNION doesn't lead stream-stream join to use shifted partition IDs") { + def constructUnionDf(desiredPartitionsForInput1: Int) + : (MemoryStream[Int], MemoryStream[Int], MemoryStream[Int], DataFrame) = { + val input1 = MemoryStream[Int](desiredPartitionsForInput1) + val df1 = input1.toDF + .select( + 'value as "key", + 'value as "leftValue", + 'value as "rightValue") + val (input2, df2) = setupStream("left", 2) + val (input3, df3) = setupStream("right", 3) + + val joined = df2 + .join(df3, + df2("key") === df3("key") && df2("leftTime") === df3("rightTime"), + "inner") + .select(df2("key"), 'leftValue, 'rightValue) + + (input1, input2, input3, df1.union(joined)) + } + + withTempDir { tempDir => + val (input1, input2, input3, unionDf) = constructUnionDf(2) + + testStream(unionDf)( + StartStream(checkpointLocation = tempDir.getAbsolutePath), + MultiAddData( + (input1, Seq(11, 12, 13)), + (input2, Seq(11, 12, 13, 14, 15)), + (input3, Seq(13, 14, 15, 16, 17))), + CheckNewAnswer(Row(11, 11, 11), Row(12, 12, 12), Row(13, 13, 13), Row(13, 26, 39), + Row(14, 28, 42), Row(15, 30, 45)), + StopStream + ) + + // We're restoring the query with different number of partitions in left side of UNION, + // which leads right side of union to have mismatched partition IDs if it relies on + // TaskContext.partitionId(). SPARK-29438 fixes this issue to not rely on it. + + val (newInput1, newInput2, newInput3, newUnionDf) = constructUnionDf(3) + + newInput1.addData(11, 12, 13) + newInput2.addData(11, 12, 13, 14, 15) + newInput3.addData(13, 14, 15, 16, 17) + + testStream(newUnionDf)( + StartStream(checkpointLocation = tempDir.getAbsolutePath), + MultiAddData( + (newInput1, Seq(21, 22, 23)), + (newInput2, Seq(21, 22, 23, 24, 25)), + (newInput3, Seq(23, 24, 25, 26, 27))), + CheckNewAnswer(Row(21, 21, 21), Row(22, 22, 22), Row(23, 23, 23), Row(23, 46, 69), + Row(24, 48, 72), Row(25, 50, 75)) + ) + } + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala index d96404863a255..9d0f829ac9684 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala @@ -23,17 +23,17 @@ import scala.collection.mutable import org.scalactic.TolerantNumerics import org.scalatest.BeforeAndAfter -import org.scalatest.PrivateMethodTester._ import org.scalatest.concurrent.PatienceConfiguration.Timeout import org.scalatest.concurrent.Waiters.Waiter import org.apache.spark.SparkException import org.apache.spark.scheduler._ -import org.apache.spark.sql.{Encoder, SparkSession} +import org.apache.spark.sql.{Encoder, Row, SparkSession} +import org.apache.spark.sql.connector.read.streaming.{Offset => OffsetV2} import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.sources.v2.reader.streaming.{Offset => OffsetV2} import org.apache.spark.sql.streaming.StreamingQueryListener._ +import org.apache.spark.sql.streaming.ui.StreamingQueryStatusListener import org.apache.spark.sql.streaming.util.StreamManualClock import org.apache.spark.util.JsonProtocol @@ -47,9 +47,11 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter { after { spark.streams.active.foreach(_.stop()) assert(spark.streams.active.isEmpty) - assert(spark.streams.listListeners().isEmpty) + // Skip check default `StreamingQueryStatusListener` which is for streaming UI. + assert(spark.streams.listListeners() + .filterNot(_.isInstanceOf[StreamingQueryStatusListener]).isEmpty) // Make sure we don't leak any events to the next test - spark.sparkContext.listenerBus.waitUntilEmpty(10000) + spark.sparkContext.listenerBus.waitUntilEmpty() } testQuietly("single listener, check trigger events are generated correctly") { @@ -252,8 +254,8 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter { assert(newEvent.name === event.name) } - testSerialization(new QueryStartedEvent(UUID.randomUUID, UUID.randomUUID, "name")) - testSerialization(new QueryStartedEvent(UUID.randomUUID, UUID.randomUUID, null)) + testSerialization(new QueryStartedEvent(UUID.randomUUID, UUID.randomUUID, "name", 1L)) + testSerialization(new QueryStartedEvent(UUID.randomUUID, UUID.randomUUID, null, 1L)) } test("QueryProgressEvent serialization") { @@ -320,7 +322,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter { q.recentProgress.size > 1 && q.recentProgress.size <= 11 } testStream(input.toDS)(actions: _*) - spark.sparkContext.listenerBus.waitUntilEmpty(10000) + spark.sparkContext.listenerBus.waitUntilEmpty() // 11 is the max value of the possible numbers of events. assert(numProgressEvent > 1 && numProgressEvent <= 11) } finally { @@ -343,7 +345,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter { AddData(mem, 1, 2, 3), CheckAnswer(1, 2, 3) ) - session.sparkContext.listenerBus.waitUntilEmpty(5000) + session.sparkContext.listenerBus.waitUntilEmpty() } def assertEventsCollected(collector: EventCollector): Unit = { @@ -404,6 +406,63 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter { testReplayListenerBusWithBorkenEventJsons("query-event-logs-version-2.0.2.txt") } + test("listener propagates observable metrics") { + import org.apache.spark.sql.functions._ + val clock = new StreamManualClock + val inputData = new MemoryStream[Int](0, sqlContext) + val df = inputData.toDF() + .observe( + name = "my_event", + min($"value").as("min_val"), + max($"value").as("max_val"), + sum($"value").as("sum_val"), + count(when($"value" % 2 === 0, 1)).as("num_even")) + .observe( + name = "other_event", + avg($"value").cast("int").as("avg_val")) + val listener = new EventCollector + def checkMetrics(f: java.util.Map[String, Row] => Unit): StreamAction = { + AssertOnQuery { _ => + eventually(Timeout(streamingTimeout)) { + assert(listener.allProgressEvents.nonEmpty) + f(listener.allProgressEvents.last.observedMetrics) + true + } + } + } + + try { + spark.streams.addListener(listener) + testStream(df, OutputMode.Append)( + StartStream(Trigger.ProcessingTime(100), triggerClock = clock), + // Batch 1 + AddData(inputData, 1, 2), + AdvanceManualClock(100), + checkMetrics { metrics => + assert(metrics.get("my_event") === Row(1, 2, 3L, 1L)) + assert(metrics.get("other_event") === Row(1)) + }, + + // Batch 2 + AddData(inputData, 10, 30, -10, 5), + AdvanceManualClock(100), + checkMetrics { metrics => + assert(metrics.get("my_event") === Row(-10, 30, 35L, 3L)) + assert(metrics.get("other_event") === Row(8)) + }, + + // Batch 3 - no data + AdvanceManualClock(100), + checkMetrics { metrics => + assert(metrics.isEmpty) + }, + StopStream + ) + } finally { + spark.streams.removeListener(listener) + } + } + private def testReplayListenerBusWithBorkenEventJsons(fileName: String): Unit = { val input = getClass.getResourceAsStream(s"/structured-streaming/$fileName") val events = mutable.ArrayBuffer[SparkListenerEvent]() @@ -454,6 +513,10 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter { _progressEvents.filter(_.numInputRows > 0) } + def allProgressEvents: Seq[StreamingQueryProgress] = _progressEvents.synchronized { + _progressEvents.clone() + } + def reset(): Unit = { startEvent = null terminationEvent = null diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenersConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenersConfSuite.scala index 7801d968e901d..d538d93b845b4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenersConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenersConfSuite.scala @@ -39,7 +39,7 @@ class StreamingQueryListenersConfSuite extends StreamTest with BeforeAndAfter { StopStream ) - spark.sparkContext.listenerBus.waitUntilEmpty(5000) + spark.sparkContext.listenerBus.waitUntilEmpty() assert(TestListener.queryStartedEvent != null) assert(TestListener.queryTerminatedEvent != null) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala index b26d2556b2e36..96f7efeef98e6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryManagerSuite.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.streaming +import java.io.File import java.util.concurrent.CountDownLatch import scala.concurrent.Future @@ -28,9 +29,10 @@ import org.scalatest.time.Span import org.scalatest.time.SpanSugar._ import org.apache.spark.SparkException -import org.apache.spark.sql.Dataset +import org.apache.spark.sql.{Dataset, Encoders} import org.apache.spark.sql.execution.datasources.v2.StreamingDataSourceV2Relation import org.apache.spark.sql.execution.streaming._ +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.util.BlockingSource import org.apache.spark.util.Utils @@ -242,6 +244,154 @@ class StreamingQueryManagerSuite extends StreamTest { } } + testQuietly("can't start a streaming query with the same name in the same session") { + val ds1 = makeDataset._2 + val ds2 = makeDataset._2 + val queryName = "abc" + + val query1 = ds1.writeStream.format("noop").queryName(queryName).start() + try { + val e = intercept[IllegalArgumentException] { + ds2.writeStream.format("noop").queryName(queryName).start() + } + assert(e.getMessage.contains("query with that name is already active")) + } finally { + query1.stop() + } + } + + testQuietly("can start a streaming query with the same name in a different session") { + val session2 = spark.cloneSession() + + val ds1 = MemoryStream(Encoders.INT, spark.sqlContext).toDS() + val ds2 = MemoryStream(Encoders.INT, session2.sqlContext).toDS() + val queryName = "abc" + + val query1 = ds1.writeStream.format("noop").queryName(queryName).start() + val query2 = ds2.writeStream.format("noop").queryName(queryName).start() + + query1.stop() + query2.stop() + } + + testQuietly("can't start multiple instances of the same streaming query in the same session") { + withSQLConf(SQLConf.STREAMING_STOP_ACTIVE_RUN_ON_RESTART.key -> "false") { + withTempDir { dir => + val (ms1, ds1) = makeDataset + val (ms2, ds2) = makeDataset + val chkLocation = new File(dir, "_checkpoint").getCanonicalPath + val dataLocation = new File(dir, "data").getCanonicalPath + + val query1 = ds1.writeStream.format("parquet") + .option("checkpointLocation", chkLocation).start(dataLocation) + ms1.addData(1, 2, 3) + try { + val e = intercept[IllegalStateException] { + ds2.writeStream.format("parquet") + .option("checkpointLocation", chkLocation).start(dataLocation) + } + assert(e.getMessage.contains("same id")) + } finally { + spark.streams.active.foreach(_.stop()) + } + } + } + } + + testQuietly("new instance of the same streaming query stops old query in the same session") { + failAfter(90 seconds) { + withSQLConf(SQLConf.STREAMING_STOP_ACTIVE_RUN_ON_RESTART.key -> "true") { + withTempDir { dir => + val (ms1, ds1) = makeDataset + val (ms2, ds2) = makeDataset + val chkLocation = new File(dir, "_checkpoint").getCanonicalPath + val dataLocation = new File(dir, "data").getCanonicalPath + + val query1 = ds1.writeStream.format("parquet") + .option("checkpointLocation", chkLocation).start(dataLocation) + ms1.addData(1, 2, 3) + val query2 = ds2.writeStream.format("parquet") + .option("checkpointLocation", chkLocation).start(dataLocation) + try { + ms2.addData(1, 2, 3) + query2.processAllAvailable() + assert(spark.sharedState.activeStreamingQueries.get(query2.id) === + query2.asInstanceOf[StreamingQueryWrapper].streamingQuery, + "The correct streaming query is not being tracked in global state") + + assert(!query1.isActive, + "First query should have stopped before starting the second query") + } finally { + spark.streams.active.foreach(_.stop()) + } + } + } + } + } + + testQuietly( + "can't start multiple instances of the same streaming query in the different sessions") { + withSQLConf(SQLConf.STREAMING_STOP_ACTIVE_RUN_ON_RESTART.key -> "false") { + withTempDir { dir => + val session2 = spark.cloneSession() + + val ms1 = MemoryStream(Encoders.INT, spark.sqlContext) + val ds2 = MemoryStream(Encoders.INT, session2.sqlContext).toDS() + val chkLocation = new File(dir, "_checkpoint").getCanonicalPath + val dataLocation = new File(dir, "data").getCanonicalPath + + val query1 = ms1.toDS().writeStream.format("parquet") + .option("checkpointLocation", chkLocation).start(dataLocation) + ms1.addData(1, 2, 3) + try { + val e = intercept[IllegalStateException] { + ds2.writeStream.format("parquet") + .option("checkpointLocation", chkLocation).start(dataLocation) + } + assert(e.getMessage.contains("same id")) + } finally { + spark.streams.active.foreach(_.stop()) + session2.streams.active.foreach(_.stop()) + } + } + } + } + + testQuietly( + "new instance of the same streaming query stops old query in a different session") { + failAfter(90 seconds) { + withSQLConf(SQLConf.STREAMING_STOP_ACTIVE_RUN_ON_RESTART.key -> "true") { + withTempDir { dir => + val session2 = spark.cloneSession() + + val ms1 = MemoryStream(Encoders.INT, spark.sqlContext) + val ds2 = MemoryStream(Encoders.INT, session2.sqlContext).toDS() + val chkLocation = new File(dir, "_checkpoint").getCanonicalPath + val dataLocation = new File(dir, "data").getCanonicalPath + + val query1 = ms1.toDS().writeStream.format("parquet") + .option("checkpointLocation", chkLocation).start(dataLocation) + ms1.addData(1, 2, 3) + val query2 = ds2.writeStream.format("parquet") + .option("checkpointLocation", chkLocation).start(dataLocation) + try { + ms1.addData(1, 2, 3) + query2.processAllAvailable() + assert(spark.sharedState.activeStreamingQueries.get(query2.id) === + query2.asInstanceOf[StreamingQueryWrapper].streamingQuery, + "The correct streaming execution is not being tracked in global state") + + assert(!query1.isActive, + "First query should have stopped before starting the second query") + } finally { + spark.streams.active.foreach(_.stop()) + session2.streams.active.foreach(_.stop()) + } + } + } + } + } + /** Run a body of code by defining a query on each dataset */ private def withQueriesOn(datasets: Dataset[_]*)(body: Seq[StreamingQuery] => Unit): Unit = { failAfter(streamingTimeout) { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala index e784d318b4ffa..6f00b528cb8bd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryStatusAndProgressSuite.scala @@ -24,12 +24,18 @@ import scala.collection.JavaConverters._ import org.json4s._ import org.json4s.jackson.JsonMethods._ import org.scalatest.concurrent.Eventually +import org.scalatest.concurrent.PatienceConfiguration.Timeout import org.scalatest.time.SpanSugar._ +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.streaming.StreamingQueryStatusAndProgressSuite._ +import org.apache.spark.sql.streaming.StreamingQuerySuite.clock +import org.apache.spark.sql.streaming.util.StreamManualClock +import org.apache.spark.sql.types.StructType class StreamingQueryStatusAndProgressSuite extends StreamTest with Eventually { test("StreamingQueryProgress - prettyJson") { @@ -74,6 +80,17 @@ class StreamingQueryStatusAndProgressSuite extends StreamTest with Eventually { | "sink" : { | "description" : "sink", | "numOutputRows" : -1 + | }, + | "observedMetrics" : { + | "event1" : { + | "c1" : 1, + | "c2" : 3.0 + | }, + | "event2" : { + | "rc" : 1, + | "min_q" : "hello", + | "max_q" : "world" + | } | } |} """.stripMargin.trim) @@ -107,6 +124,22 @@ class StreamingQueryStatusAndProgressSuite extends StreamTest with Eventually { | "sink" : { | "description" : "sink", | "numOutputRows" : -1 + | }, + | "observedMetrics" : { + | "event_a" : { + | "c1" : null, + | "c2" : -20.7 + | }, + | "event_b1" : { + | "rc" : 33, + | "min_q" : "foo", + | "max_q" : "bar" + | }, + | "event_b2" : { + | "rc" : 200, + | "min_q" : "fzo", + | "max_q" : "baz" + | } | } |} """.stripMargin.trim) @@ -215,6 +248,45 @@ class StreamingQueryStatusAndProgressSuite extends StreamTest with Eventually { } } + test("SPARK-29973: Make `processedRowsPerSecond` calculated more accurately and meaningfully") { + import testImplicits._ + + clock = new StreamManualClock + val inputData = MemoryStream[Int] + val query = inputData.toDS() + + testStream(query)( + StartStream(Trigger.ProcessingTime(1000), triggerClock = clock), + AdvanceManualClock(1000), + waitUntilBatchProcessed, + AssertOnQuery(query => { + assert(query.lastProgress.numInputRows == 0) + assert(query.lastProgress.processedRowsPerSecond == 0.0d) + true + }), + AddData(inputData, 1, 2), + AdvanceManualClock(1000), + waitUntilBatchProcessed, + AssertOnQuery(query => { + assert(query.lastProgress.numInputRows == 2) + assert(query.lastProgress.processedRowsPerSecond == 2000d) + true + }), + StopStream + ) + } + + def waitUntilBatchProcessed: AssertOnQuery = Execute { q => + eventually(Timeout(streamingTimeout)) { + if (q.exception.isEmpty) { + assert(clock.isStreamWaitingAt(clock.getTimeMillis())) + } + } + if (q.exception.isDefined) { + throw q.exception.get + } + } + def assertJson(source: String, expected: String): Unit = { assert( source.replaceAll("\r\n|\r|\n", System.lineSeparator) === @@ -223,12 +295,24 @@ class StreamingQueryStatusAndProgressSuite extends StreamTest with Eventually { } object StreamingQueryStatusAndProgressSuite { + private val schema1 = new StructType() + .add("c1", "long") + .add("c2", "double") + private val schema2 = new StructType() + .add("rc", "long") + .add("min_q", "string") + .add("max_q", "string") + private def row(schema: StructType, elements: Any*): Row = { + new GenericRowWithSchema(elements.toArray, schema) + } + val testProgress1 = new StreamingQueryProgress( id = UUID.randomUUID, runId = UUID.randomUUID, name = "myName", timestamp = "2016-12-05T20:54:20.827Z", batchId = 2L, + batchDuration = 0L, durationMs = new java.util.HashMap(Map("total" -> 0L).mapValues(long2Long).asJava), eventTime = new java.util.HashMap(Map( "max" -> "2016-12-05T20:54:20.827Z", @@ -251,7 +335,10 @@ object StreamingQueryStatusAndProgressSuite { processedRowsPerSecond = Double.PositiveInfinity // should not be present in the json ) ), - sink = SinkProgress("sink", None) + sink = SinkProgress("sink", None), + observedMetrics = new java.util.HashMap(Map( + "event1" -> row(schema1, 1L, 3.0d), + "event2" -> row(schema2, 1L, "hello", "world")).asJava) ) val testProgress2 = new StreamingQueryProgress( @@ -260,6 +347,7 @@ object StreamingQueryStatusAndProgressSuite { name = null, // should not be present in the json timestamp = "2016-12-05T20:54:20.827Z", batchId = 2L, + batchDuration = 0L, durationMs = new java.util.HashMap(Map("total" -> 0L).mapValues(long2Long).asJava), // empty maps should be handled correctly eventTime = new java.util.HashMap(Map.empty[String, String].asJava), @@ -275,7 +363,11 @@ object StreamingQueryStatusAndProgressSuite { processedRowsPerSecond = Double.NegativeInfinity // should not be present in the json ) ), - sink = SinkProgress("sink", None) + sink = SinkProgress("sink", None), + observedMetrics = new java.util.HashMap(Map( + "event_a" -> row(schema1, null, -20.7d), + "event_b1" -> row(schema2, 33L, "foo", "bar"), + "event_b2" -> row(schema2, 200L, "fzo", "baz")).asJava) ) val testStatus = new StreamingQueryStatus("active", true, false) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala index 3ad893f871c94..77f5c856ff0f4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala @@ -29,19 +29,19 @@ import org.apache.hadoop.fs.Path import org.scalactic.TolerantNumerics import org.scalatest.BeforeAndAfter import org.scalatest.concurrent.PatienceConfiguration.Timeout -import org.scalatest.mockito.MockitoSugar +import org.scalatestplus.mockito.MockitoSugar import org.apache.spark.{SparkException, TestUtils} import org.apache.spark.internal.Logging import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} import org.apache.spark.sql.catalyst.expressions.{Literal, Rand, Randn, Shuffle, Uuid} +import org.apache.spark.sql.connector.read.InputPartition +import org.apache.spark.sql.connector.read.streaming.{Offset => OffsetV2} import org.apache.spark.sql.execution.exchange.ReusedExchangeExec import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.sources.{MemorySink, TestForeachWriter} import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.sources.v2.reader.InputPartition -import org.apache.spark.sql.sources.v2.reader.streaming.{Offset => OffsetV2} import org.apache.spark.sql.streaming.util.{BlockingSource, MockSourceProvider, StreamManualClock} import org.apache.spark.sql.types.StructType @@ -123,9 +123,11 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging wi assert(q3.runId !== q4.runId) // Only one query with same id can be active - val q5 = startQuery(restart = false) - val e = intercept[IllegalStateException] { - startQuery(restart = true) + withSQLConf(SQLConf.STREAMING_STOP_ACTIVE_RUN_ON_RESTART.key -> "false") { + val q5 = startQuery(restart = false) + val e = intercept[IllegalStateException] { + startQuery(restart = true) + } } } } @@ -464,7 +466,8 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging wi val streamingTriggerDF = spark.createDataset(1 to 10).toDF val streamingInputDF = createSingleTriggerStreamingDF(streamingTriggerDF).toDF("value") - val progress = getFirstProgress(streamingInputDF.join(streamingInputDF, "value")) + val progress = getStreamingQuery(streamingInputDF.join(streamingInputDF, "value")) + .recentProgress.head assert(progress.numInputRows === 20) // data is read multiple times in self-joins assert(progress.sources.size === 1) assert(progress.sources(0).numInputRows === 20) @@ -477,7 +480,8 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging wi // Trigger input has 10 rows, static input has 2 rows, // therefore after the first trigger, the calculated input rows should be 10 - val progress = getFirstProgress(streamingInputDF.join(staticInputDF, "value")) + val progress = getStreamingQuery(streamingInputDF.join(staticInputDF, "value")) + .recentProgress.head assert(progress.numInputRows === 10) assert(progress.sources.size === 1) assert(progress.sources(0).numInputRows === 10) @@ -490,7 +494,7 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging wi val streamingInputDF = createSingleTriggerStreamingDF(streamingTriggerDF) // After the first trigger, the calculated input rows should be 10 - val progress = getFirstProgress(streamingInputDF) + val progress = getStreamingQuery(streamingInputDF).recentProgress.head assert(progress.numInputRows === 10) assert(progress.sources.size === 1) assert(progress.sources(0).numInputRows === 10) @@ -1118,12 +1122,12 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging wi StreamingExecutionRelation(source, spark) } - /** Returns the query progress at the end of the first trigger of streaming DF */ - private def getFirstProgress(streamingDF: DataFrame): StreamingQueryProgress = { + /** Returns the query at the end of the first trigger of streaming DF */ + private def getStreamingQuery(streamingDF: DataFrame): StreamingQuery = { try { val q = streamingDF.writeStream.format("memory").queryName("test").start() q.processAllAvailable() - q.recentProgress.head + q } finally { spark.streams.active.map(_.stop()) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousQueuedDataReaderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousQueuedDataReaderSuite.scala index bad22590807a7..55b884573f647 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousQueuedDataReaderSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousQueuedDataReaderSuite.scala @@ -20,15 +20,15 @@ package org.apache.spark.sql.streaming.continuous import java.util.concurrent.{ArrayBlockingQueue, BlockingQueue} import org.mockito.Mockito._ -import org.scalatest.mockito.MockitoSugar +import org.scalatestplus.mockito.MockitoSugar import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection, UnsafeRow} +import org.apache.spark.sql.connector.read.streaming.{ContinuousPartitionReader, ContinuousStream, PartitionOffset} +import org.apache.spark.sql.connector.write.streaming.StreamingWrite import org.apache.spark.sql.execution.streaming.continuous._ -import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousPartitionReader, ContinuousStream, PartitionOffset} -import org.apache.spark.sql.sources.v2.writer.streaming.StreamingWrite import org.apache.spark.sql.streaming.StreamTest import org.apache.spark.sql.types.{DataType, IntegerType, StructType} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala index 5bd75c850fe76..8599ceb833ca4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.streaming.continuous +import java.sql.Timestamp + import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart} import org.apache.spark.sql._ @@ -100,6 +102,21 @@ class ContinuousSuite extends ContinuousSuiteBase { CheckAnswer(0, 1, 2, 3, 4, 5)) } + test("SPARK-29642: basic with various types") { + val input = ContinuousMemoryStream[String] + + testStream(input.toDF())( + AddData(input, "0", "1", "2"), + CheckAnswer("0", "1", "2")) + + val input2 = ContinuousMemoryStream[(String, Timestamp)] + + val timestamp = Timestamp.valueOf("2015-06-11 10:10:10.100") + testStream(input2.toDF())( + AddData(input2, ("0", timestamp), ("1", timestamp)), + CheckAnswer(("0", timestamp), ("1", timestamp))) + } + test("map") { val input = ContinuousMemoryStream[Int] val df = input.toDF().map(_.getInt(0) * 2) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/EpochCoordinatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/EpochCoordinatorSuite.scala index e3498db4194e8..0e1c9b9c4ba46 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/EpochCoordinatorSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/EpochCoordinatorSuite.scala @@ -21,16 +21,16 @@ import org.mockito.{ArgumentCaptor, InOrder} import org.mockito.ArgumentMatchers.{any, eq => eqTo} import org.mockito.Mockito._ import org.scalatest.BeforeAndAfterEach -import org.scalatest.mockito.MockitoSugar +import org.scalatestplus.mockito.MockitoSugar import org.apache.spark._ import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.sql.LocalSparkSession +import org.apache.spark.sql.connector.read.streaming.{ContinuousStream, PartitionOffset} +import org.apache.spark.sql.connector.write.WriterCommitMessage +import org.apache.spark.sql.connector.write.streaming.StreamingWrite import org.apache.spark.sql.execution.streaming.continuous._ import org.apache.spark.sql.internal.SQLConf.CONTINUOUS_STREAMING_EPOCH_BACKLOG_QUEUE_SIZE -import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousStream, PartitionOffset} -import org.apache.spark.sql.sources.v2.writer.WriterCommitMessage -import org.apache.spark.sql.sources.v2.writer.streaming.StreamingWrite import org.apache.spark.sql.test.TestSparkSession class EpochCoordinatorSuite diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala index 4db605ee1b238..05cf324f8d490 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala @@ -23,16 +23,17 @@ import java.util.Collections import scala.collection.JavaConverters._ import org.apache.spark.sql.{DataFrame, SQLContext} +import org.apache.spark.sql.connector.catalog.{SessionConfigSupport, SupportsRead, SupportsWrite, Table, TableCapability, TableProvider} +import org.apache.spark.sql.connector.catalog.TableCapability._ +import org.apache.spark.sql.connector.read.{InputPartition, PartitionReaderFactory, Scan, ScanBuilder} +import org.apache.spark.sql.connector.read.streaming.{ContinuousPartitionReaderFactory, ContinuousStream, MicroBatchStream, Offset, PartitionOffset} +import org.apache.spark.sql.connector.write.{LogicalWriteInfo, PhysicalWriteInfo, WriteBuilder, WriterCommitMessage} +import org.apache.spark.sql.connector.write.streaming.{StreamingDataWriterFactory, StreamingWrite} import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.execution.streaming.{ContinuousTrigger, RateStreamOffset, Sink, StreamingQueryWrapper} import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.connector.SimpleTableProvider import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider} -import org.apache.spark.sql.sources.v2._ -import org.apache.spark.sql.sources.v2.TableCapability._ -import org.apache.spark.sql.sources.v2.reader._ -import org.apache.spark.sql.sources.v2.reader.streaming._ -import org.apache.spark.sql.sources.v2.writer.{WriteBuilder, WriterCommitMessage} -import org.apache.spark.sql.sources.v2.writer.streaming.{StreamingDataWriterFactory, StreamingWrite} import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery, StreamTest, Trigger} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -68,7 +69,8 @@ class FakeScanBuilder extends ScanBuilder with Scan { class FakeWriteBuilder extends WriteBuilder with StreamingWrite { override def buildForStreaming(): StreamingWrite = this - override def createStreamingWriterFactory(): StreamingDataWriterFactory = { + override def createStreamingWriterFactory( + info: PhysicalWriteInfo): StreamingDataWriterFactory = { throw new IllegalStateException("fake sink - cannot actually write") } override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = { @@ -85,14 +87,14 @@ trait FakeStreamingWriteTable extends Table with SupportsWrite { override def capabilities(): util.Set[TableCapability] = { Set(STREAMING_WRITE).asJava } - override def newWriteBuilder(options: CaseInsensitiveStringMap): WriteBuilder = { + override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = { new FakeWriteBuilder } } class FakeReadMicroBatchOnly extends DataSourceRegister - with TableProvider + with SimpleTableProvider with SessionConfigSupport { override def shortName(): String = "fake-read-microbatch-only" @@ -115,7 +117,7 @@ class FakeReadMicroBatchOnly class FakeReadContinuousOnly extends DataSourceRegister - with TableProvider + with SimpleTableProvider with SessionConfigSupport { override def shortName(): String = "fake-read-continuous-only" @@ -136,7 +138,7 @@ class FakeReadContinuousOnly } } -class FakeReadBothModes extends DataSourceRegister with TableProvider { +class FakeReadBothModes extends DataSourceRegister with SimpleTableProvider { override def shortName(): String = "fake-read-microbatch-continuous" override def getTable(options: CaseInsensitiveStringMap): Table = { @@ -153,7 +155,7 @@ class FakeReadBothModes extends DataSourceRegister with TableProvider { } } -class FakeReadNeitherMode extends DataSourceRegister with TableProvider { +class FakeReadNeitherMode extends DataSourceRegister with SimpleTableProvider { override def shortName(): String = "fake-read-neither-mode" override def getTable(options: CaseInsensitiveStringMap): Table = { @@ -167,7 +169,7 @@ class FakeReadNeitherMode extends DataSourceRegister with TableProvider { class FakeWriteOnly extends DataSourceRegister - with TableProvider + with SimpleTableProvider with SessionConfigSupport { override def shortName(): String = "fake-write-microbatch-continuous" @@ -182,7 +184,7 @@ class FakeWriteOnly } } -class FakeNoWrite extends DataSourceRegister with TableProvider { +class FakeNoWrite extends DataSourceRegister with SimpleTableProvider { override def shortName(): String = "fake-write-neither-mode" override def getTable(options: CaseInsensitiveStringMap): Table = { new Table { @@ -200,7 +202,7 @@ class FakeSink extends Sink { } class FakeWriteSupportProviderV1Fallback extends DataSourceRegister - with TableProvider with StreamSinkProvider { + with SimpleTableProvider with StreamSinkProvider { override def createSink( sqlContext: SQLContext, @@ -377,10 +379,10 @@ class StreamingDataSourceV2Suite extends StreamTest { for ((read, write, trigger) <- cases) { testQuietly(s"stream with read format $read, write format $write, trigger $trigger") { val sourceTable = DataSource.lookupDataSource(read, spark.sqlContext.conf).getConstructor() - .newInstance().asInstanceOf[TableProvider].getTable(CaseInsensitiveStringMap.empty()) + .newInstance().asInstanceOf[SimpleTableProvider].getTable(CaseInsensitiveStringMap.empty()) val sinkTable = DataSource.lookupDataSource(write, spark.sqlContext.conf).getConstructor() - .newInstance().asInstanceOf[TableProvider].getTable(CaseInsensitiveStringMap.empty()) + .newInstance().asInstanceOf[SimpleTableProvider].getTable(CaseInsensitiveStringMap.empty()) import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._ trigger match { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala index c630f1497a17e..f9fc540c2ab80 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala @@ -92,7 +92,7 @@ class DefaultSource extends StreamSourceProvider with StreamSinkProvider { spark.internalCreateDataFrame(spark.sparkContext.emptyRDD, schema, isStreaming = true) } - override def stop() {} + override def stop(): Unit = {} } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPageSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPageSuite.scala new file mode 100644 index 0000000000000..de43e470e8e13 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryPageSuite.scala @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.streaming.ui + +import java.util.{Locale, UUID} +import javax.servlet.http.HttpServletRequest + +import org.mockito.Mockito.{mock, when, RETURNS_SMART_NULLS} +import org.scalatest.BeforeAndAfter +import scala.xml.Node + +import org.apache.spark.sql.streaming.StreamingQueryProgress +import org.apache.spark.sql.test.SharedSparkSession + +class StreamingQueryPageSuite extends SharedSparkSession with BeforeAndAfter { + + test("correctly display streaming query page") { + val id = UUID.randomUUID() + val request = mock(classOf[HttpServletRequest]) + val tab = mock(classOf[StreamingQueryTab], RETURNS_SMART_NULLS) + val statusListener = mock(classOf[StreamingQueryStatusListener], RETURNS_SMART_NULLS) + when(tab.appName).thenReturn("testing") + when(tab.headerTabs).thenReturn(Seq.empty) + when(tab.statusListener).thenReturn(statusListener) + + val streamQuery = createStreamQueryUIData(id) + when(statusListener.allQueryStatus).thenReturn(Seq(streamQuery)) + var html = renderStreamingQueryPage(request, tab) + .toString().toLowerCase(Locale.ROOT) + assert(html.contains("active streaming queries (1)")) + assert(html.contains("completed streaming queries (0)")) + + when(streamQuery.isActive).thenReturn(false) + when(streamQuery.exception).thenReturn(None) + html = renderStreamingQueryPage(request, tab) + .toString().toLowerCase(Locale.ROOT) + assert(html.contains("active streaming queries (0)")) + assert(html.contains("completed streaming queries (1)")) + assert(html.contains("finished")) + + when(streamQuery.isActive).thenReturn(false) + when(streamQuery.exception).thenReturn(Option("exception in query")) + html = renderStreamingQueryPage(request, tab) + .toString().toLowerCase(Locale.ROOT) + assert(html.contains("active streaming queries (0)")) + assert(html.contains("completed streaming queries (1)")) + assert(html.contains("failed")) + assert(html.contains("exception in query")) + } + + test("correctly display streaming query statistics page") { + val id = UUID.randomUUID() + val request = mock(classOf[HttpServletRequest]) + val tab = mock(classOf[StreamingQueryTab], RETURNS_SMART_NULLS) + val statusListener = mock(classOf[StreamingQueryStatusListener], RETURNS_SMART_NULLS) + when(request.getParameter("id")).thenReturn(id.toString) + when(tab.appName).thenReturn("testing") + when(tab.headerTabs).thenReturn(Seq.empty) + when(tab.statusListener).thenReturn(statusListener) + + val streamQuery = createStreamQueryUIData(id) + when(statusListener.allQueryStatus).thenReturn(Seq(streamQuery)) + val html = renderStreamingQueryStatisticsPage(request, tab) + .toString().toLowerCase(Locale.ROOT) + + assert(html.contains("name: query<")) + assert(html.contains("""{"x": 1001898000100, "y": 10.0}""")) + assert(html.contains("""{"x": 1001898000100, "y": 12.0}""")) + assert(html.contains("(3 completed batches)")) + } + + private def createStreamQueryUIData(id: UUID): StreamingQueryUIData = { + val progress = mock(classOf[StreamingQueryProgress], RETURNS_SMART_NULLS) + when(progress.timestamp).thenReturn("2001-10-01T01:00:00.100Z") + when(progress.inputRowsPerSecond).thenReturn(10.0) + when(progress.processedRowsPerSecond).thenReturn(12.0) + when(progress.batchId).thenReturn(2) + when(progress.prettyJson).thenReturn("""{"a":1}""") + + val streamQuery = mock(classOf[StreamingQueryUIData], RETURNS_SMART_NULLS) + when(streamQuery.isActive).thenReturn(true) + when(streamQuery.name).thenReturn("query") + when(streamQuery.id).thenReturn(id) + when(streamQuery.runId).thenReturn(id) + when(streamQuery.submissionTime).thenReturn(1L) + when(streamQuery.lastProgress).thenReturn(progress) + when(streamQuery.recentProgress).thenReturn(Array(progress)) + when(streamQuery.exception).thenReturn(None) + + streamQuery + } + + /** + * Render a stage page started with the given conf and return the HTML. + * This also runs a dummy execution page to populate the page with useful content. + */ + private def renderStreamingQueryPage( + request: HttpServletRequest, + tab: StreamingQueryTab): Seq[Node] = { + val page = new StreamingQueryPage(tab) + page.render(request) + } + + private def renderStreamingQueryStatisticsPage( + request: HttpServletRequest, + tab: StreamingQueryTab): Seq[Node] = { + val page = new StreamingQueryStatisticsPage(tab) + page.render(request) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListenerSuite.scala new file mode 100644 index 0000000000000..adbb501f9842e --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatusListenerSuite.scala @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.streaming.ui + +import java.util.UUID + +import org.mockito.Mockito.{mock, when, RETURNS_SMART_NULLS} + +import org.apache.spark.sql.streaming.{StreamingQueryListener, StreamingQueryProgress, StreamTest} +import org.apache.spark.sql.streaming + +class StreamingQueryStatusListenerSuite extends StreamTest { + + test("onQueryStarted, onQueryProgress, onQueryTerminated") { + val listener = new StreamingQueryStatusListener(spark.sparkContext.conf) + + // hanlde query started event + val id = UUID.randomUUID() + val runId = UUID.randomUUID() + val startEvent = new StreamingQueryListener.QueryStartedEvent(id, runId, "test", 1L) + listener.onQueryStarted(startEvent) + + // result checking + assert(listener.activeQueryStatus.size() == 1) + assert(listener.activeQueryStatus.get(runId).name == "test") + + // handle query progress event + val progress = mock(classOf[StreamingQueryProgress], RETURNS_SMART_NULLS) + when(progress.id).thenReturn(id) + when(progress.runId).thenReturn(runId) + when(progress.timestamp).thenReturn("2001-10-01T01:00:00.100Z") + when(progress.inputRowsPerSecond).thenReturn(10.0) + when(progress.processedRowsPerSecond).thenReturn(12.0) + when(progress.batchId).thenReturn(2) + when(progress.prettyJson).thenReturn("""{"a":1}""") + val processEvent = new streaming.StreamingQueryListener.QueryProgressEvent(progress) + listener.onQueryProgress(processEvent) + + // result checking + val activeQuery = listener.activeQueryStatus.get(runId) + assert(activeQuery.isActive) + assert(activeQuery.recentProgress.length == 1) + assert(activeQuery.lastProgress.id == id) + assert(activeQuery.lastProgress.runId == runId) + assert(activeQuery.lastProgress.timestamp == "2001-10-01T01:00:00.100Z") + assert(activeQuery.lastProgress.inputRowsPerSecond == 10.0) + assert(activeQuery.lastProgress.processedRowsPerSecond == 12.0) + assert(activeQuery.lastProgress.batchId == 2) + assert(activeQuery.lastProgress.prettyJson == """{"a":1}""") + + // handle terminate event + val terminateEvent = new StreamingQueryListener.QueryTerminatedEvent(id, runId, None) + listener.onQueryTerminated(terminateEvent) + + assert(!listener.inactiveQueryStatus.head.isActive) + assert(listener.inactiveQueryStatus.head.runId == runId) + assert(listener.inactiveQueryStatus.head.id == id) + } + + test("same query start multiple times") { + val listener = new StreamingQueryStatusListener(spark.sparkContext.conf) + + // handle first time start + val id = UUID.randomUUID() + val runId0 = UUID.randomUUID() + val startEvent0 = new StreamingQueryListener.QueryStartedEvent(id, runId0, "test", 1L) + listener.onQueryStarted(startEvent0) + + // handle terminate event + val terminateEvent0 = new StreamingQueryListener.QueryTerminatedEvent(id, runId0, None) + listener.onQueryTerminated(terminateEvent0) + + // handle second time start + val runId1 = UUID.randomUUID() + val startEvent1 = new StreamingQueryListener.QueryStartedEvent(id, runId1, "test", 1L) + listener.onQueryStarted(startEvent1) + + // result checking + assert(listener.activeQueryStatus.size() == 1) + assert(listener.inactiveQueryStatus.length == 1) + assert(listener.activeQueryStatus.containsKey(runId1)) + assert(listener.activeQueryStatus.get(runId1).id == id) + assert(listener.inactiveQueryStatus.head.runId == runId0) + assert(listener.inactiveQueryStatus.head.id == id) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/UIUtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/UIUtilsSuite.scala new file mode 100644 index 0000000000000..46f2eadc05835 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/ui/UIUtilsSuite.scala @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.streaming.ui + +import org.mockito.Mockito.{mock, when, RETURNS_SMART_NULLS} +import org.scalatest.Matchers + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.streaming.StreamingQueryProgress + +class UIUtilsSuite extends SparkFunSuite with Matchers { + test("streaming query started with no batch completed") { + val query = mock(classOf[StreamingQueryUIData], RETURNS_SMART_NULLS) + when(query.lastProgress).thenReturn(null) + + assert(0 == UIUtils.withNoProgress(query, 1, 0)) + } + + test("streaming query started with at least one batch completed") { + val query = mock(classOf[StreamingQueryUIData], RETURNS_SMART_NULLS) + val progress = mock(classOf[StreamingQueryProgress], RETURNS_SMART_NULLS) + when(query.lastProgress).thenReturn(progress) + + assert(1 == UIUtils.withNoProgress(query, 1, 0)) + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/BlockOnStopSource.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/BlockOnStopSource.scala new file mode 100644 index 0000000000000..c594a8523d15e --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/BlockOnStopSource.scala @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.streaming.util + +import java.util +import java.util.concurrent.CountDownLatch + +import scala.collection.JavaConverters._ + +import org.apache.zookeeper.KeeperException.UnimplementedException + +import org.apache.spark.sql.{DataFrame, Row, SparkSession, SQLContext} +import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapability} +import org.apache.spark.sql.connector.catalog.TableCapability.CONTINUOUS_READ +import org.apache.spark.sql.connector.read.{streaming, InputPartition, Scan, ScanBuilder} +import org.apache.spark.sql.connector.read.streaming.{ContinuousPartitionReaderFactory, ContinuousStream, PartitionOffset} +import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Source} +import org.apache.spark.sql.internal.connector.SimpleTableProvider +import org.apache.spark.sql.sources.StreamSourceProvider +import org.apache.spark.sql.types.{LongType, StructType} +import org.apache.spark.sql.util.CaseInsensitiveStringMap + +/** The V1 and V2 provider of a streaming source, which blocks indefinitely on the call of stop() */ +object BlockOnStopSourceProvider { + private var _latch: CountDownLatch = _ + val schema: StructType = new StructType().add("id", LongType) + + /** Set the latch that we will use to block the streaming query thread. */ + def enableBlocking(): Unit = { + if (_latch == null || _latch.getCount == 0) { + _latch = new CountDownLatch(1) + } + } + + def disableBlocking(): Unit = { + if (_latch != null) { + _latch.countDown() + _latch = null + } + } +} + +class BlockOnStopSourceProvider extends StreamSourceProvider with SimpleTableProvider { + override def getTable(options: CaseInsensitiveStringMap): Table = { + new BlockOnStopSourceTable(BlockOnStopSourceProvider._latch) + } + + override def sourceSchema( + sqlContext: SQLContext, + schema: Option[StructType], + providerName: String, + parameters: Map[String, String]): (String, StructType) = { + "blockingSource" -> BlockOnStopSourceProvider.schema + } + + override def createSource( + sqlContext: SQLContext, + metadataPath: String, + schema: Option[StructType], + providerName: String, + parameters: Map[String, String]): Source = { + new BlockOnStopSource(sqlContext.sparkSession, BlockOnStopSourceProvider._latch) + } +} + +/** A V1 Streaming Source which blocks on stop(). It does not produce any data. */ +class BlockOnStopSource(spark: SparkSession, latch: CountDownLatch) extends Source { + // Blocks until latch countdowns + override def stop(): Unit = latch.await() + + // Boiler-plate + override val schema: StructType = BlockOnStopSourceProvider.schema + override def getOffset: Option[Offset] = Some(LongOffset(0)) + override def getBatch(start: Option[Offset], end: Offset): DataFrame = { + spark.createDataFrame(spark.sparkContext.emptyRDD[Row], schema) + } +} + +/** A V2 Table, which can create a blocking streaming source for ContinuousExecution. */ +class BlockOnStopSourceTable(latch: CountDownLatch) extends Table with SupportsRead { + override def schema(): StructType = BlockOnStopSourceProvider.schema + + override def name(): String = "blockingSource" + + override def capabilities(): util.Set[TableCapability] = Set(CONTINUOUS_READ).asJava + + override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = { + new ScanBuilder { + override def build(): Scan = new Scan { + override def readSchema(): StructType = schema() + + override def toContinuousStream(checkpointLocation: String): ContinuousStream = { + new BlockOnStopContinuousStream(latch) + } + } + } + } +} + +/** + * A V2 Streaming Source which blocks on stop(). It does not produce any data. We use this for + * testing stopping in ContinuousExecution. + */ +class BlockOnStopContinuousStream(latch: CountDownLatch) extends ContinuousStream { + + // Blocks until latch countdowns + override def stop(): Unit = latch.await() + + // Boiler-plate + override def planInputPartitions(start: streaming.Offset): Array[InputPartition] = Array.empty + override def mergeOffsets(offsets: Array[PartitionOffset]): streaming.Offset = LongOffset(0L) + override def deserializeOffset(json: String): streaming.Offset = LongOffset(0L) + override def initialOffset(): Offset = LongOffset(0) + override def commit(end: streaming.Offset): Unit = {} + override def createContinuousReaderFactory(): ContinuousPartitionReaderFactory = { + throw new UnimplementedException + } +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/BlockingSource.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/BlockingSource.scala index 67158fb99d13d..c1b29b5130e86 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/BlockingSource.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/BlockingSource.scala @@ -52,7 +52,7 @@ class BlockingSource extends StreamSourceProvider with StreamSinkProvider { import spark.implicits._ Seq[Int]().toDS().toDF() } - override def stop() {} + override def stop(): Unit = {} } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala index b98626a34cc29..fb939007697c2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala @@ -234,6 +234,21 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSparkSession with assert(DataSourceUtils.decodePartitioningColumns(partColumns) === Seq("col1", "col2")) } + test ("SPARK-29537: throw exception when user defined a wrong base path") { + withTempPath { p => + val path = new Path(p.toURI).toString + Seq((1, 1), (2, 2)).toDF("c1", "c2") + .write.partitionBy("c1").mode(SaveMode.Overwrite).parquet(path) + val wrongBasePath = new File(p, "unknown") + // basePath must be a directory + wrongBasePath.mkdir() + val msg = intercept[IllegalArgumentException] { + spark.read.option("basePath", wrongBasePath.getCanonicalPath).parquet(path) + }.getMessage + assert(msg === s"Wrong basePath ${wrongBasePath.getCanonicalPath} for the root path: $path") + } + } + test("save mode") { spark.range(10).write .format("org.apache.spark.sql.test") @@ -277,7 +292,7 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSparkSession with .format(classOf[NoopDataSource].getName) .mode(SaveMode.Append) .save() - sparkContext.listenerBus.waitUntilEmpty(1000) + sparkContext.listenerBus.waitUntilEmpty() assert(plan.isInstanceOf[AppendData]) // overwrite mode creates `OverwriteByExpression` @@ -285,22 +300,24 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSparkSession with .format(classOf[NoopDataSource].getName) .mode(SaveMode.Overwrite) .save() - sparkContext.listenerBus.waitUntilEmpty(1000) + sparkContext.listenerBus.waitUntilEmpty() assert(plan.isInstanceOf[OverwriteByExpression]) // By default the save mode is `ErrorIfExists` for data source v2. - spark.range(10).write - .format(classOf[NoopDataSource].getName) - .save() - sparkContext.listenerBus.waitUntilEmpty(1000) - assert(plan.isInstanceOf[AppendData]) + val e = intercept[AnalysisException] { + spark.range(10).write + .format(classOf[NoopDataSource].getName) + .save() + } + assert(e.getMessage.contains("ErrorIfExists")) - spark.range(10).write - .format(classOf[NoopDataSource].getName) - .mode("default") - .save() - sparkContext.listenerBus.waitUntilEmpty(1000) - assert(plan.isInstanceOf[AppendData]) + val e2 = intercept[AnalysisException] { + spark.range(10).write + .format(classOf[NoopDataSource].getName) + .mode("default") + .save() + } + assert(e2.getMessage.contains("ErrorIfExists")) } finally { spark.listenerManager.unregister(listener) } @@ -472,11 +489,10 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSparkSession with // when users do not specify the schema checkAnswer(dfReader.load(), spark.range(1, 11).toDF()) - // when users specify the schema + // when users specify a wrong schema val inputSchema = new StructType().add("s", IntegerType, nullable = false) val e = intercept[AnalysisException] { dfReader.schema(inputSchema).load() } - assert(e.getMessage.contains( - "org.apache.spark.sql.sources.SimpleScanSource does not allow user-specified schemas")) + assert(e.getMessage.contains("The user-specified schema doesn't match the actual schema")) } test("read a data source that does not extend RelationProvider") { @@ -1058,7 +1074,7 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSparkSession with checkDatasetUnorderly( spark.read.parquet(dir.getCanonicalPath).as[(Long, Long)], 0L -> 0L, 1L -> 1L, 2L -> 2L) - sparkContext.listenerBus.waitUntilEmpty(10000) + sparkContext.listenerBus.waitUntilEmpty() assert(jobDescriptions.asScala.toList.exists( _.contains("Listing leaf files and directories for 3 paths"))) } finally { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala index 615923fe02d6c..c51faaf10f5dd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala @@ -21,6 +21,7 @@ import java.nio.charset.StandardCharsets import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SparkSession, SQLContext, SQLImplicits} +import org.apache.spark.unsafe.types.CalendarInterval /** * A collection of sample data used in SQL tests. @@ -168,6 +169,13 @@ private[sql] trait SQLTestData { self => rdd } + protected lazy val calenderIntervalData: RDD[IntervalData] = { + val rdd = spark.sparkContext.parallelize( + IntervalData(new CalendarInterval(1, 1, 1)) :: Nil) + rdd.toDF().createOrReplaceTempView("calenderIntervalData") + rdd + } + protected lazy val repeatedData: RDD[StringData] = { val rdd = spark.sparkContext.parallelize(List.fill(2)(StringData("test"))) rdd.toDF().createOrReplaceTempView("repeatedData") @@ -335,4 +343,5 @@ private[sql] object SQLTestData { case class ComplexData(m: Map[String, Int], s: TestData, a: Seq[Int], b: Boolean) case class CourseSales(course: String, year: Int, earnings: Double) case class TrainingSales(training: String, sales: CourseSales) + case class IntervalData(data: CalendarInterval) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala index 115536da8949e..38893f846e5a4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala @@ -363,6 +363,19 @@ private[sql] trait SQLTestUtilsBase } } + /** + * Drops namespace `namespace` after calling `f`. + * + * Note that, if you switch current catalog/namespace in `f`, you should switch it back manually. + */ + protected def withNamespace(namespaces: String*)(f: => Unit): Unit = { + Utils.tryWithSafeFinally(f) { + namespaces.foreach { name => + spark.sql(s"DROP NAMESPACE IF EXISTS $name CASCADE") + } + } + } + /** * Enables Locale `language` before executing `f`, then switches back to the default locale of JVM * after `f` returns. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala index a8e1a44f3d5d2..6881812286b24 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala @@ -20,15 +20,19 @@ package org.apache.spark.sql.util import scala.collection.mutable.ArrayBuffer import org.apache.spark._ -import org.apache.spark.sql.{functions, AnalysisException, QueryTest} +import org.apache.spark.sql.{functions, AnalysisException, QueryTest, Row} import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation -import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, InsertIntoTable, LogicalPlan, Project} +import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, InsertIntoStatement, LogicalPlan, Project} import org.apache.spark.sql.execution.{QueryExecution, WholeStageCodegenExec} +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.datasources.{CreateTable, InsertIntoHadoopFsRelationCommand} import org.apache.spark.sql.execution.datasources.json.JsonFileFormat +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession -class DataFrameCallbackSuite extends QueryTest with SharedSparkSession { +class DataFrameCallbackSuite extends QueryTest + with SharedSparkSession + with AdaptiveSparkPlanHelper { import testImplicits._ import functions._ @@ -48,7 +52,7 @@ class DataFrameCallbackSuite extends QueryTest with SharedSparkSession { df.select("i").collect() df.filter($"i" > 0).count() - sparkContext.listenerBus.waitUntilEmpty(1000) + sparkContext.listenerBus.waitUntilEmpty() assert(metrics.length == 2) assert(metrics(0)._1 == "collect") @@ -79,7 +83,7 @@ class DataFrameCallbackSuite extends QueryTest with SharedSparkSession { val e = intercept[SparkException](df.select(errorUdf($"i")).collect()) - sparkContext.listenerBus.waitUntilEmpty(1000) + sparkContext.listenerBus.waitUntilEmpty() assert(metrics.length == 1) assert(metrics(0)._1 == "collect") assert(metrics(0)._2.analyzed.isInstanceOf[Project]) @@ -95,7 +99,7 @@ class DataFrameCallbackSuite extends QueryTest with SharedSparkSession { override def onFailure(funcName: String, qe: QueryExecution, error: Throwable): Unit = {} override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = { - val metric = qe.executedPlan match { + val metric = stripAQEPlan(qe.executedPlan) match { case w: WholeStageCodegenExec => w.child.longMetric("numOutputRows") case other => other.longMetric("numOutputRows") } @@ -109,12 +113,12 @@ class DataFrameCallbackSuite extends QueryTest with SharedSparkSession { df.collect() // Wait for the first `collect` to be caught by our listener. Otherwise the next `collect` will // reset the plan metrics. - sparkContext.listenerBus.waitUntilEmpty(1000) + sparkContext.listenerBus.waitUntilEmpty() df.collect() Seq(1 -> "a", 2 -> "a").toDF("i", "j").groupBy("i").count().collect() - sparkContext.listenerBus.waitUntilEmpty(1000) + sparkContext.listenerBus.waitUntilEmpty() assert(metrics.length == 3) assert(metrics(0) === 1) assert(metrics(1) === 1) @@ -162,7 +166,7 @@ class DataFrameCallbackSuite extends QueryTest with SharedSparkSession { // For this simple case, the peakExecutionMemory of a stage should be the data size of the // aggregate operator, as we only have one memory consuming operator per stage. - sparkContext.listenerBus.waitUntilEmpty(1000) + sparkContext.listenerBus.waitUntilEmpty() assert(metrics.length == 2) assert(metrics(0) == topAggDataSize) assert(metrics(1) == bottomAggDataSize) @@ -186,7 +190,7 @@ class DataFrameCallbackSuite extends QueryTest with SharedSparkSession { withTempPath { path => spark.range(10).write.format("json").save(path.getCanonicalPath) - sparkContext.listenerBus.waitUntilEmpty(1000) + sparkContext.listenerBus.waitUntilEmpty() assert(commands.length == 1) assert(commands.head._1 == "save") assert(commands.head._2.isInstanceOf[InsertIntoHadoopFsRelationCommand]) @@ -197,18 +201,18 @@ class DataFrameCallbackSuite extends QueryTest with SharedSparkSession { withTable("tab") { sql("CREATE TABLE tab(i long) using parquet") // adds commands(1) via onSuccess spark.range(10).write.insertInto("tab") - sparkContext.listenerBus.waitUntilEmpty(1000) + sparkContext.listenerBus.waitUntilEmpty() assert(commands.length == 3) assert(commands(2)._1 == "insertInto") - assert(commands(2)._2.isInstanceOf[InsertIntoTable]) - assert(commands(2)._2.asInstanceOf[InsertIntoTable].table + assert(commands(2)._2.isInstanceOf[InsertIntoStatement]) + assert(commands(2)._2.asInstanceOf[InsertIntoStatement].table .asInstanceOf[UnresolvedRelation].multipartIdentifier == Seq("tab")) } // exiting withTable adds commands(3) via onSuccess (drops tab) withTable("tab") { spark.range(10).select($"id", $"id" % 5 as "p").write.partitionBy("p").saveAsTable("tab") - sparkContext.listenerBus.waitUntilEmpty(1000) + sparkContext.listenerBus.waitUntilEmpty() assert(commands.length == 5) assert(commands(4)._1 == "saveAsTable") assert(commands(4)._2.isInstanceOf[CreateTable]) @@ -220,10 +224,58 @@ class DataFrameCallbackSuite extends QueryTest with SharedSparkSession { val e = intercept[AnalysisException] { spark.range(10).select($"id", $"id").write.insertInto("tab") } - sparkContext.listenerBus.waitUntilEmpty(1000) + sparkContext.listenerBus.waitUntilEmpty() assert(errors.length == 1) assert(errors.head._1 == "insertInto") assert(errors.head._2 == e) } } + + test("get observable metrics by callback") { + val metricMaps = ArrayBuffer.empty[Map[String, Row]] + val listener = new QueryExecutionListener { + override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = { + metricMaps += qe.observedMetrics + } + + override def onFailure(funcName: String, qe: QueryExecution, exception: Throwable): Unit = { + // No-op + } + } + spark.listenerManager.register(listener) + try { + val df = spark.range(100) + .observe( + name = "my_event", + min($"id").as("min_val"), + max($"id").as("max_val"), + sum($"id").as("sum_val"), + count(when($"id" % 2 === 0, 1)).as("num_even")) + .observe( + name = "other_event", + avg($"id").cast("int").as("avg_val")) + + def checkMetrics(metrics: Map[String, Row]): Unit = { + assert(metrics.size === 2) + assert(metrics("my_event") === Row(0L, 99L, 4950L, 50L)) + assert(metrics("other_event") === Row(49)) + } + + // First run + df.collect() + sparkContext.listenerBus.waitUntilEmpty() + assert(metricMaps.size === 1) + checkMetrics(metricMaps.head) + metricMaps.clear() + + // Second run should produce the same result as the first run. + df.collect() + sparkContext.listenerBus.waitUntilEmpty() + assert(metricMaps.size === 1) + checkMetrics(metricMaps.head) + + } finally { + spark.listenerManager.unregister(listener) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/util/ExecutionListenerManagerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/util/ExecutionListenerManagerSuite.scala index 79819e7655414..2fd6cb220ea3f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/util/ExecutionListenerManagerSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/util/ExecutionListenerManagerSuite.scala @@ -34,13 +34,13 @@ class ExecutionListenerManagerSuite extends SparkFunSuite with LocalSparkSession spark = SparkSession.builder().master("local").appName("test").config(conf).getOrCreate() spark.sql("select 1").collect() - spark.sparkContext.listenerBus.waitUntilEmpty(1000) + spark.sparkContext.listenerBus.waitUntilEmpty() assert(INSTANCE_COUNT.get() === 1) assert(CALLBACK_COUNT.get() === 1) val cloned = spark.cloneSession() cloned.sql("select 1").collect() - spark.sparkContext.listenerBus.waitUntilEmpty(1000) + spark.sparkContext.listenerBus.waitUntilEmpty() assert(INSTANCE_COUNT.get() === 1) assert(CALLBACK_COUNT.get() === 2) } diff --git a/sql/core/v1.2.1/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java b/sql/core/v1.2/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java similarity index 100% rename from sql/core/v1.2.1/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java rename to sql/core/v1.2/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java diff --git a/sql/core/v1.2.1/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala b/sql/core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala similarity index 100% rename from sql/core/v1.2.1/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala rename to sql/core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala diff --git a/sql/core/v1.2.1/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala b/sql/core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala similarity index 100% rename from sql/core/v1.2.1/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala rename to sql/core/v1.2/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala diff --git a/sql/core/v1.2.1/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala b/sql/core/v1.2/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala similarity index 51% rename from sql/core/v1.2.1/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala rename to sql/core/v1.2/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala index b1a907f9cba27..ee5162bced8ac 100644 --- a/sql/core/v1.2.1/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala +++ b/sql/core/v1.2/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala @@ -25,13 +25,14 @@ import scala.collection.JavaConverters._ import org.apache.orc.storage.ql.io.sarg.{PredicateLeaf, SearchArgument} +import org.apache.spark.SparkConf import org.apache.spark.sql.{AnalysisException, Column, DataFrame} import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation -import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, HadoopFsRelation, LogicalRelation} -import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation -import org.apache.spark.sql.execution.datasources.v2.orc.OrcTable +import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation +import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ @@ -44,6 +45,11 @@ import org.apache.spark.sql.types._ */ class OrcFilterSuite extends OrcTest with SharedSparkSession { + override protected def sparkConf: SparkConf = + super + .sparkConf + .set(SQLConf.USE_V1_SOURCE_LIST, "") + protected def checkFilterPredicate( df: DataFrame, predicate: Predicate, @@ -54,15 +60,11 @@ class OrcFilterSuite extends OrcTest with SharedSparkSession { .where(Column(predicate)) query.queryExecution.optimizedPlan match { - case PhysicalOperation(_, filters, - DataSourceV2Relation(orcTable: OrcTable, _, options)) => + case PhysicalOperation(_, filters, DataSourceV2ScanRelation(_, o: OrcScan, _)) => assert(filters.nonEmpty, "No filter is analyzed from the given query") - val scanBuilder = orcTable.newScanBuilder(options) - scanBuilder.pushFilters(filters.flatMap(DataSourceStrategy.translateFilter).toArray) - val pushedFilters = scanBuilder.pushedFilters() - assert(pushedFilters.nonEmpty, "No filter is pushed down") - val maybeFilter = OrcFilters.createFilter(query.schema, pushedFilters) - assert(maybeFilter.isDefined, s"Couldn't generate filter predicate for $pushedFilters") + assert(o.pushedFilters.nonEmpty, "No filter is pushed down") + val maybeFilter = OrcFilters.createFilter(query.schema, o.pushedFilters) + assert(maybeFilter.isDefined, s"Couldn't generate filter predicate for ${o.pushedFilters}") checker(maybeFilter.get) case _ => @@ -91,154 +93,154 @@ class OrcFilterSuite extends OrcTest with SharedSparkSession { test("filter pushdown - integer") { withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i)))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === 1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < 2, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= 4, PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(1) === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(1) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(2) > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(3) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(1) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(4) <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === 1, PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < 2, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= 4, PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(1) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal(1) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(2) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal(3) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(1) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(4) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } test("filter pushdown - long") { withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i.toLong)))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === 1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < 2, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= 4, PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(1) === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(1) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(2) > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(3) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(1) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(4) <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === 1, PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < 2, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= 4, PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(1) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal(1) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(2) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal(3) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(1) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(4) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } test("filter pushdown - float") { withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i.toFloat)))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === 1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < 2, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= 4, PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(1) === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(1) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(2) > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(3) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(1) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(4) <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === 1, PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < 2, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= 4, PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(1) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal(1) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(2) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal(3) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(1) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(4) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } test("filter pushdown - double") { withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i.toDouble)))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === 1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < 2, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= 4, PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(1) === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(1) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(2) > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(3) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(1) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(4) <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === 1, PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < 2, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= 4, PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(1) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal(1) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(2) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal(3) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(1) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(4) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } test("filter pushdown - string") { withOrcDataFrame((1 to 4).map(i => Tuple1(i.toString))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === "1", PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> "1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < "2", PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > "3", PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= "1", PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= "4", PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal("1") === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal("1") <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal("2") > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal("3") < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal("1") >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal("4") <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === "1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> "1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < "2", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > "3", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= "1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= "4", PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal("1") === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal("1") <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal("2") > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal("3") < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal("1") >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal("4") <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } test("filter pushdown - boolean") { withOrcDataFrame((true :: false :: Nil).map(b => Tuple1.apply(Option(b)))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === true, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> true, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < true, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > false, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= false, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= false, PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(false) === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(false) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(false) > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(true) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(true) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(true) <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === true, PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> true, PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < true, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > false, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= false, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= false, PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(false) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal(false) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(false) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal(true) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(true) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(true) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } test("filter pushdown - decimal") { withOrcDataFrame((1 to 4).map(i => Tuple1.apply(BigDecimal.valueOf(i)))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) - checkFilterPredicate('_1 === BigDecimal.valueOf(1), PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> BigDecimal.valueOf(1), PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate($"_1" === BigDecimal.valueOf(1), PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> BigDecimal.valueOf(1), PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate('_1 < BigDecimal.valueOf(2), PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > BigDecimal.valueOf(3), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= BigDecimal.valueOf(1), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= BigDecimal.valueOf(4), PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" < BigDecimal.valueOf(2), PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > BigDecimal.valueOf(3), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= BigDecimal.valueOf(1), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= BigDecimal.valueOf(4), PredicateLeaf.Operator.LESS_THAN) checkFilterPredicate( - Literal(BigDecimal.valueOf(1)) === '_1, PredicateLeaf.Operator.EQUALS) + Literal(BigDecimal.valueOf(1)) === $"_1", PredicateLeaf.Operator.EQUALS) checkFilterPredicate( - Literal(BigDecimal.valueOf(1)) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) + Literal(BigDecimal.valueOf(1)) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) checkFilterPredicate( - Literal(BigDecimal.valueOf(2)) > '_1, PredicateLeaf.Operator.LESS_THAN) + Literal(BigDecimal.valueOf(2)) > $"_1", PredicateLeaf.Operator.LESS_THAN) checkFilterPredicate( - Literal(BigDecimal.valueOf(3)) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) + Literal(BigDecimal.valueOf(3)) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) checkFilterPredicate( - Literal(BigDecimal.valueOf(1)) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) + Literal(BigDecimal.valueOf(1)) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) checkFilterPredicate( - Literal(BigDecimal.valueOf(4)) <= '_1, PredicateLeaf.Operator.LESS_THAN) + Literal(BigDecimal.valueOf(4)) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } @@ -249,46 +251,47 @@ class OrcFilterSuite extends OrcTest with SharedSparkSession { new Timestamp(milliseconds) } withOrcDataFrame(timestamps.map(Tuple1(_))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === timestamps(0), PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> timestamps(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < timestamps(1), PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > timestamps(2), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= timestamps(0), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= timestamps(3), PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(timestamps(0)) === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(timestamps(0)) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(timestamps(1)) > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(timestamps(2)) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(timestamps(0)) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(timestamps(3)) <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === timestamps(0), PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> timestamps(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < timestamps(1), PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > timestamps(2), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= timestamps(0), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= timestamps(3), PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(timestamps(0)) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal(timestamps(0)) <=> $"_1", + PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(timestamps(1)) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal(timestamps(2)) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(timestamps(0)) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(timestamps(3)) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } test("filter pushdown - combinations with logical operators") { withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i)))) { implicit df => checkFilterPredicate( - '_1.isNotNull, + $"_1".isNotNull, "leaf-0 = (IS_NULL _1), expr = (not leaf-0)" ) checkFilterPredicate( - '_1 =!= 1, + $"_1" =!= 1, "leaf-0 = (IS_NULL _1), leaf-1 = (EQUALS _1 1), expr = (and (not leaf-0) (not leaf-1))" ) checkFilterPredicate( - !('_1 < 4), + !($"_1" < 4), "leaf-0 = (IS_NULL _1), leaf-1 = (LESS_THAN _1 4), expr = (and (not leaf-0) (not leaf-1))" ) checkFilterPredicate( - '_1 < 2 || '_1 > 3, + $"_1" < 2 || $"_1" > 3, "leaf-0 = (LESS_THAN _1 2), leaf-1 = (LESS_THAN_EQUALS _1 3), " + "expr = (or leaf-0 (not leaf-1))" ) checkFilterPredicate( - '_1 < 2 && '_1 > 3, + $"_1" < 2 && $"_1" > 3, "leaf-0 = (IS_NULL _1), leaf-1 = (LESS_THAN _1 2), leaf-2 = (LESS_THAN_EQUALS _1 3), " + "expr = (and (not leaf-0) leaf-1 (not leaf-2))" ) @@ -300,22 +303,22 @@ class OrcFilterSuite extends OrcTest with SharedSparkSession { Date.valueOf(day) } withOrcDataFrame(dates.map(Tuple1(_))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === dates(0), PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> dates(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < dates(1), PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > dates(2), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= dates(0), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= dates(3), PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(dates(0)) === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(dates(0)) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(dates(1)) > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(dates(2)) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(dates(0)) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(dates(3)) <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === dates(0), PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> dates(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < dates(1), PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > dates(2), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= dates(0), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= dates(3), PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(dates(0)) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal(dates(0)) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(dates(1)) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal(dates(2)) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(dates(0)) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(dates(3)) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } @@ -325,15 +328,15 @@ class OrcFilterSuite extends OrcTest with SharedSparkSession { } // ArrayType withOrcDataFrame((1 to 4).map(i => Tuple1(Array(i)))) { implicit df => - checkNoFilterPredicate('_1.isNull, noneSupported = true) + checkNoFilterPredicate($"_1".isNull, noneSupported = true) } // BinaryType withOrcDataFrame((1 to 4).map(i => Tuple1(i.b))) { implicit df => - checkNoFilterPredicate('_1 <=> 1.b, noneSupported = true) + checkNoFilterPredicate($"_1" <=> 1.b, noneSupported = true) } // MapType withOrcDataFrame((1 to 4).map(i => Tuple1(Map(i -> i)))) { implicit df => - checkNoFilterPredicate('_1.isNotNull, noneSupported = true) + checkNoFilterPredicate($"_1".isNotNull, noneSupported = true) } } diff --git a/sql/core/v2.3.5/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java b/sql/core/v2.3/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java similarity index 100% rename from sql/core/v2.3.5/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java rename to sql/core/v2.3/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java diff --git a/sql/core/v2.3.5/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala b/sql/core/v2.3/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala similarity index 100% rename from sql/core/v2.3.5/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala rename to sql/core/v2.3/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala diff --git a/sql/core/v2.3.5/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala b/sql/core/v2.3/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala similarity index 100% rename from sql/core/v2.3.5/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala rename to sql/core/v2.3/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcShimUtils.scala diff --git a/sql/core/v2.3.5/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala b/sql/core/v2.3/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala similarity index 51% rename from sql/core/v2.3.5/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala rename to sql/core/v2.3/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala index 65b0537a0a8c1..1baa69e82bb18 100644 --- a/sql/core/v2.3.5/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala +++ b/sql/core/v2.3/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala @@ -25,13 +25,15 @@ import scala.collection.JavaConverters._ import org.apache.hadoop.hive.ql.io.sarg.{PredicateLeaf, SearchArgument} +import org.apache.spark.SparkConf import org.apache.spark.sql.{AnalysisException, Column, DataFrame} import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, HadoopFsRelation, LogicalRelation} -import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation -import org.apache.spark.sql.execution.datasources.v2.orc.OrcTable +import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, DataSourceV2ScanRelation} +import org.apache.spark.sql.execution.datasources.v2.orc.{OrcScan, OrcTable} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types._ @@ -44,6 +46,11 @@ import org.apache.spark.sql.types._ */ class OrcFilterSuite extends OrcTest with SharedSparkSession { + override protected def sparkConf: SparkConf = + super + .sparkConf + .set(SQLConf.USE_V1_SOURCE_LIST, "") + protected def checkFilterPredicate( df: DataFrame, predicate: Predicate, @@ -54,15 +61,11 @@ class OrcFilterSuite extends OrcTest with SharedSparkSession { .where(Column(predicate)) query.queryExecution.optimizedPlan match { - case PhysicalOperation(_, filters, - DataSourceV2Relation(orcTable: OrcTable, _, options)) => + case PhysicalOperation(_, filters, DataSourceV2ScanRelation(_, o: OrcScan, _)) => assert(filters.nonEmpty, "No filter is analyzed from the given query") - val scanBuilder = orcTable.newScanBuilder(options) - scanBuilder.pushFilters(filters.flatMap(DataSourceStrategy.translateFilter).toArray) - val pushedFilters = scanBuilder.pushedFilters() - assert(pushedFilters.nonEmpty, "No filter is pushed down") - val maybeFilter = OrcFilters.createFilter(query.schema, pushedFilters) - assert(maybeFilter.isDefined, s"Couldn't generate filter predicate for $pushedFilters") + assert(o.pushedFilters.nonEmpty, "No filter is pushed down") + val maybeFilter = OrcFilters.createFilter(query.schema, o.pushedFilters) + assert(maybeFilter.isDefined, s"Couldn't generate filter predicate for ${o.pushedFilters}") checker(maybeFilter.get) case _ => @@ -91,154 +94,154 @@ class OrcFilterSuite extends OrcTest with SharedSparkSession { test("filter pushdown - integer") { withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i)))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === 1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < 2, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= 4, PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(1) === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(1) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(2) > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(3) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(1) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(4) <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === 1, PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < 2, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= 4, PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(1) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal(1) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(2) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal(3) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(1) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(4) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } test("filter pushdown - long") { withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i.toLong)))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === 1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < 2, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= 4, PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(1) === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(1) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(2) > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(3) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(1) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(4) <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === 1, PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < 2, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= 4, PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(1) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal(1) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(2) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal(3) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(1) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(4) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } test("filter pushdown - float") { withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i.toFloat)))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === 1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < 2, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= 4, PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(1) === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(1) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(2) > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(3) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(1) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(4) <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === 1, PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < 2, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= 4, PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(1) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal(1) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(2) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal(3) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(1) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(4) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } test("filter pushdown - double") { withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i.toDouble)))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === 1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < 2, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= 4, PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(1) === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(1) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(2) > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(3) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(1) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(4) <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === 1, PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < 2, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= 4, PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(1) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal(1) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(2) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal(3) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(1) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(4) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } test("filter pushdown - string") { withOrcDataFrame((1 to 4).map(i => Tuple1(i.toString))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === "1", PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> "1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < "2", PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > "3", PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= "1", PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= "4", PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal("1") === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal("1") <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal("2") > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal("3") < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal("1") >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal("4") <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === "1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> "1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < "2", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > "3", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= "1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= "4", PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal("1") === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal("1") <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal("2") > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal("3") < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal("1") >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal("4") <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } test("filter pushdown - boolean") { withOrcDataFrame((true :: false :: Nil).map(b => Tuple1.apply(Option(b)))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === true, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> true, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < true, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > false, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= false, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= false, PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(false) === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(false) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(false) > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(true) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(true) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(true) <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === true, PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> true, PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < true, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > false, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= false, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= false, PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(false) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal(false) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(false) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal(true) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(true) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(true) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } test("filter pushdown - decimal") { withOrcDataFrame((1 to 4).map(i => Tuple1.apply(BigDecimal.valueOf(i)))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) - checkFilterPredicate('_1 === BigDecimal.valueOf(1), PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> BigDecimal.valueOf(1), PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate($"_1" === BigDecimal.valueOf(1), PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> BigDecimal.valueOf(1), PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate('_1 < BigDecimal.valueOf(2), PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > BigDecimal.valueOf(3), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= BigDecimal.valueOf(1), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= BigDecimal.valueOf(4), PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" < BigDecimal.valueOf(2), PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > BigDecimal.valueOf(3), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= BigDecimal.valueOf(1), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= BigDecimal.valueOf(4), PredicateLeaf.Operator.LESS_THAN) checkFilterPredicate( - Literal(BigDecimal.valueOf(1)) === '_1, PredicateLeaf.Operator.EQUALS) + Literal(BigDecimal.valueOf(1)) === $"_1", PredicateLeaf.Operator.EQUALS) checkFilterPredicate( - Literal(BigDecimal.valueOf(1)) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) + Literal(BigDecimal.valueOf(1)) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) checkFilterPredicate( - Literal(BigDecimal.valueOf(2)) > '_1, PredicateLeaf.Operator.LESS_THAN) + Literal(BigDecimal.valueOf(2)) > $"_1", PredicateLeaf.Operator.LESS_THAN) checkFilterPredicate( - Literal(BigDecimal.valueOf(3)) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) + Literal(BigDecimal.valueOf(3)) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) checkFilterPredicate( - Literal(BigDecimal.valueOf(1)) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) + Literal(BigDecimal.valueOf(1)) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) checkFilterPredicate( - Literal(BigDecimal.valueOf(4)) <= '_1, PredicateLeaf.Operator.LESS_THAN) + Literal(BigDecimal.valueOf(4)) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } @@ -249,46 +252,47 @@ class OrcFilterSuite extends OrcTest with SharedSparkSession { new Timestamp(milliseconds) } withOrcDataFrame(timestamps.map(Tuple1(_))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === timestamps(0), PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> timestamps(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < timestamps(1), PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > timestamps(2), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= timestamps(0), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= timestamps(3), PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(timestamps(0)) === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(timestamps(0)) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(timestamps(1)) > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(timestamps(2)) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(timestamps(0)) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(timestamps(3)) <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === timestamps(0), PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> timestamps(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < timestamps(1), PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > timestamps(2), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= timestamps(0), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= timestamps(3), PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(timestamps(0)) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate( + Literal(timestamps(0)) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(timestamps(1)) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal(timestamps(2)) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(timestamps(0)) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(timestamps(3)) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } test("filter pushdown - combinations with logical operators") { withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i)))) { implicit df => checkFilterPredicate( - '_1.isNotNull, + $"_1".isNotNull, "leaf-0 = (IS_NULL _1), expr = (not leaf-0)" ) checkFilterPredicate( - '_1 =!= 1, + $"_1" =!= 1, "leaf-0 = (IS_NULL _1), leaf-1 = (EQUALS _1 1), expr = (and (not leaf-0) (not leaf-1))" ) checkFilterPredicate( - !('_1 < 4), + !($"_1" < 4), "leaf-0 = (IS_NULL _1), leaf-1 = (LESS_THAN _1 4), expr = (and (not leaf-0) (not leaf-1))" ) checkFilterPredicate( - '_1 < 2 || '_1 > 3, + $"_1" < 2 || $"_1" > 3, "leaf-0 = (LESS_THAN _1 2), leaf-1 = (LESS_THAN_EQUALS _1 3), " + "expr = (or leaf-0 (not leaf-1))" ) checkFilterPredicate( - '_1 < 2 && '_1 > 3, + $"_1" < 2 && $"_1" > 3, "leaf-0 = (IS_NULL _1), leaf-1 = (LESS_THAN _1 2), leaf-2 = (LESS_THAN_EQUALS _1 3), " + "expr = (and (not leaf-0) leaf-1 (not leaf-2))" ) @@ -300,22 +304,22 @@ class OrcFilterSuite extends OrcTest with SharedSparkSession { Date.valueOf(day) } withOrcDataFrame(dates.map(Tuple1(_))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === dates(0), PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> dates(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < dates(1), PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > dates(2), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= dates(0), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= dates(3), PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(dates(0)) === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(dates(0)) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(dates(1)) > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(dates(2)) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(dates(0)) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(dates(3)) <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === dates(0), PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> dates(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < dates(1), PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > dates(2), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= dates(0), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= dates(3), PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(dates(0)) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal(dates(0)) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(dates(1)) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal(dates(2)) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(dates(0)) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(dates(3)) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } @@ -325,15 +329,15 @@ class OrcFilterSuite extends OrcTest with SharedSparkSession { } // ArrayType withOrcDataFrame((1 to 4).map(i => Tuple1(Array(i)))) { implicit df => - checkNoFilterPredicate('_1.isNull, noneSupported = true) + checkNoFilterPredicate($"_1".isNull, noneSupported = true) } // BinaryType withOrcDataFrame((1 to 4).map(i => Tuple1(i.b))) { implicit df => - checkNoFilterPredicate('_1 <=> 1.b, noneSupported = true) + checkNoFilterPredicate($"_1" <=> 1.b, noneSupported = true) } // MapType withOrcDataFrame((1 to 4).map(i => Tuple1(Map(i -> i)))) { implicit df => - checkNoFilterPredicate('_1.isNotNull, noneSupported = true) + checkNoFilterPredicate($"_1".isNotNull, noneSupported = true) } } diff --git a/sql/create-docs.sh b/sql/create-docs.sh index 4353708d22f7b..44aa877332fd5 100755 --- a/sql/create-docs.sh +++ b/sql/create-docs.sh @@ -17,7 +17,7 @@ # limitations under the License. # -# Script to create SQL API docs. This requires `mkdocs` and to build +# Script to create SQL API and config docs. This requires `mkdocs` and to build # Spark first. After running this script the html docs can be found in # $SPARK_HOME/sql/site @@ -39,14 +39,16 @@ fi pushd "$FWDIR" > /dev/null -# Now create the markdown file rm -fr docs mkdir docs -echo "Generating markdown files for SQL documentation." -"$SPARK_HOME/bin/spark-submit" gen-sql-markdown.py -# Now create the HTML files -echo "Generating HTML files for SQL documentation." +echo "Generating SQL API Markdown files." +"$SPARK_HOME/bin/spark-submit" gen-sql-api-docs.py + +echo "Generating SQL configuration table HTML file." +"$SPARK_HOME/bin/spark-submit" gen-sql-config-docs.py + +echo "Generating HTML files for SQL API documentation." mkdocs build --clean rm -fr docs diff --git a/sql/gen-sql-markdown.py b/sql/gen-sql-api-docs.py similarity index 96% rename from sql/gen-sql-markdown.py rename to sql/gen-sql-api-docs.py index e0529f8310613..4feee7ad52570 100644 --- a/sql/gen-sql-markdown.py +++ b/sql/gen-sql-api-docs.py @@ -15,10 +15,11 @@ # limitations under the License. # -import sys import os from collections import namedtuple +from pyspark.java_gateway import launch_gateway + ExpressionInfo = namedtuple( "ExpressionInfo", "className name usage arguments examples note since deprecated") @@ -219,8 +220,7 @@ def generate_sql_markdown(jvm, path): if __name__ == "__main__": - from pyspark.java_gateway import launch_gateway - jvm = launch_gateway().jvm - markdown_file_path = "%s/docs/index.md" % os.path.dirname(sys.argv[0]) + spark_root_dir = os.path.dirname(os.path.dirname(__file__)) + markdown_file_path = os.path.join(spark_root_dir, "sql/docs/index.md") generate_sql_markdown(jvm, markdown_file_path) diff --git a/sql/gen-sql-config-docs.py b/sql/gen-sql-config-docs.py new file mode 100644 index 0000000000000..04f5a850c9980 --- /dev/null +++ b/sql/gen-sql-config-docs.py @@ -0,0 +1,117 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import re +from collections import namedtuple +from textwrap import dedent + +# To avoid adding a new direct dependency, we import markdown from within mkdocs. +from mkdocs.structure.pages import markdown +from pyspark.java_gateway import launch_gateway + +SQLConfEntry = namedtuple( + "SQLConfEntry", ["name", "default", "description"]) + + +def get_public_sql_configs(jvm): + sql_configs = [ + SQLConfEntry( + name=_sql_config._1(), + default=_sql_config._2(), + description=_sql_config._3(), + ) + for _sql_config in jvm.org.apache.spark.sql.api.python.PythonSQLUtils.listSQLConfigs() + ] + return sql_configs + + +def generate_sql_configs_table(sql_configs, path): + """ + Generates an HTML table at `path` that lists all public SQL + configuration options. + + The table will look something like this: + + ```html + + + + + + + + + + ... + +
    Property NameDefaultMeaning
    spark.sql.adaptive.enabledfalse

    When true, enable adaptive query execution.

    + ``` + """ + value_reference_pattern = re.compile(r"^$") + + with open(path, 'w') as f: + f.write(dedent( + """ + + + """ + )) + for config in sorted(sql_configs, key=lambda x: x.name): + if config.default == "": + default = "(none)" + elif config.default.startswith(" + + + + + """ + .format( + name=config.name, + default=default, + description=markdown.markdown(config.description), + ) + )) + f.write("
    Property NameDefaultMeaning
    {name}{default}{description}
    \n") + + +if __name__ == "__main__": + jvm = launch_gateway().jvm + sql_configs = get_public_sql_configs(jvm) + + spark_root_dir = os.path.dirname(os.path.dirname(__file__)) + sql_configs_table_path = os.path.join(spark_root_dir, "docs/sql-configs.html") + + generate_sql_configs_table(sql_configs, path=sql_configs_table_path) diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml index 5b1352adddd89..75c7f77942396 100644 --- a/sql/hive-thriftserver/pom.xml +++ b/sql/hive-thriftserver/pom.xml @@ -77,15 +77,6 @@ ${hive.group} hive-beeline - - - ${hive.group} - hive-contrib - - - ${hive.group}.hcatalog - hive-hcatalog-core - org.eclipse.jetty jetty-server @@ -129,7 +120,11 @@ test-jar test - + + org.mockito + mockito-core + test + net.sf.jpam jpam diff --git a/sql/hive-thriftserver/src/main/resources/META-INF/services/org.apache.spark.status.AppHistoryServerPlugin b/sql/hive-thriftserver/src/main/resources/META-INF/services/org.apache.spark.status.AppHistoryServerPlugin new file mode 100644 index 0000000000000..96d990372ee4c --- /dev/null +++ b/sql/hive-thriftserver/src/main/resources/META-INF/services/org.apache.spark.status.AppHistoryServerPlugin @@ -0,0 +1 @@ +org.apache.spark.sql.hive.thriftserver.ui.HiveThriftServer2HistoryServerPlugin diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala index 36d4ac095e10c..f15193b0dc3cc 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2.scala @@ -20,9 +20,6 @@ package org.apache.spark.sql.hive.thriftserver import java.util.Locale import java.util.concurrent.atomic.AtomicBoolean -import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer - import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.hive.service.cli.thrift.{ThriftBinaryCLIService, ThriftHttpCLIService} @@ -32,12 +29,11 @@ import org.apache.spark.SparkContext import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging import org.apache.spark.internal.config.UI.UI_ENABLED -import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd, SparkListenerJobStart} import org.apache.spark.sql.SQLContext import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._ -import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab -import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.hive.thriftserver.ui._ +import org.apache.spark.status.ElementTrackingStore import org.apache.spark.util.{ShutdownHookManager, Utils} /** @@ -47,6 +43,7 @@ import org.apache.spark.util.{ShutdownHookManager, Utils} object HiveThriftServer2 extends Logging { var uiTab: Option[ThriftServerTab] = None var listener: HiveThriftServer2Listener = _ + var eventManager: HiveThriftServer2EventManager = _ /** * :: DeveloperApi :: @@ -62,17 +59,24 @@ object HiveThriftServer2 extends Logging { server.init(executionHive.conf) server.start() - listener = new HiveThriftServer2Listener(server, sqlContext.conf) - sqlContext.sparkContext.addSparkListener(listener) - uiTab = if (sqlContext.sparkContext.getConf.get(UI_ENABLED)) { - Some(new ThriftServerTab(sqlContext.sparkContext)) + createListenerAndUI(server, sqlContext.sparkContext) + server + } + + private def createListenerAndUI(server: HiveThriftServer2, sc: SparkContext): Unit = { + val kvStore = sc.statusStore.store.asInstanceOf[ElementTrackingStore] + eventManager = new HiveThriftServer2EventManager(sc) + listener = new HiveThriftServer2Listener(kvStore, sc.conf, Some(server)) + sc.listenerBus.addToStatusQueue(listener) + uiTab = if (sc.getConf.get(UI_ENABLED)) { + Some(new ThriftServerTab(new HiveThriftServer2AppStatusStore(kvStore, Some(listener)), + ThriftServerTab.getSparkUI(sc))) } else { None } - server } - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { // If the arguments contains "-h" or "--help", print out the usage and exit. if (args.contains("-h") || args.contains("--help")) { HiveServer2.main(args) @@ -101,13 +105,7 @@ object HiveThriftServer2 extends Logging { server.init(executionHive.conf) server.start() logInfo("HiveThriftServer2 started") - listener = new HiveThriftServer2Listener(server, SparkSQLEnv.sqlContext.conf) - SparkSQLEnv.sparkContext.addSparkListener(listener) - uiTab = if (SparkSQLEnv.sparkContext.getConf.get(UI_ENABLED)) { - Some(new ThriftServerTab(SparkSQLEnv.sparkContext)) - } else { - None - } + createListenerAndUI(server, SparkSQLEnv.sparkContext) // If application was killed before HiveThriftServer2 start successfully then SparkSubmit // process can not exit, so check whether if SparkContext was stopped. if (SparkSQLEnv.sparkContext.stopped.get()) { @@ -121,179 +119,10 @@ object HiveThriftServer2 extends Logging { } } - private[thriftserver] class SessionInfo( - val sessionId: String, - val startTimestamp: Long, - val ip: String, - val userName: String) { - var finishTimestamp: Long = 0L - var totalExecution: Int = 0 - def totalTime: Long = { - if (finishTimestamp == 0L) { - System.currentTimeMillis - startTimestamp - } else { - finishTimestamp - startTimestamp - } - } - } - private[thriftserver] object ExecutionState extends Enumeration { val STARTED, COMPILED, CANCELED, FAILED, FINISHED, CLOSED = Value type ExecutionState = Value } - - private[thriftserver] class ExecutionInfo( - val statement: String, - val sessionId: String, - val startTimestamp: Long, - val userName: String) { - var finishTimestamp: Long = 0L - var closeTimestamp: Long = 0L - var executePlan: String = "" - var detail: String = "" - var state: ExecutionState.Value = ExecutionState.STARTED - val jobId: ArrayBuffer[String] = ArrayBuffer[String]() - var groupId: String = "" - def totalTime(endTime: Long): Long = { - if (endTime == 0L) { - System.currentTimeMillis - startTimestamp - } else { - endTime - startTimestamp - } - } - } - - - /** - * An inner sparkListener called in sc.stop to clean up the HiveThriftServer2 - */ - private[thriftserver] class HiveThriftServer2Listener( - val server: HiveServer2, - val conf: SQLConf) extends SparkListener { - - override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = { - server.stop() - } - private val sessionList = new mutable.LinkedHashMap[String, SessionInfo] - private val executionList = new mutable.LinkedHashMap[String, ExecutionInfo] - private val retainedStatements = conf.getConf(SQLConf.THRIFTSERVER_UI_STATEMENT_LIMIT) - private val retainedSessions = conf.getConf(SQLConf.THRIFTSERVER_UI_SESSION_LIMIT) - - def getOnlineSessionNum: Int = synchronized { - sessionList.count(_._2.finishTimestamp == 0) - } - - def isExecutionActive(execInfo: ExecutionInfo): Boolean = { - !(execInfo.state == ExecutionState.FAILED || - execInfo.state == ExecutionState.CANCELED || - execInfo.state == ExecutionState.CLOSED) - } - - /** - * When an error or a cancellation occurs, we set the finishTimestamp of the statement. - * Therefore, when we count the number of running statements, we need to exclude errors and - * cancellations and count all statements that have not been closed so far. - */ - def getTotalRunning: Int = synchronized { - executionList.count { - case (_, v) => isExecutionActive(v) - } - } - - def getSessionList: Seq[SessionInfo] = synchronized { sessionList.values.toSeq } - - def getSession(sessionId: String): Option[SessionInfo] = synchronized { - sessionList.get(sessionId) - } - - def getExecutionList: Seq[ExecutionInfo] = synchronized { executionList.values.toSeq } - - override def onJobStart(jobStart: SparkListenerJobStart): Unit = synchronized { - for { - props <- Option(jobStart.properties) - groupId <- Option(props.getProperty(SparkContext.SPARK_JOB_GROUP_ID)) - (_, info) <- executionList if info.groupId == groupId - } { - info.jobId += jobStart.jobId.toString - info.groupId = groupId - } - } - - def onSessionCreated(ip: String, sessionId: String, userName: String = "UNKNOWN"): Unit = { - synchronized { - val info = new SessionInfo(sessionId, System.currentTimeMillis, ip, userName) - sessionList.put(sessionId, info) - trimSessionIfNecessary() - } - } - - def onSessionClosed(sessionId: String): Unit = synchronized { - sessionList(sessionId).finishTimestamp = System.currentTimeMillis - trimSessionIfNecessary() - } - - def onStatementStart( - id: String, - sessionId: String, - statement: String, - groupId: String, - userName: String = "UNKNOWN"): Unit = synchronized { - val info = new ExecutionInfo(statement, sessionId, System.currentTimeMillis, userName) - info.state = ExecutionState.STARTED - executionList.put(id, info) - trimExecutionIfNecessary() - sessionList(sessionId).totalExecution += 1 - executionList(id).groupId = groupId - } - - def onStatementParsed(id: String, executionPlan: String): Unit = synchronized { - executionList(id).executePlan = executionPlan - executionList(id).state = ExecutionState.COMPILED - } - - def onStatementCanceled(id: String): Unit = synchronized { - executionList(id).finishTimestamp = System.currentTimeMillis - executionList(id).state = ExecutionState.CANCELED - trimExecutionIfNecessary() - } - - def onStatementError(id: String, errorMsg: String, errorTrace: String): Unit = synchronized { - executionList(id).finishTimestamp = System.currentTimeMillis - executionList(id).detail = errorMsg - executionList(id).state = ExecutionState.FAILED - trimExecutionIfNecessary() - } - - def onStatementFinish(id: String): Unit = synchronized { - executionList(id).finishTimestamp = System.currentTimeMillis - executionList(id).state = ExecutionState.FINISHED - trimExecutionIfNecessary() - } - - def onOperationClosed(id: String): Unit = synchronized { - executionList(id).closeTimestamp = System.currentTimeMillis - executionList(id).state = ExecutionState.CLOSED - } - - private def trimExecutionIfNecessary() = { - if (executionList.size > retainedStatements) { - val toRemove = math.max(retainedStatements / 10, 1) - executionList.filter(_._2.finishTimestamp != 0).take(toRemove).foreach { s => - executionList.remove(s._1) - } - } - } - - private def trimSessionIfNecessary() = { - if (sessionList.size > retainedSessions) { - val toRemove = math.max(retainedSessions / 10, 1) - sessionList.filter(_._2.finishTimestamp != 0).take(toRemove).foreach { s => - sessionList.remove(s._1) - } - } - - } - } } private[hive] class HiveThriftServer2(sqlContext: SQLContext) @@ -303,7 +132,7 @@ private[hive] class HiveThriftServer2(sqlContext: SQLContext) // started, and then once only. private val started = new AtomicBoolean(false) - override def init(hiveConf: HiveConf) { + override def init(hiveConf: HiveConf): Unit = { val sparkSqlCliService = new SparkSQLCLIService(this, sqlContext) setSuperField(this, "cliService", sparkSqlCliService) addService(sparkSqlCliService) diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ReflectionUtils.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ReflectionUtils.scala index 599294dfbb7d7..a4024be67ac9c 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ReflectionUtils.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ReflectionUtils.scala @@ -18,11 +18,11 @@ package org.apache.spark.sql.hive.thriftserver private[hive] object ReflectionUtils { - def setSuperField(obj : Object, fieldName: String, fieldValue: Object) { + def setSuperField(obj : Object, fieldName: String, fieldValue: Object): Unit = { setAncestorField(obj, 1, fieldName, fieldValue) } - def setAncestorField(obj: AnyRef, level: Int, fieldName: String, fieldValue: AnyRef) { + def setAncestorField(obj: AnyRef, level: Int, fieldName: String, fieldValue: AnyRef): Unit = { val ancestor = Iterator.iterate[Class[_]](obj.getClass)(_.getSuperclass).drop(level).next() val field = ancestor.getDeclaredField(fieldName) field.setAccessible(true) diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala index 69e85484ccf8e..cf0e5ebf3a2b1 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala @@ -26,6 +26,7 @@ import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import scala.util.control.NonFatal +import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.metastore.api.FieldSchema import org.apache.hadoop.hive.shims.Utils import org.apache.hive.service.cli._ @@ -39,6 +40,7 @@ import org.apache.spark.sql.execution.HiveResult import org.apache.spark.sql.execution.command.SetCommand import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.{Utils => SparkUtils} private[hive] class SparkExecuteStatementOperation( @@ -56,7 +58,8 @@ private[hive] class SparkExecuteStatementOperation( // This is only used when `spark.sql.thriftServer.incrementalCollect` is set to `false`. // In case of `true`, this will be `None` and FETCH_FIRST will trigger re-execution. private var resultList: Option[Array[SparkRow]] = _ - + private var previousFetchEndOffset: Long = 0 + private var previousFetchStartOffset: Long = 0 private var iter: Iterator[SparkRow] = _ private var dataTypes: Array[DataType] = _ private var statementId: String = _ @@ -74,10 +77,10 @@ private[hive] class SparkExecuteStatementOperation( // RDDs will be cleaned automatically upon garbage collection. logInfo(s"Close statement with $statementId") cleanup(OperationState.CLOSED) - HiveThriftServer2.listener.onOperationClosed(statementId) + HiveThriftServer2.eventManager.onOperationClosed(statementId) } - def addNonNullColumnValue(from: SparkRow, to: ArrayBuffer[Any], ordinal: Int) { + def addNonNullColumnValue(from: SparkRow, to: ArrayBuffer[Any], ordinal: Int): Unit = { dataTypes(ordinal) match { case StringType => to += from.getString(ordinal) @@ -103,6 +106,8 @@ private[hive] class SparkExecuteStatementOperation( to += from.getAs[Timestamp](ordinal) case BinaryType => to += from.getAs[Array[Byte]](ordinal) + case CalendarIntervalType => + to += HiveResult.toHiveString((from.getAs[CalendarInterval](ordinal), CalendarIntervalType)) case _: ArrayType | _: StructType | _: MapType | _: UserDefinedType[_] => val hiveString = HiveResult.toHiveString((from.get(ordinal), dataTypes(ordinal))) to += hiveString @@ -110,14 +115,18 @@ private[hive] class SparkExecuteStatementOperation( } def getNextRowSet(order: FetchOrientation, maxRowsL: Long): RowSet = withSchedulerPool { + log.info(s"Received getNextRowSet request order=${order} and maxRowsL=${maxRowsL} " + + s"with ${statementId}") validateDefaultFetchOrientation(order) assertState(OperationState.FINISHED) setHasResultSet(true) val resultRowSet: RowSet = ThriftserverShimUtils.resultRowSet(getResultSetSchema, getProtocolVersion) - // Reset iter to header when fetching start from first row - if (order.equals(FetchOrientation.FETCH_FIRST)) { + // Reset iter when FETCH_FIRST or FETCH_PRIOR + if ((order.equals(FetchOrientation.FETCH_FIRST) || + order.equals(FetchOrientation.FETCH_PRIOR)) && previousFetchEndOffset != 0) { + // Reset the iterator to the beginning of the query. iter = if (sqlContext.getConf(SQLConf.THRIFTSERVER_INCREMENTAL_COLLECT.key).toBoolean) { resultList = None result.toLocalIterator.asScala @@ -129,6 +138,28 @@ private[hive] class SparkExecuteStatementOperation( } } + var resultOffset = { + if (order.equals(FetchOrientation.FETCH_FIRST)) { + logInfo(s"FETCH_FIRST request with $statementId. Resetting to resultOffset=0") + 0 + } else if (order.equals(FetchOrientation.FETCH_PRIOR)) { + // TODO: FETCH_PRIOR should be handled more efficiently than rewinding to beginning and + // reiterating. + val targetOffset = math.max(previousFetchStartOffset - maxRowsL, 0) + logInfo(s"FETCH_PRIOR request with $statementId. Resetting to resultOffset=$targetOffset") + var off = 0 + while (off < targetOffset && iter.hasNext) { + iter.next() + off += 1 + } + off + } else { // FETCH_NEXT + previousFetchEndOffset + } + } + + resultRowSet.setStartOffset(resultOffset) + previousFetchStartOffset = resultOffset if (!iter.hasNext) { resultRowSet } else { @@ -149,7 +180,11 @@ private[hive] class SparkExecuteStatementOperation( } resultRowSet.addRow(row.toArray.asInstanceOf[Array[Object]]) curRow += 1 + resultOffset += 1 } + previousFetchEndOffset = resultOffset + log.info(s"Returning result set with ${curRow} rows from offsets " + + s"[$previousFetchStartOffset, $previousFetchEndOffset) with $statementId") resultRowSet } } @@ -160,7 +195,7 @@ private[hive] class SparkExecuteStatementOperation( setState(OperationState.PENDING) statementId = UUID.randomUUID().toString logInfo(s"Submitting query '$statement' with $statementId") - HiveThriftServer2.listener.onStatementStart( + HiveThriftServer2.eventManager.onStatementStart( statementId, parentSession.getSessionHandle.getSessionId.toString, statement, @@ -210,14 +245,14 @@ private[hive] class SparkExecuteStatementOperation( case rejected: RejectedExecutionException => logError("Error submitting query in background, query rejected", rejected) setState(OperationState.ERROR) - HiveThriftServer2.listener.onStatementError( + HiveThriftServer2.eventManager.onStatementError( statementId, rejected.getMessage, SparkUtils.exceptionString(rejected)) throw new HiveSQLException("The background threadpool cannot accept" + " new task for execution, please retry the operation", rejected) case NonFatal(e) => logError(s"Error executing query in background", e) setState(OperationState.ERROR) - HiveThriftServer2.listener.onStatementError( + HiveThriftServer2.eventManager.onStatementError( statementId, e.getMessage, SparkUtils.exceptionString(e)) throw new HiveSQLException(e) } @@ -249,7 +284,8 @@ private[hive] class SparkExecuteStatementOperation( "in this session.") case _ => } - HiveThriftServer2.listener.onStatementParsed(statementId, result.queryExecution.toString()) + HiveThriftServer2.eventManager.onStatementParsed(statementId, + result.queryExecution.toString()) iter = { if (sqlContext.getConf(SQLConf.THRIFTSERVER_INCREMENTAL_COLLECT.key).toBoolean) { resultList = None @@ -259,11 +295,18 @@ private[hive] class SparkExecuteStatementOperation( resultList.get.iterator } } - dataTypes = result.queryExecution.analyzed.output.map(_.dataType).toArray + dataTypes = result.schema.fields.map(_.dataType) } catch { // Actually do need to catch Throwable as some failures don't inherit from Exception and // HiveServer will silently swallow them. case e: Throwable => + // When cancel() or close() is called very quickly after the query is started, + // then they may both call cleanup() before Spark Jobs are started. But before background + // task interrupted, it may have start some spark job, so we need to cancel again to + // make sure job was cancelled when background thread was interrupted + if (statementId != null) { + sqlContext.sparkContext.cancelJobGroup(statementId) + } val currentState = getStatus().getState() if (currentState.isTerminal) { // This may happen if the execution was cancelled, and then closed from another thread. @@ -271,19 +314,23 @@ private[hive] class SparkExecuteStatementOperation( } else { logError(s"Error executing query with $statementId, currentState $currentState, ", e) setState(OperationState.ERROR) - HiveThriftServer2.listener.onStatementError( - statementId, e.getMessage, SparkUtils.exceptionString(e)) - if (e.isInstanceOf[HiveSQLException]) { - throw e.asInstanceOf[HiveSQLException] - } else { - throw new HiveSQLException("Error running query: " + e.toString, e) + e match { + case hiveException: HiveSQLException => + HiveThriftServer2.eventManager.onStatementError( + statementId, hiveException.getMessage, SparkUtils.exceptionString(hiveException)) + throw hiveException + case _ => + val root = ExceptionUtils.getRootCause(e) + HiveThriftServer2.eventManager.onStatementError( + statementId, root.getMessage, SparkUtils.exceptionString(root)) + throw new HiveSQLException("Error running query: " + root.toString, root) } } } finally { synchronized { if (!getStatus.getState.isTerminal) { setState(OperationState.FINISHED) - HiveThriftServer2.listener.onStatementFinish(statementId) + HiveThriftServer2.eventManager.onStatementFinish(statementId) } } sqlContext.sparkContext.clearJobGroup() @@ -295,12 +342,12 @@ private[hive] class SparkExecuteStatementOperation( if (!getStatus.getState.isTerminal) { logInfo(s"Cancel query with $statementId") cleanup(OperationState.CANCELED) - HiveThriftServer2.listener.onStatementCanceled(statementId) + HiveThriftServer2.eventManager.onStatementCanceled(statementId) } } } - private def cleanup(state: OperationState) { + private def cleanup(state: OperationState): Unit = { setState(state) if (runInBackground) { val backgroundHandle = getBackgroundHandle() @@ -331,7 +378,11 @@ private[hive] class SparkExecuteStatementOperation( object SparkExecuteStatementOperation { def getTableSchema(structType: StructType): TableSchema = { val schema = structType.map { field => - val attrTypeString = if (field.dataType == NullType) "void" else field.dataType.catalogString + val attrTypeString = field.dataType match { + case NullType => "void" + case CalendarIntervalType => StringType.catalogString + case other => other.catalogString + } new FieldSchema(field.name, attrTypeString, field.getComment.getOrElse("")) } new TableSchema(schema.asJava) diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetCatalogsOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetCatalogsOperation.scala index cde99fd35bd59..2945cfd200e46 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetCatalogsOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetCatalogsOperation.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.hive.thriftserver import java.util.UUID +import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.ql.security.authorization.plugin.HiveOperationType import org.apache.hive.service.cli.{HiveSQLException, OperationState} import org.apache.hive.service.cli.operation.GetCatalogsOperation @@ -43,7 +44,7 @@ private[hive] class SparkGetCatalogsOperation( override def close(): Unit = { super.close() - HiveThriftServer2.listener.onOperationClosed(statementId) + HiveThriftServer2.eventManager.onOperationClosed(statementId) } override def runInternal(): Unit = { @@ -55,7 +56,7 @@ private[hive] class SparkGetCatalogsOperation( val executionHiveClassLoader = sqlContext.sharedState.jarClassLoader Thread.currentThread().setContextClassLoader(executionHiveClassLoader) - HiveThriftServer2.listener.onStatementStart( + HiveThriftServer2.eventManager.onStatementStart( statementId, parentSession.getSessionHandle.getSessionId.toString, logMsg, @@ -68,12 +69,21 @@ private[hive] class SparkGetCatalogsOperation( } setState(OperationState.FINISHED) } catch { - case e: HiveSQLException => + case e: Throwable => + logError(s"Error executing get catalogs operation with $statementId", e) setState(OperationState.ERROR) - HiveThriftServer2.listener.onStatementError( - statementId, e.getMessage, SparkUtils.exceptionString(e)) - throw e + e match { + case hiveException: HiveSQLException => + HiveThriftServer2.eventManager.onStatementError( + statementId, hiveException.getMessage, SparkUtils.exceptionString(hiveException)) + throw hiveException + case _ => + val root = ExceptionUtils.getRootCause(e) + HiveThriftServer2.eventManager.onStatementError( + statementId, root.getMessage, SparkUtils.exceptionString(root)) + throw new HiveSQLException("Error getting catalogs: " + root.toString, root) + } } - HiveThriftServer2.listener.onStatementFinish(statementId) + HiveThriftServer2.eventManager.onStatementFinish(statementId) } } diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetColumnsOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetColumnsOperation.scala index 89faff2f6f913..ff7cbfeae13be 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetColumnsOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetColumnsOperation.scala @@ -22,6 +22,7 @@ import java.util.regex.Pattern import scala.collection.JavaConverters.seqAsJavaListConverter +import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.ql.security.authorization.plugin.{HiveOperationType, HivePrivilegeObject} import org.apache.hadoop.hive.ql.security.authorization.plugin.HivePrivilegeObject.HivePrivilegeObjectType import org.apache.hive.service.cli._ @@ -62,7 +63,7 @@ private[hive] class SparkGetColumnsOperation( override def close(): Unit = { super.close() - HiveThriftServer2.listener.onOperationClosed(statementId) + HiveThriftServer2.eventManager.onOperationClosed(statementId) } override def runInternal(): Unit = { @@ -77,7 +78,7 @@ private[hive] class SparkGetColumnsOperation( val executionHiveClassLoader = sqlContext.sharedState.jarClassLoader Thread.currentThread().setContextClassLoader(executionHiveClassLoader) - HiveThriftServer2.listener.onStatementStart( + HiveThriftServer2.eventManager.onStatementStart( statementId, parentSession.getSessionHandle.getSessionId.toString, logMsg, @@ -129,13 +130,22 @@ private[hive] class SparkGetColumnsOperation( } setState(OperationState.FINISHED) } catch { - case e: HiveSQLException => + case e: Throwable => + logError(s"Error executing get columns operation with $statementId", e) setState(OperationState.ERROR) - HiveThriftServer2.listener.onStatementError( - statementId, e.getMessage, SparkUtils.exceptionString(e)) - throw e + e match { + case hiveException: HiveSQLException => + HiveThriftServer2.eventManager.onStatementError( + statementId, hiveException.getMessage, SparkUtils.exceptionString(hiveException)) + throw hiveException + case _ => + val root = ExceptionUtils.getRootCause(e) + HiveThriftServer2.eventManager.onStatementError( + statementId, root.getMessage, SparkUtils.exceptionString(root)) + throw new HiveSQLException("Error getting columns: " + root.toString, root) + } } - HiveThriftServer2.listener.onStatementFinish(statementId) + HiveThriftServer2.eventManager.onStatementFinish(statementId) } private def addToRowSet( diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetFunctionsOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetFunctionsOperation.scala index 462e57300e82b..d9c12b6ca9e64 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetFunctionsOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetFunctionsOperation.scala @@ -22,6 +22,7 @@ import java.util.UUID import scala.collection.JavaConverters.seqAsJavaListConverter +import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.ql.security.authorization.plugin.{HiveOperationType, HivePrivilegeObjectUtils} import org.apache.hive.service.cli._ import org.apache.hive.service.cli.operation.GetFunctionsOperation @@ -53,7 +54,7 @@ private[hive] class SparkGetFunctionsOperation( override def close(): Unit = { super.close() - HiveThriftServer2.listener.onOperationClosed(statementId) + HiveThriftServer2.eventManager.onOperationClosed(statementId) } override def runInternal(): Unit = { @@ -80,7 +81,7 @@ private[hive] class SparkGetFunctionsOperation( authorizeMetaGets(HiveOperationType.GET_FUNCTIONS, privObjs, cmdStr) } - HiveThriftServer2.listener.onStatementStart( + HiveThriftServer2.eventManager.onStatementStart( statementId, parentSession.getSessionHandle.getSessionId.toString, logMsg, @@ -104,12 +105,21 @@ private[hive] class SparkGetFunctionsOperation( } setState(OperationState.FINISHED) } catch { - case e: HiveSQLException => + case e: Throwable => + logError(s"Error executing get functions operation with $statementId", e) setState(OperationState.ERROR) - HiveThriftServer2.listener.onStatementError( - statementId, e.getMessage, SparkUtils.exceptionString(e)) - throw e + e match { + case hiveException: HiveSQLException => + HiveThriftServer2.eventManager.onStatementError( + statementId, hiveException.getMessage, SparkUtils.exceptionString(hiveException)) + throw hiveException + case _ => + val root = ExceptionUtils.getRootCause(e) + HiveThriftServer2.eventManager.onStatementError( + statementId, root.getMessage, SparkUtils.exceptionString(root)) + throw new HiveSQLException("Error getting functions: " + root.toString, root) + } } - HiveThriftServer2.listener.onStatementFinish(statementId) + HiveThriftServer2.eventManager.onStatementFinish(statementId) } } diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetSchemasOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetSchemasOperation.scala index 87ef154bcc8ab..db19880d1b99f 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetSchemasOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetSchemasOperation.scala @@ -20,6 +20,7 @@ package org.apache.spark.sql.hive.thriftserver import java.util.UUID import java.util.regex.Pattern +import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.ql.security.authorization.plugin.HiveOperationType import org.apache.hive.service.cli._ import org.apache.hive.service.cli.operation.GetSchemasOperation @@ -49,7 +50,7 @@ private[hive] class SparkGetSchemasOperation( override def close(): Unit = { super.close() - HiveThriftServer2.listener.onOperationClosed(statementId) + HiveThriftServer2.eventManager.onOperationClosed(statementId) } override def runInternal(): Unit = { @@ -67,7 +68,7 @@ private[hive] class SparkGetSchemasOperation( authorizeMetaGets(HiveOperationType.GET_TABLES, null, cmdStr) } - HiveThriftServer2.listener.onStatementStart( + HiveThriftServer2.eventManager.onStatementStart( statementId, parentSession.getSessionHandle.getSessionId.toString, logMsg, @@ -87,12 +88,21 @@ private[hive] class SparkGetSchemasOperation( } setState(OperationState.FINISHED) } catch { - case e: HiveSQLException => + case e: Throwable => + logError(s"Error executing get schemas operation with $statementId", e) setState(OperationState.ERROR) - HiveThriftServer2.listener.onStatementError( - statementId, e.getMessage, SparkUtils.exceptionString(e)) - throw e + e match { + case hiveException: HiveSQLException => + HiveThriftServer2.eventManager.onStatementError( + statementId, hiveException.getMessage, SparkUtils.exceptionString(hiveException)) + throw hiveException + case _ => + val root = ExceptionUtils.getRootCause(e) + HiveThriftServer2.eventManager.onStatementError( + statementId, root.getMessage, SparkUtils.exceptionString(root)) + throw new HiveSQLException("Error getting schemas: " + root.toString, root) + } } - HiveThriftServer2.listener.onStatementFinish(statementId) + HiveThriftServer2.eventManager.onStatementFinish(statementId) } } diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTableTypesOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTableTypesOperation.scala index 8f2257f77d2a0..b4093e58d3c07 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTableTypesOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTableTypesOperation.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.hive.thriftserver import java.util.UUID +import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.ql.security.authorization.plugin.HiveOperationType import org.apache.hive.service.cli._ import org.apache.hive.service.cli.operation.GetTableTypesOperation @@ -44,7 +45,7 @@ private[hive] class SparkGetTableTypesOperation( override def close(): Unit = { super.close() - HiveThriftServer2.listener.onOperationClosed(statementId) + HiveThriftServer2.eventManager.onOperationClosed(statementId) } override def runInternal(): Unit = { @@ -60,7 +61,7 @@ private[hive] class SparkGetTableTypesOperation( authorizeMetaGets(HiveOperationType.GET_TABLETYPES, null) } - HiveThriftServer2.listener.onStatementStart( + HiveThriftServer2.eventManager.onStatementStart( statementId, parentSession.getSessionHandle.getSessionId.toString, logMsg, @@ -74,12 +75,21 @@ private[hive] class SparkGetTableTypesOperation( } setState(OperationState.FINISHED) } catch { - case e: HiveSQLException => + case e: Throwable => + logError(s"Error executing get table types operation with $statementId", e) setState(OperationState.ERROR) - HiveThriftServer2.listener.onStatementError( - statementId, e.getMessage, SparkUtils.exceptionString(e)) - throw e + e match { + case hiveException: HiveSQLException => + HiveThriftServer2.eventManager.onStatementError( + statementId, hiveException.getMessage, SparkUtils.exceptionString(hiveException)) + throw hiveException + case _ => + val root = ExceptionUtils.getRootCause(e) + HiveThriftServer2.eventManager.onStatementError( + statementId, root.getMessage, SparkUtils.exceptionString(root)) + throw new HiveSQLException("Error getting table types: " + root.toString, root) + } } - HiveThriftServer2.listener.onStatementFinish(statementId) + HiveThriftServer2.eventManager.onStatementFinish(statementId) } } diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTablesOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTablesOperation.scala index 6441dc50f49fe..45c6d980aac47 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTablesOperation.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTablesOperation.scala @@ -22,6 +22,7 @@ import java.util.regex.Pattern import scala.collection.JavaConverters._ +import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.ql.security.authorization.plugin.HiveOperationType import org.apache.hadoop.hive.ql.security.authorization.plugin.HivePrivilegeObjectUtils import org.apache.hive.service.cli._ @@ -30,7 +31,6 @@ import org.apache.hive.service.cli.session.HiveSession import org.apache.spark.internal.Logging import org.apache.spark.sql.SQLContext -import org.apache.spark.sql.catalyst.catalog.CatalogTableType import org.apache.spark.sql.catalyst.catalog.CatalogTableType._ import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.util.{Utils => SparkUtils} @@ -59,7 +59,7 @@ private[hive] class SparkGetTablesOperation( override def close(): Unit = { super.close() - HiveThriftServer2.listener.onOperationClosed(statementId) + HiveThriftServer2.eventManager.onOperationClosed(statementId) } override def runInternal(): Unit = { @@ -85,7 +85,7 @@ private[hive] class SparkGetTablesOperation( authorizeMetaGets(HiveOperationType.GET_TABLES, privObjs, cmdStr) } - HiveThriftServer2.listener.onStatementStart( + HiveThriftServer2.eventManager.onStatementStart( statementId, parentSession.getSessionHandle.getSessionId.toString, logMsg, @@ -119,13 +119,22 @@ private[hive] class SparkGetTablesOperation( } setState(OperationState.FINISHED) } catch { - case e: HiveSQLException => + case e: Throwable => + logError(s"Error executing get tables operation with $statementId", e) setState(OperationState.ERROR) - HiveThriftServer2.listener.onStatementError( - statementId, e.getMessage, SparkUtils.exceptionString(e)) - throw e + e match { + case hiveException: HiveSQLException => + HiveThriftServer2.eventManager.onStatementError( + statementId, hiveException.getMessage, SparkUtils.exceptionString(hiveException)) + throw hiveException + case _ => + val root = ExceptionUtils.getRootCause(e) + HiveThriftServer2.eventManager.onStatementError( + statementId, root.getMessage, SparkUtils.exceptionString(root)) + throw new HiveSQLException("Error getting tables: " + root.toString, root) + } } - HiveThriftServer2.listener.onStatementFinish(statementId) + HiveThriftServer2.eventManager.onStatementFinish(statementId) } private def addToRowSet( diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTypeInfoOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTypeInfoOperation.scala new file mode 100644 index 0000000000000..dd5668a93f82d --- /dev/null +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkGetTypeInfoOperation.scala @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.thriftserver + +import java.util.UUID + +import org.apache.commons.lang3.exception.ExceptionUtils +import org.apache.hadoop.hive.ql.security.authorization.plugin.HiveOperationType +import org.apache.hive.service.cli.{HiveSQLException, OperationState} +import org.apache.hive.service.cli.operation.GetTypeInfoOperation +import org.apache.hive.service.cli.session.HiveSession + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.SQLContext +import org.apache.spark.util.{Utils => SparkUtils} + +/** + * Spark's own GetTypeInfoOperation + * + * @param sqlContext SQLContext to use + * @param parentSession a HiveSession from SessionManager + */ +private[hive] class SparkGetTypeInfoOperation( + sqlContext: SQLContext, + parentSession: HiveSession) + extends GetTypeInfoOperation(parentSession) with Logging { + + private var statementId: String = _ + + override def close(): Unit = { + super.close() + HiveThriftServer2.eventManager.onOperationClosed(statementId) + } + + override def runInternal(): Unit = { + statementId = UUID.randomUUID().toString + val logMsg = "Listing type info" + logInfo(s"$logMsg with $statementId") + setState(OperationState.RUNNING) + // Always use the latest class loader provided by executionHive's state. + val executionHiveClassLoader = sqlContext.sharedState.jarClassLoader + Thread.currentThread().setContextClassLoader(executionHiveClassLoader) + + if (isAuthV2Enabled) { + authorizeMetaGets(HiveOperationType.GET_TYPEINFO, null) + } + + HiveThriftServer2.eventManager.onStatementStart( + statementId, + parentSession.getSessionHandle.getSessionId.toString, + logMsg, + statementId, + parentSession.getUsername) + + try { + ThriftserverShimUtils.supportedType().foreach(typeInfo => { + val rowData = Array[AnyRef]( + typeInfo.getName, // TYPE_NAME + typeInfo.toJavaSQLType.asInstanceOf[AnyRef], // DATA_TYPE + typeInfo.getMaxPrecision.asInstanceOf[AnyRef], // PRECISION + typeInfo.getLiteralPrefix, // LITERAL_PREFIX + typeInfo.getLiteralSuffix, // LITERAL_SUFFIX + typeInfo.getCreateParams, // CREATE_PARAMS + typeInfo.getNullable.asInstanceOf[AnyRef], // NULLABLE + typeInfo.isCaseSensitive.asInstanceOf[AnyRef], // CASE_SENSITIVE + typeInfo.getSearchable.asInstanceOf[AnyRef], // SEARCHABLE + typeInfo.isUnsignedAttribute.asInstanceOf[AnyRef], // UNSIGNED_ATTRIBUTE + typeInfo.isFixedPrecScale.asInstanceOf[AnyRef], // FIXED_PREC_SCALE + typeInfo.isAutoIncrement.asInstanceOf[AnyRef], // AUTO_INCREMENT + typeInfo.getLocalizedName, // LOCAL_TYPE_NAME + typeInfo.getMinimumScale.asInstanceOf[AnyRef], // MINIMUM_SCALE + typeInfo.getMaximumScale.asInstanceOf[AnyRef], // MAXIMUM_SCALE + null, // SQL_DATA_TYPE, unused + null, // SQL_DATETIME_SUB, unused + typeInfo.getNumPrecRadix // NUM_PREC_RADIX + ) + rowSet.addRow(rowData) + }) + setState(OperationState.FINISHED) + } catch { + case e: Throwable => + logError(s"Error executing get type info with $statementId", e) + setState(OperationState.ERROR) + e match { + case hiveException: HiveSQLException => + HiveThriftServer2.eventManager.onStatementError( + statementId, hiveException.getMessage, SparkUtils.exceptionString(hiveException)) + throw hiveException + case _ => + val root = ExceptionUtils.getRootCause(e) + HiveThriftServer2.eventManager.onStatementError( + statementId, root.getMessage, SparkUtils.exceptionString(root)) + throw new HiveSQLException("Error getting type info: " + root.toString, root) + } + } + HiveThriftServer2.eventManager.onStatementFinish(statementId) + } +} diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala index b9614d49eadbd..b665d4a31b9b1 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.hive.thriftserver import java.io._ import java.nio.charset.StandardCharsets.UTF_8 -import java.util.{ArrayList => JArrayList, Locale} +import java.util.{ArrayList => JArrayList, List => JList, Locale} import java.util.concurrent.TimeUnit import scala.collection.JavaConverters._ @@ -37,6 +37,7 @@ import org.apache.hadoop.hive.ql.session.SessionState import org.apache.hadoop.security.{Credentials, UserGroupInformation} import org.apache.log4j.Level import org.apache.thrift.transport.TSocket +import sun.misc.{Signal, SignalHandler} import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil @@ -63,7 +64,7 @@ private[hive] object SparkSQLCLIDriver extends Logging { * a signal handler will invoke this registered callback if a Ctrl+C signal is detected while * a command is being processed by the current thread. */ - def installSignalHandler() { + def installSignalHandler(): Unit = { HiveInterruptUtils.add(() => { // Handle remote execution mode if (SparkSQLEnv.sparkContext != null) { @@ -77,7 +78,7 @@ private[hive] object SparkSQLCLIDriver extends Logging { }) } - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { val oproc = new OptionsProcessor() if (!oproc.process_stage1(args)) { System.exit(1) @@ -111,6 +112,11 @@ private[hive] object SparkSQLCLIDriver extends Logging { // Set all properties specified via command line. val conf: HiveConf = sessionState.getConf + // Hive 2.0.0 onwards HiveConf.getClassLoader returns the UDFClassLoader (created by Hive). + // Because of this spark cannot find the jars as class loader got changed + // Hive changed the class loader because of HIVE-11878, so it is required to use old + // classLoader as sparks loaded all the jars in this classLoader + conf.setClassLoader(Thread.currentThread().getContextClassLoader) sessionState.cmdProperties.entrySet().asScala.foreach { item => val key = item.getKey.toString val value = item.getValue.toString @@ -133,20 +139,7 @@ private[hive] object SparkSQLCLIDriver extends Logging { // Clean up after we exit ShutdownHookManager.addShutdownHook { () => SparkSQLEnv.stop() } - val remoteMode = isRemoteMode(sessionState) - // "-h" option has been passed, so connect to Hive thrift server. - if (!remoteMode) { - // Hadoop-20 and above - we need to augment classpath using hiveconf - // components. - // See also: code in ExecDriver.java - var loader = conf.getClassLoader - val auxJars = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEAUXJARS) - if (StringUtils.isNotBlank(auxJars)) { - loader = ThriftserverShimUtils.addToClassPath(loader, StringUtils.split(auxJars, ",")) - } - conf.setClassLoader(loader) - Thread.currentThread().setContextClassLoader(loader) - } else { + if (isRemoteMode(sessionState)) { // Hive 1.2 + not supported in CLI throw new RuntimeException("Remote operations not supported") } @@ -164,6 +157,22 @@ private[hive] object SparkSQLCLIDriver extends Logging { val cli = new SparkSQLCLIDriver cli.setHiveVariables(oproc.getHiveVariables) + // In SparkSQL CLI, we may want to use jars augmented by hiveconf + // hive.aux.jars.path, here we add jars augmented by hiveconf to + // Spark's SessionResourceLoader to obtain these jars. + val auxJars = HiveConf.getVar(conf, HiveConf.ConfVars.HIVEAUXJARS) + if (StringUtils.isNotBlank(auxJars)) { + val resourceLoader = SparkSQLEnv.sqlContext.sessionState.resourceLoader + StringUtils.split(auxJars, ",").foreach(resourceLoader.addJar(_)) + } + + // The class loader of CliSessionState's conf is current main thread's class loader + // used to load jars passed by --jars. One class loader used by AddJarCommand is + // sharedState.jarClassLoader which contain jar path passed by --jars in main thread. + // We set CliSessionState's conf class loader to sharedState.jarClassLoader. + // Thus we can load all jars passed by --jars and AddJarCommand. + sessionState.getConf.setClassLoader(SparkSQLEnv.sqlContext.sharedState.jarClassLoader) + // TODO work around for set the log output to console, because the HiveContext // will set the output into an invalid buffer. sessionState.in = System.in @@ -430,5 +439,112 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging { ret } } + + // Adapted processLine from Hive 2.3's CliDriver.processLine. + override def processLine(line: String, allowInterrupting: Boolean): Int = { + var oldSignal: SignalHandler = null + var interruptSignal: Signal = null + + if (allowInterrupting) { + // Remember all threads that were running at the time we started line processing. + // Hook up the custom Ctrl+C handler while processing this line + interruptSignal = new Signal("INT") + oldSignal = Signal.handle(interruptSignal, new SignalHandler() { + private var interruptRequested: Boolean = false + + override def handle(signal: Signal) { + val initialRequest = !interruptRequested + interruptRequested = true + + // Kill the VM on second ctrl+c + if (!initialRequest) { + console.printInfo("Exiting the JVM") + System.exit(127) + } + + // Interrupt the CLI thread to stop the current statement and return + // to prompt + console.printInfo("Interrupting... Be patient, this might take some time.") + console.printInfo("Press Ctrl+C again to kill JVM") + + HiveInterruptUtils.interrupt() + } + }) + } + + try { + var lastRet: Int = 0 + + // we can not use "split" function directly as ";" may be quoted + val commands = splitSemiColon(line).asScala + var command: String = "" + for (oneCmd <- commands) { + if (StringUtils.endsWith(oneCmd, "\\")) { + command += StringUtils.chop(oneCmd) + ";" + } else { + command += oneCmd + if (!StringUtils.isBlank(command)) { + val ret = processCmd(command) + command = "" + lastRet = ret + val ignoreErrors = HiveConf.getBoolVar(conf, HiveConf.ConfVars.CLIIGNOREERRORS) + if (ret != 0 && !ignoreErrors) { + CommandProcessorFactory.clean(conf.asInstanceOf[HiveConf]) + ret + } + } + } + } + CommandProcessorFactory.clean(conf.asInstanceOf[HiveConf]) + lastRet + } finally { + // Once we are done processing the line, restore the old handler + if (oldSignal != null && interruptSignal != null) { + Signal.handle(interruptSignal, oldSignal) + } + } + } + + // Adapted splitSemiColon from Hive 2.3's CliDriver.splitSemiColon. + private def splitSemiColon(line: String): JList[String] = { + var insideSingleQuote = false + var insideDoubleQuote = false + var escape = false + var beginIndex = 0 + val ret = new JArrayList[String] + for (index <- 0 until line.length) { + if (line.charAt(index) == '\'') { + // take a look to see if it is escaped + if (!escape) { + // flip the boolean variable + insideSingleQuote = !insideSingleQuote + } + } else if (line.charAt(index) == '\"') { + // take a look to see if it is escaped + if (!escape) { + // flip the boolean variable + insideDoubleQuote = !insideDoubleQuote + } + } else if (line.charAt(index) == ';') { + if (insideSingleQuote || insideDoubleQuote) { + // do not split + } else { + // split, do not include ; itself + ret.add(line.substring(beginIndex, index)) + beginIndex = index + 1 + } + } else { + // nothing to do + } + // set the escape + if (escape) { + escape = false + } else if (line.charAt(index) == '\\') { + escape = true + } + } + ret.add(line.substring(beginIndex)) + ret + } } diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala index c32d908ad1bba..1644ecb2453be 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIService.scala @@ -43,7 +43,7 @@ private[hive] class SparkSQLCLIService(hiveServer: HiveServer2, sqlContext: SQLC extends CLIService(hiveServer) with ReflectedCompositeService { - override def init(hiveConf: HiveConf) { + override def init(hiveConf: HiveConf): Unit = { setSuperField(this, "hiveConf", hiveConf) val sparkSqlSessionManager = new SparkSQLSessionManager(hiveServer, sqlContext) @@ -105,7 +105,7 @@ private[hive] class SparkSQLCLIService(hiveServer: HiveServer2, sqlContext: SQLC } private[thriftserver] trait ReflectedCompositeService { this: AbstractService => - def initCompositeService(hiveConf: HiveConf) { + def initCompositeService(hiveConf: HiveConf): Unit = { // Emulating `CompositeService.init(hiveConf)` val serviceList = getAncestorField[JList[Service]](this, 2, "serviceList") serviceList.asScala.foreach(_.init(hiveConf)) diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala index 960fdd11db15d..12fba0eae6dce 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala @@ -61,7 +61,7 @@ private[hive] class SparkSQLDriver(val context: SQLContext = SparkSQLEnv.sqlCont try { context.sparkContext.setJobDescription(command) val execution = context.sessionState.executePlan(context.sql(command).logicalPlan) - hiveResponse = SQLExecution.withNewExecutionId(context.sparkSession, execution) { + hiveResponse = SQLExecution.withNewExecutionId(execution) { hiveResultString(execution.executedPlan) } tableSchema = getResultSetSchema(execution) @@ -94,7 +94,7 @@ private[hive] class SparkSQLDriver(val context: SQLContext = SparkSQLEnv.sqlCont override def getSchema: Schema = tableSchema - override def destroy() { + override def destroy(): Unit = { super.destroy() hiveResponse = null tableSchema = null diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala index 674da18ca1803..8944b93d9b697 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnv.scala @@ -33,7 +33,7 @@ private[hive] object SparkSQLEnv extends Logging { var sqlContext: SQLContext = _ var sparkContext: SparkContext = _ - def init() { + def init(): Unit = { if (sqlContext == null) { val sparkConf = new SparkConf(loadDefaults = true) // If user doesn't specify the appName, we want to get [SparkSQL::localHostName] instead of @@ -50,6 +50,11 @@ private[hive] object SparkSQLEnv extends Logging { sparkContext = sparkSession.sparkContext sqlContext = sparkSession.sqlContext + // SPARK-29604: force initialization of the session state with the Spark class loader, + // instead of having it happen during the initialization of the Hive client (which may use a + // different class loader). + sparkSession.sessionState + val metadataHive = sparkSession .sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog].client metadataHive.setOut(new PrintStream(System.out, true, UTF_8.name())) @@ -60,7 +65,7 @@ private[hive] object SparkSQLEnv extends Logging { } /** Cleans up and shuts down the Spark SQL environments. */ - def stop() { + def stop(): Unit = { logDebug("Shutting down Spark SQL Environment") // Stop the SparkContext if (SparkSQLEnv.sparkContext != null) { diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala index 13055e0ae1394..b3171897141c2 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala @@ -38,7 +38,7 @@ private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, sqlContext: private lazy val sparkSqlOperationManager = new SparkSQLOperationManager() - override def init(hiveConf: HiveConf) { + override def init(hiveConf: HiveConf): Unit = { setSuperField(this, "operationManager", sparkSqlOperationManager) super.init(hiveConf) } @@ -55,7 +55,7 @@ private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, sqlContext: super.openSession(protocol, username, passwd, ipAddress, sessionConf, withImpersonation, delegationToken) val session = super.getSession(sessionHandle) - HiveThriftServer2.listener.onSessionCreated( + HiveThriftServer2.eventManager.onSessionCreated( session.getIpAddress, sessionHandle.getSessionId.toString, session.getUsername) val ctx = if (sqlContext.conf.hiveThriftServerSingleSession) { sqlContext @@ -63,6 +63,9 @@ private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, sqlContext: sqlContext.newSession() } ctx.setConf(HiveUtils.FAKE_HIVE_VERSION.key, HiveUtils.builtinHiveVersion) + val hiveSessionState = session.getSessionState + setConfMap(ctx, hiveSessionState.getOverriddenConfigurations) + setConfMap(ctx, hiveSessionState.getHiveVariables) if (sessionConf != null && sessionConf.containsKey("use:database")) { ctx.sql(s"use ${sessionConf.get("use:database")}") } @@ -70,10 +73,20 @@ private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, sqlContext: sessionHandle } - override def closeSession(sessionHandle: SessionHandle) { - HiveThriftServer2.listener.onSessionClosed(sessionHandle.getSessionId.toString) + override def closeSession(sessionHandle: SessionHandle): Unit = { + HiveThriftServer2.eventManager.onSessionClosed(sessionHandle.getSessionId.toString) + val ctx = sparkSqlOperationManager.sessionToContexts.getOrDefault(sessionHandle, sqlContext) + ctx.sparkSession.sessionState.catalog.getTempViewNames().foreach(ctx.uncacheTable) super.closeSession(sessionHandle) sparkSqlOperationManager.sessionToActivePool.remove(sessionHandle) sparkSqlOperationManager.sessionToContexts.remove(sessionHandle) } + + def setConfMap(conf: SQLContext, confMap: java.util.Map[String, String]): Unit = { + val iterator = confMap.entrySet().iterator() + while (iterator.hasNext) { + val kv = iterator.next() + conf.setConf(kv.getKey, kv.getValue) + } + } } diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala index 35f92547e7815..3396560f43502 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala @@ -28,7 +28,6 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.SQLContext import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.thriftserver._ -import org.apache.spark.sql.internal.SQLConf /** * Executes queries using Spark SQL, and maintains a list of handles to active queries. @@ -51,9 +50,6 @@ private[thriftserver] class SparkSQLOperationManager() require(sqlContext != null, s"Session handle: ${parentSession.getSessionHandle} has not been" + s" initialized or had already closed.") val conf = sqlContext.sessionState.conf - val hiveSessionState = parentSession.getSessionState - setConfMap(conf, hiveSessionState.getOverriddenConfigurations) - setConfMap(conf, hiveSessionState.getHiveVariables) val runInBackground = async && conf.getConf(HiveUtils.HIVE_THRIFT_SERVER_ASYNC) val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay, runInBackground)(sqlContext, sessionToActivePool) @@ -145,11 +141,14 @@ private[thriftserver] class SparkSQLOperationManager() operation } - def setConfMap(conf: SQLConf, confMap: java.util.Map[String, String]): Unit = { - val iterator = confMap.entrySet().iterator() - while (iterator.hasNext) { - val kv = iterator.next() - conf.setConfString(kv.getKey, kv.getValue) - } + override def newGetTypeInfoOperation( + parentSession: HiveSession): GetTypeInfoOperation = synchronized { + val sqlContext = sessionToContexts.get(parentSession.getSessionHandle) + require(sqlContext != null, s"Session handle: ${parentSession.getSessionHandle} has not been" + + " initialized or had already closed.") + val operation = new SparkGetTypeInfoOperation(sqlContext, parentSession) + handleToOperation.put(operation.getHandle, operation) + logDebug(s"Created GetTypeInfoOperation with session=$parentSession.") + operation } } diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/HiveThriftServer2AppStatusStore.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/HiveThriftServer2AppStatusStore.scala new file mode 100644 index 0000000000000..5cb78f6e64650 --- /dev/null +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/HiveThriftServer2AppStatusStore.scala @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.thriftserver.ui + +import com.fasterxml.jackson.annotation.JsonIgnore +import scala.collection.JavaConverters._ +import scala.collection.mutable.ArrayBuffer + +import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2.ExecutionState +import org.apache.spark.status.KVUtils.KVIndexParam +import org.apache.spark.util.kvstore.{KVIndex, KVStore} + +/** + * Provides a view of a KVStore with methods that make it easy to query SQL-specific state. There's + * no state kept in this class, so it's ok to have multiple instances of it in an application. + */ +class HiveThriftServer2AppStatusStore( + store: KVStore, + val listener: Option[HiveThriftServer2Listener] = None) { + + def getSessionList: Seq[SessionInfo] = { + store.view(classOf[SessionInfo]).asScala.toSeq + } + + def getExecutionList: Seq[ExecutionInfo] = { + store.view(classOf[ExecutionInfo]).asScala.toSeq + } + + def getOnlineSessionNum: Int = { + store.view(classOf[SessionInfo]).asScala.count(_.finishTimestamp == 0) + } + + def getSession(sessionId: String): Option[SessionInfo] = { + try { + Some(store.read(classOf[SessionInfo], sessionId)) + } catch { + case _: NoSuchElementException => None + } + } + + def getExecution(executionId: String): Option[ExecutionInfo] = { + try { + Some(store.read(classOf[ExecutionInfo], executionId)) + } catch { + case _: NoSuchElementException => None + } + } + + /** + * When an error or a cancellation occurs, we set the finishTimestamp of the statement. + * Therefore, when we count the number of running statements, we need to exclude errors and + * cancellations and count all statements that have not been closed so far. + */ + def getTotalRunning: Int = { + store.view(classOf[ExecutionInfo]).asScala.count(_.isExecutionActive) + } + + def getSessionCount: Long = { + store.count(classOf[SessionInfo]) + } + + def getExecutionCount: Long = { + store.count(classOf[ExecutionInfo]) + } +} + +private[thriftserver] class SessionInfo( + @KVIndexParam val sessionId: String, + val startTimestamp: Long, + val ip: String, + val userName: String, + val finishTimestamp: Long, + val totalExecution: Long) { + @JsonIgnore @KVIndex("finishTime") + private def finishTimeIndex: Long = if (finishTimestamp > 0L ) finishTimestamp else -1L + def totalTime: Long = { + if (finishTimestamp == 0L) { + System.currentTimeMillis - startTimestamp + } else { + finishTimestamp - startTimestamp + } + } +} + +private[thriftserver] class ExecutionInfo( + @KVIndexParam val execId: String, + val statement: String, + val sessionId: String, + val startTimestamp: Long, + val userName: String, + val finishTimestamp: Long, + val closeTimestamp: Long, + val executePlan: String, + val detail: String, + val state: ExecutionState.Value, + val jobId: ArrayBuffer[String], + val groupId: String) { + @JsonIgnore @KVIndex("finishTime") + private def finishTimeIndex: Long = if (finishTimestamp > 0L && !isExecutionActive) { + finishTimestamp + } else -1L + + @JsonIgnore @KVIndex("isExecutionActive") + def isExecutionActive: Boolean = { + !(state == ExecutionState.FAILED || + state == ExecutionState.CANCELED || + state == ExecutionState.CLOSED) + } + + def totalTime(endTime: Long): Long = { + if (endTime == 0L) { + System.currentTimeMillis - startTimestamp + } else { + endTime - startTimestamp + } + } +} diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/HiveThriftServer2EventManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/HiveThriftServer2EventManager.scala new file mode 100644 index 0000000000000..fa04c67896a69 --- /dev/null +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/HiveThriftServer2EventManager.scala @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.hive.thriftserver.ui + +import org.apache.spark.SparkContext +import org.apache.spark.scheduler.SparkListenerEvent + +/** + * This class manages events generated by the thriftserver application. It converts the + * operation and session events to listener events and post it into the live listener bus. + */ +private[thriftserver] class HiveThriftServer2EventManager(sc: SparkContext) { + + def postLiveListenerBus(event: SparkListenerEvent): Unit = { + sc.listenerBus.post(event) + } + + def onSessionCreated(ip: String, sessionId: String, userName: String = "UNKNOWN"): Unit = { + postLiveListenerBus(SparkListenerThriftServerSessionCreated(ip, sessionId, + userName, System.currentTimeMillis())) + } + + def onSessionClosed(sessionId: String): Unit = { + postLiveListenerBus(SparkListenerThriftServerSessionClosed(sessionId, + System.currentTimeMillis())) + } + + def onStatementStart( + id: String, + sessionId: String, + statement: String, + groupId: String, + userName: String = "UNKNOWN"): Unit = { + postLiveListenerBus(SparkListenerThriftServerOperationStart(id, sessionId, statement, groupId, + System.currentTimeMillis(), userName)) + } + + def onStatementParsed(id: String, executionPlan: String): Unit = { + postLiveListenerBus(SparkListenerThriftServerOperationParsed(id, executionPlan)) + } + + def onStatementCanceled(id: String): Unit = { + postLiveListenerBus(SparkListenerThriftServerOperationCanceled(id, System.currentTimeMillis())) + } + + def onStatementError(id: String, errorMsg: String, errorTrace: String): Unit = { + postLiveListenerBus(SparkListenerThriftServerOperationError(id, errorMsg, errorTrace, + System.currentTimeMillis())) + } + + def onStatementFinish(id: String): Unit = { + postLiveListenerBus(SparkListenerThriftServerOperationFinish(id, System.currentTimeMillis())) + + } + + def onOperationClosed(id: String): Unit = { + postLiveListenerBus(SparkListenerThriftServerOperationClosed(id, System.currentTimeMillis())) + } +} + +private[thriftserver] case class SparkListenerThriftServerSessionCreated( + ip: String, + sessionId: String, + userName: String, + startTime: Long) extends SparkListenerEvent + +private[thriftserver] case class SparkListenerThriftServerSessionClosed( + sessionId: String, finishTime: Long) extends SparkListenerEvent + +private[thriftserver] case class SparkListenerThriftServerOperationStart( + id: String, + sessionId: String, + statement: String, + groupId: String, + startTime: Long, + userName: String = "UNKNOWN") extends SparkListenerEvent + +private[thriftserver] case class SparkListenerThriftServerOperationParsed( + id: String, + executionPlan: String) extends SparkListenerEvent + +private[thriftserver] case class SparkListenerThriftServerOperationCanceled( + id: String, finishTime: Long) extends SparkListenerEvent + +private[thriftserver] case class SparkListenerThriftServerOperationError( + id: String, + errorMsg: String, + errorTrace: String, + finishTime: Long) extends SparkListenerEvent + +private[thriftserver] case class SparkListenerThriftServerOperationFinish( + id: String, + finishTime: Long) extends SparkListenerEvent + +private[thriftserver] case class SparkListenerThriftServerOperationClosed( + id: String, + closeTime: Long) extends SparkListenerEvent + + diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/HiveThriftServer2HistoryServerPlugin.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/HiveThriftServer2HistoryServerPlugin.scala new file mode 100644 index 0000000000000..aec4125801f68 --- /dev/null +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/HiveThriftServer2HistoryServerPlugin.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.thriftserver.ui + +import org.apache.spark.SparkConf +import org.apache.spark.scheduler.SparkListener +import org.apache.spark.status.{AppHistoryServerPlugin, ElementTrackingStore} +import org.apache.spark.ui.SparkUI + +class HiveThriftServer2HistoryServerPlugin extends AppHistoryServerPlugin { + + override def createListeners(conf: SparkConf, store: ElementTrackingStore): Seq[SparkListener] = { + Seq(new HiveThriftServer2Listener(store, conf, None, false)) + } + + override def setupUI(ui: SparkUI): Unit = { + val store = new HiveThriftServer2AppStatusStore(ui.store.store) + if (store.getSessionCount > 0) { + new ThriftServerTab(store, ui) + } + } + + override def displayOrder: Int = 1 +} + diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/HiveThriftServer2Listener.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/HiveThriftServer2Listener.scala new file mode 100644 index 0000000000000..6d0a506fa94dc --- /dev/null +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/HiveThriftServer2Listener.scala @@ -0,0 +1,315 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.thriftserver.ui + +import java.util.concurrent.ConcurrentHashMap + +import scala.collection.JavaConverters._ +import scala.collection.mutable.ArrayBuffer + +import org.apache.hive.service.server.HiveServer2 + +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.internal.config.Status.LIVE_ENTITY_UPDATE_PERIOD +import org.apache.spark.scheduler._ +import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2.ExecutionState +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.status.{ElementTrackingStore, KVUtils, LiveEntity} + +/** + * An inner sparkListener called in sc.stop to clean up the HiveThriftServer2 + */ +private[thriftserver] class HiveThriftServer2Listener( + kvstore: ElementTrackingStore, + sparkConf: SparkConf, + server: Option[HiveServer2], + live: Boolean = true) extends SparkListener { + + private val sessionList = new ConcurrentHashMap[String, LiveSessionData]() + private val executionList = new ConcurrentHashMap[String, LiveExecutionData]() + + private val (retainedStatements: Int, retainedSessions: Int) = { + (sparkConf.get(SQLConf.THRIFTSERVER_UI_STATEMENT_LIMIT), + sparkConf.get(SQLConf.THRIFTSERVER_UI_SESSION_LIMIT)) + } + + // How often to update live entities. -1 means "never update" when replaying applications, + // meaning only the last write will happen. For live applications, this avoids a few + // operations that we can live without when rapidly processing incoming events. + private val liveUpdatePeriodNs = if (live) sparkConf.get(LIVE_ENTITY_UPDATE_PERIOD) else -1L + + // Returns true if this listener has no live data. Exposed for tests only. + private[thriftserver] def noLiveData(): Boolean = { + sessionList.isEmpty && executionList.isEmpty + } + + kvstore.addTrigger(classOf[SessionInfo], retainedSessions) { count => + cleanupSession(count) + } + + kvstore.addTrigger(classOf[ExecutionInfo], retainedStatements) { count => + cleanupExecutions(count) + } + + kvstore.onFlush { + if (!live) { + flush((entity: LiveEntity) => updateStoreWithTriggerEnabled(entity)) + } + } + + override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = { + if (live) { + server.foreach(_.stop()) + } + } + + override def onJobStart(jobStart: SparkListenerJobStart): Unit = { + val properties = jobStart.properties + if (properties != null) { + val groupId = properties.getProperty(SparkContext.SPARK_JOB_GROUP_ID) + if (groupId != null) { + updateJobDetails(jobStart.jobId.toString, groupId) + } + } + } + + private def updateJobDetails(jobId: String, groupId: String): Unit = { + val execList = executionList.values().asScala.filter(_.groupId == groupId).toSeq + if (execList.nonEmpty) { + execList.foreach { exec => + exec.jobId += jobId.toString + updateLiveStore(exec) + } + } else { + // It may possible that event reordering happens, such a way that JobStart event come after + // Execution end event (Refer SPARK-27019). To handle that situation, if occurs in + // Thriftserver, following code will take care. Here will come only if JobStart event comes + // after Execution End event. + val storeExecInfo = kvstore.view(classOf[ExecutionInfo]).asScala.filter(_.groupId == groupId) + storeExecInfo.foreach { exec => + val liveExec = getOrCreateExecution(exec.execId, exec.statement, exec.sessionId, + exec.startTimestamp, exec.userName) + liveExec.jobId += jobId.toString + updateStoreWithTriggerEnabled(liveExec) + executionList.remove(liveExec.execId) + } + } + } + + override def onOtherEvent(event: SparkListenerEvent): Unit = { + event match { + case e: SparkListenerThriftServerSessionCreated => onSessionCreated(e) + case e: SparkListenerThriftServerSessionClosed => onSessionClosed(e) + case e: SparkListenerThriftServerOperationStart => onOperationStart(e) + case e: SparkListenerThriftServerOperationParsed => onOperationParsed(e) + case e: SparkListenerThriftServerOperationCanceled => onOperationCanceled(e) + case e: SparkListenerThriftServerOperationError => onOperationError(e) + case e: SparkListenerThriftServerOperationFinish => onOperationFinished(e) + case e: SparkListenerThriftServerOperationClosed => onOperationClosed(e) + case _ => // Ignore + } + } + + private def onSessionCreated(e: SparkListenerThriftServerSessionCreated): Unit = { + val session = getOrCreateSession(e.sessionId, e.startTime, e.ip, e.userName) + sessionList.put(e.sessionId, session) + updateLiveStore(session) + } + + private def onSessionClosed(e: SparkListenerThriftServerSessionClosed): Unit = { + val session = sessionList.get(e.sessionId) + session.finishTimestamp = e.finishTime + updateStoreWithTriggerEnabled(session) + sessionList.remove(e.sessionId) + } + + private def onOperationStart(e: SparkListenerThriftServerOperationStart): Unit = { + val info = getOrCreateExecution( + e.id, + e.statement, + e.sessionId, + e.startTime, + e.userName) + + info.state = ExecutionState.STARTED + executionList.put(e.id, info) + sessionList.get(e.sessionId).totalExecution += 1 + executionList.get(e.id).groupId = e.groupId + updateLiveStore(executionList.get(e.id)) + updateLiveStore(sessionList.get(e.sessionId)) + } + + private def onOperationParsed(e: SparkListenerThriftServerOperationParsed): Unit = { + executionList.get(e.id).executePlan = e.executionPlan + executionList.get(e.id).state = ExecutionState.COMPILED + updateLiveStore(executionList.get(e.id)) + } + + private def onOperationCanceled(e: SparkListenerThriftServerOperationCanceled): Unit = { + executionList.get(e.id).finishTimestamp = e.finishTime + executionList.get(e.id).state = ExecutionState.CANCELED + updateLiveStore(executionList.get(e.id)) + } + + private def onOperationError(e: SparkListenerThriftServerOperationError): Unit = { + executionList.get(e.id).finishTimestamp = e.finishTime + executionList.get(e.id).detail = e.errorMsg + executionList.get(e.id).state = ExecutionState.FAILED + updateLiveStore(executionList.get(e.id)) + } + + private def onOperationFinished(e: SparkListenerThriftServerOperationFinish): Unit = { + executionList.get(e.id).finishTimestamp = e.finishTime + executionList.get(e.id).state = ExecutionState.FINISHED + updateLiveStore(executionList.get(e.id)) + } + + private def onOperationClosed(e: SparkListenerThriftServerOperationClosed): Unit = { + executionList.get(e.id).closeTimestamp = e.closeTime + executionList.get(e.id).state = ExecutionState.CLOSED + updateStoreWithTriggerEnabled(executionList.get(e.id)) + executionList.remove(e.id) + } + + // Update both live and history stores. Trigger is enabled by default, hence + // it will cleanup the entity which exceeds the threshold. + def updateStoreWithTriggerEnabled(entity: LiveEntity): Unit = { + entity.write(kvstore, System.nanoTime(), checkTriggers = true) + } + + // Update only live stores. If trigger is enabled, it will cleanup entity + // which exceeds the threshold. + def updateLiveStore(entity: LiveEntity, trigger: Boolean = false): Unit = { + val now = System.nanoTime() + if (live && liveUpdatePeriodNs >= 0 && now - entity.lastWriteTime > liveUpdatePeriodNs) { + entity.write(kvstore, now, checkTriggers = trigger) + } + } + + /** Go through all `LiveEntity`s and use `entityFlushFunc(entity)` to flush them. */ + private def flush(entityFlushFunc: LiveEntity => Unit): Unit = { + sessionList.values.asScala.foreach(entityFlushFunc) + executionList.values.asScala.foreach(entityFlushFunc) + } + + private def getOrCreateSession( + sessionId: String, + startTime: Long, + ip: String, + username: String): LiveSessionData = { + sessionList.computeIfAbsent(sessionId, + (_: String) => new LiveSessionData(sessionId, startTime, ip, username)) + } + + private def getOrCreateExecution( + execId: String, statement: String, + sessionId: String, startTimestamp: Long, + userName: String): LiveExecutionData = { + executionList.computeIfAbsent(execId, + (_: String) => new LiveExecutionData(execId, statement, sessionId, startTimestamp, userName)) + } + + private def cleanupExecutions(count: Long): Unit = { + val countToDelete = calculateNumberToRemove(count, retainedStatements) + if (countToDelete <= 0L) { + return + } + val view = kvstore.view(classOf[ExecutionInfo]).index("finishTime").first(0L) + val toDelete = KVUtils.viewToSeq(view, countToDelete.toInt) { j => + j.finishTimestamp != 0 + } + toDelete.foreach { j => kvstore.delete(j.getClass, j.execId) } + } + + private def cleanupSession(count: Long): Unit = { + val countToDelete = calculateNumberToRemove(count, retainedSessions) + if (countToDelete <= 0L) { + return + } + val view = kvstore.view(classOf[SessionInfo]).index("finishTime").first(0L) + val toDelete = KVUtils.viewToSeq(view, countToDelete.toInt) { j => + j.finishTimestamp != 0L + } + + toDelete.foreach { j => kvstore.delete(j.getClass, j.sessionId) } + } + + /** + * Remove at least (retainedSize / 10) items to reduce friction. Because tracking may be done + * asynchronously, this method may return 0 in case enough items have been deleted already. + */ + private def calculateNumberToRemove(dataSize: Long, retainedSize: Long): Long = { + if (dataSize > retainedSize) { + math.max(retainedSize / 10L, dataSize - retainedSize) + } else { + 0L + } + } +} + +private[thriftserver] class LiveExecutionData( + val execId: String, + val statement: String, + val sessionId: String, + val startTimestamp: Long, + val userName: String) extends LiveEntity { + + var finishTimestamp: Long = 0L + var closeTimestamp: Long = 0L + var executePlan: String = "" + var detail: String = "" + var state: ExecutionState.Value = ExecutionState.STARTED + val jobId: ArrayBuffer[String] = ArrayBuffer[String]() + var groupId: String = "" + + override protected def doUpdate(): Any = { + new ExecutionInfo( + execId, + statement, + sessionId, + startTimestamp, + userName, + finishTimestamp, + closeTimestamp, + executePlan, + detail, + state, + jobId, + groupId) + } +} + +private[thriftserver] class LiveSessionData( + val sessionId: String, + val startTimeStamp: Long, + val ip: String, + val username: String) extends LiveEntity { + + var finishTimestamp: Long = 0L + var totalExecution: Int = 0 + + override protected def doUpdate(): Any = { + new SessionInfo( + sessionId, + startTimeStamp, + ip, + username, + finishTimestamp, + totalExecution) + } +} diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala index 261e8fc912eb9..890a668275b81 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPage.scala @@ -17,39 +17,42 @@ package org.apache.spark.sql.hive.thriftserver.ui +import java.net.URLEncoder +import java.nio.charset.StandardCharsets.UTF_8 import java.util.Calendar import javax.servlet.http.HttpServletRequest -import scala.xml.Node +import scala.collection.JavaConverters._ +import scala.xml.{Node, Unparsed} import org.apache.commons.text.StringEscapeUtils import org.apache.spark.internal.Logging -import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2.{ExecutionInfo, ExecutionState, SessionInfo} +import org.apache.spark.sql.hive.thriftserver.ui.ToolTips._ import org.apache.spark.ui._ import org.apache.spark.ui.UIUtils._ - +import org.apache.spark.util.Utils /** Page for Spark Web UI that shows statistics of the thrift server */ private[ui] class ThriftServerPage(parent: ThriftServerTab) extends WebUIPage("") with Logging { - - private val listener = parent.listener - private val startTime = Calendar.getInstance().getTime() - private val emptyCell = "-" + private val store = parent.store + private val startTime = parent.startTime /** Render the page */ def render(request: HttpServletRequest): Seq[Node] = { - val content = - listener.synchronized { // make sure all parts in this page are consistent - generateBasicStats() ++ -
    ++ + val content = store.synchronized { // make sure all parts in this page are consistent + generateBasicStats() ++ +
    ++

    - {listener.getOnlineSessionNum} session(s) are online, - running {listener.getTotalRunning} SQL statement(s) + {store.getOnlineSessionNum} + session(s) are online, + running + {store.getTotalRunning} + SQL statement(s)

    ++ generateSessionStatsTable(request) ++ generateSQLStatsTable(request) - } + } UIUtils.headerSparkPage(request, "JDBC/ODBC Server", content, parent) } @@ -68,53 +71,301 @@ private[ui] class ThriftServerPage(parent: ThriftServerTab) extends WebUIPage("" /** Generate stats of batch statements of the thrift server program */ private def generateSQLStatsTable(request: HttpServletRequest): Seq[Node] = { - val numStatement = listener.getExecutionList.size + + val numStatement = store.getExecutionList.size + val table = if (numStatement > 0) { - val headerRow = Seq("User", "JobID", "GroupID", "Start Time", "Finish Time", "Close Time", - "Execution Time", "Duration", "Statement", "State", "Detail") - val dataRows = listener.getExecutionList.sortBy(_.startTimestamp).reverse - - def generateDataRow(info: ExecutionInfo): Seq[Node] = { - val jobLink = info.jobId.map { id: String => - - [{id}] - + + val sqlTableTag = "sqlstat" + + val parameterOtherTable = request.getParameterMap().asScala + .filterNot(_._1.startsWith(sqlTableTag)) + .map { case (name, vals) => + name + "=" + vals(0) } - val detail = Option(info.detail).filter(!_.isEmpty).getOrElse(info.executePlan) - - {info.userName} - - {jobLink} - - {info.groupId} - {formatDate(info.startTimestamp)} - {if (info.finishTimestamp > 0) formatDate(info.finishTimestamp)} - {if (info.closeTimestamp > 0) formatDate(info.closeTimestamp)} - {formatDurationOption(Some(info.totalTime(info.finishTimestamp)))} - {formatDurationOption(Some(info.totalTime(info.closeTimestamp)))} - {info.statement} - {info.state} - {errorMessageCell(detail)} - + + val parameterSqlTablePage = request.getParameter(s"$sqlTableTag.page") + val parameterSqlTableSortColumn = request.getParameter(s"$sqlTableTag.sort") + val parameterSqlTableSortDesc = request.getParameter(s"$sqlTableTag.desc") + val parameterSqlPageSize = request.getParameter(s"$sqlTableTag.pageSize") + + val sqlTablePage = Option(parameterSqlTablePage).map(_.toInt).getOrElse(1) + val sqlTableSortColumn = Option(parameterSqlTableSortColumn).map { sortColumn => + UIUtils.decodeURLParameter(sortColumn) + }.getOrElse("Start Time") + val sqlTableSortDesc = Option(parameterSqlTableSortDesc).map(_.toBoolean).getOrElse( + // New executions should be shown above old executions by default. + sqlTableSortColumn == "Start Time" + ) + val sqlTablePageSize = Option(parameterSqlPageSize).map(_.toInt).getOrElse(100) + + try { + Some(new SqlStatsPagedTable( + request, + parent, + store.getExecutionList, + "sqlserver", + UIUtils.prependBaseUri(request, parent.basePath), + parameterOtherTable, + sqlTableTag, + pageSize = sqlTablePageSize, + sortColumn = sqlTableSortColumn, + desc = sqlTableSortDesc + ).table(sqlTablePage)) + } catch { + case e@(_: IllegalArgumentException | _: IndexOutOfBoundsException) => + Some(
    +

    Error while rendering job table:

    +
    +              {Utils.exceptionString(e)}
    +            
    +
    ) } + } else { + None + } + val content = + +

    + + SQL Statistics ({numStatement}) +

    +
    ++ +
    + {table.getOrElse("No statistics have been generated yet.")} +
    + content + } + + /** Generate stats of batch sessions of the thrift server program */ + private def generateSessionStatsTable(request: HttpServletRequest): Seq[Node] = { + val numSessions = store.getSessionList.size + val table = if (numSessions > 0) { + + val sessionTableTag = "sessionstat" + + val parameterOtherTable = request.getParameterMap().asScala + .filterNot(_._1.startsWith(sessionTableTag)) + .map { case (name, vals) => + name + "=" + vals(0) + } + + val parameterSessionTablePage = request.getParameter(s"$sessionTableTag.page") + val parameterSessionTableSortColumn = request.getParameter(s"$sessionTableTag.sort") + val parameterSessionTableSortDesc = request.getParameter(s"$sessionTableTag.desc") + val parameterSessionPageSize = request.getParameter(s"$sessionTableTag.pageSize") + + val sessionTablePage = Option(parameterSessionTablePage).map(_.toInt).getOrElse(1) + val sessionTableSortColumn = Option(parameterSessionTableSortColumn).map { sortColumn => + UIUtils.decodeURLParameter(sortColumn) + }.getOrElse("Start Time") + val sessionTableSortDesc = Option(parameterSessionTableSortDesc).map(_.toBoolean).getOrElse( + // New session should be shown above old session by default. + (sessionTableSortColumn == "Start Time") + ) + val sessionTablePageSize = Option(parameterSessionPageSize).map(_.toInt).getOrElse(100) - Some(UIUtils.listingTable(headerRow, generateDataRow, - dataRows, false, None, Seq(null), false)) + try { + Some(new SessionStatsPagedTable( + request, + parent, + store.getSessionList, + "sqlserver", + UIUtils.prependBaseUri(request, parent.basePath), + parameterOtherTable, + sessionTableTag, + pageSize = sessionTablePageSize, + sortColumn = sessionTableSortColumn, + desc = sessionTableSortDesc + ).table(sessionTablePage)) + } catch { + case e@(_: IllegalArgumentException | _: IndexOutOfBoundsException) => + Some(
    +

    Error while rendering job table:

    +
    +              {Utils.exceptionString(e)}
    +            
    +
    ) + } } else { None } val content = -
    SQL Statistics ({numStatement})
    ++ -
    -
      - {table.getOrElse("No statistics have been generated yet.")} -
    -
    + +

    + + Session Statistics ({numSessions}) +

    +
    ++ +
    + {table.getOrElse("No statistics have been generated yet.")} +
    content } +} + +private[ui] class SqlStatsPagedTable( + request: HttpServletRequest, + parent: ThriftServerTab, + data: Seq[ExecutionInfo], + subPath: String, + basePath: String, + parameterOtherTable: Iterable[String], + sqlStatsTableTag: String, + pageSize: Int, + sortColumn: String, + desc: Boolean) extends PagedTable[SqlStatsTableRow] { + + override val dataSource = new SqlStatsTableDataSource(data, pageSize, sortColumn, desc) + + private val parameterPath = s"$basePath/$subPath/?${parameterOtherTable.mkString("&")}" + + override def tableId: String = sqlStatsTableTag + + override def tableCssClass: String = + "table table-bordered table-condensed table-striped " + + "table-head-clickable table-cell-width-limited" + + override def pageLink(page: Int): String = { + val encodedSortColumn = URLEncoder.encode(sortColumn, UTF_8.name()) + parameterPath + + s"&$pageNumberFormField=$page" + + s"&$sqlStatsTableTag.sort=$encodedSortColumn" + + s"&$sqlStatsTableTag.desc=$desc" + + s"&$pageSizeFormField=$pageSize" + } + + override def pageSizeFormField: String = s"$sqlStatsTableTag.pageSize" + + override def pageNumberFormField: String = s"$sqlStatsTableTag.page" + + override def goButtonFormPath: String = { + val encodedSortColumn = URLEncoder.encode(sortColumn, UTF_8.name()) + s"$parameterPath&$sqlStatsTableTag.sort=$encodedSortColumn&$sqlStatsTableTag.desc=$desc" + } + + override def headers: Seq[Node] = { + val sqlTableHeaders = Seq("User", "JobID", "GroupID", "Start Time", "Finish Time", + "Close Time", "Execution Time", "Duration", "Statement", "State", "Detail") + + val tooltips = Seq(None, None, None, None, Some(THRIFT_SERVER_FINISH_TIME), + Some(THRIFT_SERVER_CLOSE_TIME), Some(THRIFT_SERVER_EXECUTION), + Some(THRIFT_SERVER_DURATION), None, None, None) + + assert(sqlTableHeaders.length == tooltips.length) + + val headerRow: Seq[Node] = { + sqlTableHeaders.zip(tooltips).map { case (header, tooltip) => + if (header == sortColumn) { + val headerLink = Unparsed( + parameterPath + + s"&$sqlStatsTableTag.sort=${URLEncoder.encode(header, UTF_8.name())}" + + s"&$sqlStatsTableTag.desc=${!desc}" + + s"&$sqlStatsTableTag.pageSize=$pageSize" + + s"#$sqlStatsTableTag") + val arrow = if (desc) "▾" else "▴" // UP or DOWN + + if (tooltip.nonEmpty) { + + + + {header} {Unparsed(arrow)} + + + + } else { + + + {header} {Unparsed(arrow)} + + + } + } else { + val headerLink = Unparsed( + parameterPath + + s"&$sqlStatsTableTag.sort=${URLEncoder.encode(header, UTF_8.name())}" + + s"&$sqlStatsTableTag.pageSize=$pageSize" + + s"#$sqlStatsTableTag") + + if(tooltip.nonEmpty) { + + + + {header} + + + + } else { + + + {header} + + + } + } + } + } + + {headerRow} + + } + + override def row(sqlStatsTableRow: SqlStatsTableRow): Seq[Node] = { + val info = sqlStatsTableRow.executionInfo + val startTime = info.startTimestamp + val executionTime = sqlStatsTableRow.executionTime + val duration = sqlStatsTableRow.duration + + def jobLinks(jobData: Seq[String]): Seq[Node] = { + jobData.map { jobId => + [{jobId.toString}] + } + } + + + + {info.userName} + + + {jobLinks(sqlStatsTableRow.jobId)} + + + {info.groupId} + + + {UIUtils.formatDate(startTime)} + + + {if (info.finishTimestamp > 0) formatDate(info.finishTimestamp)} + + + {if (info.closeTimestamp > 0) formatDate(info.closeTimestamp)} + + + + {formatDurationVerbose(executionTime)} + + + {formatDurationVerbose(duration)} + + + + {info.statement} + + + + {info.state} + + {errorMessageCell(sqlStatsTableRow.detail)} + + } + private def errorMessageCell(errorMessage: String): Seq[Node] = { val isMultiline = errorMessage.indexOf('\n') >= 0 @@ -124,73 +375,236 @@ private[ui] class ThriftServerPage(parent: ThriftServerTab) extends WebUIPage("" } else { errorMessage }) - val details = if (isMultiline) { - // scalastyle:off - - + details - ++ - - // scalastyle:on - } else { - "" - } - {errorSummary}{details} + val details = detailsUINode(isMultiline, errorMessage) + + {errorSummary}{details} + } - /** Generate stats of batch sessions of the thrift server program */ - private def generateSessionStatsTable(request: HttpServletRequest): Seq[Node] = { - val sessionList = listener.getSessionList - val numBatches = sessionList.size - val table = if (numBatches > 0) { - val dataRows = sessionList.sortBy(_.startTimestamp).reverse - val headerRow = Seq("User", "IP", "Session ID", "Start Time", "Finish Time", "Duration", - "Total Execute") - def generateDataRow(session: SessionInfo): Seq[Node] = { - val sessionLink = "%s/%s/session/?id=%s".format( - UIUtils.prependBaseUri(request, parent.basePath), parent.prefix, session.sessionId) - - {session.userName} - {session.ip} - {session.sessionId} - {formatDate(session.startTimestamp)} - {if (session.finishTimestamp > 0) formatDate(session.finishTimestamp)} - {formatDurationOption(Some(session.totalTime))} - {session.totalExecution.toString} - - } - Some(UIUtils.listingTable(headerRow, generateDataRow, dataRows, true, None, Seq(null), false)) - } else { - None - } + private def jobURL(request: HttpServletRequest, jobId: String): String = + "%s/jobs/job/?id=%s".format(UIUtils.prependBaseUri(request, parent.basePath), jobId) +} - val content = -
    Session Statistics ({numBatches})
    ++ -
    -
      - {table.getOrElse("No statistics have been generated yet.")} -
    -
    +private[ui] class SessionStatsPagedTable( + request: HttpServletRequest, + parent: ThriftServerTab, + data: Seq[SessionInfo], + subPath: String, + basePath: String, + parameterOtherTable: Iterable[String], + sessionStatsTableTag: String, + pageSize: Int, + sortColumn: String, + desc: Boolean) extends PagedTable[SessionInfo] { - content + override val dataSource = new SessionStatsTableDataSource(data, pageSize, sortColumn, desc) + + private val parameterPath = s"$basePath/$subPath/?${parameterOtherTable.mkString("&")}" + + override def tableId: String = sessionStatsTableTag + + override def tableCssClass: String = + "table table-bordered table-condensed table-striped " + + "table-head-clickable table-cell-width-limited" + + override def pageLink(page: Int): String = { + val encodedSortColumn = URLEncoder.encode(sortColumn, UTF_8.name()) + parameterPath + + s"&$pageNumberFormField=$page" + + s"&$sessionStatsTableTag.sort=$encodedSortColumn" + + s"&$sessionStatsTableTag.desc=$desc" + + s"&$pageSizeFormField=$pageSize" } + override def pageSizeFormField: String = s"$sessionStatsTableTag.pageSize" - /** - * Returns a human-readable string representing a duration such as "5 second 35 ms" - */ - private def formatDurationOption(msOption: Option[Long]): String = { - msOption.map(formatDurationVerbose).getOrElse(emptyCell) + override def pageNumberFormField: String = s"$sessionStatsTableTag.page" + + override def goButtonFormPath: String = { + val encodedSortColumn = URLEncoder.encode(sortColumn, UTF_8.name()) + s"$parameterPath&$sessionStatsTableTag.sort=$encodedSortColumn&$sessionStatsTableTag.desc=$desc" } - /** Generate HTML table from string data */ - private def listingTable(headers: Seq[String], data: Seq[Seq[String]]) = { - def generateDataRow(data: Seq[String]): Seq[Node] = { - {data.map(d => {d})} + override def headers: Seq[Node] = { + val sessionTableHeaders = + Seq("User", "IP", "Session ID", "Start Time", "Finish Time", "Duration", "Total Execute") + + val tooltips = Seq(None, None, None, None, None, Some(THRIFT_SESSION_DURATION), + Some(THRIFT_SESSION_TOTAL_EXECUTE)) + assert(sessionTableHeaders.length == tooltips.length) + val colWidthAttr = s"${100.toDouble / sessionTableHeaders.size}%" + + val headerRow: Seq[Node] = { + sessionTableHeaders.zip(tooltips).map { case (header, tooltip) => + if (header == sortColumn) { + val headerLink = Unparsed( + parameterPath + + s"&$sessionStatsTableTag.sort=${URLEncoder.encode(header, UTF_8.name())}" + + s"&$sessionStatsTableTag.desc=${!desc}" + + s"&$sessionStatsTableTag.pageSize=$pageSize" + + s"#$sessionStatsTableTag") + val arrow = if (desc) "▾" else "▴" // UP or DOWN + + + { + if (tooltip.nonEmpty) { + + {header} {Unparsed(arrow)} + + } else { + + {header} {Unparsed(arrow)} + + } + } + + + + } else { + val headerLink = Unparsed( + parameterPath + + s"&$sessionStatsTableTag.sort=${URLEncoder.encode(header, UTF_8.name())}" + + s"&$sessionStatsTableTag.pageSize=$pageSize" + + s"#$sessionStatsTableTag") + + + + { + if (tooltip.nonEmpty) { + + {header} + + } else { + {header} + } + } + + + } + } } - UIUtils.listingTable(headers, generateDataRow, data, fixedWidth = true) + + {headerRow} + + } + + override def row(session: SessionInfo): Seq[Node] = { + val sessionLink = "%s/%s/session/?id=%s".format( + UIUtils.prependBaseUri(request, parent.basePath), parent.prefix, session.sessionId) + + {session.userName} + {session.ip} + {session.sessionId} + {formatDate(session.startTimestamp)} + {if (session.finishTimestamp > 0) formatDate(session.finishTimestamp)} + {formatDurationVerbose(session.totalTime)} + {session.totalExecution.toString} + } } + private[ui] class SqlStatsTableRow( + val jobId: Seq[String], + val duration: Long, + val executionTime: Long, + val executionInfo: ExecutionInfo, + val detail: String) + + private[ui] class SqlStatsTableDataSource( + info: Seq[ExecutionInfo], + pageSize: Int, + sortColumn: String, + desc: Boolean) extends PagedDataSource[SqlStatsTableRow](pageSize) { + + // Convert ExecutionInfo to SqlStatsTableRow which contains the final contents to show in + // the table so that we can avoid creating duplicate contents during sorting the data + private val data = info.map(sqlStatsTableRow).sorted(ordering(sortColumn, desc)) + + private var _slicedStartTime: Set[Long] = null + + override def dataSize: Int = data.size + + override def sliceData(from: Int, to: Int): Seq[SqlStatsTableRow] = { + val r = data.slice(from, to) + _slicedStartTime = r.map(_.executionInfo.startTimestamp).toSet + r + } + + private def sqlStatsTableRow(executionInfo: ExecutionInfo): SqlStatsTableRow = { + val duration = executionInfo.totalTime(executionInfo.closeTimestamp) + val executionTime = executionInfo.totalTime(executionInfo.finishTimestamp) + val detail = Option(executionInfo.detail).filter(!_.isEmpty) + .getOrElse(executionInfo.executePlan) + val jobId = executionInfo.jobId.toSeq.sorted + + new SqlStatsTableRow(jobId, duration, executionTime, executionInfo, detail) + + } + + /** + * Return Ordering according to sortColumn and desc. + */ + private def ordering(sortColumn: String, desc: Boolean): Ordering[SqlStatsTableRow] = { + val ordering: Ordering[SqlStatsTableRow] = sortColumn match { + case "User" => Ordering.by(_.executionInfo.userName) + case "JobID" => Ordering by (_.jobId.headOption) + case "GroupID" => Ordering.by(_.executionInfo.groupId) + case "Start Time" => Ordering.by(_.executionInfo.startTimestamp) + case "Finish Time" => Ordering.by(_.executionInfo.finishTimestamp) + case "Close Time" => Ordering.by(_.executionInfo.closeTimestamp) + case "Execution Time" => Ordering.by(_.executionTime) + case "Duration" => Ordering.by(_.duration) + case "Statement" => Ordering.by(_.executionInfo.statement) + case "State" => Ordering.by(_.executionInfo.state) + case "Detail" => Ordering.by(_.detail) + case unknownColumn => throw new IllegalArgumentException(s"Unknown column: $unknownColumn") + } + if (desc) { + ordering.reverse + } else { + ordering + } + } + + } + + private[ui] class SessionStatsTableDataSource( + info: Seq[SessionInfo], + pageSize: Int, + sortColumn: String, + desc: Boolean) extends PagedDataSource[SessionInfo](pageSize) { + + // Sorting SessionInfo data + private val data = info.sorted(ordering(sortColumn, desc)) + + private var _slicedStartTime: Set[Long] = null + + override def dataSize: Int = data.size + + override def sliceData(from: Int, to: Int): Seq[SessionInfo] = { + val r = data.slice(from, to) + _slicedStartTime = r.map(_.startTimestamp).toSet + r + } + + /** + * Return Ordering according to sortColumn and desc. + */ + private def ordering(sortColumn: String, desc: Boolean): Ordering[SessionInfo] = { + val ordering: Ordering[SessionInfo] = sortColumn match { + case "User" => Ordering.by(_.userName) + case "IP" => Ordering.by(_.ip) + case "Session ID" => Ordering.by(_.sessionId) + case "Start Time" => Ordering by (_.startTimestamp) + case "Finish Time" => Ordering.by(_.finishTimestamp) + case "Duration" => Ordering.by(_.totalTime) + case "Total Execute" => Ordering.by(_.totalExecution) + case unknownColumn => throw new IllegalArgumentException(s"Unknown column: $unknownColumn") + } + if (desc) { + ordering.reverse + } else { + ordering + } + } + } diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala index 81df1304085e8..c46c3d6b68a43 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerSessionPage.scala @@ -17,34 +17,29 @@ package org.apache.spark.sql.hive.thriftserver.ui -import java.util.Calendar import javax.servlet.http.HttpServletRequest +import scala.collection.JavaConverters._ import scala.xml.Node -import org.apache.commons.text.StringEscapeUtils - import org.apache.spark.internal.Logging -import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2.{ExecutionInfo, ExecutionState} import org.apache.spark.ui._ import org.apache.spark.ui.UIUtils._ +import org.apache.spark.util.Utils /** Page for Spark Web UI that shows statistics of jobs running in the thrift server */ private[ui] class ThriftServerSessionPage(parent: ThriftServerTab) extends WebUIPage("session") with Logging { - - private val listener = parent.listener - private val startTime = Calendar.getInstance().getTime() - private val emptyCell = "-" + val store = parent.store + private val startTime = parent.startTime /** Render the page */ def render(request: HttpServletRequest): Seq[Node] = { val parameterId = request.getParameter("id") require(parameterId != null && parameterId.nonEmpty, "Missing id parameter") - val content = - listener.synchronized { // make sure all parts in this page are consistent - val sessionStat = listener.getSession(parameterId).getOrElse(null) + val content = store.synchronized { // make sure all parts in this page are consistent + val sessionStat = store.getSession(parameterId).getOrElse(null) require(sessionStat != null, "Invalid sessionID[" + parameterId + "]") generateBasicStats() ++ @@ -75,92 +70,72 @@ private[ui] class ThriftServerSessionPage(parent: ThriftServerTab) /** Generate stats of batch statements of the thrift server program */ private def generateSQLStatsTable(request: HttpServletRequest, sessionID: String): Seq[Node] = { - val executionList = listener.getExecutionList + val executionList = store.getExecutionList .filter(_.sessionId == sessionID) val numStatement = executionList.size val table = if (numStatement > 0) { - val headerRow = Seq("User", "JobID", "GroupID", "Start Time", "Finish Time", "Close Time", - "Execution Time", "Duration", "Statement", "State", "Detail") - val dataRows = executionList.sortBy(_.startTimestamp).reverse - - def generateDataRow(info: ExecutionInfo): Seq[Node] = { - val jobLink = info.jobId.map { id: String => - - [{id}] - + + val sqlTableTag = "sqlsessionstat" + + val parameterOtherTable = request.getParameterMap().asScala + .filterNot(_._1.startsWith(sqlTableTag)) + .map { case (name, vals) => + name + "=" + vals(0) } - val detail = Option(info.detail).filter(!_.isEmpty).getOrElse(info.executePlan) - - {info.userName} - - {jobLink} - - {info.groupId} - {formatDate(info.startTimestamp)} - {formatDate(info.finishTimestamp)} - {formatDate(info.closeTimestamp)} - {formatDurationOption(Some(info.totalTime(info.finishTimestamp)))} - {formatDurationOption(Some(info.totalTime(info.closeTimestamp)))} - {info.statement} - {info.state} - {errorMessageCell(detail)} - - } - Some(UIUtils.listingTable(headerRow, generateDataRow, - dataRows, false, None, Seq(null), false)) + val parameterSqlTablePage = request.getParameter(s"$sqlTableTag.page") + val parameterSqlTableSortColumn = request.getParameter(s"$sqlTableTag.sort") + val parameterSqlTableSortDesc = request.getParameter(s"$sqlTableTag.desc") + val parameterSqlPageSize = request.getParameter(s"$sqlTableTag.pageSize") + + val sqlTablePage = Option(parameterSqlTablePage).map(_.toInt).getOrElse(1) + val sqlTableSortColumn = Option(parameterSqlTableSortColumn).map { sortColumn => + UIUtils.decodeURLParameter(sortColumn) + }.getOrElse("Start Time") + val sqlTableSortDesc = Option(parameterSqlTableSortDesc).map(_.toBoolean).getOrElse( + // New executions should be shown above old executions by default. + sqlTableSortColumn == "Start Time" + ) + val sqlTablePageSize = Option(parameterSqlPageSize).map(_.toInt).getOrElse(100) + + try { + Some(new SqlStatsPagedTable( + request, + parent, + executionList, + "sqlserver/session", + UIUtils.prependBaseUri(request, parent.basePath), + parameterOtherTable, + sqlTableTag, + pageSize = sqlTablePageSize, + sortColumn = sqlTableSortColumn, + desc = sqlTableSortDesc + ).table(sqlTablePage)) + } catch { + case e@(_: IllegalArgumentException | _: IndexOutOfBoundsException) => + Some(
    +

    Error while rendering job table:

    +
    +              {Utils.exceptionString(e)}
    +            
    +
    ) + } } else { None } - val content = -
    SQL Statistics
    ++ -
    -
      - {table.getOrElse("No statistics have been generated yet.")} -
    + +

    + + SQL Statistics +

    +
    ++ +
    + {table.getOrElse("No statistics have been generated yet.")}
    content } - - private def errorMessageCell(errorMessage: String): Seq[Node] = { - val isMultiline = errorMessage.indexOf('\n') >= 0 - val errorSummary = StringEscapeUtils.escapeHtml4( - if (isMultiline) { - errorMessage.substring(0, errorMessage.indexOf('\n')) - } else { - errorMessage - }) - val details = if (isMultiline) { - // scalastyle:off - - + details - ++ - - // scalastyle:on - } else { - "" - } - {errorSummary}{details} - } - - /** - * Returns a human-readable string representing a duration such as "5 second 35 ms" - */ - private def formatDurationOption(msOption: Option[Long]): String = { - msOption.map(formatDurationVerbose).getOrElse(emptyCell) - } - - /** Generate HTML table from string data */ - private def listingTable(headers: Seq[String], data: Seq[Seq[String]]) = { - def generateDataRow(data: Seq[String]): Seq[Node] = { - {data.map(d => {d})} - } - UIUtils.listingTable(headers, generateDataRow, data, fixedWidth = true) - } } diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerTab.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerTab.scala index db2066009b351..6d783b1c555a7 100644 --- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerTab.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerTab.scala @@ -19,28 +19,25 @@ package org.apache.spark.sql.hive.thriftserver.ui import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.internal.Logging -import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 -import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._ import org.apache.spark.ui.{SparkUI, SparkUITab} /** * Spark Web UI tab that shows statistics of jobs running in the thrift server. * This assumes the given SparkContext has enabled its SparkUI. */ -private[thriftserver] class ThriftServerTab(sparkContext: SparkContext) - extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging { - +private[thriftserver] class ThriftServerTab( + val store: HiveThriftServer2AppStatusStore, + sparkUI: SparkUI) extends SparkUITab(sparkUI, "sqlserver") with Logging { override val name = "JDBC/ODBC Server" - val parent = getSparkUI(sparkContext) - val listener = HiveThriftServer2.listener + val parent = sparkUI + val startTime = sparkUI.store.applicationInfo().attempts.head.startTime attachPage(new ThriftServerPage(this)) attachPage(new ThriftServerSessionPage(this)) parent.attachTab(this) - - def detach() { - getSparkUI(sparkContext).detachTab(this) + def detach(): Unit = { + sparkUI.detachTab(this) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateControlMicroBatchStream.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ToolTips.scala similarity index 56% rename from sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateControlMicroBatchStream.scala rename to sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ToolTips.scala index 6a66f52c8f732..56ab766f4aabd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateControlMicroBatchStream.scala +++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/ui/ToolTips.scala @@ -15,17 +15,25 @@ * limitations under the License. */ -package org.apache.spark.sql.execution.streaming.sources +package org.apache.spark.sql.hive.thriftserver.ui -import org.apache.spark.sql.sources.v2.reader.streaming.{MicroBatchStream, Offset} +private[ui] object ToolTips { + val THRIFT_SERVER_FINISH_TIME = + "Execution finish time, before fetching the results" -// A special `MicroBatchStream` that can get latestOffset with a start offset. -trait RateControlMicroBatchStream extends MicroBatchStream { + val THRIFT_SERVER_CLOSE_TIME = + "Operation close time after fetching the results" - override def latestOffset(): Offset = { - throw new IllegalAccessException( - "latestOffset should not be called for RateControlMicroBatchReadSupport") - } + val THRIFT_SERVER_EXECUTION = + "Difference between start time and finish time" + + val THRIFT_SERVER_DURATION = + "Difference between start time and close time" + + val THRIFT_SESSION_TOTAL_EXECUTE = + "Number of operations submitted in this session" + + val THRIFT_SESSION_DURATION = + "Elapsed time since session start, or until closed if the session was closed" - def latestOffset(start: Offset): Offset } diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala index 6e042ac41d9da..6609701be0ede 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala @@ -27,12 +27,11 @@ import scala.concurrent.Promise import scala.concurrent.duration._ import org.apache.hadoop.hive.conf.HiveConf.ConfVars -import org.apache.hadoop.hive.contrib.udaf.example.UDAFExampleMax import org.scalatest.BeforeAndAfterAll import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging -import org.apache.spark.sql.hive.test.HiveTestUtils +import org.apache.spark.sql.hive.test.HiveTestJars import org.apache.spark.sql.test.ProcessTestUtils.ProcessOutputCapturer import org.apache.spark.util.{ThreadUtils, Utils} @@ -165,7 +164,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { Thread.currentThread().getContextClassLoader.getResource("data/files/small_kv.txt") runCliWithin(3.minute)( - "CREATE TABLE hive_test(key INT, val STRING);" + "CREATE TABLE hive_test(key INT, val STRING) USING hive;" -> "", "SHOW TABLES;" -> "hive_test", @@ -202,7 +201,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { } test("Commands using SerDe provided in --jars") { - val jarFile = HiveTestUtils.getHiveHcatalogCoreJar.getCanonicalPath + val jarFile = HiveTestJars.getHiveHcatalogCoreJar().getCanonicalPath val dataFilePath = Thread.currentThread().getContextClassLoader.getResource("data/files/small_kv.txt") @@ -212,14 +211,14 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { |ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'; """.stripMargin -> "", - "CREATE TABLE sourceTable (key INT, val STRING);" + "CREATE TABLE sourceTable (key INT, val STRING) USING hive;" -> "", s"LOAD DATA LOCAL INPATH '$dataFilePath' OVERWRITE INTO TABLE sourceTable;" -> "", "INSERT INTO TABLE t1 SELECT key, val FROM sourceTable;" -> "", - "SELECT count(key) FROM t1;" - -> "5", + "SELECT collect_list(array(val)) FROM t1;" + -> """[["val_238"],["val_86"],["val_311"],["val_27"],["val_165"]]""", "DROP TABLE t1;" -> "", "DROP TABLE sourceTable;" @@ -227,6 +226,32 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { ) } + test("SPARK-29022: Commands using SerDe provided in --hive.aux.jars.path") { + val dataFilePath = + Thread.currentThread().getContextClassLoader.getResource("data/files/small_kv.txt") + val hiveContribJar = HiveTestJars.getHiveHcatalogCoreJar().getCanonicalPath + runCliWithin( + 3.minute, + Seq("--conf", s"spark.hadoop.${ConfVars.HIVEAUXJARS}=$hiveContribJar"))( + """CREATE TABLE addJarWithHiveAux(key string, val string) + |ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'; + """.stripMargin + -> "", + "CREATE TABLE sourceTableForWithHiveAux (key INT, val STRING) USING hive;" + -> "", + s"LOAD DATA LOCAL INPATH '$dataFilePath' OVERWRITE INTO TABLE sourceTableForWithHiveAux;" + -> "", + "INSERT INTO TABLE addJarWithHiveAux SELECT key, val FROM sourceTableForWithHiveAux;" + -> "", + "SELECT collect_list(array(val)) FROM addJarWithHiveAux;" + -> """[["val_238"],["val_86"],["val_311"],["val_27"],["val_165"]]""", + "DROP TABLE addJarWithHiveAux;" + -> "", + "DROP TABLE sourceTableForWithHiveAux;" + -> "" + ) + } + test("SPARK-11188 Analysis error reporting") { runCliWithin(timeout = 2.minute, errorResponses = Seq("AnalysisException"))( @@ -297,12 +322,82 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { } test("Support hive.aux.jars.path") { - val hiveContribJar = HiveTestUtils.getHiveContribJar.getCanonicalPath + val hiveContribJar = HiveTestJars.getHiveContribJar().getCanonicalPath runCliWithin( 1.minute, Seq("--conf", s"spark.hadoop.${ConfVars.HIVEAUXJARS}=$hiveContribJar"))( - s"CREATE TEMPORARY FUNCTION example_max AS '${classOf[UDAFExampleMax].getName}';" -> "", - "SELECT example_max(1);" -> "1" + "CREATE TEMPORARY FUNCTION example_format AS " + + "'org.apache.hadoop.hive.contrib.udf.example.UDFExampleFormat';" -> "", + "SELECT example_format('%o', 93);" -> "135" + ) + } + + test("SPARK-28840 test --jars command") { + val jarFile = new File("../../sql/hive/src/test/resources/SPARK-21101-1.0.jar").getCanonicalPath + runCliWithin( + 1.minute, + Seq("--jars", s"$jarFile"))( + "CREATE TEMPORARY FUNCTION testjar AS" + + " 'org.apache.spark.sql.hive.execution.UDTFStack';" -> "", + "SELECT testjar(1,'TEST-SPARK-TEST-jar', 28840);" -> "TEST-SPARK-TEST-jar\t28840" + ) + } + + test("SPARK-28840 test --jars and hive.aux.jars.path command") { + val jarFile = new File("../../sql/hive/src/test/resources/SPARK-21101-1.0.jar").getCanonicalPath + val hiveContribJar = HiveTestJars.getHiveContribJar().getCanonicalPath + runCliWithin( + 1.minute, + Seq("--jars", s"$jarFile", "--conf", + s"spark.hadoop.${ConfVars.HIVEAUXJARS}=$hiveContribJar"))( + "CREATE TEMPORARY FUNCTION testjar AS" + + " 'org.apache.spark.sql.hive.execution.UDTFStack';" -> "", + "SELECT testjar(1,'TEST-SPARK-TEST-jar', 28840);" -> "TEST-SPARK-TEST-jar\t28840", + "CREATE TEMPORARY FUNCTION example_max AS " + + "'org.apache.hadoop.hive.contrib.udaf.example.UDAFExampleMax';" -> "", + "SELECT concat_ws(',', 'First', example_max(1234321), 'Third');" -> "First,1234321,Third" + ) + } + + test("SPARK-29022 Commands using SerDe provided in ADD JAR sql") { + val dataFilePath = + Thread.currentThread().getContextClassLoader.getResource("data/files/small_kv.txt") + val hiveContribJar = HiveTestJars.getHiveHcatalogCoreJar().getCanonicalPath + runCliWithin( + 3.minute)( + s"ADD JAR ${hiveContribJar};" -> "", + """CREATE TABLE addJarWithSQL(key string, val string) + |ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'; + """.stripMargin + -> "", + "CREATE TABLE sourceTableForWithSQL(key INT, val STRING) USING hive;" + -> "", + s"LOAD DATA LOCAL INPATH '$dataFilePath' OVERWRITE INTO TABLE sourceTableForWithSQL;" + -> "", + "INSERT INTO TABLE addJarWithSQL SELECT key, val FROM sourceTableForWithSQL;" + -> "", + "SELECT collect_list(array(val)) FROM addJarWithSQL;" + -> """[["val_238"],["val_86"],["val_311"],["val_27"],["val_165"]]""", + "DROP TABLE addJarWithSQL;" + -> "", + "DROP TABLE sourceTableForWithSQL;" + -> "" + ) + } + + test("SPARK-26321 Should not split semicolon within quoted string literals") { + runCliWithin(3.minute)( + """select 'Test1', "^;^";""" -> "Test1\t^;^", + """select 'Test2', "\";";""" -> "Test2\t\";", + """select 'Test3', "\';";""" -> "Test3\t';", + "select concat('Test4', ';');" -> "Test4;" + ) + } + + test("Pad Decimal numbers with trailing zeros to the scale of the column") { + runCliWithin(1.minute)( + "SELECT CAST(1 AS DECIMAL(38, 18));" + -> "1.000000000000000000" ) } } diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/DummyListeners.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/DummyListeners.scala new file mode 100644 index 0000000000000..d056b3b2153cf --- /dev/null +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/DummyListeners.scala @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * These classes in this package are intentionally placed to the outer package of spark, + * because IsolatedClientLoader leverages Spark classloader for shared classess including + * spark package, and the test should fail if Spark initializes these listeners with + * IsolatedClientLoader. + */ +package test.custom.listener + +import org.apache.spark.sql.execution.QueryExecution +import org.apache.spark.sql.streaming.StreamingQueryListener +import org.apache.spark.sql.util.QueryExecutionListener + +class DummyQueryExecutionListener extends QueryExecutionListener { + override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = {} + override def onFailure(funcName: String, qe: QueryExecution, error: Throwable): Unit = {} +} + +class DummyStreamingQueryListener extends StreamingQueryListener { + override def onQueryStarted(event: StreamingQueryListener.QueryStartedEvent): Unit = {} + override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = {} + override def onQueryTerminated(event: StreamingQueryListener.QueryTerminatedEvent): Unit = {} +} diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala index b7185db2f2ae7..84eed7b2eda22 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala @@ -23,6 +23,7 @@ import java.nio.charset.StandardCharsets import java.sql.{Date, DriverManager, SQLException, Statement} import java.util.{Locale, UUID} +import scala.collection.JavaConverters._ import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import scala.concurrent.{ExecutionContext, Future, Promise} @@ -34,7 +35,7 @@ import com.google.common.io.Files import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.hive.jdbc.HiveDriver import org.apache.hive.service.auth.PlainSaslHelper -import org.apache.hive.service.cli.{FetchOrientation, FetchType, GetInfoType} +import org.apache.hive.service.cli.{FetchOrientation, FetchType, GetInfoType, RowSet} import org.apache.hive.service.cli.thrift.ThriftCLIServiceClient import org.apache.thrift.protocol.TBinaryProtocol import org.apache.thrift.transport.TSocket @@ -43,7 +44,7 @@ import org.scalatest.BeforeAndAfterAll import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.internal.Logging import org.apache.spark.sql.hive.HiveUtils -import org.apache.spark.sql.hive.test.HiveTestUtils +import org.apache.spark.sql.hive.test.HiveTestJars import org.apache.spark.sql.internal.StaticSQLConf.HIVE_THRIFT_SERVER_SINGLESESSION import org.apache.spark.sql.test.ProcessTestUtils.ProcessOutputCapturer import org.apache.spark.util.{ThreadUtils, Utils} @@ -100,7 +101,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest { withJdbcStatement("test_16563") { statement => val queries = Seq( - "CREATE TABLE test_16563(key INT, val STRING)", + "CREATE TABLE test_16563(key INT, val STRING) USING hive", s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test_16563") queries.foreach(statement.execute) @@ -144,10 +145,17 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest { def executeTest(hiveList: String): Unit = { hiveList.split(";").foreach{ m => val kv = m.split("=") - // select "${a}"; ---> avalue - val resultSet = statement.executeQuery("select \"${" + kv(0) + "}\"") + val k = kv(0) + val v = kv(1) + val modValue = s"${v}_MOD_VALUE" + // select '${a}'; ---> avalue + val resultSet = statement.executeQuery(s"select '$${$k}'") resultSet.next() - assert(resultSet.getString(1) === kv(1)) + assert(resultSet.getString(1) === v) + statement.executeQuery(s"set $k=$modValue") + val modResultSet = statement.executeQuery(s"select '$${$k}'") + modResultSet.next() + assert(modResultSet.getString(1) === s"$modValue") } } } @@ -157,7 +165,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest { withJdbcStatement("test") { statement => val queries = Seq( "SET spark.sql.shuffle.partitions=3", - "CREATE TABLE test(key INT, val STRING)", + "CREATE TABLE test(key INT, val STRING) USING hive", s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test", "CACHE TABLE test") @@ -183,7 +191,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest { test("SPARK-3004 regression: result set containing NULL") { withJdbcStatement("test_null") { statement => val queries = Seq( - "CREATE TABLE test_null(key INT, val STRING)", + "CREATE TABLE test_null(key INT, val STRING) USING hive", s"LOAD DATA LOCAL INPATH '${TestData.smallKvWithNull}' OVERWRITE INTO TABLE test_null") queries.foreach(statement.execute) @@ -203,7 +211,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest { test("SPARK-4292 regression: result set iterator issue") { withJdbcStatement("test_4292") { statement => val queries = Seq( - "CREATE TABLE test_4292(key INT, val STRING)", + "CREATE TABLE test_4292(key INT, val STRING) USING hive", s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test_4292") queries.foreach(statement.execute) @@ -220,7 +228,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest { test("SPARK-4309 regression: Date type support") { withJdbcStatement("test_date") { statement => val queries = Seq( - "CREATE TABLE test_date(key INT, value STRING)", + "CREATE TABLE test_date(key INT, value STRING) USING hive", s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test_date") queries.foreach(statement.execute) @@ -237,7 +245,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest { test("SPARK-4407 regression: Complex type support") { withJdbcStatement("test_map") { statement => val queries = Seq( - "CREATE TABLE test_map(key INT, value STRING)", + "CREATE TABLE test_map(key INT, value STRING) USING hive", s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test_map") queries.foreach(statement.execute) @@ -260,7 +268,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest { test("SPARK-12143 regression: Binary type support") { withJdbcStatement("test_binary") { statement => val queries = Seq( - "CREATE TABLE test_binary(key INT, value STRING)", + "CREATE TABLE test_binary(key INT, value STRING) USING hive", s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test_binary") queries.foreach(statement.execute) @@ -286,7 +294,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest { { statement => val queries = Seq( - "CREATE TABLE test_map(key INT, value STRING)", + "CREATE TABLE test_map(key INT, value STRING) USING hive", s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test_map", "CACHE TABLE test_table AS SELECT key FROM test_map ORDER BY key DESC", "CREATE DATABASE db1") @@ -485,7 +493,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest { withMultipleConnectionJdbcStatement("smallKV", "addJar")( { statement => - val jarFile = HiveTestUtils.getHiveHcatalogCoreJar.getCanonicalPath + val jarFile = HiveTestJars.getHiveHcatalogCoreJar().getCanonicalPath statement.executeQuery(s"ADD JAR $jarFile") }, @@ -493,7 +501,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest { { statement => val queries = Seq( - "CREATE TABLE smallKV(key INT, val STRING)", + "CREATE TABLE smallKV(key INT, val STRING) USING hive", s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE smallKV", """CREATE TABLE addJar(key string) |ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' @@ -590,7 +598,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest { val dataPath = "../hive/src/test/resources/data/files/kv1.txt" Seq( - "CREATE TABLE test_udtf(key INT, value STRING)", + "CREATE TABLE test_udtf(key INT, value STRING) USING hive", s"LOAD DATA LOCAL INPATH '$dataPath' OVERWRITE INTO TABLE test_udtf" ).foreach(statement.execute) @@ -662,6 +670,107 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest { assert(rs.getBigDecimal(1) === new java.math.BigDecimal("1.000000000000000000")) } } + + test("Support interval type") { + withJdbcStatement() { statement => + val rs = statement.executeQuery("SELECT interval 3 months 1 hours") + assert(rs.next()) + assert(rs.getString(1) === "3 months 1 hours") + } + // Invalid interval value + withJdbcStatement() { statement => + val e = intercept[SQLException] { + statement.executeQuery("SELECT interval 3 months 1 hou") + } + assert(e.getMessage.contains("org.apache.spark.sql.catalyst.parser.ParseException")) + } + } + + test("ThriftCLIService FetchResults FETCH_FIRST, FETCH_NEXT, FETCH_PRIOR") { + def checkResult(rows: RowSet, start: Long, end: Long): Unit = { + assert(rows.getStartOffset() == start) + assert(rows.numRows() == end - start) + rows.iterator.asScala.zip((start until end).iterator).foreach { case (row, v) => + assert(row(0).asInstanceOf[Long] === v) + } + } + + withCLIServiceClient { client => + val user = System.getProperty("user.name") + val sessionHandle = client.openSession(user, "") + + val confOverlay = new java.util.HashMap[java.lang.String, java.lang.String] + val operationHandle = client.executeStatement( + sessionHandle, + "SELECT * FROM range(10)", + confOverlay) // 10 rows result with sequence 0, 1, 2, ..., 9 + var rows: RowSet = null + + // Fetch 5 rows with FETCH_NEXT + rows = client.fetchResults( + operationHandle, FetchOrientation.FETCH_NEXT, 5, FetchType.QUERY_OUTPUT) + checkResult(rows, 0, 5) // fetched [0, 5) + + // Fetch another 2 rows with FETCH_NEXT + rows = client.fetchResults( + operationHandle, FetchOrientation.FETCH_NEXT, 2, FetchType.QUERY_OUTPUT) + checkResult(rows, 5, 7) // fetched [5, 7) + + // FETCH_PRIOR 3 rows + rows = client.fetchResults( + operationHandle, FetchOrientation.FETCH_PRIOR, 3, FetchType.QUERY_OUTPUT) + checkResult(rows, 2, 5) // fetched [2, 5) + + // FETCH_PRIOR again will scroll back to 0, and then the returned result + // may overlap the results of previous FETCH_PRIOR + rows = client.fetchResults( + operationHandle, FetchOrientation.FETCH_PRIOR, 3, FetchType.QUERY_OUTPUT) + checkResult(rows, 0, 3) // fetched [0, 3) + + // FETCH_PRIOR again will stay at 0 + rows = client.fetchResults( + operationHandle, FetchOrientation.FETCH_PRIOR, 4, FetchType.QUERY_OUTPUT) + checkResult(rows, 0, 4) // fetched [0, 4) + + // FETCH_NEXT will continue moving forward from offset 4 + rows = client.fetchResults( + operationHandle, FetchOrientation.FETCH_NEXT, 10, FetchType.QUERY_OUTPUT) + checkResult(rows, 4, 10) // fetched [4, 10) until the end of results + + // FETCH_NEXT is at end of results + rows = client.fetchResults( + operationHandle, FetchOrientation.FETCH_NEXT, 5, FetchType.QUERY_OUTPUT) + checkResult(rows, 10, 10) // fetched empty [10, 10) (at end of results) + + // FETCH_NEXT is at end of results again + rows = client.fetchResults( + operationHandle, FetchOrientation.FETCH_NEXT, 2, FetchType.QUERY_OUTPUT) + checkResult(rows, 10, 10) // fetched empty [10, 10) (at end of results) + + // FETCH_PRIOR 1 rows yet again + rows = client.fetchResults( + operationHandle, FetchOrientation.FETCH_PRIOR, 1, FetchType.QUERY_OUTPUT) + checkResult(rows, 9, 10) // fetched [9, 10) + + // FETCH_NEXT will return 0 yet again + rows = client.fetchResults( + operationHandle, FetchOrientation.FETCH_NEXT, 5, FetchType.QUERY_OUTPUT) + checkResult(rows, 10, 10) // fetched empty [10, 10) (at end of results) + + // FETCH_FIRST results from first row + rows = client.fetchResults( + operationHandle, FetchOrientation.FETCH_FIRST, 3, FetchType.QUERY_OUTPUT) + checkResult(rows, 0, 3) // fetch [0, 3) + + // Fetch till the end rows with FETCH_NEXT" + rows = client.fetchResults( + operationHandle, FetchOrientation.FETCH_NEXT, 1000, FetchType.QUERY_OUTPUT) + checkResult(rows, 3, 10) // fetched [3, 10) + + client.closeOperation(operationHandle) + client.closeSession(sessionHandle) + } + } } class SingleSessionSuite extends HiveThriftJdbcTest { @@ -681,6 +790,8 @@ class SingleSessionSuite extends HiveThriftJdbcTest { Seq( "SET foo=bar", s"ADD JAR $jarURL", + "CREATE TABLE test_udtf(key INT, value STRING) USING hive", + s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test_udtf", s"""CREATE TEMPORARY FUNCTION udtf_count2 |AS 'org.apache.spark.sql.hive.execution.GenericUDTFCount2' """.stripMargin @@ -707,6 +818,16 @@ class SingleSessionSuite extends HiveThriftJdbcTest { assert(rs2.next()) assert(rs2.getString(1) === "Usage: N/A.") + + val rs3 = statement.executeQuery( + "SELECT key, cc FROM test_udtf LATERAL VIEW udtf_count2(value) dd AS cc") + assert(rs3.next()) + assert(rs3.getInt(1) === 165) + assert(rs3.getInt(2) === 5) + + assert(rs3.next()) + assert(rs3.getInt(1) === 165) + assert(rs3.getInt(2) === 5) } finally { statement.executeQuery("DROP TEMPORARY FUNCTION udtf_count2") } @@ -770,7 +891,7 @@ class HiveThriftHttpServerSuite extends HiveThriftJdbcTest { withJdbcStatement("test") { statement => val queries = Seq( "SET spark.sql.shuffle.partitions=3", - "CREATE TABLE test(key INT, val STRING)", + "CREATE TABLE test(key INT, val STRING) USING hive", s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test", "CACHE TABLE test") @@ -820,7 +941,7 @@ abstract class HiveThriftJdbcTest extends HiveThriftServer2Test { s"jdbc:hive2://localhost:$serverPort/?${hiveConfList}#${hiveVarList}" } - def withMultipleConnectionJdbcStatement(tableNames: String*)(fs: (Statement => Unit)*) { + def withMultipleConnectionJdbcStatement(tableNames: String*)(fs: (Statement => Unit)*): Unit = { val user = System.getProperty("user.name") val connections = fs.map { _ => DriverManager.getConnection(jdbcUri, user, "") } val statements = connections.map(_.createStatement()) @@ -841,7 +962,7 @@ abstract class HiveThriftJdbcTest extends HiveThriftServer2Test { } } - def withDatabase(dbNames: String*)(fs: (Statement => Unit)*) { + def withDatabase(dbNames: String*)(fs: (Statement => Unit)*): Unit = { val user = System.getProperty("user.name") val connections = fs.map { _ => DriverManager.getConnection(jdbcUri, user, "") } val statements = connections.map(_.createStatement()) @@ -857,7 +978,7 @@ abstract class HiveThriftJdbcTest extends HiveThriftServer2Test { } } - def withJdbcStatement(tableNames: String*)(f: Statement => Unit) { + def withJdbcStatement(tableNames: String*)(f: Statement => Unit): Unit = { withMultipleConnectionJdbcStatement(tableNames: _*)(f) } } diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SharedThriftServer.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SharedThriftServer.scala new file mode 100644 index 0000000000000..ce610098156f3 --- /dev/null +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SharedThriftServer.scala @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.thriftserver + +import java.sql.{DriverManager, Statement} + +import scala.concurrent.duration._ +import scala.util.{Random, Try} + +import org.apache.hadoop.hive.conf.HiveConf.ConfVars + +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.test.SharedSparkSession + +trait SharedThriftServer extends SharedSparkSession { + + private var hiveServer2: HiveThriftServer2 = _ + + override def beforeAll(): Unit = { + super.beforeAll() + // Chooses a random port between 10000 and 19999 + var listeningPort = 10000 + Random.nextInt(10000) + + // Retries up to 3 times with different port numbers if the server fails to start + (1 to 3).foldLeft(Try(startThriftServer(listeningPort, 0))) { case (started, attempt) => + started.orElse { + listeningPort += 1 + Try(startThriftServer(listeningPort, attempt)) + } + }.recover { + case cause: Throwable => + throw cause + }.get + logInfo("HiveThriftServer2 started successfully") + } + + override def afterAll(): Unit = { + try { + hiveServer2.stop() + } finally { + super.afterAll() + } + } + + protected def withJdbcStatement(fs: (Statement => Unit)*): Unit = { + val user = System.getProperty("user.name") + + val serverPort = hiveServer2.getHiveConf.get(ConfVars.HIVE_SERVER2_THRIFT_PORT.varname) + val connections = + fs.map { _ => DriverManager.getConnection(s"jdbc:hive2://localhost:$serverPort", user, "") } + val statements = connections.map(_.createStatement()) + + try { + statements.zip(fs).foreach { case (s, f) => f(s) } + } finally { + statements.foreach(_.close()) + connections.foreach(_.close()) + } + } + + private def startThriftServer(port: Int, attempt: Int): Unit = { + logInfo(s"Trying to start HiveThriftServer2: port=$port, attempt=$attempt") + val sqlContext = spark.newSession().sqlContext + sqlContext.setConf(ConfVars.HIVE_SERVER2_THRIFT_PORT.varname, port.toString) + hiveServer2 = HiveThriftServer2.startWithContext(sqlContext) + + // Wait for thrift server to be ready to serve the query, via executing simple query + // till the query succeeds. See SPARK-30345 for more details. + eventually(timeout(30.seconds), interval(1.seconds)) { + withJdbcStatement { _.execute("SELECT 1") } + } + } +} diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkMetadataOperationSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkMetadataOperationSuite.scala index 21870ffd463ec..f7ee3e0a46cd1 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkMetadataOperationSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkMetadataOperationSuite.scala @@ -231,4 +231,20 @@ class SparkMetadataOperationSuite extends HiveThriftJdbcTest { assert(!rs.next()) } } + + test("GetTypeInfo Thrift API") { + def checkResult(rs: ResultSet, typeNames: Seq[String]): Unit = { + for (i <- typeNames.indices) { + assert(rs.next()) + assert(rs.getString("TYPE_NAME") === typeNames(i)) + } + // Make sure there are no more elements + assert(!rs.next()) + } + + withJdbcStatement() { statement => + val metaData = statement.getConnection.getMetaData + checkResult(metaData.getTypeInfo, ThriftserverShimUtils.supportedType().map(_.getName)) + } + } } diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnvSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnvSuite.scala new file mode 100644 index 0000000000000..ffd1fc48f19fe --- /dev/null +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLEnvSuite.scala @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.thriftserver + +import test.custom.listener.{DummyQueryExecutionListener, DummyStreamingQueryListener} + +import org.apache.spark.SparkFunSuite +import org.apache.spark.launcher.SparkLauncher +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.hive.HiveUtils.{HIVE_METASTORE_JARS, HIVE_METASTORE_VERSION} +import org.apache.spark.sql.hive.test.TestHiveContext +import org.apache.spark.sql.internal.StaticSQLConf.{QUERY_EXECUTION_LISTENERS, STREAMING_QUERY_LISTENERS, WAREHOUSE_PATH} + +class SparkSQLEnvSuite extends SparkFunSuite { + test("SPARK-29604 external listeners should be initialized with Spark classloader") { + withSystemProperties( + QUERY_EXECUTION_LISTENERS.key -> classOf[DummyQueryExecutionListener].getCanonicalName, + STREAMING_QUERY_LISTENERS.key -> classOf[DummyStreamingQueryListener].getCanonicalName, + WAREHOUSE_PATH.key -> TestHiveContext.makeWarehouseDir().toURI.getPath, + // The issue occured from "maven" and list of custom jars, but providing list of custom + // jars to initialize HiveClient isn't trivial, so just use "maven". + HIVE_METASTORE_JARS.key -> "maven", + HIVE_METASTORE_VERSION.key -> null, + SparkLauncher.SPARK_MASTER -> "local[2]", + "spark.app.name" -> "testApp") { + + try { + SparkSQLEnv.init() + + val session = SparkSession.getActiveSession + assert(session.isDefined) + assert(session.get.listenerManager.listListeners() + .exists(_.isInstanceOf[DummyQueryExecutionListener])) + assert(session.get.streams.listListeners() + .exists(_.isInstanceOf[DummyStreamingQueryListener])) + } finally { + SparkSQLEnv.stop() + } + } + } + + private def withSystemProperties(pairs: (String, String)*)(f: => Unit): Unit = { + def setProperties(properties: Seq[(String, String)]): Unit = { + properties.foreach { case (key, value) => + if (value != null) { + System.setProperty(key, value) + } else { + System.clearProperty(key) + } + } + } + + val oldValues = pairs.map { kv => kv._1 -> System.getProperty(kv._1) } + try { + setProperties(pairs) + f + } finally { + setProperties(oldValues) + } + } +} diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkThriftServerProtocolVersionsSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkThriftServerProtocolVersionsSuite.scala index f198372a4c998..a63b5dac0aac3 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkThriftServerProtocolVersionsSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/SparkThriftServerProtocolVersionsSuite.scala @@ -261,10 +261,10 @@ class SparkThriftServerProtocolVersionsSuite extends HiveThriftJdbcTest { } } - // We do not fully support interval type - ignore(s"$version get interval type") { + test(s"$version get interval type") { testExecuteStatementWithProtocolVersion(version, "SELECT interval '1' year '2' day") { rs => assert(rs.next()) + assert(rs.getString(1) === "1 years 2 days") } } diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala index 1f7b3feae47b5..d9ac9ab441f0c 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala @@ -18,17 +18,16 @@ package org.apache.spark.sql.hive.thriftserver import java.io.File -import java.sql.{DriverManager, SQLException, Statement, Timestamp} -import java.util.Locale +import java.sql.{SQLException, Statement, Timestamp} +import java.util.{Locale, MissingFormatArgumentException} -import scala.util.{Random, Try} import scala.util.control.NonFatal -import org.apache.hadoop.hive.conf.HiveConf.ConfVars -import org.apache.hive.service.cli.HiveSQLException -import org.scalatest.Ignore +import org.apache.commons.lang3.exception.ExceptionUtils -import org.apache.spark.sql.{AnalysisException, SQLQueryTestSuite} +import org.apache.spark.SparkException +import org.apache.spark.sql.SQLQueryTestSuite +import org.apache.spark.sql.catalyst.analysis.NoSuchTableException import org.apache.spark.sql.catalyst.util.fileToString import org.apache.spark.sql.execution.HiveResult import org.apache.spark.sql.internal.SQLConf @@ -36,88 +35,67 @@ import org.apache.spark.sql.types._ /** * Re-run all the tests in SQLQueryTestSuite via Thrift Server. - * Note that this TestSuite does not support maven. + * + * To run the entire test suite: + * {{{ + * build/sbt "hive-thriftserver/test-only *ThriftServerQueryTestSuite" -Phive-thriftserver + * }}} + * + * This test suite won't generate golden files. To re-generate golden files for entire suite, run: + * {{{ + * SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "sql/test-only *SQLQueryTestSuite" + * }}} * * TODO: * 1. Support UDF testing. * 2. Support DESC command. * 3. Support SHOW command. */ -@Ignore -class ThriftServerQueryTestSuite extends SQLQueryTestSuite { - - private var hiveServer2: HiveThriftServer2 = _ - - override def beforeEach(): Unit = { - // Chooses a random port between 10000 and 19999 - var listeningPort = 10000 + Random.nextInt(10000) - - // Retries up to 3 times with different port numbers if the server fails to start - (1 to 3).foldLeft(Try(startThriftServer(listeningPort, 0))) { case (started, attempt) => - started.orElse { - listeningPort += 1 - Try(startThriftServer(listeningPort, attempt)) - } - }.recover { - case cause: Throwable => - throw cause - }.get - logInfo("HiveThriftServer2 started successfully") - } - - override def afterEach(): Unit = { - hiveServer2.stop() - } - - override val isTestWithConfigSets = false +class ThriftServerQueryTestSuite extends SQLQueryTestSuite with SharedThriftServer { /** List of test cases to ignore, in lower cases. */ - override def blackList: Set[String] = Set( - "blacklist.sql", // Do NOT remove this one. It is here to test the blacklist functionality. + override def blackList: Set[String] = super.blackList ++ Set( // Missing UDF - "pgSQL/boolean.sql", - "pgSQL/case.sql", + "postgreSQL/boolean.sql", + "postgreSQL/case.sql", // SPARK-28624 "date.sql", - // SPARK-28619 - "pgSQL/aggregates_part1.sql", - "group-by.sql", // SPARK-28620 - "pgSQL/float4.sql", + "postgreSQL/float4.sql", // SPARK-28636 "decimalArithmeticOperations.sql", "literals.sql", "subquery/scalar-subquery/scalar-subquery-predicate.sql", "subquery/in-subquery/in-limit.sql", + "subquery/in-subquery/in-group-by.sql", "subquery/in-subquery/simple-in.sql", "subquery/in-subquery/in-order-by.sql", - "subquery/in-subquery/in-set-operations.sql", - // SPARK-28637 - "cast.sql", - "ansi/interval.sql" + "subquery/in-subquery/in-set-operations.sql" ) override def runQueries( queries: Seq[String], testCase: TestCase, - configSet: Option[Seq[(String, String)]]): Unit = { + configSet: Seq[(String, String)]): Unit = { // We do not test with configSet. withJdbcStatement { statement => loadTestData(statement) + configSet.foreach { case (k, v) => + statement.execute(s"SET $k = $v") + } + testCase match { - case _: PgSQLTest => - // PostgreSQL enabled cartesian product by default. - statement.execute(s"SET ${SQLConf.CROSS_JOINS_ENABLED.key} = true") - statement.execute(s"SET ${SQLConf.ANSI_SQL_PARSER.key} = true") - statement.execute(s"SET ${SQLConf.PREFER_INTEGRAL_DIVISION.key} = true") + case _: PgSQLTest | _: AnsiTest => + statement.execute(s"SET ${SQLConf.ANSI_ENABLED.key} = true") case _ => + statement.execute(s"SET ${SQLConf.ANSI_ENABLED.key} = false") } // Run the SQL queries preparing them for comparison. val outputs: Seq[QueryOutput] = queries.map { sql => - val output = getNormalizedResult(statement, sql) + val (_, output) = handleExceptions(getNormalizedResult(statement, sql)) // We might need to do some query canonicalization in the future. QueryOutput( sql = sql, @@ -128,7 +106,7 @@ class ThriftServerQueryTestSuite extends SQLQueryTestSuite { // Read back the golden file. val expectedOutputs: Seq[QueryOutput] = { val goldenOutput = fileToString(new File(testCase.resultFile)) - val segments = goldenOutput.split("-- !query.+\n") + val segments = goldenOutput.split("-- !query.*\n") // each query has 3 segments, plus the header assert(segments.size == outputs.size * 3 + 1, @@ -136,8 +114,9 @@ class ThriftServerQueryTestSuite extends SQLQueryTestSuite { "Try regenerate the result files.") Seq.tabulate(outputs.size) { i => val sql = segments(i * 3 + 1).trim + val schema = segments(i * 3 + 2).trim val originalOut = segments(i * 3 + 3) - val output = if (isNeedSort(sql)) { + val output = if (schema != emptySchema && isNeedSort(sql)) { originalOut.split("\n").sorted.mkString("\n") } else { originalOut @@ -166,19 +145,48 @@ class ThriftServerQueryTestSuite extends SQLQueryTestSuite { || d.sql.toUpperCase(Locale.ROOT).startsWith("DESC\n") || d.sql.toUpperCase(Locale.ROOT).startsWith("DESCRIBE ") || d.sql.toUpperCase(Locale.ROOT).startsWith("DESCRIBE\n") => + // Skip show command, see HiveResult.hiveResultString case s if s.sql.toUpperCase(Locale.ROOT).startsWith("SHOW ") || s.sql.toUpperCase(Locale.ROOT).startsWith("SHOW\n") => - // AnalysisException should exactly match. + + case _ if output.output.startsWith(classOf[NoSuchTableException].getPackage.getName) => + assert(expected.output.startsWith(classOf[NoSuchTableException].getPackage.getName), + s"Exception did not match for query #$i\n${expected.sql}, " + + s"expected: ${expected.output}, but got: ${output.output}") + + case _ if output.output.startsWith(classOf[SparkException].getName) && + output.output.contains("overflow") => + assert(expected.output.contains(classOf[ArithmeticException].getName) && + expected.output.contains("overflow"), + s"Exception did not match for query #$i\n${expected.sql}, " + + s"expected: ${expected.output}, but got: ${output.output}") + + case _ if output.output.startsWith(classOf[RuntimeException].getName) => + assert(expected.output.contains("Exception"), + s"Exception did not match for query #$i\n${expected.sql}, " + + s"expected: ${expected.output}, but got: ${output.output}") + + case _ if output.output.startsWith(classOf[ArithmeticException].getName) && + output.output.contains("causes overflow") => + assert(expected.output.contains(classOf[ArithmeticException].getName) && + expected.output.contains("causes overflow"), + s"Exception did not match for query #$i\n${expected.sql}, " + + s"expected: ${expected.output}, but got: ${output.output}") + + case _ if output.output.startsWith(classOf[MissingFormatArgumentException].getName) && + output.output.contains("Format specifier") => + assert(expected.output.contains(classOf[MissingFormatArgumentException].getName) && + expected.output.contains("Format specifier"), + s"Exception did not match for query #$i\n${expected.sql}, " + + s"expected: ${expected.output}, but got: ${output.output}") + // SQLException should not exactly match. We only assert the result contains Exception. case _ if output.output.startsWith(classOf[SQLException].getName) => assert(expected.output.contains("Exception"), s"Exception did not match for query #$i\n${expected.sql}, " + s"expected: ${expected.output}, but got: ${output.output}") - // HiveSQLException is usually a feature that our ThriftServer cannot support. - // Please add SQL to blackList. - case _ if output.output.startsWith(classOf[HiveSQLException].getName) => - assert(false, s"${output.output} for query #$i\n${expected.sql}") + case _ => assertResult(expected.output, s"Result did not match for query #$i\n${expected.sql}") { output.output @@ -201,7 +209,7 @@ class ThriftServerQueryTestSuite extends SQLQueryTestSuite { } } - override def listTestCases(): Seq[TestCase] = { + override lazy val listTestCases: Seq[TestCase] = { listFilesRecursively(new File(inputFilePath)).flatMap { file => val resultFile = file.getAbsolutePath.replace(inputFilePath, goldenFilePath) + ".out" val absPath = file.getAbsolutePath @@ -209,8 +217,10 @@ class ThriftServerQueryTestSuite extends SQLQueryTestSuite { if (file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}udf")) { Seq.empty - } else if (file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}pgSQL")) { + } else if (file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}postgreSQL")) { PgSQLTestCase(testCaseName, absPath, resultFile) :: Nil + } else if (file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}ansi")) { + AnsiTestCase(testCaseName, absPath, resultFile) :: Nil } else { RegularTestCase(testCaseName, absPath, resultFile) :: Nil } @@ -225,54 +235,30 @@ class ThriftServerQueryTestSuite extends SQLQueryTestSuite { } } - private def getNormalizedResult(statement: Statement, sql: String): Seq[String] = { - try { - val rs = statement.executeQuery(sql) - val cols = rs.getMetaData.getColumnCount - val buildStr = () => (for (i <- 1 to cols) yield { - getHiveResult(rs.getObject(i)) - }).mkString("\t") - - val answer = Iterator.continually(rs.next()).takeWhile(identity).map(_ => buildStr()).toSeq - .map(replaceNotIncludedMsg) - if (isNeedSort(sql)) { - answer.sorted - } else { - answer + /** ThriftServer wraps the root exception, so it needs to be extracted. */ + override def handleExceptions(result: => (String, Seq[String])): (String, Seq[String]) = { + super.handleExceptions { + try { + result + } catch { + case NonFatal(e) => throw ExceptionUtils.getRootCause(e) } - } catch { - case a: AnalysisException => - // Do not output the logical plan tree which contains expression IDs. - // Also implement a crude way of masking expression IDs in the error message - // with a generic pattern "###". - val msg = if (a.plan.nonEmpty) a.getSimpleMessage else a.getMessage - Seq(a.getClass.getName, msg.replaceAll("#\\d+", "#x")).sorted - case NonFatal(e) => - // If there is an exception, put the exception class followed by the message. - Seq(e.getClass.getName, e.getMessage) } } - private def startThriftServer(port: Int, attempt: Int): Unit = { - logInfo(s"Trying to start HiveThriftServer2: port=$port, attempt=$attempt") - val sqlContext = spark.newSession().sqlContext - sqlContext.setConf(ConfVars.HIVE_SERVER2_THRIFT_PORT.varname, port.toString) - hiveServer2 = HiveThriftServer2.startWithContext(sqlContext) - } - - private def withJdbcStatement(fs: (Statement => Unit)*) { - val user = System.getProperty("user.name") - - val serverPort = hiveServer2.getHiveConf.get(ConfVars.HIVE_SERVER2_THRIFT_PORT.varname) - val connections = - fs.map { _ => DriverManager.getConnection(s"jdbc:hive2://localhost:$serverPort", user, "") } - val statements = connections.map(_.createStatement()) - - try { - statements.zip(fs).foreach { case (s, f) => f(s) } - } finally { - statements.foreach(_.close()) - connections.foreach(_.close()) + private def getNormalizedResult(statement: Statement, sql: String): (String, Seq[String]) = { + val rs = statement.executeQuery(sql) + val cols = rs.getMetaData.getColumnCount + val buildStr = () => (for (i <- 1 to cols) yield { + getHiveResult(rs.getObject(i)) + }).mkString("\t") + + val answer = Iterator.continually(rs.next()).takeWhile(identity).map(_ => buildStr()).toSeq + .map(replaceNotIncludedMsg) + if (isNeedSort(sql)) { + ("", answer.sorted) + } else { + ("", answer) } } @@ -337,7 +323,7 @@ class ThriftServerQueryTestSuite extends SQLQueryTestSuite { upperCase.startsWith("SELECT ") || upperCase.startsWith("SELECT\n") || upperCase.startsWith("WITH ") || upperCase.startsWith("WITH\n") || upperCase.startsWith("VALUES ") || upperCase.startsWith("VALUES\n") || - // pgSQL/union.sql + // postgreSQL/union.sql upperCase.startsWith("(") } @@ -350,7 +336,7 @@ class ThriftServerQueryTestSuite extends SQLQueryTestSuite { case t: Timestamp => HiveResult.toHiveString((t, TimestampType)) case d: java.math.BigDecimal => - HiveResult.toHiveString((d, DecimalType.fromBigDecimal(d))) + HiveResult.toHiveString((d, DecimalType.fromDecimal(Decimal(d)))) case bin: Array[Byte] => HiveResult.toHiveString((bin, BinaryType)) case other => diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala new file mode 100644 index 0000000000000..3e1fce78ae71c --- /dev/null +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerWithSparkContextSuite.scala @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.thriftserver + +class ThriftServerWithSparkContextSuite extends SharedThriftServer { + + test("SPARK-29911: Uncache cached tables when session closed") { + val cacheManager = spark.sharedState.cacheManager + val globalTempDB = spark.sharedState.globalTempViewManager.database + withJdbcStatement { statement => + statement.execute("CACHE TABLE tempTbl AS SELECT 1") + } + // the cached data of local temporary view should be uncached + assert(cacheManager.isEmpty) + try { + withJdbcStatement { statement => + statement.execute("CREATE GLOBAL TEMP VIEW globalTempTbl AS SELECT 1, 2") + statement.execute(s"CACHE TABLE $globalTempDB.globalTempTbl") + } + // the cached data of global temporary view shouldn't be uncached + assert(!cacheManager.isEmpty) + } finally { + withJdbcStatement { statement => + statement.execute(s"UNCACHE TABLE IF EXISTS $globalTempDB.globalTempTbl") + } + assert(cacheManager.isEmpty) + } + } +} diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala index 47cf4f104d204..7f731f3d05e51 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/UISeleniumSuite.scala @@ -24,8 +24,8 @@ import org.openqa.selenium.WebDriver import org.openqa.selenium.htmlunit.HtmlUnitDriver import org.scalatest.{BeforeAndAfterAll, Matchers} import org.scalatest.concurrent.Eventually._ -import org.scalatest.selenium.WebBrowser import org.scalatest.time.SpanSugar._ +import org.scalatestplus.selenium.WebBrowser import org.apache.spark.ui.SparkUICssErrorHandler diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ui/HiveThriftServer2ListenerSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ui/HiveThriftServer2ListenerSuite.scala new file mode 100644 index 0000000000000..075032fa5d099 --- /dev/null +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ui/HiveThriftServer2ListenerSuite.scala @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.thriftserver.ui + +import java.util.Properties + +import org.mockito.Mockito.{mock, RETURNS_SMART_NULLS} +import org.scalatest.BeforeAndAfter + +import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} +import org.apache.spark.internal.config.Status.{ASYNC_TRACKING_ENABLED, LIVE_ENTITY_UPDATE_PERIOD} +import org.apache.spark.scheduler.SparkListenerJobStart +import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.status.ElementTrackingStore +import org.apache.spark.util.kvstore.InMemoryStore + +class HiveThriftServer2ListenerSuite extends SparkFunSuite with BeforeAndAfter { + + private var kvstore: ElementTrackingStore = _ + + after { + if (kvstore != null) { + kvstore.close() + kvstore = null + } + } + + Seq(true, false).foreach { live => + test(s"listener events should store successfully (live = $live)") { + val (statusStore: HiveThriftServer2AppStatusStore, + listener: HiveThriftServer2Listener) = createAppStatusStore(live) + + listener.onOtherEvent(SparkListenerThriftServerSessionCreated("localhost", "sessionId", + "user", System.currentTimeMillis())) + listener.onOtherEvent(SparkListenerThriftServerOperationStart("id", "sessionId", + "dummy query", "groupId", System.currentTimeMillis(), "user")) + listener.onOtherEvent(SparkListenerThriftServerOperationParsed("id", "dummy plan")) + listener.onJobStart(SparkListenerJobStart( + 0, + System.currentTimeMillis(), + Nil, + createProperties)) + listener.onOtherEvent(SparkListenerThriftServerOperationFinish("id", + System.currentTimeMillis())) + listener.onOtherEvent(SparkListenerThriftServerOperationClosed("id", + System.currentTimeMillis())) + + if (live) { + assert(statusStore.getOnlineSessionNum === 1) + } + + listener.onOtherEvent(SparkListenerThriftServerSessionClosed("sessionId", + System.currentTimeMillis())) + + if (!live) { + // To update history store + kvstore.close(false) + } + assert(statusStore.getOnlineSessionNum === 0) + assert(statusStore.getExecutionList.size === 1) + + val storeExecData = statusStore.getExecutionList.head + + assert(storeExecData.execId === "id") + assert(storeExecData.sessionId === "sessionId") + assert(storeExecData.executePlan === "dummy plan") + assert(storeExecData.jobId === Seq("0")) + assert(listener.noLiveData()) + } + } + + Seq(true, false).foreach { live => + test(s"cleanup session if exceeds the threshold (live = $live)") { + val (statusStore: HiveThriftServer2AppStatusStore, + listener: HiveThriftServer2Listener) = createAppStatusStore(true) + var time = 0 + listener.onOtherEvent(SparkListenerThriftServerSessionCreated("localhost", "sessionId1", + "user", time)) + time += 1 + listener.onOtherEvent(SparkListenerThriftServerSessionCreated("localhost", "sessionId2", + "user", time)) + time += 1 + listener.onOtherEvent(SparkListenerThriftServerSessionClosed("sessionId1", time)) + time += 1 + listener.onOtherEvent(SparkListenerThriftServerSessionClosed("sessionId2", time)) + listener.onOtherEvent(SparkListenerThriftServerSessionCreated("localhost", "sessionId3", + "user", time)) + time += 1 + listener.onOtherEvent(SparkListenerThriftServerSessionClosed("sessionId3", time)) + + if (!live) { + kvstore.close(false) + } + assert(statusStore.getOnlineSessionNum === 0) + assert(statusStore.getSessionCount === 1) + assert(statusStore.getSession("sessionId1") === None) + assert(listener.noLiveData()) + } + } + + test("update execution info when jobstart event come after execution end event") { + val (statusStore: HiveThriftServer2AppStatusStore, + listener: HiveThriftServer2Listener) = createAppStatusStore(true) + + listener.onOtherEvent(SparkListenerThriftServerSessionCreated("localhost", "sessionId", "user", + System.currentTimeMillis())) + listener.onOtherEvent(SparkListenerThriftServerOperationStart("id", "sessionId", "dummy query", + "groupId", System.currentTimeMillis(), "user")) + listener.onOtherEvent(SparkListenerThriftServerOperationParsed("id", "dummy plan")) + listener.onOtherEvent(SparkListenerThriftServerOperationFinish("id", + System.currentTimeMillis())) + listener.onOtherEvent(SparkListenerThriftServerOperationClosed("id", + System.currentTimeMillis())) + listener.onJobStart(SparkListenerJobStart( + 0, + System.currentTimeMillis(), + Nil, + createProperties)) + listener.onOtherEvent(SparkListenerThriftServerSessionClosed("sessionId", + System.currentTimeMillis())) + val exec = statusStore.getExecution("id") + assert(exec.isDefined) + assert(exec.get.jobId === Seq("0")) + assert(listener.noLiveData()) + } + + private def createProperties: Properties = { + val properties = new Properties() + properties.setProperty(SparkContext.SPARK_JOB_GROUP_ID, "groupId") + properties + } + + private def createAppStatusStore(live: Boolean) = { + val sparkConf = new SparkConf() + sparkConf.set(ASYNC_TRACKING_ENABLED, false) + .set(SQLConf.THRIFTSERVER_UI_SESSION_LIMIT, 1) + .set(LIVE_ENTITY_UPDATE_PERIOD, 0L) + kvstore = new ElementTrackingStore(new InMemoryStore, sparkConf) + if (live) { + val server = mock(classOf[HiveThriftServer2], RETURNS_SMART_NULLS) + val listener = new HiveThriftServer2Listener(kvstore, sparkConf, Some(server)) + (new HiveThriftServer2AppStatusStore(kvstore, Some(listener)), listener) + } else { + (new HiveThriftServer2AppStatusStore(kvstore), + new HiveThriftServer2Listener(kvstore, sparkConf, None, false)) + } + } +} diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPageSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPageSuite.scala new file mode 100644 index 0000000000000..9f3c2957a182d --- /dev/null +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ui/ThriftServerPageSuite.scala @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.thriftserver.ui + +import java.util.{Calendar, Locale} +import javax.servlet.http.HttpServletRequest + +import org.mockito.Mockito.{mock, when, RETURNS_SMART_NULLS} +import org.scalatest.BeforeAndAfter + +import org.apache.spark.{SparkConf, SparkFunSuite} +import org.apache.spark.scheduler.SparkListenerJobStart +import org.apache.spark.sql.hive.thriftserver._ +import org.apache.spark.status.ElementTrackingStore +import org.apache.spark.util.kvstore.InMemoryStore + + +class ThriftServerPageSuite extends SparkFunSuite with BeforeAndAfter { + + private var kvstore: ElementTrackingStore = _ + + after { + if (kvstore != null) { + kvstore.close() + kvstore = null + } + } + + /** + * Run a dummy session and return the store + */ + private def getStatusStore: HiveThriftServer2AppStatusStore = { + kvstore = new ElementTrackingStore(new InMemoryStore, new SparkConf()) + val server = mock(classOf[HiveThriftServer2], RETURNS_SMART_NULLS) + val sparkConf = new SparkConf + + val listener = new HiveThriftServer2Listener(kvstore, sparkConf, Some(server)) + val statusStore = new HiveThriftServer2AppStatusStore(kvstore, Some(listener)) + + listener.onOtherEvent(SparkListenerThriftServerSessionCreated("localhost", "sessionid", "user", + System.currentTimeMillis())) + listener.onOtherEvent(SparkListenerThriftServerOperationStart("id", "sessionid", + "dummy query", "groupid", System.currentTimeMillis(), "user")) + listener.onOtherEvent(SparkListenerThriftServerOperationParsed("id", "dummy plan")) + listener.onOtherEvent(SparkListenerJobStart(0, System.currentTimeMillis(), Seq())) + listener.onOtherEvent(SparkListenerThriftServerOperationFinish("id", + System.currentTimeMillis())) + listener.onOtherEvent(SparkListenerThriftServerOperationClosed("id", + System.currentTimeMillis())) + listener.onOtherEvent(SparkListenerThriftServerSessionClosed("sessionid", + System.currentTimeMillis())) + + statusStore + } + + test("thriftserver page should load successfully") { + val store = getStatusStore + + val request = mock(classOf[HttpServletRequest]) + val tab = mock(classOf[ThriftServerTab], RETURNS_SMART_NULLS) + when(tab.startTime).thenReturn(Calendar.getInstance().getTime) + when(tab.store).thenReturn(store) + when(tab.appName).thenReturn("testing") + when(tab.headerTabs).thenReturn(Seq.empty) + val page = new ThriftServerPage(tab) + val html = page.render(request).toString().toLowerCase(Locale.ROOT) + + // session statistics and sql statistics tables should load successfully + assert(html.contains("session statistics (1)")) + assert(html.contains("sql statistics (1)")) + assert(html.contains("dummy query")) + assert(html.contains("dummy plan")) + + // Pagination support + assert(html.contains("")) + + // Hiding table support + assert(html.contains("class=\"collapse-aggregated-sessionstat" + + " collapse-table\" onclick=\"collapsetable")) + } + + test("thriftserver session page should load successfully") { + val store = getStatusStore + + val request = mock(classOf[HttpServletRequest]) + when(request.getParameter("id")).thenReturn("sessionid") + val tab = mock(classOf[ThriftServerTab], RETURNS_SMART_NULLS) + when(tab.startTime).thenReturn(Calendar.getInstance().getTime) + when(tab.store).thenReturn(store) + when(tab.appName).thenReturn("testing") + when(tab.headerTabs).thenReturn(Seq.empty) + val page = new ThriftServerSessionPage(tab) + val html = page.render(request).toString().toLowerCase(Locale.ROOT) + + // session sql statistics table should load successfully + assert(html.contains("sql statistics")) + assert(html.contains("user")) + assert(html.contains("groupid")) + + // Pagination support + assert(html.contains("")) + + // Hiding table support + assert(html.contains("collapse-aggregated-sqlsessionstat collapse-table\"" + + " onclick=\"collapsetable")) + } +} + diff --git a/sql/hive-thriftserver/v1.2.1/if/TCLIService.thrift b/sql/hive-thriftserver/v1.2/if/TCLIService.thrift similarity index 99% rename from sql/hive-thriftserver/v1.2.1/if/TCLIService.thrift rename to sql/hive-thriftserver/v1.2/if/TCLIService.thrift index 7cd6fa37cec37..225e319737811 100644 --- a/sql/hive-thriftserver/v1.2.1/if/TCLIService.thrift +++ b/sql/hive-thriftserver/v1.2/if/TCLIService.thrift @@ -1028,7 +1028,6 @@ enum TFetchOrientation { FETCH_NEXT, // Get the previous rowset. The fetch offset is ignored. - // NOT SUPPORTED FETCH_PRIOR, // Return the rowset at the given fetch offset relative @@ -1056,8 +1055,8 @@ struct TFetchResultsReq { // Operation from which to fetch results. 1: required TOperationHandle operationHandle - // The fetch orientation. For V1 this must be either - // FETCH_NEXT or FETCH_FIRST. Defaults to FETCH_NEXT. + // The fetch orientation. This must be either + // FETCH_NEXT, FETCH_PRIOR or FETCH_FIRST. Defaults to FETCH_NEXT. 2: required TFetchOrientation orientation = TFetchOrientation.FETCH_NEXT // Max number of rows that should be returned in diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TArrayTypeEntry.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TArrayTypeEntry.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TArrayTypeEntry.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TArrayTypeEntry.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TBinaryColumn.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TBinaryColumn.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TBinaryColumn.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TBinaryColumn.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TBoolColumn.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TBoolColumn.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TBoolColumn.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TBoolColumn.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TBoolValue.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TBoolValue.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TBoolValue.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TBoolValue.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TByteColumn.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TByteColumn.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TByteColumn.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TByteColumn.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TByteValue.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TByteValue.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TByteValue.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TByteValue.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TCLIService.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TCLIService.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TCLIService.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TCLIService.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TCLIServiceConstants.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TCLIServiceConstants.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TCLIServiceConstants.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TCLIServiceConstants.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TCancelDelegationTokenReq.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TCancelDelegationTokenReq.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TCancelDelegationTokenReq.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TCancelDelegationTokenReq.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TCancelDelegationTokenResp.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TCancelDelegationTokenResp.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TCancelDelegationTokenResp.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TCancelDelegationTokenResp.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TCancelOperationReq.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TCancelOperationReq.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TCancelOperationReq.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TCancelOperationReq.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TCancelOperationResp.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TCancelOperationResp.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TCancelOperationResp.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TCancelOperationResp.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TCloseOperationReq.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TCloseOperationReq.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TCloseOperationReq.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TCloseOperationReq.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TCloseOperationResp.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TCloseOperationResp.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TCloseOperationResp.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TCloseOperationResp.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TCloseSessionReq.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TCloseSessionReq.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TCloseSessionReq.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TCloseSessionReq.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TCloseSessionResp.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TCloseSessionResp.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TCloseSessionResp.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TCloseSessionResp.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TColumn.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TColumn.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TColumn.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TColumn.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TColumnDesc.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TColumnDesc.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TColumnDesc.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TColumnDesc.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TColumnValue.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TColumnValue.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TColumnValue.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TColumnValue.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TDoubleColumn.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TDoubleColumn.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TDoubleColumn.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TDoubleColumn.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TDoubleValue.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TDoubleValue.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TDoubleValue.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TDoubleValue.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TExecuteStatementReq.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TExecuteStatementReq.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TExecuteStatementReq.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TExecuteStatementReq.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TExecuteStatementResp.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TExecuteStatementResp.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TExecuteStatementResp.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TExecuteStatementResp.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TFetchOrientation.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TFetchOrientation.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TFetchOrientation.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TFetchOrientation.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TFetchResultsReq.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TFetchResultsReq.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TFetchResultsReq.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TFetchResultsReq.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TFetchResultsResp.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TFetchResultsResp.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TFetchResultsResp.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TFetchResultsResp.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetCatalogsReq.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetCatalogsReq.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetCatalogsReq.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetCatalogsReq.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetCatalogsResp.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetCatalogsResp.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetCatalogsResp.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetCatalogsResp.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetColumnsReq.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetColumnsReq.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetColumnsReq.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetColumnsReq.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetColumnsResp.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetColumnsResp.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetColumnsResp.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetColumnsResp.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetDelegationTokenReq.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetDelegationTokenReq.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetDelegationTokenReq.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetDelegationTokenReq.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetDelegationTokenResp.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetDelegationTokenResp.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetDelegationTokenResp.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetDelegationTokenResp.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetFunctionsReq.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetFunctionsReq.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetFunctionsReq.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetFunctionsReq.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetFunctionsResp.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetFunctionsResp.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetFunctionsResp.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetFunctionsResp.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetInfoReq.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetInfoReq.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetInfoReq.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetInfoReq.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetInfoResp.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetInfoResp.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetInfoResp.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetInfoResp.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetInfoType.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetInfoType.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetInfoType.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetInfoType.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetInfoValue.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetInfoValue.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetInfoValue.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetInfoValue.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetOperationStatusReq.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetOperationStatusReq.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetOperationStatusReq.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetOperationStatusReq.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetOperationStatusResp.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetOperationStatusResp.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetOperationStatusResp.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetOperationStatusResp.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetResultSetMetadataReq.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetResultSetMetadataReq.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetResultSetMetadataReq.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetResultSetMetadataReq.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetResultSetMetadataResp.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetResultSetMetadataResp.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetResultSetMetadataResp.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetResultSetMetadataResp.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetSchemasReq.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetSchemasReq.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetSchemasReq.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetSchemasReq.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetSchemasResp.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetSchemasResp.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetSchemasResp.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetSchemasResp.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetTableTypesReq.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetTableTypesReq.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetTableTypesReq.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetTableTypesReq.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetTableTypesResp.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetTableTypesResp.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetTableTypesResp.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetTableTypesResp.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetTablesReq.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetTablesReq.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetTablesReq.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetTablesReq.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetTablesResp.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetTablesResp.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetTablesResp.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetTablesResp.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetTypeInfoReq.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetTypeInfoReq.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetTypeInfoReq.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetTypeInfoReq.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetTypeInfoResp.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetTypeInfoResp.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TGetTypeInfoResp.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TGetTypeInfoResp.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/THandleIdentifier.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/THandleIdentifier.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/THandleIdentifier.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/THandleIdentifier.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TI16Column.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TI16Column.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TI16Column.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TI16Column.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TI16Value.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TI16Value.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TI16Value.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TI16Value.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TI32Column.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TI32Column.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TI32Column.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TI32Column.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TI32Value.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TI32Value.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TI32Value.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TI32Value.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TI64Column.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TI64Column.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TI64Column.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TI64Column.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TI64Value.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TI64Value.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TI64Value.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TI64Value.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TMapTypeEntry.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TMapTypeEntry.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TMapTypeEntry.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TMapTypeEntry.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TOpenSessionReq.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TOpenSessionReq.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TOpenSessionReq.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TOpenSessionReq.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TOpenSessionResp.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TOpenSessionResp.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TOpenSessionResp.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TOpenSessionResp.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TOperationHandle.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TOperationHandle.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TOperationHandle.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TOperationHandle.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TOperationState.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TOperationState.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TOperationState.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TOperationState.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TOperationType.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TOperationType.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TOperationType.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TOperationType.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TPrimitiveTypeEntry.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TPrimitiveTypeEntry.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TPrimitiveTypeEntry.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TPrimitiveTypeEntry.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TProtocolVersion.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TProtocolVersion.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TProtocolVersion.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TProtocolVersion.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TRenewDelegationTokenReq.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TRenewDelegationTokenReq.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TRenewDelegationTokenReq.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TRenewDelegationTokenReq.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TRenewDelegationTokenResp.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TRenewDelegationTokenResp.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TRenewDelegationTokenResp.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TRenewDelegationTokenResp.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TRow.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TRow.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TRow.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TRow.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TRowSet.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TRowSet.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TRowSet.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TRowSet.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TSessionHandle.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TSessionHandle.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TSessionHandle.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TSessionHandle.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TStatus.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TStatus.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TStatus.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TStatus.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TStatusCode.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TStatusCode.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TStatusCode.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TStatusCode.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TStringColumn.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TStringColumn.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TStringColumn.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TStringColumn.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TStringValue.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TStringValue.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TStringValue.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TStringValue.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TStructTypeEntry.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TStructTypeEntry.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TStructTypeEntry.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TStructTypeEntry.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TTableSchema.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TTableSchema.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TTableSchema.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TTableSchema.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TTypeDesc.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TTypeDesc.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TTypeDesc.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TTypeDesc.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TTypeEntry.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TTypeEntry.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TTypeEntry.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TTypeEntry.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TTypeId.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TTypeId.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TTypeId.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TTypeId.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TTypeQualifierValue.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TTypeQualifierValue.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TTypeQualifierValue.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TTypeQualifierValue.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TTypeQualifiers.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TTypeQualifiers.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TTypeQualifiers.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TTypeQualifiers.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TUnionTypeEntry.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TUnionTypeEntry.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TUnionTypeEntry.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TUnionTypeEntry.java diff --git a/sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TUserDefinedTypeEntry.java b/sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TUserDefinedTypeEntry.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/gen/java/org/apache/hive/service/cli/thrift/TUserDefinedTypeEntry.java rename to sql/hive-thriftserver/v1.2/src/gen/java/org/apache/hive/service/cli/thrift/TUserDefinedTypeEntry.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/AbstractService.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/AbstractService.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/AbstractService.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/AbstractService.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/CompositeService.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/CompositeService.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/CompositeService.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/CompositeService.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/CookieSigner.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/CookieSigner.java similarity index 97% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/CookieSigner.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/CookieSigner.java index ee51c24351c3d..f2a80c9d5ffbc 100644 --- a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/CookieSigner.java +++ b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/CookieSigner.java @@ -81,7 +81,7 @@ public String verifyAndExtract(String signedStr) { if (LOG.isDebugEnabled()) { LOG.debug("Signature generated for " + rawValue + " inside verify is " + currentSignature); } - if (!originalSignature.equals(currentSignature)) { + if (!MessageDigest.isEqual(originalSignature.getBytes(), currentSignature.getBytes())) { throw new IllegalArgumentException("Invalid sign, original = " + originalSignature + " current = " + currentSignature); } diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/ServiceOperations.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/ServiceOperations.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/ServiceOperations.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/ServiceOperations.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/ServiceUtils.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/ServiceUtils.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/ServiceUtils.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/ServiceUtils.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/auth/HiveAuthFactory.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/auth/HiveAuthFactory.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/auth/HiveAuthFactory.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/auth/HiveAuthFactory.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/auth/HttpAuthUtils.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/auth/HttpAuthUtils.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/auth/HttpAuthUtils.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/auth/HttpAuthUtils.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/auth/KerberosSaslHelper.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/auth/KerberosSaslHelper.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/auth/KerberosSaslHelper.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/auth/KerberosSaslHelper.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/auth/PlainSaslHelper.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/auth/PlainSaslHelper.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/auth/PlainSaslHelper.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/auth/PlainSaslHelper.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/auth/TSetIpAddressProcessor.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/auth/TSetIpAddressProcessor.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/auth/TSetIpAddressProcessor.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/auth/TSetIpAddressProcessor.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/CLIService.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/CLIService.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/CLIService.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/CLIService.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/Column.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/Column.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/Column.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/Column.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/ColumnBasedSet.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/ColumnBasedSet.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/ColumnBasedSet.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/ColumnBasedSet.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/ColumnDescriptor.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/ColumnDescriptor.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/ColumnDescriptor.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/ColumnDescriptor.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/ColumnValue.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/ColumnValue.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/ColumnValue.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/ColumnValue.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/EmbeddedCLIServiceClient.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/EmbeddedCLIServiceClient.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/EmbeddedCLIServiceClient.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/EmbeddedCLIServiceClient.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/FetchOrientation.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/FetchOrientation.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/FetchOrientation.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/FetchOrientation.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/GetInfoType.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/GetInfoType.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/GetInfoType.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/GetInfoType.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/GetInfoValue.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/GetInfoValue.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/GetInfoValue.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/GetInfoValue.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/Handle.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/Handle.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/Handle.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/Handle.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/HandleIdentifier.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/HandleIdentifier.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/HandleIdentifier.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/HandleIdentifier.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/HiveSQLException.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/HiveSQLException.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/HiveSQLException.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/HiveSQLException.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/ICLIService.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/ICLIService.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/ICLIService.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/ICLIService.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/OperationHandle.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/OperationHandle.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/OperationHandle.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/OperationHandle.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/OperationState.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/OperationState.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/OperationState.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/OperationState.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/OperationType.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/OperationType.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/OperationType.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/OperationType.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/PatternOrIdentifier.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/PatternOrIdentifier.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/PatternOrIdentifier.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/PatternOrIdentifier.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/RowBasedSet.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/RowBasedSet.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/RowBasedSet.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/RowBasedSet.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/RowSet.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/RowSet.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/RowSet.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/RowSet.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/RowSetFactory.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/RowSetFactory.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/RowSetFactory.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/RowSetFactory.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/SessionHandle.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/SessionHandle.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/SessionHandle.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/SessionHandle.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/TableSchema.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/TableSchema.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/TableSchema.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/TableSchema.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/Type.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/Type.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/Type.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/Type.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/TypeDescriptor.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/TypeDescriptor.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/TypeDescriptor.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/TypeDescriptor.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/TypeQualifiers.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/TypeQualifiers.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/TypeQualifiers.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/TypeQualifiers.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/ClassicTableTypeMapping.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/ClassicTableTypeMapping.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/ClassicTableTypeMapping.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/ClassicTableTypeMapping.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/GetCatalogsOperation.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/GetCatalogsOperation.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/GetCatalogsOperation.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/GetCatalogsOperation.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/GetColumnsOperation.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/GetColumnsOperation.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/GetColumnsOperation.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/GetColumnsOperation.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/GetFunctionsOperation.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/GetFunctionsOperation.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/GetFunctionsOperation.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/GetFunctionsOperation.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/GetSchemasOperation.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/GetSchemasOperation.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/GetSchemasOperation.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/GetSchemasOperation.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/GetTableTypesOperation.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/GetTableTypesOperation.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/GetTableTypesOperation.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/GetTableTypesOperation.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/GetTablesOperation.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/GetTablesOperation.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/GetTablesOperation.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/GetTablesOperation.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/GetTypeInfoOperation.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/GetTypeInfoOperation.java similarity index 99% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/GetTypeInfoOperation.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/GetTypeInfoOperation.java index 0f72071d7e7d1..3e81f8afbd85f 100644 --- a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/GetTypeInfoOperation.java +++ b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/GetTypeInfoOperation.java @@ -73,7 +73,7 @@ public class GetTypeInfoOperation extends MetadataOperation { .addPrimitiveColumn("NUM_PREC_RADIX", Type.INT_TYPE, "Usually 2 or 10"); - private final RowSet rowSet; + protected final RowSet rowSet; protected GetTypeInfoOperation(HiveSession parentSession) { super(parentSession, OperationType.GET_TYPE_INFO); diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/HiveCommandOperation.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/HiveCommandOperation.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/HiveCommandOperation.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/HiveCommandOperation.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/HiveTableTypeMapping.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/HiveTableTypeMapping.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/HiveTableTypeMapping.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/HiveTableTypeMapping.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/MetadataOperation.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/MetadataOperation.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/MetadataOperation.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/MetadataOperation.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/Operation.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/Operation.java similarity index 98% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/Operation.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/Operation.java index 19153b654b08a..51bb28748d9e2 100644 --- a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/Operation.java +++ b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/Operation.java @@ -58,7 +58,10 @@ public abstract class Operation { private long lastAccessTime; protected static final EnumSet DEFAULT_FETCH_ORIENTATION_SET = - EnumSet.of(FetchOrientation.FETCH_NEXT,FetchOrientation.FETCH_FIRST); + EnumSet.of( + FetchOrientation.FETCH_NEXT, + FetchOrientation.FETCH_FIRST, + FetchOrientation.FETCH_PRIOR); protected Operation(HiveSession parentSession, OperationType opType, boolean runInBackground) { this.parentSession = parentSession; diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/OperationManager.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/OperationManager.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/OperationManager.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/OperationManager.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java similarity index 99% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java index 18652f17aa926..c7726f1fac07a 100644 --- a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java +++ b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java @@ -20,7 +20,6 @@ import java.io.IOException; import java.io.Serializable; -import java.io.UnsupportedEncodingException; import java.nio.charset.StandardCharsets; import java.security.PrivilegedExceptionAction; import java.sql.SQLException; diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/TableTypeMapping.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/TableTypeMapping.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/operation/TableTypeMapping.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/operation/TableTypeMapping.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/session/HiveSession.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/session/HiveSession.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/session/HiveSession.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/session/HiveSession.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/session/HiveSessionBase.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/session/HiveSessionBase.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/session/HiveSessionBase.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/session/HiveSessionBase.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/session/HiveSessionImplwithUGI.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/session/HiveSessionImplwithUGI.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/session/HiveSessionImplwithUGI.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/session/HiveSessionImplwithUGI.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/session/SessionManager.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/session/SessionManager.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/session/SessionManager.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/session/SessionManager.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIService.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIService.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIService.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIService.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIServiceClient.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIServiceClient.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIServiceClient.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIServiceClient.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpCLIService.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpCLIService.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpCLIService.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpCLIService.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpServlet.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpServlet.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpServlet.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpServlet.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/server/HiveServer2.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/server/HiveServer2.java similarity index 89% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/server/HiveServer2.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/server/HiveServer2.java index a30be2bc06b9e..95233996cbbcb 100644 --- a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/server/HiveServer2.java +++ b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/server/HiveServer2.java @@ -31,8 +31,6 @@ import org.apache.commons.cli.ParseException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hive.common.LogUtils; -import org.apache.hadoop.hive.common.LogUtils.LogInitializationException; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.shims.ShimLoader; import org.apache.hive.common.util.HiveStringUtils; @@ -153,25 +151,13 @@ private static void startHiveServer2() throws Throwable { public static void main(String[] args) { HiveConf.setLoadHiveServer2Config(true); - try { - ServerOptionsProcessor oproc = new ServerOptionsProcessor("hiveserver2"); - ServerOptionsProcessorResponse oprocResponse = oproc.parse(args); + ServerOptionsProcessor oproc = new ServerOptionsProcessor("hiveserver2"); + ServerOptionsProcessorResponse oprocResponse = oproc.parse(args); - // NOTE: It is critical to do this here so that log4j is reinitialized - // before any of the other core hive classes are loaded - String initLog4jMessage = LogUtils.initHiveLog4j(); - LOG.debug(initLog4jMessage); - HiveStringUtils.startupShutdownMessage(HiveServer2.class, args, LOG); + HiveStringUtils.startupShutdownMessage(HiveServer2.class, args, LOG); - // Log debug message from "oproc" after log4j initialize properly - LOG.debug(oproc.getDebugMessage().toString()); - - // Call the executor which will execute the appropriate command based on the parsed options - oprocResponse.getServerOptionsExecutor().execute(); - } catch (LogInitializationException e) { - LOG.error("Error initializing log: " + e.getMessage(), e); - System.exit(-1); - } + // Call the executor which will execute the appropriate command based on the parsed options + oprocResponse.getServerOptionsExecutor().execute(); } /** diff --git a/sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/server/ThreadWithGarbageCleanup.java b/sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/server/ThreadWithGarbageCleanup.java similarity index 100% rename from sql/hive-thriftserver/v1.2.1/src/main/java/org/apache/hive/service/server/ThreadWithGarbageCleanup.java rename to sql/hive-thriftserver/v1.2/src/main/java/org/apache/hive/service/server/ThreadWithGarbageCleanup.java diff --git a/sql/hive-thriftserver/v1.2.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/ThriftserverShimUtils.scala b/sql/hive-thriftserver/v1.2/src/main/scala/org/apache/spark/sql/hive/thriftserver/ThriftserverShimUtils.scala similarity index 89% rename from sql/hive-thriftserver/v1.2.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/ThriftserverShimUtils.scala rename to sql/hive-thriftserver/v1.2/src/main/scala/org/apache/spark/sql/hive/thriftserver/ThriftserverShimUtils.scala index 87c0f8f6a571a..fbfc698ecb4bf 100644 --- a/sql/hive-thriftserver/v1.2.1/src/main/scala/org/apache/spark/sql/hive/thriftserver/ThriftserverShimUtils.scala +++ b/sql/hive-thriftserver/v1.2/src/main/scala/org/apache/spark/sql/hive/thriftserver/ThriftserverShimUtils.scala @@ -18,9 +18,9 @@ package org.apache.spark.sql.hive.thriftserver import org.apache.commons.logging.LogFactory -import org.apache.hadoop.hive.ql.exec.Utilities import org.apache.hadoop.hive.ql.session.SessionState import org.apache.hive.service.cli.{RowSet, RowSetFactory, TableSchema, Type} +import org.apache.hive.service.cli.Type._ import org.apache.hive.service.cli.thrift.TProtocolVersion._ /** @@ -51,10 +51,12 @@ private[thriftserver] object ThriftserverShimUtils { private[thriftserver] def toJavaSQLType(s: String): Int = Type.getType(s).toJavaSQLType - private[thriftserver] def addToClassPath( - loader: ClassLoader, - auxJars: Array[String]): ClassLoader = { - Utilities.addToClassPath(loader, auxJars) + private[thriftserver] def supportedType(): Seq[Type] = { + Seq(NULL_TYPE, BOOLEAN_TYPE, STRING_TYPE, BINARY_TYPE, + TINYINT_TYPE, SMALLINT_TYPE, INT_TYPE, BIGINT_TYPE, + FLOAT_TYPE, DOUBLE_TYPE, DECIMAL_TYPE, + DATE_TYPE, TIMESTAMP_TYPE, + ARRAY_TYPE, MAP_TYPE, STRUCT_TYPE) } private[thriftserver] val testedProtocolVersions = Seq( diff --git a/sql/hive-thriftserver/v2.3.5/if/TCLIService.thrift b/sql/hive-thriftserver/v2.3/if/TCLIService.thrift similarity index 99% rename from sql/hive-thriftserver/v2.3.5/if/TCLIService.thrift rename to sql/hive-thriftserver/v2.3/if/TCLIService.thrift index 824b04919073a..9026cd25df5b3 100644 --- a/sql/hive-thriftserver/v2.3.5/if/TCLIService.thrift +++ b/sql/hive-thriftserver/v2.3/if/TCLIService.thrift @@ -1105,7 +1105,6 @@ enum TFetchOrientation { FETCH_NEXT, // Get the previous rowset. The fetch offset is ignored. - // NOT SUPPORTED FETCH_PRIOR, // Return the rowset at the given fetch offset relative @@ -1133,8 +1132,8 @@ struct TFetchResultsReq { // Operation from which to fetch results. 1: required TOperationHandle operationHandle - // The fetch orientation. For V1 this must be either - // FETCH_NEXT or FETCH_FIRST. Defaults to FETCH_NEXT. + // The fetch orientation. This must be either + // FETCH_NEXT, FETCH_PRIOR or FETCH_FIRST. Defaults to FETCH_NEXT. 2: required TFetchOrientation orientation = TFetchOrientation.FETCH_NEXT // Max number of rows that should be returned in diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TArrayTypeEntry.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TArrayTypeEntry.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TArrayTypeEntry.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TArrayTypeEntry.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TBinaryColumn.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TBinaryColumn.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TBinaryColumn.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TBinaryColumn.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TBoolColumn.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TBoolColumn.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TBoolColumn.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TBoolColumn.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TBoolValue.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TBoolValue.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TBoolValue.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TBoolValue.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TByteColumn.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TByteColumn.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TByteColumn.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TByteColumn.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TByteValue.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TByteValue.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TByteValue.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TByteValue.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TCLIService.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TCLIService.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TCLIService.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TCLIService.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TCLIServiceConstants.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TCLIServiceConstants.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TCLIServiceConstants.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TCLIServiceConstants.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TCancelDelegationTokenReq.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TCancelDelegationTokenReq.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TCancelDelegationTokenReq.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TCancelDelegationTokenReq.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TCancelDelegationTokenResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TCancelDelegationTokenResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TCancelDelegationTokenResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TCancelDelegationTokenResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TCancelOperationReq.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TCancelOperationReq.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TCancelOperationReq.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TCancelOperationReq.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TCancelOperationResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TCancelOperationResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TCancelOperationResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TCancelOperationResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TCloseOperationReq.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TCloseOperationReq.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TCloseOperationReq.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TCloseOperationReq.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TCloseOperationResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TCloseOperationResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TCloseOperationResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TCloseOperationResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TCloseSessionReq.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TCloseSessionReq.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TCloseSessionReq.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TCloseSessionReq.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TCloseSessionResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TCloseSessionResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TCloseSessionResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TCloseSessionResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TColumn.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TColumn.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TColumn.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TColumn.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TColumnDesc.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TColumnDesc.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TColumnDesc.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TColumnDesc.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TColumnValue.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TColumnValue.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TColumnValue.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TColumnValue.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TDoubleColumn.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TDoubleColumn.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TDoubleColumn.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TDoubleColumn.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TDoubleValue.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TDoubleValue.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TDoubleValue.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TDoubleValue.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TExecuteStatementReq.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TExecuteStatementReq.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TExecuteStatementReq.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TExecuteStatementReq.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TExecuteStatementResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TExecuteStatementResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TExecuteStatementResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TExecuteStatementResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TFetchOrientation.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TFetchOrientation.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TFetchOrientation.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TFetchOrientation.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TFetchResultsReq.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TFetchResultsReq.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TFetchResultsReq.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TFetchResultsReq.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TFetchResultsResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TFetchResultsResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TFetchResultsResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TFetchResultsResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetCatalogsReq.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetCatalogsReq.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetCatalogsReq.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetCatalogsReq.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetCatalogsResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetCatalogsResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetCatalogsResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetCatalogsResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetColumnsReq.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetColumnsReq.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetColumnsReq.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetColumnsReq.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetColumnsResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetColumnsResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetColumnsResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetColumnsResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetCrossReferenceReq.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetCrossReferenceReq.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetCrossReferenceReq.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetCrossReferenceReq.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetCrossReferenceResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetCrossReferenceResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetCrossReferenceResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetCrossReferenceResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetDelegationTokenReq.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetDelegationTokenReq.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetDelegationTokenReq.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetDelegationTokenReq.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetDelegationTokenResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetDelegationTokenResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetDelegationTokenResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetDelegationTokenResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetFunctionsReq.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetFunctionsReq.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetFunctionsReq.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetFunctionsReq.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetFunctionsResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetFunctionsResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetFunctionsResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetFunctionsResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetInfoReq.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetInfoReq.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetInfoReq.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetInfoReq.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetInfoResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetInfoResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetInfoResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetInfoResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetInfoType.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetInfoType.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetInfoType.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetInfoType.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetInfoValue.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetInfoValue.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetInfoValue.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetInfoValue.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetOperationStatusReq.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetOperationStatusReq.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetOperationStatusReq.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetOperationStatusReq.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetOperationStatusResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetOperationStatusResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetOperationStatusResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetOperationStatusResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetPrimaryKeysReq.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetPrimaryKeysReq.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetPrimaryKeysReq.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetPrimaryKeysReq.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetPrimaryKeysResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetPrimaryKeysResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetPrimaryKeysResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetPrimaryKeysResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetResultSetMetadataReq.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetResultSetMetadataReq.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetResultSetMetadataReq.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetResultSetMetadataReq.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetResultSetMetadataResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetResultSetMetadataResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetResultSetMetadataResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetResultSetMetadataResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetSchemasReq.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetSchemasReq.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetSchemasReq.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetSchemasReq.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetSchemasResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetSchemasResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetSchemasResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetSchemasResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTableTypesReq.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTableTypesReq.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTableTypesReq.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTableTypesReq.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTableTypesResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTableTypesResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTableTypesResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTableTypesResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTablesReq.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTablesReq.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTablesReq.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTablesReq.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTablesResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTablesResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTablesResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTablesResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTypeInfoReq.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTypeInfoReq.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTypeInfoReq.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTypeInfoReq.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTypeInfoResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTypeInfoResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTypeInfoResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TGetTypeInfoResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/THandleIdentifier.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/THandleIdentifier.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/THandleIdentifier.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/THandleIdentifier.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TI16Column.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TI16Column.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TI16Column.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TI16Column.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TI16Value.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TI16Value.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TI16Value.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TI16Value.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TI32Column.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TI32Column.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TI32Column.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TI32Column.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TI32Value.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TI32Value.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TI32Value.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TI32Value.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TI64Column.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TI64Column.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TI64Column.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TI64Column.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TI64Value.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TI64Value.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TI64Value.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TI64Value.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TJobExecutionStatus.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TJobExecutionStatus.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TJobExecutionStatus.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TJobExecutionStatus.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TMapTypeEntry.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TMapTypeEntry.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TMapTypeEntry.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TMapTypeEntry.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TOpenSessionReq.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TOpenSessionReq.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TOpenSessionReq.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TOpenSessionReq.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TOpenSessionResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TOpenSessionResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TOpenSessionResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TOpenSessionResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TOperationHandle.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TOperationHandle.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TOperationHandle.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TOperationHandle.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TOperationState.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TOperationState.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TOperationState.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TOperationState.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TOperationType.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TOperationType.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TOperationType.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TOperationType.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TPrimitiveTypeEntry.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TPrimitiveTypeEntry.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TPrimitiveTypeEntry.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TPrimitiveTypeEntry.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TProgressUpdateResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TProgressUpdateResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TProgressUpdateResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TProgressUpdateResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TProtocolVersion.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TProtocolVersion.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TProtocolVersion.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TProtocolVersion.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TRenewDelegationTokenReq.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TRenewDelegationTokenReq.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TRenewDelegationTokenReq.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TRenewDelegationTokenReq.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TRenewDelegationTokenResp.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TRenewDelegationTokenResp.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TRenewDelegationTokenResp.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TRenewDelegationTokenResp.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TRow.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TRow.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TRow.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TRow.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TRowSet.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TRowSet.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TRowSet.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TRowSet.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TSessionHandle.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TSessionHandle.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TSessionHandle.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TSessionHandle.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TStatus.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TStatus.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TStatus.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TStatus.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TStatusCode.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TStatusCode.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TStatusCode.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TStatusCode.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TStringColumn.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TStringColumn.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TStringColumn.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TStringColumn.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TStringValue.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TStringValue.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TStringValue.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TStringValue.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TStructTypeEntry.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TStructTypeEntry.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TStructTypeEntry.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TStructTypeEntry.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TTableSchema.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TTableSchema.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TTableSchema.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TTableSchema.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TTypeDesc.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TTypeDesc.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TTypeDesc.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TTypeDesc.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TTypeEntry.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TTypeEntry.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TTypeEntry.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TTypeEntry.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TTypeId.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TTypeId.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TTypeId.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TTypeId.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TTypeQualifierValue.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TTypeQualifierValue.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TTypeQualifierValue.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TTypeQualifierValue.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TTypeQualifiers.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TTypeQualifiers.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TTypeQualifiers.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TTypeQualifiers.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TUnionTypeEntry.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TUnionTypeEntry.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TUnionTypeEntry.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TUnionTypeEntry.java diff --git a/sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TUserDefinedTypeEntry.java b/sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TUserDefinedTypeEntry.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/gen/java/org/apache/hive/service/rpc/thrift/TUserDefinedTypeEntry.java rename to sql/hive-thriftserver/v2.3/src/gen/java/org/apache/hive/service/rpc/thrift/TUserDefinedTypeEntry.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/AbstractService.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/AbstractService.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/AbstractService.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/AbstractService.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/CompositeService.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/CompositeService.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/CompositeService.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/CompositeService.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/CookieSigner.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/CookieSigner.java similarity index 97% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/CookieSigner.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/CookieSigner.java index 9c8bd563268bc..593abd2e153a0 100644 --- a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/CookieSigner.java +++ b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/CookieSigner.java @@ -81,7 +81,7 @@ public String verifyAndExtract(String signedStr) { if (LOG.isDebugEnabled()) { LOG.debug("Signature generated for " + rawValue + " inside verify is " + currentSignature); } - if (!originalSignature.equals(currentSignature)) { + if (!MessageDigest.isEqual(originalSignature.getBytes(), currentSignature.getBytes())) { throw new IllegalArgumentException("Invalid sign, original = " + originalSignature + " current = " + currentSignature); } diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/ServiceOperations.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/ServiceOperations.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/ServiceOperations.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/ServiceOperations.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/ServiceUtils.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/ServiceUtils.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/ServiceUtils.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/ServiceUtils.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/auth/HiveAuthFactory.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/auth/HiveAuthFactory.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/auth/HiveAuthFactory.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/auth/HiveAuthFactory.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/auth/HttpAuthUtils.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/auth/HttpAuthUtils.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/auth/HttpAuthUtils.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/auth/HttpAuthUtils.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/auth/KerberosSaslHelper.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/auth/KerberosSaslHelper.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/auth/KerberosSaslHelper.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/auth/KerberosSaslHelper.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/auth/PlainSaslHelper.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/auth/PlainSaslHelper.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/auth/PlainSaslHelper.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/auth/PlainSaslHelper.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/auth/TSetIpAddressProcessor.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/auth/TSetIpAddressProcessor.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/auth/TSetIpAddressProcessor.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/auth/TSetIpAddressProcessor.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/CLIService.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/CLIService.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/CLIService.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/CLIService.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/ColumnBasedSet.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/ColumnBasedSet.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/ColumnBasedSet.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/ColumnBasedSet.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/ColumnDescriptor.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/ColumnDescriptor.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/ColumnDescriptor.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/ColumnDescriptor.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/ColumnValue.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/ColumnValue.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/ColumnValue.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/ColumnValue.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/FetchOrientation.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/FetchOrientation.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/FetchOrientation.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/FetchOrientation.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/GetInfoType.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/GetInfoType.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/GetInfoType.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/GetInfoType.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/GetInfoValue.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/GetInfoValue.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/GetInfoValue.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/GetInfoValue.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/Handle.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/Handle.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/Handle.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/Handle.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/HandleIdentifier.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/HandleIdentifier.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/HandleIdentifier.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/HandleIdentifier.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/HiveSQLException.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/HiveSQLException.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/HiveSQLException.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/HiveSQLException.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/ICLIService.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/ICLIService.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/ICLIService.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/ICLIService.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/OperationHandle.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/OperationHandle.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/OperationHandle.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/OperationHandle.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/OperationState.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/OperationState.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/OperationState.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/OperationState.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/OperationType.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/OperationType.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/OperationType.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/OperationType.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/RowBasedSet.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/RowBasedSet.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/RowBasedSet.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/RowBasedSet.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/RowSet.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/RowSet.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/RowSet.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/RowSet.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/RowSetFactory.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/RowSetFactory.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/RowSetFactory.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/RowSetFactory.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/SessionHandle.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/SessionHandle.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/SessionHandle.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/SessionHandle.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/TableSchema.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/TableSchema.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/TableSchema.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/TableSchema.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/TypeDescriptor.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/TypeDescriptor.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/TypeDescriptor.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/TypeDescriptor.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/TypeQualifiers.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/TypeQualifiers.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/TypeQualifiers.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/TypeQualifiers.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/ClassicTableTypeMapping.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/ClassicTableTypeMapping.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/ClassicTableTypeMapping.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/ClassicTableTypeMapping.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/GetCatalogsOperation.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/GetCatalogsOperation.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/GetCatalogsOperation.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/GetCatalogsOperation.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/GetColumnsOperation.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/GetColumnsOperation.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/GetColumnsOperation.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/GetColumnsOperation.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/GetCrossReferenceOperation.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/GetCrossReferenceOperation.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/GetCrossReferenceOperation.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/GetCrossReferenceOperation.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/GetFunctionsOperation.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/GetFunctionsOperation.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/GetFunctionsOperation.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/GetFunctionsOperation.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/GetPrimaryKeysOperation.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/GetPrimaryKeysOperation.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/GetPrimaryKeysOperation.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/GetPrimaryKeysOperation.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/GetSchemasOperation.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/GetSchemasOperation.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/GetSchemasOperation.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/GetSchemasOperation.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/GetTableTypesOperation.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/GetTableTypesOperation.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/GetTableTypesOperation.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/GetTableTypesOperation.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/GetTablesOperation.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/GetTablesOperation.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/GetTablesOperation.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/GetTablesOperation.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/GetTypeInfoOperation.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/GetTypeInfoOperation.java similarity index 99% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/GetTypeInfoOperation.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/GetTypeInfoOperation.java index 9612eb145638c..0f57a72e2a1ce 100644 --- a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/GetTypeInfoOperation.java +++ b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/GetTypeInfoOperation.java @@ -73,7 +73,7 @@ public class GetTypeInfoOperation extends MetadataOperation { .addPrimitiveColumn("NUM_PREC_RADIX", Type.INT_TYPE, "Usually 2 or 10"); - private final RowSet rowSet; + protected final RowSet rowSet; protected GetTypeInfoOperation(HiveSession parentSession) { super(parentSession, OperationType.GET_TYPE_INFO); diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/HiveCommandOperation.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/HiveCommandOperation.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/HiveCommandOperation.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/HiveCommandOperation.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/HiveTableTypeMapping.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/HiveTableTypeMapping.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/HiveTableTypeMapping.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/HiveTableTypeMapping.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/MetadataOperation.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/MetadataOperation.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/MetadataOperation.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/MetadataOperation.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/Operation.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/Operation.java similarity index 98% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/Operation.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/Operation.java index 788fcdee282ae..f26c715add987 100644 --- a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/Operation.java +++ b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/Operation.java @@ -64,7 +64,10 @@ public abstract class Operation { protected final QueryState queryState; protected static final EnumSet DEFAULT_FETCH_ORIENTATION_SET = - EnumSet.of(FetchOrientation.FETCH_NEXT,FetchOrientation.FETCH_FIRST); + EnumSet.of( + FetchOrientation.FETCH_NEXT, + FetchOrientation.FETCH_FIRST, + FetchOrientation.FETCH_PRIOR); protected Operation(HiveSession parentSession, OperationType opType) { this(parentSession, null, opType); diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/OperationManager.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/OperationManager.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/OperationManager.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/OperationManager.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java similarity index 99% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java index ac5392cf42dbf..e2ac1ea78c1ab 100644 --- a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java +++ b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java @@ -20,7 +20,6 @@ import java.io.IOException; import java.io.Serializable; -import java.io.UnsupportedEncodingException; import java.security.PrivilegedExceptionAction; import java.sql.SQLException; import java.util.ArrayList; diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/TableTypeMapping.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/TableTypeMapping.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/operation/TableTypeMapping.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/operation/TableTypeMapping.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/session/HiveSession.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/session/HiveSession.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/session/HiveSession.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/session/HiveSession.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/session/HiveSessionBase.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/session/HiveSessionBase.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/session/HiveSessionBase.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/session/HiveSessionBase.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/session/HiveSessionHookContext.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/session/HiveSessionHookContext.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/session/HiveSessionHookContext.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/session/HiveSessionHookContext.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/session/HiveSessionHookContextImpl.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/session/HiveSessionHookContextImpl.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/session/HiveSessionHookContextImpl.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/session/HiveSessionHookContextImpl.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/session/HiveSessionImpl.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/session/HiveSessionImplwithUGI.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/session/HiveSessionImplwithUGI.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/session/HiveSessionImplwithUGI.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/session/HiveSessionImplwithUGI.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/session/SessionManager.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/session/SessionManager.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/session/SessionManager.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/session/SessionManager.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/thrift/ThriftBinaryCLIService.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIService.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIService.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIService.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIService.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIServiceClient.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIServiceClient.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIServiceClient.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/thrift/ThriftCLIServiceClient.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpCLIService.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpCLIService.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpCLIService.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpCLIService.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpServlet.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpServlet.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpServlet.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/cli/thrift/ThriftHttpServlet.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/server/HiveServer2.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/server/HiveServer2.java similarity index 90% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/server/HiveServer2.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/server/HiveServer2.java index ae74641ef6805..b7da4e8fdf3f7 100644 --- a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/server/HiveServer2.java +++ b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/server/HiveServer2.java @@ -30,8 +30,6 @@ import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.hadoop.hive.common.JvmPauseMonitor; -import org.apache.hadoop.hive.common.LogUtils; -import org.apache.hadoop.hive.common.LogUtils.LogInitializationException; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hive.common.util.HiveStringUtils; import org.apache.hive.service.CompositeService; @@ -158,25 +156,13 @@ private static void startHiveServer2() throws Throwable { public static void main(String[] args) { HiveConf.setLoadHiveServer2Config(true); - try { - ServerOptionsProcessor oproc = new ServerOptionsProcessor("hiveserver2"); - ServerOptionsProcessorResponse oprocResponse = oproc.parse(args); + ServerOptionsProcessor oproc = new ServerOptionsProcessor("hiveserver2"); + ServerOptionsProcessorResponse oprocResponse = oproc.parse(args); - // NOTE: It is critical to do this here so that log4j is reinitialized - // before any of the other core hive classes are loaded - String initLog4jMessage = LogUtils.initHiveLog4j(); - LOG.debug(initLog4jMessage); - HiveStringUtils.startupShutdownMessage(HiveServer2.class, args, LOG); + HiveStringUtils.startupShutdownMessage(HiveServer2.class, args, LOG); - // Log debug message from "oproc" after log4j initialize properly - LOG.debug(oproc.getDebugMessage().toString()); - - // Call the executor which will execute the appropriate command based on the parsed options - oprocResponse.getServerOptionsExecutor().execute(); - } catch (LogInitializationException e) { - LOG.error("Error initializing log: " + e.getMessage(), e); - System.exit(-1); - } + // Call the executor which will execute the appropriate command based on the parsed options + oprocResponse.getServerOptionsExecutor().execute(); } /** diff --git a/sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/server/ThreadWithGarbageCleanup.java b/sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/server/ThreadWithGarbageCleanup.java similarity index 100% rename from sql/hive-thriftserver/v2.3.5/src/main/java/org/apache/hive/service/server/ThreadWithGarbageCleanup.java rename to sql/hive-thriftserver/v2.3/src/main/java/org/apache/hive/service/server/ThreadWithGarbageCleanup.java diff --git a/sql/hive-thriftserver/v2.3.5/src/main/scala/org/apache/spark/sql/hive/thriftserver/ThriftserverShimUtils.scala b/sql/hive-thriftserver/v2.3/src/main/scala/org/apache/spark/sql/hive/thriftserver/ThriftserverShimUtils.scala similarity index 88% rename from sql/hive-thriftserver/v2.3.5/src/main/scala/org/apache/spark/sql/hive/thriftserver/ThriftserverShimUtils.scala rename to sql/hive-thriftserver/v2.3/src/main/scala/org/apache/spark/sql/hive/thriftserver/ThriftserverShimUtils.scala index 124c9937c0fca..850382fe2bfd7 100644 --- a/sql/hive-thriftserver/v2.3.5/src/main/scala/org/apache/spark/sql/hive/thriftserver/ThriftserverShimUtils.scala +++ b/sql/hive-thriftserver/v2.3/src/main/scala/org/apache/spark/sql/hive/thriftserver/ThriftserverShimUtils.scala @@ -17,13 +17,9 @@ package org.apache.spark.sql.hive.thriftserver -import java.security.AccessController - -import scala.collection.JavaConverters._ - -import org.apache.hadoop.hive.ql.exec.AddToClassPathAction import org.apache.hadoop.hive.ql.session.SessionState import org.apache.hadoop.hive.serde2.thrift.Type +import org.apache.hadoop.hive.serde2.thrift.Type._ import org.apache.hive.service.cli.{RowSet, RowSetFactory, TableSchema} import org.apache.hive.service.rpc.thrift.TProtocolVersion._ import org.slf4j.LoggerFactory @@ -56,11 +52,12 @@ private[thriftserver] object ThriftserverShimUtils { private[thriftserver] def toJavaSQLType(s: String): Int = Type.getType(s).toJavaSQLType - private[thriftserver] def addToClassPath( - loader: ClassLoader, - auxJars: Array[String]): ClassLoader = { - val addAction = new AddToClassPathAction(loader, auxJars.toList.asJava) - AccessController.doPrivileged(addAction) + private[thriftserver] def supportedType(): Seq[Type] = { + Seq(NULL_TYPE, BOOLEAN_TYPE, STRING_TYPE, BINARY_TYPE, + TINYINT_TYPE, SMALLINT_TYPE, INT_TYPE, BIGINT_TYPE, + FLOAT_TYPE, DOUBLE_TYPE, DECIMAL_TYPE, + DATE_TYPE, TIMESTAMP_TYPE, + ARRAY_TYPE, MAP_TYPE, STRUCT_TYPE) } private[thriftserver] val testedProtocolVersions = Seq( diff --git a/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-jdk11-results.txt b/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-jdk11-results.txt new file mode 100644 index 0000000000000..4a8058766319f --- /dev/null +++ b/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-jdk11-results.txt @@ -0,0 +1,45 @@ +================================================================================================ +Hive UDAF vs Spark AF +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +hive udaf vs spark af: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +hive udaf w/o group by 6492 7169 388 0.0 99066.1 1.0X +spark af w/o group by 58 88 24 1.1 890.2 111.3X +hive udaf w/ group by 4864 4888 33 0.0 74221.0 1.3X +spark af w/ group by w/o fallback 60 67 7 1.1 912.9 108.5X +spark af w/ group by w/ fallback 154 164 27 0.4 2348.2 42.2X + + +================================================================================================ +ObjectHashAggregateExec vs SortAggregateExec - typed_count +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +object agg v.s. sort agg: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +sort agg w/ group by 51728 51728 0 2.0 493.3 1.0X +object agg w/ group by w/o fallback 10174 10218 34 10.3 97.0 5.1X +object agg w/ group by w/ fallback 29341 29537 277 3.6 279.8 1.8X +sort agg w/o group by 7541 7577 28 13.9 71.9 6.9X +object agg w/o group by w/o fallback 5574 5620 38 18.8 53.2 9.3X + + +================================================================================================ +ObjectHashAggregateExec vs SortAggregateExec - percentile_approx +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +object agg v.s. sort agg: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +sort agg w/ group by 900 925 14 2.3 429.0 1.0X +object agg w/ group by w/o fallback 597 633 14 3.5 284.6 1.5X +object agg w/ group by w/ fallback 905 923 10 2.3 431.6 1.0X +sort agg w/o group by 611 631 10 3.4 291.4 1.5X +object agg w/o group by w/o fallback 559 576 11 3.8 266.5 1.6X + + diff --git a/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-results.txt b/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-results.txt index f3044da972497..8c58a5a5fdf0b 100644 --- a/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-results.txt +++ b/sql/hive/benchmarks/ObjectHashAggregateExecBenchmark-results.txt @@ -2,44 +2,44 @@ Hive UDAF vs Spark AF ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -hive udaf vs spark af: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -hive udaf w/o group by 6370 / 6400 0.0 97193.6 1.0X -spark af w/o group by 54 / 63 1.2 820.8 118.4X -hive udaf w/ group by 4492 / 4507 0.0 68539.5 1.4X -spark af w/ group by w/o fallback 58 / 64 1.1 881.7 110.2X -spark af w/ group by w/ fallback 136 / 142 0.5 2075.0 46.8X +hive udaf vs spark af: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +hive udaf w/o group by 7014 7206 120 0.0 107031.0 1.0X +spark af w/o group by 47 59 11 1.4 716.9 149.3X +hive udaf w/ group by 4811 4831 28 0.0 73409.1 1.5X +spark af w/ group by w/o fallback 50 56 7 1.3 762.9 140.3X +spark af w/ group by w/ fallback 126 130 8 0.5 1916.6 55.8X ================================================================================================ ObjectHashAggregateExec vs SortAggregateExec - typed_count ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -object agg v.s. sort agg: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -sort agg w/ group by 41500 / 41630 2.5 395.8 1.0X -object agg w/ group by w/o fallback 10075 / 10122 10.4 96.1 4.1X -object agg w/ group by w/ fallback 28131 / 28205 3.7 268.3 1.5X -sort agg w/o group by 6182 / 6221 17.0 59.0 6.7X -object agg w/o group by w/o fallback 5435 / 5468 19.3 51.8 7.6X +object agg v.s. sort agg: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +sort agg w/ group by 42969 43306 476 2.4 409.8 1.0X +object agg w/ group by w/o fallback 9744 9844 145 10.8 92.9 4.4X +object agg w/ group by w/ fallback 26814 26960 206 3.9 255.7 1.6X +sort agg w/o group by 6278 6330 57 16.7 59.9 6.8X +object agg w/o group by w/o fallback 5433 5478 60 19.3 51.8 7.9X ================================================================================================ ObjectHashAggregateExec vs SortAggregateExec - percentile_approx ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -object agg v.s. sort agg: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -sort agg w/ group by 970 / 1025 2.2 462.5 1.0X -object agg w/ group by w/o fallback 772 / 798 2.7 368.1 1.3X -object agg w/ group by w/ fallback 1013 / 1044 2.1 483.1 1.0X -sort agg w/o group by 751 / 781 2.8 358.0 1.3X -object agg w/o group by w/o fallback 772 / 814 2.7 368.0 1.3X +object agg v.s. sort agg: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +sort agg w/ group by 756 773 9 2.8 360.3 1.0X +object agg w/ group by w/o fallback 548 560 7 3.8 261.3 1.4X +object agg w/ group by w/ fallback 759 773 7 2.8 362.0 1.0X +sort agg w/o group by 471 483 13 4.4 224.8 1.6X +object agg w/o group by w/o fallback 471 482 12 4.5 224.7 1.6X diff --git a/sql/hive/benchmarks/OrcReadBenchmark-jdk11-results.txt b/sql/hive/benchmarks/OrcReadBenchmark-jdk11-results.txt new file mode 100644 index 0000000000000..d516d3369ad05 --- /dev/null +++ b/sql/hive/benchmarks/OrcReadBenchmark-jdk11-results.txt @@ -0,0 +1,156 @@ +================================================================================================ +SQL Single Numeric Column Scan +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 1750 1872 173 9.0 111.2 1.0X +Native ORC Vectorized 433 499 68 36.3 27.5 4.0X +Hive built-in ORC 2540 2575 49 6.2 161.5 0.7X + +OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 1979 2001 31 7.9 125.8 1.0X +Native ORC Vectorized 261 303 42 60.3 16.6 7.6X +Hive built-in ORC 2559 2583 34 6.1 162.7 0.8X + +OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 2094 2158 91 7.5 133.2 1.0X +Native ORC Vectorized 309 361 41 50.8 19.7 6.8X +Hive built-in ORC 2649 2744 135 5.9 168.4 0.8X + +OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 2256 2271 22 7.0 143.4 1.0X +Native ORC Vectorized 511 518 11 30.8 32.5 4.4X +Hive built-in ORC 2867 2880 19 5.5 182.3 0.8X + +OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 2270 2325 78 6.9 144.3 1.0X +Native ORC Vectorized 502 508 5 31.3 31.9 4.5X +Hive built-in ORC 2862 2880 24 5.5 182.0 0.8X + +OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 2376 2426 71 6.6 151.0 1.0X +Native ORC Vectorized 609 616 8 25.8 38.7 3.9X +Hive built-in ORC 2979 2991 17 5.3 189.4 0.8X + + +================================================================================================ +Int and String Scan +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 4112 4232 170 2.6 392.1 1.0X +Native ORC Vectorized 2199 2223 35 4.8 209.7 1.9X +Hive built-in ORC 5150 5238 123 2.0 491.2 0.8X + + +================================================================================================ +Partitioned Table Scan +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Data column - Native ORC MR 2398 2435 53 6.6 152.4 1.0X +Data column - Native ORC Vectorized 458 482 26 34.3 29.1 5.2X +Data column - Hive built-in ORC 3126 3171 64 5.0 198.8 0.8X +Partition column - Native ORC MR 1639 1680 58 9.6 104.2 1.5X +Partition column - Native ORC Vectorized 105 119 11 149.6 6.7 22.8X +Partition column - Hive built-in ORC 2223 2229 8 7.1 141.4 1.1X +Both columns - Native ORC MR 2588 2608 28 6.1 164.5 0.9X +Both columns - Native ORC Vectorized 489 522 49 32.2 31.1 4.9X +Both columns - Hive built-in ORC 3258 3292 48 4.8 207.1 0.7X + + +================================================================================================ +Repeated String Scan +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 1991 2028 52 5.3 189.9 1.0X +Native ORC Vectorized 392 398 8 26.7 37.4 5.1X +Hive built-in ORC 2810 2816 8 3.7 268.0 0.7X + + +================================================================================================ +String with Nulls Scan +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 3638 3647 13 2.9 346.9 1.0X +Native ORC Vectorized 1171 1181 14 9.0 111.7 3.1X +Hive built-in ORC 4847 4871 34 2.2 462.2 0.8X + +OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 3280 3283 5 3.2 312.8 1.0X +Native ORC Vectorized 1199 1206 10 8.7 114.4 2.7X +Hive built-in ORC 4263 4273 14 2.5 406.5 0.8X + +OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 1935 1950 21 5.4 184.6 1.0X +Native ORC Vectorized 451 459 10 23.2 43.1 4.3X +Hive built-in ORC 2542 2552 14 4.1 242.4 0.8X + + +================================================================================================ +Single Column Scan From Wide Columns +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 270 292 23 3.9 257.2 1.0X +Native ORC Vectorized 143 155 12 7.3 136.2 1.9X +Hive built-in ORC 1593 1627 48 0.7 1519.1 0.2X + +OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Single Column Scan from 200 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 369 386 17 2.8 351.5 1.0X +Native ORC Vectorized 218 231 15 4.8 208.3 1.7X +Hive built-in ORC 3092 3101 12 0.3 2949.1 0.1X + +OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws +Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Single Column Scan from 300 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 498 531 33 2.1 475.0 1.0X +Native ORC Vectorized 360 376 18 2.9 342.9 1.4X +Hive built-in ORC 4786 4786 1 0.2 4564.1 0.1X + + diff --git a/sql/hive/benchmarks/OrcReadBenchmark-results.txt b/sql/hive/benchmarks/OrcReadBenchmark-results.txt index caa78b9a8f102..c7d6c976192b2 100644 --- a/sql/hive/benchmarks/OrcReadBenchmark-results.txt +++ b/sql/hive/benchmarks/OrcReadBenchmark-results.txt @@ -2,155 +2,155 @@ SQL Single Numeric Column Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -SQL Single TINYINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -Native ORC MR 1725 / 1759 9.1 109.7 1.0X -Native ORC Vectorized 272 / 316 57.8 17.3 6.3X -Hive built-in ORC 1970 / 1987 8.0 125.3 0.9X +SQL Single TINYINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 1844 1851 10 8.5 117.2 1.0X +Native ORC Vectorized 284 312 36 55.5 18.0 6.5X +Hive built-in ORC 2380 2380 1 6.6 151.3 0.8X -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -SQL Single SMALLINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -Native ORC MR 1633 / 1672 9.6 103.8 1.0X -Native ORC Vectorized 238 / 255 66.0 15.1 6.9X -Hive built-in ORC 2293 / 2305 6.9 145.8 0.7X +SQL Single SMALLINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 1999 2031 45 7.9 127.1 1.0X +Native ORC Vectorized 252 264 15 62.5 16.0 7.9X +Hive built-in ORC 2483 2509 37 6.3 157.9 0.8X -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -SQL Single INT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -Native ORC MR 1677 / 1699 9.4 106.6 1.0X -Native ORC Vectorized 325 / 342 48.3 20.7 5.2X -Hive built-in ORC 2561 / 2569 6.1 162.8 0.7X +SQL Single INT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 2134 2135 2 7.4 135.7 1.0X +Native ORC Vectorized 329 351 34 47.8 20.9 6.5X +Hive built-in ORC 2672 2716 61 5.9 169.9 0.8X -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -SQL Single BIGINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -Native ORC MR 1791 / 1795 8.8 113.9 1.0X -Native ORC Vectorized 400 / 408 39.3 25.4 4.5X -Hive built-in ORC 2713 / 2720 5.8 172.5 0.7X +SQL Single BIGINT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 2172 2247 105 7.2 138.1 1.0X +Native ORC Vectorized 407 427 23 38.7 25.9 5.3X +Hive built-in ORC 2806 2822 22 5.6 178.4 0.8X -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -SQL Single FLOAT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -Native ORC MR 1791 / 1805 8.8 113.8 1.0X -Native ORC Vectorized 433 / 438 36.3 27.5 4.1X -Hive built-in ORC 2690 / 2803 5.8 171.0 0.7X +SQL Single FLOAT Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 2187 2200 19 7.2 139.0 1.0X +Native ORC Vectorized 451 457 5 34.9 28.7 4.8X +Hive built-in ORC 2886 2938 73 5.4 183.5 0.8X -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -SQL Single DOUBLE Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -Native ORC MR 1911 / 1930 8.2 121.5 1.0X -Native ORC Vectorized 543 / 552 29.0 34.5 3.5X -Hive built-in ORC 2967 / 3065 5.3 188.6 0.6X +SQL Single DOUBLE Column Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 2313 2319 9 6.8 147.1 1.0X +Native ORC Vectorized 554 562 7 28.4 35.2 4.2X +Hive built-in ORC 2927 2933 8 5.4 186.1 0.8X ================================================================================================ Int and String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -Int and String Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -Native ORC MR 4160 / 4188 2.5 396.7 1.0X -Native ORC Vectorized 2405 / 2406 4.4 229.4 1.7X -Hive built-in ORC 5514 / 5562 1.9 525.9 0.8X +Int and String Scan: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 4162 4294 186 2.5 397.0 1.0X +Native ORC Vectorized 2236 2258 32 4.7 213.2 1.9X +Hive built-in ORC 5054 5135 114 2.1 482.0 0.8X ================================================================================================ Partitioned Table Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -Partitioned Table: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -Data column - Native ORC MR 1863 / 1867 8.4 118.4 1.0X -Data column - Native ORC Vectorized 411 / 418 38.2 26.2 4.5X -Data column - Hive built-in ORC 3297 / 3308 4.8 209.6 0.6X -Partition column - Native ORC MR 1505 / 1506 10.4 95.7 1.2X -Partition column - Native ORC Vectorized 80 / 93 195.6 5.1 23.2X -Partition column - Hive built-in ORC 1960 / 1979 8.0 124.6 1.0X -Both columns - Native ORC MR 2076 / 2090 7.6 132.0 0.9X -Both columns - Native ORC Vectorized 450 / 463 34.9 28.6 4.1X -Both columns - Hive built-in ORC 3528 / 3548 4.5 224.3 0.5X +Partitioned Table: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Data column - Native ORC MR 2436 2447 16 6.5 154.8 1.0X +Data column - Native ORC Vectorized 421 443 35 37.4 26.8 5.8X +Data column - Hive built-in ORC 3007 3026 27 5.2 191.2 0.8X +Partition column - Native ORC MR 1603 1630 39 9.8 101.9 1.5X +Partition column - Native ORC Vectorized 84 96 15 186.7 5.4 28.9X +Partition column - Hive built-in ORC 2174 2187 18 7.2 138.2 1.1X +Both columns - Native ORC MR 2609 2645 51 6.0 165.9 0.9X +Both columns - Native ORC Vectorized 460 470 9 34.2 29.3 5.3X +Both columns - Hive built-in ORC 3094 3099 8 5.1 196.7 0.8X ================================================================================================ Repeated String Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -Repeated String: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -Native ORC MR 1727 / 1733 6.1 164.7 1.0X -Native ORC Vectorized 375 / 379 28.0 35.7 4.6X -Hive built-in ORC 2665 / 2666 3.9 254.2 0.6X +Repeated String: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 2036 2046 13 5.1 194.2 1.0X +Native ORC Vectorized 366 386 18 28.6 34.9 5.6X +Hive built-in ORC 2683 2686 4 3.9 255.9 0.8X ================================================================================================ String with Nulls Scan ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -String with Nulls Scan (0.0%): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -Native ORC MR 3324 / 3325 3.2 317.0 1.0X -Native ORC Vectorized 1085 / 1106 9.7 103.4 3.1X -Hive built-in ORC 5272 / 5299 2.0 502.8 0.6X +String with Nulls Scan (0.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 3614 3643 40 2.9 344.7 1.0X +Native ORC Vectorized 1072 1087 22 9.8 102.2 3.4X +Hive built-in ORC 4625 4636 15 2.3 441.1 0.8X -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -String with Nulls Scan (50.0%): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -Native ORC MR 3045 / 3046 3.4 290.4 1.0X -Native ORC Vectorized 1248 / 1260 8.4 119.0 2.4X -Hive built-in ORC 3989 / 3999 2.6 380.4 0.8X +String with Nulls Scan (50.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 3347 3376 42 3.1 319.2 1.0X +Native ORC Vectorized 1220 1225 7 8.6 116.3 2.7X +Hive built-in ORC 4168 4184 23 2.5 397.5 0.8X -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -String with Nulls Scan (95.0%): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -Native ORC MR 1692 / 1694 6.2 161.3 1.0X -Native ORC Vectorized 471 / 493 22.3 44.9 3.6X -Hive built-in ORC 2398 / 2411 4.4 228.7 0.7X +String with Nulls Scan (95.0%): Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 1851 1862 16 5.7 176.5 1.0X +Native ORC Vectorized 466 471 7 22.5 44.4 4.0X +Hive built-in ORC 2523 2529 8 4.2 240.6 0.7X ================================================================================================ Single Column Scan From Wide Columns ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -Single Column Scan from 100 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -Native ORC MR 1371 / 1379 0.8 1307.5 1.0X -Native ORC Vectorized 121 / 135 8.6 115.8 11.3X -Hive built-in ORC 521 / 561 2.0 497.1 2.6X +Single Column Scan from 100 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 250 264 15 4.2 238.1 1.0X +Native ORC Vectorized 121 138 24 8.7 115.5 2.1X +Hive built-in ORC 1761 1792 43 0.6 1679.3 0.1X -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -Single Column Scan from 200 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -Native ORC MR 2711 / 2767 0.4 2585.5 1.0X -Native ORC Vectorized 210 / 232 5.0 200.5 12.9X -Hive built-in ORC 764 / 775 1.4 728.3 3.5X +Single Column Scan from 200 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 319 341 17 3.3 304.5 1.0X +Native ORC Vectorized 188 222 50 5.6 178.8 1.7X +Hive built-in ORC 3492 3508 24 0.3 3329.8 0.1X -OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64 +OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz -Single Column Scan from 300 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------- -Native ORC MR 3979 / 3988 0.3 3794.4 1.0X -Native ORC Vectorized 357 / 366 2.9 340.2 11.2X -Hive built-in ORC 1091 / 1095 1.0 1040.5 3.6X +Single Column Scan from 300 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Native ORC MR 443 456 12 2.4 422.9 1.0X +Native ORC Vectorized 306 321 23 3.4 292.0 1.4X +Hive built-in ORC 5295 5312 24 0.2 5049.9 0.1X diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala index e7ff3a5f4be2b..29825e5116ef9 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala @@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy /** * Runs the test cases that are included in the hive distribution. @@ -41,12 +42,13 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { private val originalInMemoryPartitionPruning = TestHive.conf.inMemoryPartitionPruning private val originalCrossJoinEnabled = TestHive.conf.crossJoinEnabled private val originalSessionLocalTimeZone = TestHive.conf.sessionLocalTimeZone + private val originalCreateHiveTable = TestHive.conf.createHiveTableByDefaultEnabled def testCases: Seq[(String, File)] = { hiveQueryDir.listFiles.map(f => f.getName.stripSuffix(".q") -> f) } - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() TestHive.setCacheTables(true) // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*) @@ -59,13 +61,16 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, true) // Ensures that cross joins are enabled so that we can test them TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, true) + // Ensures that the table insertion behaivor is consistent with Hive + TestHive.setConf(SQLConf.STORE_ASSIGNMENT_POLICY, StoreAssignmentPolicy.LEGACY.toString) // Fix session local timezone to America/Los_Angeles for those timezone sensitive tests // (timestamp_*) TestHive.setConf(SQLConf.SESSION_LOCAL_TIMEZONE, "America/Los_Angeles") + TestHive.setConf(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED, true) RuleExecutor.resetMetrics() } - override def afterAll() { + override def afterAll(): Unit = { try { TestHive.setCacheTables(false) TimeZone.setDefault(originalTimeZone) @@ -74,6 +79,8 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter { TestHive.setConf(SQLConf.IN_MEMORY_PARTITION_PRUNING, originalInMemoryPartitionPruning) TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, originalCrossJoinEnabled) TestHive.setConf(SQLConf.SESSION_LOCAL_TIMEZONE, originalSessionLocalTimeZone) + TestHive.setConf(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED, + originalCreateHiveTable) // For debugging dump some statistics about how much time was spent in various optimizer rules logWarning(RuleExecutor.dumpTimeSpent()) diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala index c7d953a731b9b..ed23f65815917 100644 --- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala +++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveWindowFunctionQuerySuite.scala @@ -37,7 +37,7 @@ class HiveWindowFunctionQuerySuite extends HiveComparisonTest with BeforeAndAfte private val originalLocale = Locale.getDefault private val testTempDir = Utils.createTempDir() - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() TestHive.setCacheTables(true) // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*) @@ -58,7 +58,7 @@ class HiveWindowFunctionQuerySuite extends HiveComparisonTest with BeforeAndAfte | p_size INT, | p_container STRING, | p_retailprice DOUBLE, - | p_comment STRING) + | p_comment STRING) USING hive """.stripMargin) val testData1 = TestHive.getHiveFile("data/files/part_tiny.txt").getCanonicalPath sql( @@ -100,7 +100,7 @@ class HiveWindowFunctionQuerySuite extends HiveComparisonTest with BeforeAndAfte sql("set mapreduce.jobtracker.address=local") } - override def afterAll() { + override def afterAll(): Unit = { try { TestHive.setCacheTables(false) TimeZone.setDefault(originalTimeZone) @@ -751,7 +751,7 @@ class HiveWindowFunctionQueryFileSuite private val originalLocale = Locale.getDefault private val testTempDir = Utils.createTempDir() - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() TestHive.setCacheTables(true) // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*) @@ -769,7 +769,7 @@ class HiveWindowFunctionQueryFileSuite // sql("set mapreduce.jobtracker.address=local") } - override def afterAll() { + override def afterAll(): Unit = { try { TestHive.setCacheTables(false) TimeZone.setDefault(originalTimeZone) diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml index d37f0c8573659..c37582386347b 100644 --- a/sql/hive/pom.xml +++ b/sql/hive/pom.xml @@ -88,12 +88,11 @@ ${protobuf.version} --> - ${hive.group} hive-exec @@ -105,22 +104,24 @@ ${hive.group} - hive-contrib + hive-serde + ${hive.serde.scope} + + + ${hive.group} + hive-shims + ${hive.shims.scope} - ${hive.group}.hcatalog - hive-hcatalog-core + org.apache.hive + hive-llap-common + ${hive.llap.scope} + + + org.apache.hive + hive-llap-client + ${hive.llap.scope} - org.apache.avro @@ -216,31 +217,6 @@ - - hadoop-3.2 - - - ${hive.group} - hive-common - - - ${hive.group} - hive-serde - - - ${hive.group} - hive-shims - - - org.apache.hive - hive-llap-common - - - org.apache.hive - hive-llap-client - - - @@ -252,7 +228,7 @@ scalatest-maven-plugin - -da -Xmx4g -XX:ReservedCodeCacheSize=${CodeCacheSize} + -da -Xmx4g -XX:ReservedCodeCacheSize=${CodeCacheSize} -Dio.netty.tryReflectionSetAccessible=true diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala deleted file mode 100644 index 02a5117f005e8..0000000000000 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.hive - -import org.apache.spark.SparkContext -import org.apache.spark.api.java.JavaSparkContext -import org.apache.spark.internal.Logging -import org.apache.spark.sql.{SparkSession, SQLContext} - - -/** - * An instance of the Spark SQL execution engine that integrates with data stored in Hive. - * Configuration for Hive is read from hive-site.xml on the classpath. - */ -@deprecated("Use SparkSession.builder.enableHiveSupport instead", "2.0.0") -class HiveContext private[hive](_sparkSession: SparkSession) - extends SQLContext(_sparkSession) with Logging { - - self => - - def this(sc: SparkContext) = { - this(SparkSession.builder().sparkContext(HiveUtils.withHiveExternalCatalog(sc)).getOrCreate()) - } - - def this(sc: JavaSparkContext) = this(sc.sc) - - /** - * Returns a new HiveContext as new session, which will have separated SQLConf, UDF/UDAF, - * temporary tables and SessionState, but sharing the same CacheManager, IsolatedClientLoader - * and Hive client (both of execution and metadata) with existing HiveContext. - */ - override def newSession(): HiveContext = { - new HiveContext(sparkSession.newSession()) - } - - /** - * Invalidate and refresh all the cached the metadata of the given table. For performance reasons, - * Spark SQL or the external data source library it uses might cache certain metadata about a - * table, such as the location of blocks. When those change outside of Spark SQL, users should - * call this function to invalidate the cache. - * - * @since 1.3.0 - */ - def refreshTable(tableName: String): Unit = { - sparkSession.catalog.refreshTable(tableName) - } - -} diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala index 03874d005a6e6..ca292f65efeee 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala @@ -40,8 +40,8 @@ import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils._ import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.plans.logical.ColumnStat import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.sql.connector.catalog.TableCatalog import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.execution.datasources.{PartitioningUtils, SourceOptions} import org.apache.spark.sql.hive.client.HiveClient @@ -635,12 +635,16 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat k.startsWith(CREATED_SPARK_VERSION) } val newTableProps = propsFromOldTable ++ tableDefinition.properties + partitionProviderProp + + // // Add old table's owner if we need to restore + val owner = Option(tableDefinition.owner).filter(_.nonEmpty).getOrElse(oldTableDef.owner) val newDef = tableDefinition.copy( storage = newStorage, schema = oldTableDef.schema, partitionColumnNames = oldTableDef.partitionColumnNames, bucketSpec = oldTableDef.bucketSpec, - properties = newTableProps) + properties = newTableProps, + owner = owner) client.alterTable(newDef) } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala index 33b5bcefd853f..0cd9b3641bd4a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala @@ -305,12 +305,17 @@ private[hive] trait HiveInspectors { withNullSafe(o => getByteWritable(o)) case _: ByteObjectInspector => withNullSafe(o => o.asInstanceOf[java.lang.Byte]) - case _: JavaHiveVarcharObjectInspector => + // To spark HiveVarchar and HiveChar are same as string + case _: HiveVarcharObjectInspector if x.preferWritable() => + withNullSafe(o => getStringWritable(o)) + case _: HiveVarcharObjectInspector => withNullSafe { o => val s = o.asInstanceOf[UTF8String].toString new HiveVarchar(s, s.length) } - case _: JavaHiveCharObjectInspector => + case _: HiveCharObjectInspector if x.preferWritable() => + withNullSafe(o => getStringWritable(o)) + case _: HiveCharObjectInspector => withNullSafe { o => val s = o.asInstanceOf[UTF8String].toString new HiveChar(s, s.length) @@ -787,6 +792,9 @@ private[hive] trait HiveInspectors { ObjectInspectorFactory.getStandardStructObjectInspector( java.util.Arrays.asList(fields.map(f => f.name) : _*), java.util.Arrays.asList(fields.map(f => toInspector(f.dataType)) : _*)) + case _: UserDefinedType[_] => + val sqlType = dataType.asInstanceOf[UserDefinedType[_]].sqlType + toInspector(sqlType) } /** @@ -849,6 +857,8 @@ private[hive] trait HiveInspectors { } case Literal(_, dt: StructType) => toInspector(dt) + case Literal(_, dt: UserDefinedType[_]) => + toInspector(dt.sqlType) // We will enumerate all of the possible constant expressions, throw exception if we missed case Literal(_, dt) => sys.error(s"Hive doesn't support the constant type [$dt].") // ideally, we don't test the foldable here(but in optimizer), however, some of the diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index 5ad2caba07fc0..2981e391c0439 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -26,7 +26,7 @@ import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.internal.Logging -import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.catalyst.{QualifiedTableName, TableIdentifier} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.plans.logical._ @@ -257,8 +257,20 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log } // The inferred schema may have different field names as the table schema, we should respect // it, but also respect the exprId in table relation output. - assert(result.output.length == relation.output.length && - result.output.zip(relation.output).forall { case (a1, a2) => a1.dataType == a2.dataType }) + if (result.output.length != relation.output.length) { + throw new AnalysisException( + s"Converted table has ${result.output.length} columns, " + + s"but source Hive table has ${relation.output.length} columns. " + + s"Set ${HiveUtils.CONVERT_METASTORE_PARQUET.key} to false, " + + s"or recreate table ${relation.tableMeta.identifier} to workaround.") + } + if (!result.output.zip(relation.output).forall { + case (a1, a2) => a1.dataType == a2.dataType }) { + throw new AnalysisException( + s"Column in converted table has different data type with source Hive table's. " + + s"Set ${HiveUtils.CONVERT_METASTORE_PARQUET.key} to false, " + + s"or recreate table ${relation.tableMeta.identifier} to workaround.") + } val newOutput = result.output.zip(relation.output).map { case (a1, a2) => a1.withExprId(a2.exprId) } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala index 3f0a9f222feb2..bc7760c982aab 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionCatalog.scala @@ -66,49 +66,52 @@ private[sql] class HiveSessionCatalog( name: String, clazz: Class[_], input: Seq[Expression]): Expression = { - - Try(super.makeFunctionExpression(name, clazz, input)).getOrElse { - var udfExpr: Option[Expression] = None - try { - // When we instantiate hive UDF wrapper class, we may throw exception if the input - // expressions don't satisfy the hive UDF, such as type mismatch, input number - // mismatch, etc. Here we catch the exception and throw AnalysisException instead. - if (classOf[UDF].isAssignableFrom(clazz)) { - udfExpr = Some(HiveSimpleUDF(name, new HiveFunctionWrapper(clazz.getName), input)) - udfExpr.get.dataType // Force it to check input data types. - } else if (classOf[GenericUDF].isAssignableFrom(clazz)) { - udfExpr = Some(HiveGenericUDF(name, new HiveFunctionWrapper(clazz.getName), input)) - udfExpr.get.dataType // Force it to check input data types. - } else if (classOf[AbstractGenericUDAFResolver].isAssignableFrom(clazz)) { - udfExpr = Some(HiveUDAFFunction(name, new HiveFunctionWrapper(clazz.getName), input)) - udfExpr.get.dataType // Force it to check input data types. - } else if (classOf[UDAF].isAssignableFrom(clazz)) { - udfExpr = Some(HiveUDAFFunction( - name, - new HiveFunctionWrapper(clazz.getName), - input, - isUDAFBridgeRequired = true)) - udfExpr.get.dataType // Force it to check input data types. - } else if (classOf[GenericUDTF].isAssignableFrom(clazz)) { - udfExpr = Some(HiveGenericUDTF(name, new HiveFunctionWrapper(clazz.getName), input)) - udfExpr.get.asInstanceOf[HiveGenericUDTF].elementSchema // Force it to check data types. + // Current thread context classloader may not be the one loaded the class. Need to switch + // context classloader to initialize instance properly. + Utils.withContextClassLoader(clazz.getClassLoader) { + Try(super.makeFunctionExpression(name, clazz, input)).getOrElse { + var udfExpr: Option[Expression] = None + try { + // When we instantiate hive UDF wrapper class, we may throw exception if the input + // expressions don't satisfy the hive UDF, such as type mismatch, input number + // mismatch, etc. Here we catch the exception and throw AnalysisException instead. + if (classOf[UDF].isAssignableFrom(clazz)) { + udfExpr = Some(HiveSimpleUDF(name, new HiveFunctionWrapper(clazz.getName), input)) + udfExpr.get.dataType // Force it to check input data types. + } else if (classOf[GenericUDF].isAssignableFrom(clazz)) { + udfExpr = Some(HiveGenericUDF(name, new HiveFunctionWrapper(clazz.getName), input)) + udfExpr.get.dataType // Force it to check input data types. + } else if (classOf[AbstractGenericUDAFResolver].isAssignableFrom(clazz)) { + udfExpr = Some(HiveUDAFFunction(name, new HiveFunctionWrapper(clazz.getName), input)) + udfExpr.get.dataType // Force it to check input data types. + } else if (classOf[UDAF].isAssignableFrom(clazz)) { + udfExpr = Some(HiveUDAFFunction( + name, + new HiveFunctionWrapper(clazz.getName), + input, + isUDAFBridgeRequired = true)) + udfExpr.get.dataType // Force it to check input data types. + } else if (classOf[GenericUDTF].isAssignableFrom(clazz)) { + udfExpr = Some(HiveGenericUDTF(name, new HiveFunctionWrapper(clazz.getName), input)) + udfExpr.get.asInstanceOf[HiveGenericUDTF].elementSchema // Force it to check data types. + } + } catch { + case NonFatal(e) => + val noHandlerMsg = s"No handler for UDF/UDAF/UDTF '${clazz.getCanonicalName}': $e" + val errorMsg = + if (classOf[GenericUDTF].isAssignableFrom(clazz)) { + s"$noHandlerMsg\nPlease make sure your function overrides " + + "`public StructObjectInspector initialize(ObjectInspector[] args)`." + } else { + noHandlerMsg + } + val analysisException = new AnalysisException(errorMsg) + analysisException.setStackTrace(e.getStackTrace) + throw analysisException + } + udfExpr.getOrElse { + throw new AnalysisException(s"No handler for UDF/UDAF/UDTF '${clazz.getCanonicalName}'") } - } catch { - case NonFatal(e) => - val noHandlerMsg = s"No handler for UDF/UDAF/UDTF '${clazz.getCanonicalName}': $e" - val errorMsg = - if (classOf[GenericUDTF].isAssignableFrom(clazz)) { - s"$noHandlerMsg\nPlease make sure your function overrides " + - "`public StructObjectInspector initialize(ObjectInspector[] args)`." - } else { - noHandlerMsg - } - val analysisException = new AnalysisException(errorMsg) - analysisException.setStackTrace(e.getStackTrace) - throw analysisException - } - udfExpr.getOrElse { - throw new AnalysisException(s"No handler for UDF/UDAF/UDTF '${clazz.getCanonicalName}'") } } } @@ -117,7 +120,7 @@ private[sql] class HiveSessionCatalog( try { lookupFunction0(name, children) } catch { - case NonFatal(_) => + case NonFatal(_) if children.exists(_.dataType.isInstanceOf[DecimalType]) => // SPARK-16228 ExternalCatalog may recognize `double`-type only. val newChildren = children.map { child => if (child.dataType.isInstanceOf[DecimalType]) Cast(child, DoubleType) else child diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala index 188aedc3640b8..b117c582a3e6e 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala @@ -19,15 +19,17 @@ package org.apache.spark.sql.hive import org.apache.spark.annotation.Unstable import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.analysis.Analyzer +import org.apache.spark.sql.catalyst.analysis.{Analyzer, ResolveSessionCatalog} import org.apache.spark.sql.catalyst.catalog.ExternalCatalogWithListener +import org.apache.spark.sql.catalyst.optimizer.Optimizer import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution.SparkPlanner +import org.apache.spark.sql.execution.{SparkOptimizer, SparkPlanner} import org.apache.spark.sql.execution.analysis.DetectAmbiguousSelfJoin import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.datasources.v2.TableCapabilityCheck import org.apache.spark.sql.hive.client.HiveClient +import org.apache.spark.sql.hive.execution.PruneHiveTablePartitions import org.apache.spark.sql.internal.{BaseSessionStateBuilder, SessionResourceLoader, SessionState} /** @@ -67,13 +69,13 @@ class HiveSessionStateBuilder(session: SparkSession, parentState: Option[Session /** * A logical query plan `Analyzer` with rules specific to Hive. */ - override protected def analyzer: Analyzer = new Analyzer(catalog, conf) { + override protected def analyzer: Analyzer = new Analyzer(catalogManager, conf) { override val extendedResolutionRules: Seq[Rule[LogicalPlan]] = new ResolveHiveSerdeTable(session) +: new FindDataSourceTable(session) +: new ResolveSQLOnFile(session) +: new FallBackFileSourceV2(session) +: - DataSourceResolution(conf, this.catalogManager) +: + new ResolveSessionCatalog(catalogManager, conf, catalog.isView) +: customResolutionRules override val postHocResolutionRules: Seq[Rule[LogicalPlan]] = @@ -93,11 +95,25 @@ class HiveSessionStateBuilder(session: SparkSession, parentState: Option[Session customCheckRules } + /** + * Logical query plan optimizer that takes into account Hive. + */ + override protected def optimizer: Optimizer = { + new SparkOptimizer(catalogManager, catalog, experimentalMethods) { + override def postHocOptimizationBatches: Seq[Batch] = Seq( + Batch("Prune Hive Table Partitions", Once, new PruneHiveTablePartitions(session)) + ) + + override def extendedOperatorOptimizationRules: Seq[Rule[LogicalPlan]] = + super.extendedOperatorOptimizationRules ++ customOperatorOptimizationRules + } + } + /** * Planner that takes into account Hive-specific strategies. */ override protected def planner: SparkPlanner = { - new SparkPlanner(session.sparkContext, conf, experimentalMethods) with HiveStrategies { + new SparkPlanner(session, conf, experimentalMethods) with HiveStrategies { override val sparkSession: SparkSession = session override def extraPlanningStrategies: Seq[Strategy] = diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala index be4a0c175b6dc..3beef6b1df457 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala @@ -51,7 +51,7 @@ private[hive] object HiveShim { /* * This function in hive-0.13 become private, but we have to do this to work around hive bug */ - private def appendReadColumnNames(conf: Configuration, cols: Seq[String]) { + private def appendReadColumnNames(conf: Configuration, cols: Seq[String]): Unit = { val old: String = conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, "") val result: StringBuilder = new StringBuilder(old) var first: Boolean = old.isEmpty @@ -70,7 +70,7 @@ private[hive] object HiveShim { /* * Cannot use ColumnProjectionUtils.appendReadColumns directly, if ids is null */ - def appendReadColumns(conf: Configuration, ids: Seq[Integer], names: Seq[String]) { + def appendReadColumns(conf: Configuration, ids: Seq[Integer], names: Seq[String]): Unit = { if (ids != null) { ColumnProjectionUtils.appendReadColumns(conf, ids.asJava) } @@ -201,7 +201,7 @@ private[hive] object HiveShim { } } - def writeExternal(out: java.io.ObjectOutput) { + def writeExternal(out: java.io.ObjectOutput): Unit = { // output the function name out.writeUTF(functionClassName) @@ -220,7 +220,7 @@ private[hive] object HiveShim { } } - def readExternal(in: java.io.ObjectInput) { + def readExternal(in: java.io.ObjectInput): Unit = { // read the function name functionClassName = in.readUTF() @@ -279,25 +279,25 @@ private[hive] object HiveShim { var compressType: String = _ var destTableId: Int = _ - def setCompressed(compressed: Boolean) { + def setCompressed(compressed: Boolean): Unit = { this.compressed = compressed } def getDirName(): String = dir - def setDestTableId(destTableId: Int) { + def setDestTableId(destTableId: Int): Unit = { this.destTableId = destTableId } - def setTableInfo(tableInfo: TableDesc) { + def setTableInfo(tableInfo: TableDesc): Unit = { this.tableInfo = tableInfo } - def setCompressCodec(intermediateCompressorCodec: String) { + def setCompressCodec(intermediateCompressorCodec: String): Unit = { compressCodec = intermediateCompressorCodec } - def setCompressType(intermediateCompressType: String) { + def setCompressType(intermediateCompressType: String): Unit = { compressType = intermediateCompressType } } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala index 18feb98519fbe..b9c98f4ea15e9 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala @@ -26,8 +26,7 @@ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning._ -import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoDir, InsertIntoTable, LogicalPlan, - ScriptTransformation, Statistics} +import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoDir, InsertIntoStatement, LogicalPlan, ScriptTransformation, Statistics} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.command.{CreateTableCommand, DDLUtils} @@ -143,9 +142,9 @@ class DetermineTableStats(session: SparkSession) extends Rule[LogicalPlan] { if DDLUtils.isHiveTable(relation.tableMeta) && relation.tableMeta.stats.isEmpty => hiveTableWithStats(relation) - // handles InsertIntoTable specially as the table in InsertIntoTable is not added in its + // handles InsertIntoStatement specially as the table in InsertIntoStatement is not added in its // children, hence not matched directly by previous HiveTableRelation case. - case i @ InsertIntoTable(relation: HiveTableRelation, _, _, _, _) + case i @ InsertIntoStatement(relation: HiveTableRelation, _, _, _, _) if DDLUtils.isHiveTable(relation.tableMeta) && relation.tableMeta.stats.isEmpty => i.copy(table = hiveTableWithStats(relation)) } @@ -159,7 +158,7 @@ class DetermineTableStats(session: SparkSession) extends Rule[LogicalPlan] { */ object HiveAnalysis extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { - case InsertIntoTable(r: HiveTableRelation, partSpec, query, overwrite, ifPartitionNotExists) + case InsertIntoStatement(r: HiveTableRelation, partSpec, query, overwrite, ifPartitionNotExists) if DDLUtils.isHiveTable(r.tableMeta) => InsertIntoHiveTable(r.tableMeta, partSpec, query, overwrite, ifPartitionNotExists, query.output.map(_.name)) @@ -207,11 +206,12 @@ case class RelationConversions( override def apply(plan: LogicalPlan): LogicalPlan = { plan resolveOperators { // Write path - case InsertIntoTable(r: HiveTableRelation, partition, query, overwrite, ifPartitionNotExists) + case InsertIntoStatement( + r: HiveTableRelation, partition, query, overwrite, ifPartitionNotExists) if query.resolved && DDLUtils.isHiveTable(r.tableMeta) && (!r.isPartitioned || SQLConf.get.getConf(HiveUtils.CONVERT_INSERTING_PARTITIONED_TABLE)) && isConvertible(r) => - InsertIntoTable(metastoreCatalog.convert(r), partition, + InsertIntoStatement(metastoreCatalog.convert(r), partition, query, overwrite, ifPartitionNotExists) // Read path @@ -252,7 +252,7 @@ private[hive] trait HiveStrategies { */ object HiveTableScans extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { - case PhysicalOperation(projectList, predicates, relation: HiveTableRelation) => + case ScanOperation(projectList, predicates, relation: HiveTableRelation) => // Filter out all predicates that only deal with partition keys, these are given to the // hive table scan operator to be used for partition pruning. val partitionKeyIds = AttributeSet(relation.partitionCols) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala index d5f3697ce3bf7..9c4b8a5819a33 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala @@ -476,6 +476,7 @@ private[spark] object HiveUtils extends Logging { // Configuration. But it happens before SparkContext initialized, we need to take them from // system properties in the form of regular hadoop configurations. SparkHadoopUtil.get.appendSparkHadoopConfigs(sys.props.toMap, propMap) + SparkHadoopUtil.get.appendSparkHiveConfigs(sys.props.toMap, propMap) propMap.toMap } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala index 3f9925e73705e..4d18eb6289418 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala @@ -83,7 +83,7 @@ class HadoopTableReader( sparkSession.sparkContext.defaultMinPartitions) } - SparkHadoopUtil.get.appendS3AndSparkHadoopConfigurations( + SparkHadoopUtil.get.appendS3AndSparkHadoopHiveConfigurations( sparkSession.sparkContext.conf, hadoopConf) private val _broadcastedHadoopConf = @@ -132,7 +132,9 @@ class HadoopTableReader( val deserializedHadoopRDD = hadoopRDD.mapPartitions { iter => val hconf = broadcastedHadoopConf.value.value val deserializer = deserializerClass.getConstructor().newInstance() - deserializer.initialize(hconf, localTableDesc.getProperties) + DeserializerLock.synchronized { + deserializer.initialize(hconf, localTableDesc.getProperties) + } HadoopTableReader.fillObject(iter, deserializer, attrsWithIndex, mutableRow, deserializer) } @@ -170,7 +172,7 @@ class HadoopTableReader( val pathPatternSet = collection.mutable.Set[String]() partitionToDeserializer.filter { case (partition, partDeserializer) => - def updateExistPathSetByPathPattern(pathPatternStr: String) { + def updateExistPathSetByPathPattern(pathPatternStr: String): Unit = { val pathPattern = new Path(pathPatternStr) val fs = pathPattern.getFileSystem(hadoopConf) val matches = fs.globStatus(pathPattern) @@ -252,10 +254,14 @@ class HadoopTableReader( partProps.asScala.foreach { case (key, value) => props.setProperty(key, value) } - deserializer.initialize(hconf, props) + DeserializerLock.synchronized { + deserializer.initialize(hconf, props) + } // get the table deserializer val tableSerDe = localTableDesc.getDeserializerClass.getConstructor().newInstance() - tableSerDe.initialize(hconf, localTableDesc.getProperties) + DeserializerLock.synchronized { + tableSerDe.initialize(hconf, tableProperties) + } // fill the non partition key attributes HadoopTableReader.fillObject(iter, deserializer, nonPartitionKeyAttrs, @@ -352,7 +358,7 @@ private[hive] object HiveTableUtil { // that calls Hive.get() which tries to access metastore, but it's not valid in runtime // it would be fixed in next version of hive but till then, we should use this instead def configureJobPropertiesForStorageHandler( - tableDesc: TableDesc, conf: Configuration, input: Boolean) { + tableDesc: TableDesc, conf: Configuration, input: Boolean): Unit = { val property = tableDesc.getProperties.getProperty(META_TABLE_STORAGE) val storageHandler = org.apache.hadoop.hive.ql.metadata.HiveUtils.getStorageHandler(conf, property) @@ -370,12 +376,23 @@ private[hive] object HiveTableUtil { } } +/** + * Object to synchronize on when calling org.apache.hadoop.hive.serde2.Deserializer#initialize. + * + * [SPARK-17398] org.apache.hive.hcatalog.data.JsonSerDe#initialize calls the non-thread-safe + * HCatRecordObjectInspectorFactory.getHCatRecordObjectInspector, the results of which are + * returned by JsonSerDe#getObjectInspector. + * To protect against this bug in Hive (HIVE-15773/HIVE-21752), we synchronize on this object + * when calling initialize on Deserializer instances that could be JsonSerDe instances. + */ +private[hive] object DeserializerLock + private[hive] object HadoopTableReader extends HiveInspectors with Logging { /** * Curried. After given an argument for 'path', the resulting JobConf => Unit closure is used to * instantiate a HadoopRDD. */ - def initializeLocalJobConfFunc(path: String, tableDesc: TableDesc)(jobConf: JobConf) { + def initializeLocalJobConfFunc(path: String, tableDesc: TableDesc)(jobConf: JobConf): Unit = { FileInputFormat.setInputPaths(jobConf, Seq[Path](new Path(path)): _*) if (tableDesc != null) { HiveTableUtil.configureJobPropertiesForStorageHandler(tableDesc, jobConf, true) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala index cb015d7301c19..e31dffa4795c5 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClient.scala @@ -111,8 +111,8 @@ private[hive] trait HiveClient { * TODO(cloud-fan): it's a little hacky to introduce the schema table properties here in * `HiveClient`, but we don't have a cleaner solution now. */ - def alterTableDataSchema( - dbName: String, tableName: String, newDataSchema: StructType, schemaProps: Map[String, String]) + def alterTableDataSchema(dbName: String, tableName: String, newDataSchema: StructType, + schemaProps: Map[String, String]): Unit /** Creates a new database with the given name. */ def createDatabase(database: CatalogDatabase, ignoreIfExists: Boolean): Unit @@ -292,4 +292,6 @@ private[hive] trait HiveClient { /** Used for testing only. Removes all metadata from this instance of Hive. */ def reset(): Unit + /** Returns the user name which is used as owner for Hive table. */ + def userName: String } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index 5b2eeb2cf34c0..b5c5f0e9381bc 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -32,8 +32,7 @@ import org.apache.hadoop.hive.common.StatsSetupConst import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.hadoop.hive.metastore.{IMetaStoreClient, TableType => HiveTableType} -import org.apache.hadoop.hive.metastore.api.{Database => HiveDatabase, Table => MetaStoreApiTable} -import org.apache.hadoop.hive.metastore.api.{FieldSchema, Order, SerDeInfo, StorageDescriptor} +import org.apache.hadoop.hive.metastore.api.{Database => HiveDatabase, Table => MetaStoreApiTable, _} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.metadata.{Hive, HiveException, Partition => HivePartition, Table => HiveTable} import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.HIVE_COLUMN_ORDER_ASC @@ -42,6 +41,7 @@ import org.apache.hadoop.hive.ql.session.SessionState import org.apache.hadoop.hive.serde.serdeConstants import org.apache.hadoop.hive.serde2.MetadataTypedColumnsetSerDe import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe +import org.apache.hadoop.security.UserGroupInformation import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.internal.Logging @@ -53,11 +53,13 @@ import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException} +import org.apache.spark.sql.connector.catalog.SupportsNamespaces._ import org.apache.spark.sql.execution.QueryExecutionException -import org.apache.spark.sql.execution.command.DDLUtils +import org.apache.spark.sql.hive.HiveExternalCatalog import org.apache.spark.sql.hive.HiveExternalCatalog.{DATASOURCE_SCHEMA, DATASOURCE_SCHEMA_NUMPARTS, DATASOURCE_SCHEMA_PART_PREFIX} import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.client.HiveClientImpl._ +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.util.{CircularBuffer, Utils} @@ -161,7 +163,7 @@ private[hive] class HiveClientImpl( // HiveConf is a Hadoop Configuration, which has a field of classLoader and // the initial value will be the current thread's context class loader // (i.e. initClassLoader at here). - // We call initialConf.setClassLoader(initClassLoader) at here to make + // We call hiveConf.setClassLoader(initClassLoader) at here to make // this action explicit. hiveConf.setClassLoader(initClassLoader) @@ -175,14 +177,15 @@ private[hive] class HiveClientImpl( // has hive-site.xml. So, HiveConf will use that to override its default values. // 2: we set all spark confs to this hiveConf. // 3: we set all entries in config to this hiveConf. - (hadoopConf.iterator().asScala.map(kv => kv.getKey -> kv.getValue) - ++ sparkConf.getAll.toMap ++ extraConfig).foreach { case (k, v) => + val confMap = (hadoopConf.iterator().asScala.map(kv => kv.getKey -> kv.getValue) ++ + sparkConf.getAll.toMap ++ extraConfig).toMap + confMap.foreach { case (k, v) => hiveConf.set(k, v) } + SQLConf.get.redactOptions(confMap).foreach { case (k, v) => logDebug( s""" |Applying Hadoop/Hive/Spark and extra properties to Hive Conf: - |$k=${if (k.toLowerCase(Locale.ROOT).contains("password")) "xxx" else v} + |$k=$v """.stripMargin) - hiveConf.set(k, v) } // Disable CBO because we removed the Calcite dependency. hiveConf.setBoolean("hive.cbo.enable", false) @@ -190,6 +193,13 @@ private[hive] class HiveClientImpl( if (clientLoader.cachedHive != null) { Hive.set(clientLoader.cachedHive.asInstanceOf[Hive]) } + // Hive 2.3 will set UDFClassLoader to hiveConf when initializing SessionState + // since HIVE-11878, and ADDJarCommand will add jars to clientLoader.classLoader. + // For this reason we cannot load the jars added by ADDJarCommand because of class loader + // got changed. We reset it to clientLoader.ClassLoader here. + if (HiveUtils.isHive23) { + state.getConf.setClassLoader(clientLoader.classLoader) + } SessionState.start(state) state.out = new PrintStream(outputBuffer, true, UTF_8.name()) state.err = new PrintStream(outputBuffer, true, UTF_8.name()) @@ -221,7 +231,7 @@ private[hive] class HiveClientImpl( hiveConf } - private val userName = conf.getUser + override val userName = UserGroupInformation.getCurrentUser.getShortUserName override def getConf(key: String, defaultValue: String): String = { conf.get(key, defaultValue) @@ -345,13 +355,8 @@ private[hive] class HiveClientImpl( override def createDatabase( database: CatalogDatabase, ignoreIfExists: Boolean): Unit = withHiveState { - client.createDatabase( - new HiveDatabase( - database.name, - database.description, - CatalogUtils.URIToString(database.locationUri), - Option(database.properties).map(_.asJava).orNull), - ignoreIfExists) + val hiveDb = toHiveDatabase(database, Some(userName)) + client.createDatabase(hiveDb, ignoreIfExists) } override def dropDatabase( @@ -362,22 +367,41 @@ private[hive] class HiveClientImpl( } override def alterDatabase(database: CatalogDatabase): Unit = withHiveState { - client.alterDatabase( + if (!getDatabase(database.name).locationUri.equals(database.locationUri)) { + // SPARK-29260: Enable supported versions once it support altering database location. + if (!(version.equals(hive.v3_0) || version.equals(hive.v3_1))) { + throw new AnalysisException( + s"Hive ${version.fullVersion} does not support altering database location") + } + } + val hiveDb = toHiveDatabase(database) + client.alterDatabase(database.name, hiveDb) + } + + private def toHiveDatabase( + database: CatalogDatabase, userName: Option[String] = None): HiveDatabase = { + val props = database.properties + val hiveDb = new HiveDatabase( database.name, - new HiveDatabase( - database.name, - database.description, - CatalogUtils.URIToString(database.locationUri), - Option(database.properties).map(_.asJava).orNull)) + database.description, + CatalogUtils.URIToString(database.locationUri), + (props -- Seq(PROP_OWNER)).asJava) + props.get(PROP_OWNER).orElse(userName).foreach { ownerName => + shim.setDatabaseOwnerName(hiveDb, ownerName) + } + hiveDb } override def getDatabase(dbName: String): CatalogDatabase = withHiveState { Option(client.getDatabase(dbName)).map { d => + val paras = Option(d.getParameters).map(_.asScala.toMap).getOrElse(Map()) ++ + Map(PROP_OWNER -> shim.getDatabaseOwnerName(d)) + CatalogDatabase( name = d.getName, description = Option(d.getDescription).getOrElse(""), locationUri = CatalogUtils.stringToURI(d.getLocationUri), - properties = Option(d.getParameters).map(_.asScala.toMap).orNull) + properties = paras) }.getOrElse(throw new NoSuchDatabaseException(dbName)) } @@ -423,8 +447,13 @@ private[hive] class HiveClientImpl( private def convertHiveTableToCatalogTable(h: HiveTable): CatalogTable = { // Note: Hive separates partition columns and the schema, but for us the // partition columns are part of the schema - val cols = h.getCols.asScala.map(fromHiveColumn) - val partCols = h.getPartCols.asScala.map(fromHiveColumn) + val (cols, partCols) = try { + (h.getCols.asScala.map(fromHiveColumn), h.getPartCols.asScala.map(fromHiveColumn)) + } catch { + case ex: SparkException => + throw new SparkException( + s"${ex.getMessage}, db: ${h.getDbName}, table: ${h.getTableName}", ex) + } val schema = StructType(cols ++ partCols) val bucketSpec = if (h.getNumBuckets > 0) { @@ -965,7 +994,8 @@ private[hive] object HiveClientImpl { CatalystSqlParser.parseDataType(hc.getType) } catch { case e: ParseException => - throw new SparkException("Cannot recognize hive type string: " + hc.getType, e) + throw new SparkException( + s"Cannot recognize hive type string: ${hc.getType}, column: ${hc.getName}", e) } } @@ -1021,7 +1051,7 @@ private[hive] object HiveClientImpl { } hiveTable.setFields(schema.asJava) hiveTable.setPartCols(partCols.asJava) - userName.foreach(hiveTable.setOwner) + Option(table.owner).filter(_.nonEmpty).orElse(userName).foreach(hiveTable.setOwner) hiveTable.setCreateTime(MILLISECONDS.toSeconds(table.createTime).toInt) hiveTable.setLastAccessTime(MILLISECONDS.toSeconds(table.lastAccessTime).toInt) table.storage.locationUri.map(CatalogUtils.URIToString).foreach { loc => @@ -1042,7 +1072,7 @@ private[hive] object HiveClientImpl { } table.bucketSpec match { - case Some(bucketSpec) if DDLUtils.isHiveTable(table) => + case Some(bucketSpec) if !HiveExternalCatalog.isDatasourceTable(table) => hiveTable.setNumBuckets(bucketSpec.numBuckets) hiveTable.setBucketCols(bucketSpec.bucketColumnNames.toList.asJava) @@ -1155,9 +1185,10 @@ private[hive] object HiveClientImpl { * Note that this statistics could be overridden by Spark's statistics if that's available. */ private def readHiveStats(properties: Map[String, String]): Option[CatalogStatistics] = { - val totalSize = properties.get(StatsSetupConst.TOTAL_SIZE).map(BigInt(_)) - val rawDataSize = properties.get(StatsSetupConst.RAW_DATA_SIZE).map(BigInt(_)) - val rowCount = properties.get(StatsSetupConst.ROW_COUNT).map(BigInt(_)) + val totalSize = properties.get(StatsSetupConst.TOTAL_SIZE).filter(_.nonEmpty).map(BigInt(_)) + val rawDataSize = properties.get(StatsSetupConst.RAW_DATA_SIZE).filter(_.nonEmpty) + .map(BigInt(_)) + val rowCount = properties.get(StatsSetupConst.ROW_COUNT).filter(_.nonEmpty).map(BigInt(_)) // NOTE: getting `totalSize` directly from params is kind of hacky, but this should be // relatively cheap if parameters for the table are populated into the metastore. // Currently, only totalSize, rawDataSize, and rowCount are used to build the field `stats` diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala index 586fbbefade46..50ce536a160c8 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala @@ -29,8 +29,7 @@ import scala.util.control.NonFatal import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.metastore.IMetaStoreClient -import org.apache.hadoop.hive.metastore.api.{EnvironmentContext, Function => HiveFunction, FunctionType} -import org.apache.hadoop.hive.metastore.api.{MetaException, PrincipalType, ResourceType, ResourceUri} +import org.apache.hadoop.hive.metastore.api.{Database, EnvironmentContext, Function => HiveFunction, FunctionType, MetaException, PrincipalType, ResourceType, ResourceUri} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.io.AcidUtils import org.apache.hadoop.hive.ql.metadata.{Hive, HiveException, Partition, Table} @@ -154,6 +153,10 @@ private[client] sealed abstract class Shim { deleteData: Boolean, purge: Boolean): Unit + def getDatabaseOwnerName(db: Database): String + + def setDatabaseOwnerName(db: Database, owner: String): Unit + protected def findStaticMethod(klass: Class[_], name: String, args: Class[_]*): Method = { val method = findMethod(klass, name, args: _*) require(Modifier.isStatic(method.getModifiers()), @@ -456,6 +459,10 @@ private[client] class Shim_v0_12 extends Shim with Logging { def listFunctions(hive: Hive, db: String, pattern: String): Seq[String] = { Seq.empty[String] } + + override def getDatabaseOwnerName(db: Database): String = "" + + override def setDatabaseOwnerName(db: Database, owner: String): Unit = {} } private[client] class Shim_v0_13 extends Shim_v0_12 { @@ -493,6 +500,17 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { "getResults", classOf[JList[Object]]) + private lazy val getDatabaseOwnerNameMethod = + findMethod( + classOf[Database], + "getOwnerName") + + private lazy val setDatabaseOwnerNameMethod = + findMethod( + classOf[Database], + "setOwnerName", + classOf[String]) + override def setCurrentSessionState(state: SessionState): Unit = setCurrentSessionStateMethod.invoke(null, state) @@ -666,7 +684,7 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { } } - object NonVarcharAttribute { + object SupportedAttribute { // hive varchar is treated as catalyst string, but hive varchar can't be pushed down. private val varcharKeys = table.getPartitionKeys.asScala .filter(col => col.getType.startsWith(serdeConstants.VARCHAR_TYPE_NAME) || @@ -676,8 +694,10 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { def unapply(attr: Attribute): Option[String] = { if (varcharKeys.contains(attr.name)) { None - } else { + } else if (attr.dataType.isInstanceOf[IntegralType] || attr.dataType == StringType) { Some(attr.name) + } else { + None } } } @@ -700,20 +720,20 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { } def convert(expr: Expression): Option[String] = expr match { - case In(ExtractAttribute(NonVarcharAttribute(name)), ExtractableLiterals(values)) + case In(ExtractAttribute(SupportedAttribute(name)), ExtractableLiterals(values)) if useAdvanced => Some(convertInToOr(name, values)) - case InSet(ExtractAttribute(NonVarcharAttribute(name)), ExtractableValues(values)) + case InSet(ExtractAttribute(SupportedAttribute(name)), ExtractableValues(values)) if useAdvanced => Some(convertInToOr(name, values)) case op @ SpecialBinaryComparison( - ExtractAttribute(NonVarcharAttribute(name)), ExtractableLiteral(value)) => + ExtractAttribute(SupportedAttribute(name)), ExtractableLiteral(value)) => Some(s"$name ${op.symbol} $value") case op @ SpecialBinaryComparison( - ExtractableLiteral(value), ExtractAttribute(NonVarcharAttribute(name))) => + ExtractableLiteral(value), ExtractAttribute(SupportedAttribute(name))) => Some(s"$value ${op.symbol} $name") case And(expr1, expr2) if useAdvanced => @@ -809,6 +829,13 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { } } + override def getDatabaseOwnerName(db: Database): String = { + Option(getDatabaseOwnerNameMethod.invoke(db)).map(_.asInstanceOf[String]).getOrElse("") + } + + override def setDatabaseOwnerName(db: Database, owner: String): Unit = { + setDatabaseOwnerNameMethod.invoke(db, owner) + } } private[client] class Shim_v0_14 extends Shim_v0_13 { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala index 6f60bb7c9c74d..5da7b70cfc7aa 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala @@ -36,6 +36,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.util.quietly import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.internal.NonClosableMutableURLClassLoader +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.{MutableURLClassLoader, Utils} /** Factory for `IsolatedClientLoader` with specific versions of hive. */ @@ -60,9 +61,10 @@ private[hive] object IsolatedClientLoader extends Logging { val files = if (resolvedVersions.contains((resolvedVersion, hadoopVersion))) { resolvedVersions((resolvedVersion, hadoopVersion)) } else { + val remoteRepos = sparkConf.get(SQLConf.ADDITIONAL_REMOTE_REPOSITORIES) val (downloadedFiles, actualHadoopVersion) = try { - (downloadVersion(resolvedVersion, hadoopVersion, ivyPath), hadoopVersion) + (downloadVersion(resolvedVersion, hadoopVersion, ivyPath, remoteRepos), hadoopVersion) } catch { case e: RuntimeException if e.getMessage.contains("hadoop") => // If the error message contains hadoop, it is probably because the hadoop @@ -74,7 +76,8 @@ private[hive] object IsolatedClientLoader extends Logging { "It is recommended to set jars used by Hive metastore client through " + "spark.sql.hive.metastore.jars in the production environment.") _sharesHadoopClasses = false - (downloadVersion(resolvedVersion, fallbackVersion, ivyPath), fallbackVersion) + (downloadVersion( + resolvedVersion, fallbackVersion, ivyPath, remoteRepos), fallbackVersion) } resolvedVersions.put((resolvedVersion, actualHadoopVersion), downloadedFiles) resolvedVersions((resolvedVersion, actualHadoopVersion)) @@ -112,7 +115,8 @@ private[hive] object IsolatedClientLoader extends Logging { private def downloadVersion( version: HiveVersion, hadoopVersion: String, - ivyPath: Option[String]): Seq[URL] = { + ivyPath: Option[String], + remoteRepos: String): Seq[URL] = { val hiveArtifacts = version.extraDeps ++ Seq("hive-metastore", "hive-exec", "hive-common", "hive-serde") .map(a => s"org.apache.hive:$a:${version.fullVersion}") ++ @@ -123,7 +127,7 @@ private[hive] object IsolatedClientLoader extends Logging { SparkSubmitUtils.resolveMavenCoordinates( hiveArtifacts.mkString(","), SparkSubmitUtils.buildIvySettings( - Some("http://www.datanucleus.org/downloads/maven2"), + Some(remoteRepos), ivyPath), exclusions = version.exclusions) } @@ -158,7 +162,7 @@ private[hive] object IsolatedClientLoader extends Logging { * @param execJars A collection of jar files that must include hive and hadoop. * @param config A set of options that will be added to the HiveConf of the constructed client. * @param isolationOn When true, custom versions of barrier classes will be constructed. Must be - * true unless loading the version of hive that is on Sparks classloader. + * true unless loading the version of hive that is on Spark's classloader. * @param sharesHadoopClasses When true, we will share Hadoop classes between Spark and * @param baseClassLoader The spark classloader that is used to load shared classes. */ diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala index 27071075b4165..c51c521cacba0 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala @@ -130,10 +130,15 @@ class HiveOutputWriter( new Path(path), Reporter.NULL) + /** + * Since SPARK-30201 ObjectInspectorCopyOption.JAVA change to ObjectInspectorCopyOption.DEFAULT. + * The reason is DEFAULT option can convert `UTF8String` to `Text` with bytes and + * we can compatible with non UTF-8 code bytes during write. + */ private val standardOI = ObjectInspectorUtils .getStandardObjectInspector( tableDesc.getDeserializer(jobConf).getObjectInspector, - ObjectInspectorCopyOption.JAVA) + ObjectInspectorCopyOption.DEFAULT) .asInstanceOf[StructObjectInspector] private val fieldOIs = diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala index 5b00e2ebafa43..4dccacef337e9 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala @@ -146,7 +146,7 @@ case class HiveTableScanExec( * @param partitions All partitions of the relation. * @return Partitions that are involved in the query plan. */ - private[hive] def prunePartitions(partitions: Seq[HivePartition]) = { + private[hive] def prunePartitions(partitions: Seq[HivePartition]): Seq[HivePartition] = { boundPruningPred match { case None => partitions case Some(shouldKeep) => partitions.filter { part => @@ -162,18 +162,36 @@ case class HiveTableScanExec( } } + @transient lazy val prunedPartitions: Seq[HivePartition] = { + if (relation.prunedPartitions.nonEmpty) { + val hivePartitions = + relation.prunedPartitions.get.map(HiveClientImpl.toHivePartition(_, hiveQlTable)) + if (partitionPruningPred.forall(!ExecSubqueryExpression.hasSubquery(_))) { + hivePartitions + } else { + prunePartitions(hivePartitions) + } + } else { + if (sparkSession.sessionState.conf.metastorePartitionPruning && + partitionPruningPred.nonEmpty) { + rawPartitions + } else { + prunePartitions(rawPartitions) + } + } + } + // exposed for tests - @transient lazy val rawPartitions = { + @transient lazy val rawPartitions: Seq[HivePartition] = { val prunedPartitions = if (sparkSession.sessionState.conf.metastorePartitionPruning && - partitionPruningPred.size > 0) { + partitionPruningPred.nonEmpty) { // Retrieve the original attributes based on expression ID so that capitalization matches. val normalizedFilters = partitionPruningPred.map(_.transform { case a: AttributeReference => originalAttributes(a) }) - sparkSession.sessionState.catalog.listPartitionsByFilter( - relation.tableMeta.identifier, - normalizedFilters) + sparkSession.sessionState.catalog + .listPartitionsByFilter(relation.tableMeta.identifier, normalizedFilters) } else { sparkSession.sessionState.catalog.listPartitions(relation.tableMeta.identifier) } @@ -189,7 +207,7 @@ case class HiveTableScanExec( } } else { Utils.withDummyCallSite(sqlContext.sparkContext) { - hadoopReader.makeRDDForPartitionedTable(prunePartitions(rawPartitions)) + hadoopReader.makeRDDForPartitionedTable(prunedPartitions) } } val numOutputRows = longMetric("numOutputRows") diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala index ee1734b1f232c..801be64702519 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala @@ -26,7 +26,7 @@ import org.apache.hadoop.hive.ql.plan.TableDesc import org.apache.spark.SparkException import org.apache.spark.sql.{AnalysisException, Row, SparkSession} -import org.apache.spark.sql.catalyst.catalog.{CatalogTable, ExternalCatalog} +import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType, ExternalCatalog, ExternalCatalogUtils} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan @@ -199,7 +199,7 @@ case class InsertIntoHiveTable( attr.withName(name.toLowerCase(Locale.ROOT)) } - saveAsHiveFile( + val writtenParts = saveAsHiveFile( sparkSession = sparkSession, plan = child, hadoopConf = hadoopConf, @@ -209,6 +209,42 @@ case class InsertIntoHiveTable( if (partition.nonEmpty) { if (numDynamicPartitions > 0) { + if (overwrite && table.tableType == CatalogTableType.EXTERNAL) { + // SPARK-29295: When insert overwrite to a Hive external table partition, if the + // partition does not exist, Hive will not check if the external partition directory + // exists or not before copying files. So if users drop the partition, and then do + // insert overwrite to the same partition, the partition will have both old and new + // data. We construct partition path. If the path exists, we delete it manually. + writtenParts.foreach { partPath => + val dpMap = partPath.split("/").map { part => + val splitPart = part.split("=") + assert(splitPart.size == 2, s"Invalid written partition path: $part") + ExternalCatalogUtils.unescapePathName(splitPart(0)) -> + ExternalCatalogUtils.unescapePathName(splitPart(1)) + }.toMap + + val updatedPartitionSpec = partition.map { + case (key, Some(value)) => key -> value + case (key, None) if dpMap.contains(key) => key -> dpMap(key) + case (key, _) => + throw new SparkException(s"Dynamic partition key $key is not among " + + "written partition paths.") + } + val partitionColumnNames = table.partitionColumnNames + val tablePath = new Path(table.location) + val partitionPath = ExternalCatalogUtils.generatePartitionPath(updatedPartitionSpec, + partitionColumnNames, tablePath) + + val fs = partitionPath.getFileSystem(hadoopConf) + if (fs.exists(partitionPath)) { + if (!fs.delete(partitionPath, true)) { + throw new RuntimeException( + "Cannot remove partition directory '" + partitionPath.toString) + } + } + } + } + externalCatalog.loadDynamicPartitions( db = table.database, table = table.identifier.table, @@ -230,18 +266,32 @@ case class InsertIntoHiveTable( var doHiveOverwrite = overwrite if (oldPart.isEmpty || !ifPartitionNotExists) { + // SPARK-29295: When insert overwrite to a Hive external table partition, if the + // partition does not exist, Hive will not check if the external partition directory + // exists or not before copying files. So if users drop the partition, and then do + // insert overwrite to the same partition, the partition will have both old and new + // data. We construct partition path. If the path exists, we delete it manually. + val partitionPath = if (oldPart.isEmpty && overwrite + && table.tableType == CatalogTableType.EXTERNAL) { + val partitionColumnNames = table.partitionColumnNames + val tablePath = new Path(table.location) + Some(ExternalCatalogUtils.generatePartitionPath(partitionSpec, + partitionColumnNames, tablePath)) + } else { + oldPart.flatMap(_.storage.locationUri.map(uri => new Path(uri))) + } + // SPARK-18107: Insert overwrite runs much slower than hive-client. // Newer Hive largely improves insert overwrite performance. As Spark uses older Hive // version and we may not want to catch up new Hive version every time. We delete the // Hive partition first and then load data file into the Hive partition. - if (oldPart.nonEmpty && overwrite) { - oldPart.get.storage.locationUri.foreach { uri => - val partitionPath = new Path(uri) - val fs = partitionPath.getFileSystem(hadoopConf) - if (fs.exists(partitionPath)) { - if (!fs.delete(partitionPath, true)) { + if (partitionPath.nonEmpty && overwrite) { + partitionPath.foreach { path => + val fs = path.getFileSystem(hadoopConf) + if (fs.exists(path)) { + if (!fs.delete(path, true)) { throw new RuntimeException( - "Cannot remove partition directory '" + partitionPath.toString) + "Cannot remove partition directory '" + path.toString) } // Don't let Hive do overwrite operation since it is slower. doHiveOverwrite = false diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala new file mode 100644 index 0000000000000..da6e4c52cf3a7 --- /dev/null +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitions.scala @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.execution + +import org.apache.hadoop.hive.common.StatsSetupConst + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.analysis.CastSupport +import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, CatalogTable, CatalogTablePartition, ExternalCatalogUtils, HiveTableRelation} +import org.apache.spark.sql.catalyst.expressions.{And, AttributeSet, Expression, ExpressionSet, SubqueryExpression} +import org.apache.spark.sql.catalyst.planning.PhysicalOperation +import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.execution.datasources.DataSourceStrategy +import org.apache.spark.sql.internal.SQLConf + +/** + * Prune hive table partitions using partition filters on [[HiveTableRelation]]. The pruned + * partitions will be kept in [[HiveTableRelation.prunedPartitions]], and the statistics of + * the hive table relation will be updated based on pruned partitions. + * + * This rule is executed in optimization phase, so the statistics can be updated before physical + * planning, which is useful for some spark strategy, eg. + * [[org.apache.spark.sql.execution.SparkStrategies.JoinSelection]]. + * + * TODO: merge this with PruneFileSourcePartitions after we completely make hive as a data source. + */ +private[sql] class PruneHiveTablePartitions(session: SparkSession) + extends Rule[LogicalPlan] with CastSupport { + + override val conf: SQLConf = session.sessionState.conf + + /** + * Extract the partition filters from the filters on the table. + */ + private def getPartitionKeyFilters( + filters: Seq[Expression], + relation: HiveTableRelation): ExpressionSet = { + val normalizedFilters = DataSourceStrategy.normalizeExprs( + filters.filter(f => f.deterministic && !SubqueryExpression.hasSubquery(f)), relation.output) + val partitionColumnSet = AttributeSet(relation.partitionCols) + ExpressionSet(normalizedFilters.filter { f => + !f.references.isEmpty && f.references.subsetOf(partitionColumnSet) + }) + } + + /** + * Prune the hive table using filters on the partitions of the table. + */ + private def prunePartitions( + relation: HiveTableRelation, + partitionFilters: ExpressionSet): Seq[CatalogTablePartition] = { + if (conf.metastorePartitionPruning) { + session.sessionState.catalog.listPartitionsByFilter( + relation.tableMeta.identifier, partitionFilters.toSeq) + } else { + ExternalCatalogUtils.prunePartitionsByFilter(relation.tableMeta, + session.sessionState.catalog.listPartitions(relation.tableMeta.identifier), + partitionFilters.toSeq, conf.sessionLocalTimeZone) + } + } + + /** + * Update the statistics of the table. + */ + private def updateTableMeta( + tableMeta: CatalogTable, + prunedPartitions: Seq[CatalogTablePartition]): CatalogTable = { + val sizeOfPartitions = prunedPartitions.map { partition => + val rawDataSize = partition.parameters.get(StatsSetupConst.RAW_DATA_SIZE).map(_.toLong) + val totalSize = partition.parameters.get(StatsSetupConst.TOTAL_SIZE).map(_.toLong) + if (rawDataSize.isDefined && rawDataSize.get > 0) { + rawDataSize.get + } else if (totalSize.isDefined && totalSize.get > 0L) { + totalSize.get + } else { + 0L + } + } + if (sizeOfPartitions.forall(_ > 0)) { + val sizeInBytes = sizeOfPartitions.sum + tableMeta.copy(stats = Some(CatalogStatistics(sizeInBytes = BigInt(sizeInBytes)))) + } else { + tableMeta + } + } + + override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { + case op @ PhysicalOperation(projections, filters, relation: HiveTableRelation) + if filters.nonEmpty && relation.isPartitioned && relation.prunedPartitions.isEmpty => + val partitionKeyFilters = getPartitionKeyFilters(filters, relation) + if (partitionKeyFilters.nonEmpty) { + val newPartitions = prunePartitions(relation, partitionKeyFilters) + val newTableMeta = updateTableMeta(relation.tableMeta, newPartitions) + val newRelation = relation.copy( + tableMeta = newTableMeta, prunedPartitions = Some(newPartitions)) + // Keep partition filters so that they are visible in physical planning + Project(projections, Filter(filters.reduceLeft(And), newRelation)) + } else { + op + } + } +} diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformationExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformationExec.scala index e12f663304e7a..40f7b4e8db7c5 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformationExec.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformationExec.scala @@ -94,9 +94,8 @@ case class ScriptTransformationExec( // This new thread will consume the ScriptTransformation's input rows and write them to the // external process. That process's output will be read by this current thread. val writerThread = new ScriptTransformationWriterThread( - inputIterator, + inputIterator.map(outputProjection), input.map(_.dataType), - outputProjection, inputSerde, inputSoi, ioschema, @@ -249,16 +248,15 @@ case class ScriptTransformationExec( private class ScriptTransformationWriterThread( iter: Iterator[InternalRow], inputSchema: Seq[DataType], - outputProjection: Projection, @Nullable inputSerde: AbstractSerDe, - @Nullable inputSoi: ObjectInspector, + @Nullable inputSoi: StructObjectInspector, ioschema: HiveScriptIOSchema, outputStream: OutputStream, proc: Process, stderrBuffer: CircularBuffer, taskContext: TaskContext, conf: Configuration - ) extends Thread("Thread-ScriptTransformation-Feed") with Logging { + ) extends Thread("Thread-ScriptTransformation-Feed") with HiveInspectors with Logging { setDaemon(true) @@ -278,8 +276,8 @@ private class ScriptTransformationWriterThread( var threwException: Boolean = true val len = inputSchema.length try { - iter.map(outputProjection).foreach { row => - if (inputSerde == null) { + if (inputSerde == null) { + iter.foreach { row => val data = if (len == 0) { ioschema.inputRowFormatMap("TOK_TABLEROWFORMATLINES") } else { @@ -295,10 +293,21 @@ private class ScriptTransformationWriterThread( sb.toString() } outputStream.write(data.getBytes(StandardCharsets.UTF_8)) - } else { - val writable = inputSerde.serialize( - row.asInstanceOf[GenericInternalRow].values, inputSoi) + } + } else { + // Convert Spark InternalRows to hive data via `HiveInspectors.wrapperFor`. + val hiveData = new Array[Any](inputSchema.length) + val fieldOIs = inputSoi.getAllStructFieldRefs.asScala.map(_.getFieldObjectInspector).toArray + val wrappers = fieldOIs.zip(inputSchema).map { case (f, dt) => wrapperFor(f, dt) } + + iter.foreach { row => + var i = 0 + while (i < fieldOIs.length) { + hiveData(i) = if (row.isNullAt(i)) null else wrappers(i)(row.get(i, inputSchema(i))) + i += 1 + } + val writable = inputSerde.serialize(hiveData, inputSoi) if (scriptInputWriter != null) { scriptInputWriter.write(writable) } else { @@ -374,14 +383,13 @@ case class HiveScriptIOSchema ( val outputRowFormatMap = outputRowFormat.toMap.withDefault((k) => defaultFormat(k)) - def initInputSerDe(input: Seq[Expression]): Option[(AbstractSerDe, ObjectInspector)] = { + def initInputSerDe(input: Seq[Expression]): Option[(AbstractSerDe, StructObjectInspector)] = { inputSerdeClass.map { serdeClass => val (columns, columnTypes) = parseAttrs(input) val serde = initSerDe(serdeClass, columns, columnTypes, inputSerdeProps) val fieldObjectInspectors = columnTypes.map(toInspector) val objectInspector = ObjectInspectorFactory .getStandardStructObjectInspector(columns.asJava, fieldObjectInspectors.asJava) - .asInstanceOf[ObjectInspector] (serde, objectInspector) } } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala index d78fc9da9f8a5..05d608a2016a5 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala @@ -247,7 +247,7 @@ private[hive] case class HiveGenericUDTF( protected class UDTFCollector extends Collector { var collected = new ArrayBuffer[InternalRow] - override def collect(input: java.lang.Object) { + override def collect(input: java.lang.Object): Unit = { // We need to clone the input here because implementations of // GenericUDTF reuse the same object. Luckily they are always an array, so // it is easy to clone. diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/package.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/package.scala index db074361ef03c..14276c9b583f2 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/package.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/package.scala @@ -23,8 +23,5 @@ package org.apache.spark.sql * - Using HiveQL to express queries. * - Reading metadata from the Hive Metastore using HiveSerDes. * - Hive UDFs, UDAs, UDTs - * - * Users that would like access to this functionality should create a - * [[hive.HiveContext HiveContext]] instead of a [[SQLContext]]. */ package object hive diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java index 636ce10da3734..2b532389bafb6 100644 --- a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java +++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaDataFrameSuite.java @@ -22,7 +22,6 @@ import java.util.List; import org.junit.After; -import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -39,10 +38,7 @@ public class JavaDataFrameSuite { Dataset df; private static void checkAnswer(Dataset actual, List expected) { - String errorMessage = QueryTest$.MODULE$.checkAnswer(actual, expected); - if (errorMessage != null) { - Assert.fail(errorMessage); - } + QueryTest$.MODULE$.checkAnswer(actual, expected); } @Before diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java index 25bd4d0017bd8..d433386a6c19a 100644 --- a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java +++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java @@ -27,7 +27,6 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.junit.After; -import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -38,9 +37,6 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.hive.test.TestHive$; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.catalyst.TableIdentifier; import org.apache.spark.util.Utils; @@ -54,13 +50,6 @@ public class JavaMetastoreDataSourcesSuite { FileSystem fs; Dataset df; - private static void checkAnswer(Dataset actual, List expected) { - String errorMessage = QueryTest$.MODULE$.checkAnswer(actual, expected); - if (errorMessage != null) { - Assert.fail(errorMessage); - } - } - @Before public void setUp() throws IOException { sqlContext = TestHive$.MODULE$; @@ -94,57 +83,6 @@ public void tearDown() throws IOException { } } - @Test - public void saveExternalTableAndQueryIt() { - Map options = new HashMap<>(); - options.put("path", path.toString()); - df.write() - .format("org.apache.spark.sql.json") - .mode(SaveMode.Append) - .options(options) - .saveAsTable("javaSavedTable"); - - checkAnswer( - sqlContext.sql("SELECT * FROM javaSavedTable"), - df.collectAsList()); - - Dataset loadedDF = - sqlContext.createExternalTable("externalTable", "org.apache.spark.sql.json", options); - - checkAnswer(loadedDF, df.collectAsList()); - checkAnswer( - sqlContext.sql("SELECT * FROM externalTable"), - df.collectAsList()); - } - - @Test - public void saveExternalTableWithSchemaAndQueryIt() { - Map options = new HashMap<>(); - options.put("path", path.toString()); - df.write() - .format("org.apache.spark.sql.json") - .mode(SaveMode.Append) - .options(options) - .saveAsTable("javaSavedTable"); - - checkAnswer( - sqlContext.sql("SELECT * FROM javaSavedTable"), - df.collectAsList()); - - List fields = new ArrayList<>(); - fields.add(DataTypes.createStructField("b", DataTypes.StringType, true)); - StructType schema = DataTypes.createStructType(fields); - Dataset loadedDF = - sqlContext.createExternalTable("externalTable", "org.apache.spark.sql.json", schema, options); - - checkAnswer( - loadedDF, - sqlContext.sql("SELECT b FROM javaSavedTable").collectAsList()); - checkAnswer( - sqlContext.sql("SELECT * FROM externalTable"), - sqlContext.sql("SELECT b FROM javaSavedTable").collectAsList()); - } - @Test public void saveTableAndQueryIt() { Map options = new HashMap<>(); @@ -154,7 +92,7 @@ public void saveTableAndQueryIt() { .options(options) .saveAsTable("javaSavedTable"); - checkAnswer( + QueryTest$.MODULE$.checkAnswer( sqlContext.sql("SELECT * FROM javaSavedTable"), df.collectAsList()); } diff --git a/sql/hive/src/test/noclasspath/README b/sql/hive/src/test/noclasspath/README new file mode 100644 index 0000000000000..8ce1b0bd09668 --- /dev/null +++ b/sql/hive/src/test/noclasspath/README @@ -0,0 +1 @@ +Place files which are being used as resources of tests but shouldn't be added to classpath. \ No newline at end of file diff --git a/sql/hive/src/test/noclasspath/TestUDTF-spark-26560.jar b/sql/hive/src/test/noclasspath/TestUDTF-spark-26560.jar new file mode 100644 index 0000000000000..b73b17d5c7880 Binary files /dev/null and b/sql/hive/src/test/noclasspath/TestUDTF-spark-26560.jar differ diff --git a/sql/hive/src/test/resources/golden/Partition pruning - with filter containing non-deterministic condition - query test-0-56a1c59bd13c2a83a91eb0ec658fcecc b/sql/hive/src/test/resources/golden/Partition pruning - with filter containing non-deterministic condition - query test-0-56a1c59bd13c2a83a91eb0ec658fcecc new file mode 100644 index 0000000000000..0fe6b905e7781 --- /dev/null +++ b/sql/hive/src/test/resources/golden/Partition pruning - with filter containing non-deterministic condition - query test-0-56a1c59bd13c2a83a91eb0ec658fcecc @@ -0,0 +1,500 @@ +val_238 11 +val_86 11 +val_311 11 +val_27 11 +val_165 11 +val_409 11 +val_255 11 +val_278 11 +val_98 11 +val_484 11 +val_265 11 +val_193 11 +val_401 11 +val_150 11 +val_273 11 +val_224 11 +val_369 11 +val_66 11 +val_128 11 +val_213 11 +val_146 11 +val_406 11 +val_429 11 +val_374 11 +val_152 11 +val_469 11 +val_145 11 +val_495 11 +val_37 11 +val_327 11 +val_281 11 +val_277 11 +val_209 11 +val_15 11 +val_82 11 +val_403 11 +val_166 11 +val_417 11 +val_430 11 +val_252 11 +val_292 11 +val_219 11 +val_287 11 +val_153 11 +val_193 11 +val_338 11 +val_446 11 +val_459 11 +val_394 11 +val_237 11 +val_482 11 +val_174 11 +val_413 11 +val_494 11 +val_207 11 +val_199 11 +val_466 11 +val_208 11 +val_174 11 +val_399 11 +val_396 11 +val_247 11 +val_417 11 +val_489 11 +val_162 11 +val_377 11 +val_397 11 +val_309 11 +val_365 11 +val_266 11 +val_439 11 +val_342 11 +val_367 11 +val_325 11 +val_167 11 +val_195 11 +val_475 11 +val_17 11 +val_113 11 +val_155 11 +val_203 11 +val_339 11 +val_0 11 +val_455 11 +val_128 11 +val_311 11 +val_316 11 +val_57 11 +val_302 11 +val_205 11 +val_149 11 +val_438 11 +val_345 11 +val_129 11 +val_170 11 +val_20 11 +val_489 11 +val_157 11 +val_378 11 +val_221 11 +val_92 11 +val_111 11 +val_47 11 +val_72 11 +val_4 11 +val_280 11 +val_35 11 +val_427 11 +val_277 11 +val_208 11 +val_356 11 +val_399 11 +val_169 11 +val_382 11 +val_498 11 +val_125 11 +val_386 11 +val_437 11 +val_469 11 +val_192 11 +val_286 11 +val_187 11 +val_176 11 +val_54 11 +val_459 11 +val_51 11 +val_138 11 +val_103 11 +val_239 11 +val_213 11 +val_216 11 +val_430 11 +val_278 11 +val_176 11 +val_289 11 +val_221 11 +val_65 11 +val_318 11 +val_332 11 +val_311 11 +val_275 11 +val_137 11 +val_241 11 +val_83 11 +val_333 11 +val_180 11 +val_284 11 +val_12 11 +val_230 11 +val_181 11 +val_67 11 +val_260 11 +val_404 11 +val_384 11 +val_489 11 +val_353 11 +val_373 11 +val_272 11 +val_138 11 +val_217 11 +val_84 11 +val_348 11 +val_466 11 +val_58 11 +val_8 11 +val_411 11 +val_230 11 +val_208 11 +val_348 11 +val_24 11 +val_463 11 +val_431 11 +val_179 11 +val_172 11 +val_42 11 +val_129 11 +val_158 11 +val_119 11 +val_496 11 +val_0 11 +val_322 11 +val_197 11 +val_468 11 +val_393 11 +val_454 11 +val_100 11 +val_298 11 +val_199 11 +val_191 11 +val_418 11 +val_96 11 +val_26 11 +val_165 11 +val_327 11 +val_230 11 +val_205 11 +val_120 11 +val_131 11 +val_51 11 +val_404 11 +val_43 11 +val_436 11 +val_156 11 +val_469 11 +val_468 11 +val_308 11 +val_95 11 +val_196 11 +val_288 11 +val_481 11 +val_457 11 +val_98 11 +val_282 11 +val_197 11 +val_187 11 +val_318 11 +val_318 11 +val_409 11 +val_470 11 +val_137 11 +val_369 11 +val_316 11 +val_169 11 +val_413 11 +val_85 11 +val_77 11 +val_0 11 +val_490 11 +val_87 11 +val_364 11 +val_179 11 +val_118 11 +val_134 11 +val_395 11 +val_282 11 +val_138 11 +val_238 11 +val_419 11 +val_15 11 +val_118 11 +val_72 11 +val_90 11 +val_307 11 +val_19 11 +val_435 11 +val_10 11 +val_277 11 +val_273 11 +val_306 11 +val_224 11 +val_309 11 +val_389 11 +val_327 11 +val_242 11 +val_369 11 +val_392 11 +val_272 11 +val_331 11 +val_401 11 +val_242 11 +val_452 11 +val_177 11 +val_226 11 +val_5 11 +val_497 11 +val_402 11 +val_396 11 +val_317 11 +val_395 11 +val_58 11 +val_35 11 +val_336 11 +val_95 11 +val_11 11 +val_168 11 +val_34 11 +val_229 11 +val_233 11 +val_143 11 +val_472 11 +val_322 11 +val_498 11 +val_160 11 +val_195 11 +val_42 11 +val_321 11 +val_430 11 +val_119 11 +val_489 11 +val_458 11 +val_78 11 +val_76 11 +val_41 11 +val_223 11 +val_492 11 +val_149 11 +val_449 11 +val_218 11 +val_228 11 +val_138 11 +val_453 11 +val_30 11 +val_209 11 +val_64 11 +val_468 11 +val_76 11 +val_74 11 +val_342 11 +val_69 11 +val_230 11 +val_33 11 +val_368 11 +val_103 11 +val_296 11 +val_113 11 +val_216 11 +val_367 11 +val_344 11 +val_167 11 +val_274 11 +val_219 11 +val_239 11 +val_485 11 +val_116 11 +val_223 11 +val_256 11 +val_263 11 +val_70 11 +val_487 11 +val_480 11 +val_401 11 +val_288 11 +val_191 11 +val_5 11 +val_244 11 +val_438 11 +val_128 11 +val_467 11 +val_432 11 +val_202 11 +val_316 11 +val_229 11 +val_469 11 +val_463 11 +val_280 11 +val_2 11 +val_35 11 +val_283 11 +val_331 11 +val_235 11 +val_80 11 +val_44 11 +val_193 11 +val_321 11 +val_335 11 +val_104 11 +val_466 11 +val_366 11 +val_175 11 +val_403 11 +val_483 11 +val_53 11 +val_105 11 +val_257 11 +val_406 11 +val_409 11 +val_190 11 +val_406 11 +val_401 11 +val_114 11 +val_258 11 +val_90 11 +val_203 11 +val_262 11 +val_348 11 +val_424 11 +val_12 11 +val_396 11 +val_201 11 +val_217 11 +val_164 11 +val_431 11 +val_454 11 +val_478 11 +val_298 11 +val_125 11 +val_431 11 +val_164 11 +val_424 11 +val_187 11 +val_382 11 +val_5 11 +val_70 11 +val_397 11 +val_480 11 +val_291 11 +val_24 11 +val_351 11 +val_255 11 +val_104 11 +val_70 11 +val_163 11 +val_438 11 +val_119 11 +val_414 11 +val_200 11 +val_491 11 +val_237 11 +val_439 11 +val_360 11 +val_248 11 +val_479 11 +val_305 11 +val_417 11 +val_199 11 +val_444 11 +val_120 11 +val_429 11 +val_169 11 +val_443 11 +val_323 11 +val_325 11 +val_277 11 +val_230 11 +val_478 11 +val_178 11 +val_468 11 +val_310 11 +val_317 11 +val_333 11 +val_493 11 +val_460 11 +val_207 11 +val_249 11 +val_265 11 +val_480 11 +val_83 11 +val_136 11 +val_353 11 +val_172 11 +val_214 11 +val_462 11 +val_233 11 +val_406 11 +val_133 11 +val_175 11 +val_189 11 +val_454 11 +val_375 11 +val_401 11 +val_421 11 +val_407 11 +val_384 11 +val_256 11 +val_26 11 +val_134 11 +val_67 11 +val_384 11 +val_379 11 +val_18 11 +val_462 11 +val_492 11 +val_100 11 +val_298 11 +val_9 11 +val_341 11 +val_498 11 +val_146 11 +val_458 11 +val_362 11 +val_186 11 +val_285 11 +val_348 11 +val_167 11 +val_18 11 +val_273 11 +val_183 11 +val_281 11 +val_344 11 +val_97 11 +val_469 11 +val_315 11 +val_84 11 +val_28 11 +val_37 11 +val_448 11 +val_152 11 +val_348 11 +val_307 11 +val_194 11 +val_414 11 +val_477 11 +val_222 11 +val_126 11 +val_90 11 +val_169 11 +val_403 11 +val_400 11 +val_200 11 +val_97 11 diff --git a/sql/hive/src/test/resources/golden/decimal_1_1-3-ac24b36077314acab595ada14e598e b/sql/hive/src/test/resources/golden/decimal_1_1-3-ac24b36077314acab595ada14e598e index 6944273be927c..f360c8c73ad1d 100644 --- a/sql/hive/src/test/resources/golden/decimal_1_1-3-ac24b36077314acab595ada14e598e +++ b/sql/hive/src/test/resources/golden/decimal_1_1-3-ac24b36077314acab595ada14e598e @@ -3,18 +3,18 @@ -0.3 -0.9 -0.9 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 0.1 0.2 0.3 diff --git a/sql/hive/src/test/resources/golden/decimal_1_1-4-128804f8dfe7dbb23be0498b91647ba3 b/sql/hive/src/test/resources/golden/decimal_1_1-4-128804f8dfe7dbb23be0498b91647ba3 index f4bf1446459a9..0fa3b15120f86 100644 --- a/sql/hive/src/test/resources/golden/decimal_1_1-4-128804f8dfe7dbb23be0498b91647ba3 +++ b/sql/hive/src/test/resources/golden/decimal_1_1-4-128804f8dfe7dbb23be0498b91647ba3 @@ -3,18 +3,18 @@ 0.3 0.2 0.1 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 -0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 +0.0 -0.1 -0.2 -0.3 diff --git a/sql/hive/src/test/resources/golden/decimal_4-6-693c2e345731f9b2b547c3b75218458e b/sql/hive/src/test/resources/golden/decimal_4-6-693c2e345731f9b2b547c3b75218458e index f59549a6e4a46..a298a84cb2c5a 100644 --- a/sql/hive/src/test/resources/golden/decimal_4-6-693c2e345731f9b2b547c3b75218458e +++ b/sql/hive/src/test/resources/golden/decimal_4-6-693c2e345731f9b2b547c3b75218458e @@ -1,38 +1,38 @@ NULL 0 --1234567890.123456789 -1234567890 --4400 4400 --1255.49 -1255 --1.122 -11 --1.12 -1 --1.12 -1 --0.333 0 --0.33 0 --0.3 0 -0 0 -0 0 -0 0 -0.01 0 -0.02 0 -0.1 0 -0.2 0 -0.3 0 -0.33 0 -0.333 0 +-1234567890.1234567890000000000000000 -1234567890 +-4400.0000000000000000000000000 4400 +-1255.4900000000000000000000000 -1255 +-1.1220000000000000000000000 -11 +-1.1200000000000000000000000 -1 +-1.1200000000000000000000000 -1 +-0.3330000000000000000000000 0 +-0.3300000000000000000000000 0 +-0.3000000000000000000000000 0 +0.0000000000000000000000000 0 +0.0000000000000000000000000 0 +0.0000000000000000000000000 0 +0.0100000000000000000000000 0 +0.0200000000000000000000000 0 +0.1000000000000000000000000 0 +0.2000000000000000000000000 0 +0.3000000000000000000000000 0 +0.3300000000000000000000000 0 +0.3330000000000000000000000 0 0.9999999999999999999999999 1 -1 1 -1 1 -1.12 1 -1.122 1 -2 2 -2 2 -3.14 3 -3.14 3 -3.14 3 -3.14 4 -10 10 -20 20 -100 100 -124 124 -125.2 125 -200 200 -1234567890.12345678 1234567890 +1.0000000000000000000000000 1 +1.0000000000000000000000000 1 +1.1200000000000000000000000 1 +1.1220000000000000000000000 1 +2.0000000000000000000000000 2 +2.0000000000000000000000000 2 +3.1400000000000000000000000 3 +3.1400000000000000000000000 3 +3.1400000000000000000000000 3 +3.1400000000000000000000000 4 +10.0000000000000000000000000 10 +20.0000000000000000000000000 20 +100.0000000000000000000000000 100 +124.0000000000000000000000000 124 +125.2000000000000000000000000 125 +200.0000000000000000000000000 200 +1234567890.1234567800000000000000000 1234567890 diff --git a/sql/hive/src/test/resources/golden/decimal_4-7-f1eb45492510cb76cf6b452121af8531 b/sql/hive/src/test/resources/golden/decimal_4-7-f1eb45492510cb76cf6b452121af8531 index 6bada475c6d3d..60df68a2e3ab5 100644 --- a/sql/hive/src/test/resources/golden/decimal_4-7-f1eb45492510cb76cf6b452121af8531 +++ b/sql/hive/src/test/resources/golden/decimal_4-7-f1eb45492510cb76cf6b452121af8531 @@ -1,38 +1,38 @@ NULL NULL --1234567890.123456789 -3703703670.370370367 --4400 -13200 --1255.49 -3766.47 --1.122 -3.366 --1.12 -3.36 --1.12 -3.36 --0.333 -0.999 --0.33 -0.99 --0.3 -0.9 -0 0 -0 0 -0 0 -0.01 0.03 -0.02 0.06 -0.1 0.3 -0.2 0.6 -0.3 0.9 -0.33 0.99 -0.333 0.999 +-1234567890.1234567890000000000000000 -3703703670.3703703670000000000000000 +-4400.0000000000000000000000000 -13200.0000000000000000000000000 +-1255.4900000000000000000000000 -3766.4700000000000000000000000 +-1.1220000000000000000000000 -3.3660000000000000000000000 +-1.1200000000000000000000000 -3.3600000000000000000000000 +-1.1200000000000000000000000 -3.3600000000000000000000000 +-0.3330000000000000000000000 -0.9990000000000000000000000 +-0.3300000000000000000000000 -0.9900000000000000000000000 +-0.3000000000000000000000000 -0.9000000000000000000000000 +0.0000000000000000000000000 0.0000000000000000000000000 +0.0000000000000000000000000 0.0000000000000000000000000 +0.0000000000000000000000000 0.0000000000000000000000000 +0.0100000000000000000000000 0.0300000000000000000000000 +0.0200000000000000000000000 0.0600000000000000000000000 +0.1000000000000000000000000 0.3000000000000000000000000 +0.2000000000000000000000000 0.6000000000000000000000000 +0.3000000000000000000000000 0.9000000000000000000000000 +0.3300000000000000000000000 0.9900000000000000000000000 +0.3330000000000000000000000 0.9990000000000000000000000 0.9999999999999999999999999 2.9999999999999999999999997 -1 3 -1 3 -1.12 3.36 -1.122 3.366 -2 6 -2 6 -3.14 9.42 -3.14 9.42 -3.14 9.42 -3.14 9.42 -10 30 -20 60 -100 300 -124 372 -125.2 375.6 -200 600 -1234567890.12345678 3703703670.37037034 +1.0000000000000000000000000 3.0000000000000000000000000 +1.0000000000000000000000000 3.0000000000000000000000000 +1.1200000000000000000000000 3.3600000000000000000000000 +1.1220000000000000000000000 3.3660000000000000000000000 +2.0000000000000000000000000 6.0000000000000000000000000 +2.0000000000000000000000000 6.0000000000000000000000000 +3.1400000000000000000000000 9.4200000000000000000000000 +3.1400000000000000000000000 9.4200000000000000000000000 +3.1400000000000000000000000 9.4200000000000000000000000 +3.1400000000000000000000000 9.4200000000000000000000000 +10.0000000000000000000000000 30.0000000000000000000000000 +20.0000000000000000000000000 60.0000000000000000000000000 +100.0000000000000000000000000 300.0000000000000000000000000 +124.0000000000000000000000000 372.0000000000000000000000000 +125.2000000000000000000000000 375.6000000000000000000000000 +200.0000000000000000000000000 600.0000000000000000000000000 +1234567890.1234567800000000000000000 3703703670.3703703400000000000000000 diff --git a/sql/hive/src/test/resources/golden/serde_regex-10-c5b3ec90419a40660e5f83736241c429 b/sql/hive/src/test/resources/golden/serde_regex-10-c5b3ec90419a40660e5f83736241c429 index 93cdc5c85645c..a26c8b7d12886 100644 --- a/sql/hive/src/test/resources/golden/serde_regex-10-c5b3ec90419a40660e5f83736241c429 +++ b/sql/hive/src/test/resources/golden/serde_regex-10-c5b3ec90419a40660e5f83736241c429 @@ -1,38 +1,38 @@ NULL 0 --1234567890.123456789 -1234567890 --4400 4400 --1255.49 -1255 --1.122 -11 --1.12 -1 --1.12 -1 --0.333 0 --0.33 0 --0.3 0 -0 0 -0 0 -0 0 -0.01 0 -0.02 0 -0.1 0 -0.2 0 -0.3 0 -0.33 0 -0.333 0 -1 1 -1 1 -1 1 -1.12 1 -1.122 1 -2 2 -2 2 -3.14 3 -3.14 3 -3.14 3 -3.14 4 -10 10 -20 20 -100 100 -124 124 -125.2 125 -200 200 -1234567890.12345678 1234567890 +-1234567890.123456789000000000 -1234567890 +-4400.000000000000000000 4400 +-1255.490000000000000000 -1255 +-1.122000000000000000 -11 +-1.120000000000000000 -1 +-1.120000000000000000 -1 +-0.333000000000000000 0 +-0.330000000000000000 0 +-0.300000000000000000 0 +0.000000000000000000 0 +0.000000000000000000 0 +0.000000000000000000 0 +0.010000000000000000 0 +0.020000000000000000 0 +0.100000000000000000 0 +0.200000000000000000 0 +0.300000000000000000 0 +0.330000000000000000 0 +0.333000000000000000 0 +1.000000000000000000 1 +1.000000000000000000 1 +1.000000000000000000 1 +1.120000000000000000 1 +1.122000000000000000 1 +2.000000000000000000 2 +2.000000000000000000 2 +3.140000000000000000 3 +3.140000000000000000 3 +3.140000000000000000 3 +3.140000000000000000 4 +10.000000000000000000 10 +20.000000000000000000 20 +100.000000000000000000 100 +124.000000000000000000 124 +125.200000000000000000 125 +200.000000000000000000 200 +1234567890.123456780000000000 1234567890 diff --git a/sql/hive/src/test/resources/golden/windowing_navfn.q (deterministic)-2-1e88e0ba414a00195f7ebf6b8600ac04 b/sql/hive/src/test/resources/golden/windowing_navfn.q (deterministic)-2-1e88e0ba414a00195f7ebf6b8600ac04 index 62d71abc6fc7d..33ea4edf780a6 100644 --- a/sql/hive/src/test/resources/golden/windowing_navfn.q (deterministic)-2-1e88e0ba414a00195f7ebf6b8600ac04 +++ b/sql/hive/src/test/resources/golden/windowing_navfn.q (deterministic)-2-1e88e0ba414a00195f7ebf6b8600ac04 @@ -3,7 +3,7 @@ 65536 32.68 65536 33.45 65536 58.86 -65536 75.7 +65536 75.70 65536 83.48 65537 NULL 65537 4.49 @@ -57,9 +57,9 @@ 65548 75.39 65548 77.24 65549 NULL -65549 13.3 +65549 13.30 65549 28.93 -65549 50.6 +65549 50.60 65549 55.04 65549 64.91 65549 76.06 @@ -70,7 +70,7 @@ 65550 33.01 65550 57.63 65550 91.38 -65550 96.9 +65550 96.90 65551 NULL 65551 39.43 65551 73.93 @@ -99,7 +99,7 @@ 65559 29.55 65559 56.06 65559 73.94 -65559 83.5 +65559 83.50 65560 NULL 65560 16.86 65560 21.81 @@ -128,7 +128,7 @@ 65565 NULL 65565 81.72 65566 NULL -65566 7.8 +65566 7.80 65567 NULL 65568 NULL 65568 21.79 @@ -136,14 +136,14 @@ 65569 NULL 65570 NULL 65570 17.09 -65570 18.2 +65570 18.20 65570 25.57 65570 45.23 -65570 76.8 +65570 76.80 65571 NULL 65571 26.64 65571 40.68 -65571 82.5 +65571 82.50 65572 NULL 65572 22.64 65572 43.49 @@ -156,9 +156,9 @@ 65574 31.28 65574 38.54 65575 NULL -65575 17 +65575 17.00 65575 32.85 -65575 83.4 +65575 83.40 65576 NULL 65576 2.04 65576 4.88 @@ -166,7 +166,7 @@ 65577 NULL 65578 NULL 65578 16.01 -65578 41.1 +65578 41.10 65578 51.36 65578 54.35 65578 58.78 @@ -188,7 +188,7 @@ 65582 NULL 65582 1.23 65582 9.35 -65582 96.6 +65582 96.60 65583 NULL 65583 28.07 65583 50.57 @@ -218,7 +218,7 @@ 65588 98.33 65589 NULL 65589 49.49 -65589 72.3 +65589 72.30 65589 74.83 65589 94.73 65590 NULL @@ -240,8 +240,8 @@ 65595 NULL 65595 8.76 65595 67.56 -65595 72.7 -65595 89.6 +65595 72.70 +65595 89.60 65595 90.24 65596 NULL 65596 12.72 @@ -252,7 +252,7 @@ 65597 37.41 65597 69.05 65598 NULL -65598 63.3 +65598 63.30 65599 NULL 65599 0.56 65599 4.93 @@ -283,7 +283,7 @@ 65605 NULL 65606 NULL 65606 7.51 -65606 24.8 +65606 24.80 65606 57.69 65606 67.94 65606 87.16 @@ -294,9 +294,9 @@ 65607 75.86 65607 91.52 65608 NULL -65608 48.9 +65608 48.90 65608 69.42 -65608 87.9 +65608 87.90 65609 NULL 65610 NULL 65610 7.59 @@ -309,7 +309,7 @@ 65611 64.89 65612 NULL 65612 16.05 -65612 25.1 +65612 25.10 65612 52.64 65613 NULL 65614 NULL @@ -317,17 +317,17 @@ 65614 94.47 65615 NULL 65615 10.79 -65615 39.4 +65615 39.40 65615 99.88 65616 NULL -65616 75.2 +65616 75.20 65617 NULL 65617 18.51 65617 47.45 -65617 64.9 +65617 64.90 65618 NULL 65618 10.06 -65618 16.6 +65618 16.60 65618 81.99 65618 88.38 65619 NULL @@ -348,20 +348,20 @@ 65622 28.37 65622 50.08 65622 74.31 -65622 88.6 -65622 93.7 +65622 88.60 +65622 93.70 65623 NULL 65623 30.83 65623 31.22 65623 39.74 65623 48.51 65623 95.58 -65623 97.2 +65623 97.20 65624 NULL 65624 58.02 65624 65.31 65624 70.08 -65624 93.3 +65624 93.30 65625 NULL 65625 20.61 65625 42.86 @@ -377,13 +377,13 @@ 65628 NULL 65628 14.83 65628 30.43 -65628 37.8 +65628 37.80 65628 74.31 65628 83.26 65629 NULL 65629 19.33 65629 58.81 -65629 72.9 +65629 72.90 65630 NULL 65630 72.13 65631 NULL @@ -412,7 +412,7 @@ 65637 48.88 65637 93.41 65638 NULL -65638 11.2 +65638 11.20 65638 19.13 65639 NULL 65640 NULL @@ -477,20 +477,20 @@ 65654 26.73 65654 29.85 65654 37.74 -65654 37.8 +65654 37.80 65654 53.55 65654 88.23 65655 NULL 65655 77.41 65656 NULL -65656 14 +65656 14.00 65656 14.96 65656 53.27 65656 64.44 65656 82.67 65657 NULL 65657 11.93 -65657 26.4 +65657 26.40 65657 64.39 65657 65.01 65658 NULL @@ -506,8 +506,8 @@ 65659 NULL 65659 8.95 65659 46.57 -65659 53.8 -65659 94.3 +65659 53.80 +65659 94.30 65659 94.69 65659 95.71 65659 99.87 @@ -517,7 +517,7 @@ 65661 NULL 65661 5.24 65661 8.06 -65661 26.8 +65661 26.80 65661 68.98 65662 NULL 65662 59.92 @@ -531,10 +531,10 @@ 65663 94.16 65664 NULL 65664 11.46 -65664 27.6 +65664 27.60 65664 34.71 65664 38.42 -65664 45.4 +65664 45.40 65664 55.82 65664 97.64 65665 NULL @@ -543,13 +543,13 @@ 65666 83.95 65667 NULL 65667 13.96 -65667 63.9 +65667 63.90 65667 97.87 65668 NULL 65669 NULL 65669 1.76 65669 16.95 -65669 38.6 +65669 38.60 65669 54.25 65669 93.79 65670 NULL @@ -561,12 +561,12 @@ 65671 8.65 65671 52.05 65672 NULL -65672 52.6 -65672 58.1 +65672 52.60 +65672 58.10 65672 64.09 65672 75.27 65673 NULL -65673 0.9 +65673 0.90 65673 33.27 65673 43.81 65673 87.78 @@ -576,7 +576,7 @@ 65675 24.19 65675 35.33 65675 35.78 -65675 79.9 +65675 79.90 65675 83.09 65675 87.36 65676 NULL @@ -591,19 +591,19 @@ 65677 87.67 65678 NULL 65678 8.72 -65678 33.9 +65678 33.90 65679 NULL 65679 64.15 65680 NULL 65680 1.01 65680 34.08 65680 54.11 -65680 55.3 +65680 55.30 65680 65.88 65681 NULL 65681 35.45 65681 41.57 -65681 61.3 +65681 61.30 65681 71.17 65681 75.85 65682 NULL @@ -641,7 +641,7 @@ 65691 28.47 65691 56.02 65691 58.01 -65691 69.8 +65691 69.80 65691 76.98 65692 NULL 65692 54.76 @@ -655,19 +655,19 @@ 65694 NULL 65694 58.23 65694 82.24 -65694 88.5 +65694 88.50 65695 NULL 65695 57.33 65695 59.96 65695 77.09 65696 NULL 65696 17.35 -65696 40.3 +65696 40.30 65696 54.02 65697 NULL 65697 3.18 65697 50.01 -65697 67.9 +65697 67.90 65697 86.79 65697 90.16 65698 NULL @@ -685,9 +685,9 @@ 65701 1.81 65701 6.35 65702 NULL -65702 37.6 +65702 37.60 65702 55.68 -65702 79.5 +65702 79.50 65703 NULL 65703 37.18 65703 40.81 @@ -708,23 +708,23 @@ 65706 55.94 65706 72.87 65707 NULL -65707 76.2 +65707 76.20 65708 NULL 65708 1.29 65709 NULL 65709 5.64 65709 49.79 65710 NULL -65710 86.7 +65710 86.70 65711 NULL 65711 8.66 65711 50.26 65711 71.89 65711 78.69 -65711 96.1 +65711 96.10 65712 NULL 65712 30.27 -65712 34.7 +65712 34.70 65712 49.69 65712 53.65 65713 NULL @@ -739,11 +739,11 @@ 65715 39.62 65715 54.79 65715 81.28 -65715 89.4 +65715 89.40 65716 NULL -65716 9 +65716 9.00 65716 10.07 -65716 33.4 +65716 33.40 65716 71.53 65716 85.93 65717 NULL @@ -758,10 +758,10 @@ 65719 NULL 65719 51.13 65719 66.85 -65719 82.1 +65719 82.10 65720 NULL 65720 2.72 -65720 18.8 +65720 18.80 65720 22.34 65720 62.04 65721 NULL @@ -775,7 +775,7 @@ 65722 1.76 65722 38.82 65723 NULL -65723 39.9 +65723 39.90 65724 NULL 65724 10.52 65724 36.05 @@ -784,7 +784,7 @@ 65724 85.52 65725 NULL 65726 NULL -65726 6 +65726 6.00 65726 60.46 65727 NULL 65727 19.81 @@ -796,7 +796,7 @@ 65729 NULL 65730 NULL 65730 1.35 -65730 30.6 +65730 30.60 65730 81.44 65731 NULL 65731 24.48 @@ -810,14 +810,14 @@ 65733 20.72 65733 88.46 65733 93.45 -65733 99.8 +65733 99.80 65734 NULL 65734 31.71 65735 NULL 65735 12.67 65735 61.16 65736 NULL -65736 28.9 +65736 28.90 65736 48.54 65736 86.51 65737 NULL @@ -828,10 +828,10 @@ 65738 NULL 65738 30.94 65738 82.32 -65738 95.1 +65738 95.10 65739 NULL 65739 74.77 -65739 92.4 +65739 92.40 65740 NULL 65740 7.49 65740 58.65 @@ -840,9 +840,9 @@ 65742 6.61 65742 43.84 65743 NULL -65743 26.6 +65743 26.60 65743 52.65 -65743 62 +65743 62.00 65744 NULL 65744 46.98 65745 NULL @@ -853,11 +853,11 @@ 65746 36.74 65746 93.21 65746 97.52 -65746 98.1 +65746 98.10 65747 NULL 65747 11.16 65747 15.07 -65747 21.8 +65747 21.80 65747 39.77 65747 52.77 65747 71.87 @@ -865,7 +865,7 @@ 65748 29.49 65749 NULL 65749 15.14 -65749 45 +65749 45.00 65749 65.49 65749 73.24 65750 NULL @@ -888,12 +888,12 @@ 65755 NULL 65755 11.23 65755 22.44 -65755 64 +65755 64.00 65755 67.54 65755 76.75 65755 81.44 65755 90.08 -65755 96.8 +65755 96.80 65756 NULL 65756 1.45 65756 11.81 @@ -907,10 +907,10 @@ 65758 25.62 65758 56.56 65758 60.88 -65758 94.9 +65758 94.90 65759 NULL 65759 10.63 -65759 14.1 +65759 14.10 65759 47.54 65759 92.81 65760 NULL @@ -920,17 +920,17 @@ 65761 NULL 65762 NULL 65762 5.49 -65762 45.7 +65762 45.70 65762 77.96 -65762 87.5 +65762 87.50 65763 NULL 65763 0.72 -65763 43.8 +65763 43.80 65763 86.43 65763 87.99 65764 NULL 65764 31.41 -65764 57.1 +65764 57.10 65765 NULL 65765 88.52 65765 88.56 @@ -938,7 +938,7 @@ 65766 37.06 65766 66.34 65766 86.53 -65766 98.9 +65766 98.90 65767 NULL 65767 90.88 65767 95.57 @@ -950,14 +950,14 @@ 65769 70.52 65769 91.49 65770 NULL -65770 51.9 +65770 51.90 65771 NULL 65771 6.15 -65771 7.5 +65771 7.50 65772 NULL 65773 NULL 65773 3.81 -65773 18.2 +65773 18.20 65773 30.49 65773 47.09 65773 53.09 @@ -966,7 +966,7 @@ 65774 NULL 65774 45.74 65774 45.97 -65774 48.8 +65774 48.80 65774 56.84 65774 94.77 65775 NULL @@ -975,7 +975,7 @@ 65775 66.68 65775 98.43 65776 NULL -65776 18.7 +65776 18.70 65776 28.47 65776 49.73 65776 98.87 @@ -993,7 +993,7 @@ 65778 95.69 65779 NULL 65779 11.87 -65779 28.2 +65779 28.20 65779 39.48 65779 45.61 65779 64.41 @@ -1008,15 +1008,15 @@ 65782 30.24 65782 34.31 65782 76.14 -65782 81.9 +65782 81.90 65783 NULL 65783 46.34 65783 51.08 65783 52.43 65783 62.58 -65783 77.4 +65783 77.40 65784 NULL -65784 15.7 +65784 15.70 65784 31.35 65784 68.18 65784 93.95 @@ -1032,7 +1032,7 @@ 65787 31.19 65787 64.88 65788 NULL -65788 16.1 +65788 16.10 65788 21.81 65788 25.77 65789 NULL @@ -1041,7 +1041,7 @@ 65789 52.49 65789 83.18 65789 92.74 -65789 96.9 +65789 96.90 65790 NULL 65790 46.91 65790 84.87 diff --git a/sql/hive/src/test/resources/golden/windowing_rank.q (deterministic) 2-0-81bb7f49a55385878637c8aac4d08e5 b/sql/hive/src/test/resources/golden/windowing_rank.q (deterministic) 2-0-81bb7f49a55385878637c8aac4d08e5 index 9091a9156134c..207dababa0a50 100644 --- a/sql/hive/src/test/resources/golden/windowing_rank.q (deterministic) 2-0-81bb7f49a55385878637c8aac4d08e5 +++ b/sql/hive/src/test/resources/golden/windowing_rank.q (deterministic) 2-0-81bb7f49a55385878637c8aac4d08e5 @@ -18,12 +18,12 @@ 2013-03-01 09:11:58.703073 10.07 1 2013-03-01 09:11:58.703073 10.07 1 2013-03-01 09:11:58.703073 10.07 1 -2013-03-01 09:11:58.703074 37.8 1 -2013-03-01 09:11:58.703074 37.8 1 -2013-03-01 09:11:58.703074 37.8 1 -2013-03-01 09:11:58.703074 37.8 1 -2013-03-01 09:11:58.703074 37.8 1 -2013-03-01 09:11:58.703074 37.8 1 +2013-03-01 09:11:58.703074 37.80 1 +2013-03-01 09:11:58.703074 37.80 1 +2013-03-01 09:11:58.703074 37.80 1 +2013-03-01 09:11:58.703074 37.80 1 +2013-03-01 09:11:58.703074 37.80 1 +2013-03-01 09:11:58.703074 37.80 1 2013-03-01 09:11:58.703075 5.64 1 2013-03-01 09:11:58.703075 5.64 1 2013-03-01 09:11:58.703075 5.64 1 @@ -59,11 +59,11 @@ 2013-03-01 09:11:58.70308 1.76 1 2013-03-01 09:11:58.70308 1.76 1 2013-03-01 09:11:58.70308 1.76 1 -2013-03-01 09:11:58.703081 67.9 1 -2013-03-01 09:11:58.703081 67.9 1 -2013-03-01 09:11:58.703081 67.9 1 -2013-03-01 09:11:58.703081 67.9 1 -2013-03-01 09:11:58.703081 67.9 1 +2013-03-01 09:11:58.703081 67.90 1 +2013-03-01 09:11:58.703081 67.90 1 +2013-03-01 09:11:58.703081 67.90 1 +2013-03-01 09:11:58.703081 67.90 1 +2013-03-01 09:11:58.703081 67.90 1 2013-03-01 09:11:58.703082 37.25 1 2013-03-01 09:11:58.703082 37.25 1 2013-03-01 09:11:58.703082 37.25 1 @@ -148,9 +148,9 @@ 2013-03-01 09:11:58.703096 11.64 1 2013-03-01 09:11:58.703096 11.64 1 2013-03-01 09:11:58.703096 11.64 1 -2013-03-01 09:11:58.703097 0.9 1 -2013-03-01 09:11:58.703097 0.9 1 -2013-03-01 09:11:58.703097 0.9 1 +2013-03-01 09:11:58.703097 0.90 1 +2013-03-01 09:11:58.703097 0.90 1 +2013-03-01 09:11:58.703097 0.90 1 2013-03-01 09:11:58.703098 1.35 1 2013-03-01 09:11:58.703098 1.35 1 2013-03-01 09:11:58.703098 1.35 1 @@ -210,27 +210,27 @@ 2013-03-01 09:11:58.70311 8.16 1 2013-03-01 09:11:58.70311 8.16 1 2013-03-01 09:11:58.70311 8.16 1 -2013-03-01 09:11:58.703111 18.8 1 -2013-03-01 09:11:58.703111 18.8 1 -2013-03-01 09:11:58.703111 18.8 1 -2013-03-01 09:11:58.703111 18.8 1 -2013-03-01 09:11:58.703111 18.8 1 -2013-03-01 09:11:58.703111 18.8 1 -2013-03-01 09:11:58.703111 18.8 1 +2013-03-01 09:11:58.703111 18.80 1 +2013-03-01 09:11:58.703111 18.80 1 +2013-03-01 09:11:58.703111 18.80 1 +2013-03-01 09:11:58.703111 18.80 1 +2013-03-01 09:11:58.703111 18.80 1 +2013-03-01 09:11:58.703111 18.80 1 +2013-03-01 09:11:58.703111 18.80 1 2013-03-01 09:11:58.703112 13.29 1 2013-03-01 09:11:58.703112 13.29 1 2013-03-01 09:11:58.703112 13.29 1 2013-03-01 09:11:58.703112 13.29 1 -2013-03-01 09:11:58.703113 21.8 1 -2013-03-01 09:11:58.703113 21.8 1 -2013-03-01 09:11:58.703113 21.8 1 -2013-03-01 09:11:58.703113 21.8 1 -2013-03-01 09:11:58.703113 21.8 1 -2013-03-01 09:11:58.703113 21.8 1 -2013-03-01 09:11:58.703113 21.8 1 -2013-03-01 09:11:58.703113 21.8 1 -2013-03-01 09:11:58.703113 21.8 1 -2013-03-01 09:11:58.703113 21.8 1 +2013-03-01 09:11:58.703113 21.80 1 +2013-03-01 09:11:58.703113 21.80 1 +2013-03-01 09:11:58.703113 21.80 1 +2013-03-01 09:11:58.703113 21.80 1 +2013-03-01 09:11:58.703113 21.80 1 +2013-03-01 09:11:58.703113 21.80 1 +2013-03-01 09:11:58.703113 21.80 1 +2013-03-01 09:11:58.703113 21.80 1 +2013-03-01 09:11:58.703113 21.80 1 +2013-03-01 09:11:58.703113 21.80 1 2013-03-01 09:11:58.703114 73.94 1 2013-03-01 09:11:58.703114 73.94 1 2013-03-01 09:11:58.703114 73.94 1 @@ -256,14 +256,14 @@ 2013-03-01 09:11:58.703118 8.69 1 2013-03-01 09:11:58.703119 58.02 1 2013-03-01 09:11:58.703119 58.02 1 -2013-03-01 09:11:58.70312 52.6 1 -2013-03-01 09:11:58.70312 52.6 1 -2013-03-01 09:11:58.70312 52.6 1 -2013-03-01 09:11:58.70312 52.6 1 -2013-03-01 09:11:58.703121 96.9 1 -2013-03-01 09:11:58.703121 96.9 1 -2013-03-01 09:11:58.703121 96.9 1 -2013-03-01 09:11:58.703121 96.9 1 +2013-03-01 09:11:58.70312 52.60 1 +2013-03-01 09:11:58.70312 52.60 1 +2013-03-01 09:11:58.70312 52.60 1 +2013-03-01 09:11:58.70312 52.60 1 +2013-03-01 09:11:58.703121 96.90 1 +2013-03-01 09:11:58.703121 96.90 1 +2013-03-01 09:11:58.703121 96.90 1 +2013-03-01 09:11:58.703121 96.90 1 2013-03-01 09:11:58.703122 53.56 1 2013-03-01 09:11:58.703122 53.56 1 2013-03-01 09:11:58.703122 53.56 1 @@ -310,11 +310,11 @@ 2013-03-01 09:11:58.703133 27.34 1 2013-03-01 09:11:58.703133 27.34 1 2013-03-01 09:11:58.703133 27.34 1 -2013-03-01 09:11:58.703134 98.9 1 -2013-03-01 09:11:58.703134 98.9 1 -2013-03-01 09:11:58.703134 98.9 1 -2013-03-01 09:11:58.703134 98.9 1 -2013-03-01 09:11:58.703134 98.9 1 +2013-03-01 09:11:58.703134 98.90 1 +2013-03-01 09:11:58.703134 98.90 1 +2013-03-01 09:11:58.703134 98.90 1 +2013-03-01 09:11:58.703134 98.90 1 +2013-03-01 09:11:58.703134 98.90 1 2013-03-01 09:11:58.703135 29.14 1 2013-03-01 09:11:58.703135 29.14 1 2013-03-01 09:11:58.703135 29.14 1 @@ -467,12 +467,12 @@ 2013-03-01 09:11:58.703162 3.51 1 2013-03-01 09:11:58.703162 3.51 1 2013-03-01 09:11:58.703162 3.51 1 -2013-03-01 09:11:58.703163 15.7 1 -2013-03-01 09:11:58.703163 15.7 1 -2013-03-01 09:11:58.703163 15.7 1 -2013-03-01 09:11:58.703163 15.7 1 -2013-03-01 09:11:58.703163 15.7 1 -2013-03-01 09:11:58.703163 15.7 1 +2013-03-01 09:11:58.703163 15.70 1 +2013-03-01 09:11:58.703163 15.70 1 +2013-03-01 09:11:58.703163 15.70 1 +2013-03-01 09:11:58.703163 15.70 1 +2013-03-01 09:11:58.703163 15.70 1 +2013-03-01 09:11:58.703163 15.70 1 2013-03-01 09:11:58.703164 30.27 1 2013-03-01 09:11:58.703164 30.27 1 2013-03-01 09:11:58.703164 30.27 1 @@ -482,9 +482,9 @@ 2013-03-01 09:11:58.703165 8.38 1 2013-03-01 09:11:58.703165 8.38 1 2013-03-01 09:11:58.703165 8.38 1 -2013-03-01 09:11:58.703166 16.6 1 -2013-03-01 09:11:58.703166 16.6 1 -2013-03-01 09:11:58.703166 16.6 1 +2013-03-01 09:11:58.703166 16.60 1 +2013-03-01 09:11:58.703166 16.60 1 +2013-03-01 09:11:58.703166 16.60 1 2013-03-01 09:11:58.703167 17.66 1 2013-03-01 09:11:58.703167 17.66 1 2013-03-01 09:11:58.703167 17.66 1 @@ -537,11 +537,11 @@ 2013-03-01 09:11:58.703175 33.37 1 2013-03-01 09:11:58.703175 33.37 1 2013-03-01 09:11:58.703175 33.37 1 -2013-03-01 09:11:58.703176 28.2 1 -2013-03-01 09:11:58.703176 28.2 1 -2013-03-01 09:11:58.703176 28.2 1 -2013-03-01 09:11:58.703176 28.2 1 -2013-03-01 09:11:58.703176 28.2 1 +2013-03-01 09:11:58.703176 28.20 1 +2013-03-01 09:11:58.703176 28.20 1 +2013-03-01 09:11:58.703176 28.20 1 +2013-03-01 09:11:58.703176 28.20 1 +2013-03-01 09:11:58.703176 28.20 1 2013-03-01 09:11:58.703177 11.43 1 2013-03-01 09:11:58.703177 11.43 1 2013-03-01 09:11:58.703177 11.43 1 @@ -567,13 +567,13 @@ 2013-03-01 09:11:58.70318 10.28 1 2013-03-01 09:11:58.70318 10.28 1 2013-03-01 09:11:58.70318 10.28 1 -2013-03-01 09:11:58.703181 26.6 1 -2013-03-01 09:11:58.703181 26.6 1 -2013-03-01 09:11:58.703181 26.6 1 -2013-03-01 09:11:58.703181 26.6 1 -2013-03-01 09:11:58.703181 26.6 1 -2013-03-01 09:11:58.703181 26.6 1 -2013-03-01 09:11:58.703181 26.6 1 +2013-03-01 09:11:58.703181 26.60 1 +2013-03-01 09:11:58.703181 26.60 1 +2013-03-01 09:11:58.703181 26.60 1 +2013-03-01 09:11:58.703181 26.60 1 +2013-03-01 09:11:58.703181 26.60 1 +2013-03-01 09:11:58.703181 26.60 1 +2013-03-01 09:11:58.703181 26.60 1 2013-03-01 09:11:58.703182 1.23 1 2013-03-01 09:11:58.703182 1.23 1 2013-03-01 09:11:58.703182 1.23 1 @@ -647,10 +647,10 @@ 2013-03-01 09:11:58.703197 16.01 1 2013-03-01 09:11:58.703197 16.01 1 2013-03-01 09:11:58.703197 16.01 1 -2013-03-01 09:11:58.703198 30.6 1 -2013-03-01 09:11:58.703198 30.6 1 -2013-03-01 09:11:58.703198 30.6 1 -2013-03-01 09:11:58.703198 30.6 1 +2013-03-01 09:11:58.703198 30.60 1 +2013-03-01 09:11:58.703198 30.60 1 +2013-03-01 09:11:58.703198 30.60 1 +2013-03-01 09:11:58.703198 30.60 1 2013-03-01 09:11:58.703199 45.69 1 2013-03-01 09:11:58.703199 45.69 1 2013-03-01 09:11:58.703199 45.69 1 @@ -669,11 +669,11 @@ 2013-03-01 09:11:58.703203 11.63 1 2013-03-01 09:11:58.703203 11.63 1 2013-03-01 09:11:58.703203 11.63 1 -2013-03-01 09:11:58.703205 35.8 1 -2013-03-01 09:11:58.703205 35.8 1 -2013-03-01 09:11:58.703205 35.8 1 -2013-03-01 09:11:58.703205 35.8 1 -2013-03-01 09:11:58.703205 35.8 1 +2013-03-01 09:11:58.703205 35.80 1 +2013-03-01 09:11:58.703205 35.80 1 +2013-03-01 09:11:58.703205 35.80 1 +2013-03-01 09:11:58.703205 35.80 1 +2013-03-01 09:11:58.703205 35.80 1 2013-03-01 09:11:58.703206 6.61 1 2013-03-01 09:11:58.703206 6.61 1 2013-03-01 09:11:58.703206 6.61 1 @@ -824,9 +824,9 @@ 2013-03-01 09:11:58.703233 40.81 1 2013-03-01 09:11:58.703233 40.81 1 2013-03-01 09:11:58.703233 40.81 1 -2013-03-01 09:11:58.703234 44.1 1 -2013-03-01 09:11:58.703234 44.1 1 -2013-03-01 09:11:58.703234 44.1 1 +2013-03-01 09:11:58.703234 44.10 1 +2013-03-01 09:11:58.703234 44.10 1 +2013-03-01 09:11:58.703234 44.10 1 2013-03-01 09:11:58.703235 6.35 1 2013-03-01 09:11:58.703235 6.35 1 2013-03-01 09:11:58.703235 6.35 1 @@ -834,11 +834,11 @@ 2013-03-01 09:11:58.703235 6.35 1 2013-03-01 09:11:58.703235 6.35 1 2013-03-01 09:11:58.703235 6.35 1 -2013-03-01 09:11:58.703236 37.8 1 -2013-03-01 09:11:58.703236 37.8 1 -2013-03-01 09:11:58.703236 37.8 1 -2013-03-01 09:11:58.703236 37.8 1 -2013-03-01 09:11:58.703236 37.8 1 +2013-03-01 09:11:58.703236 37.80 1 +2013-03-01 09:11:58.703236 37.80 1 +2013-03-01 09:11:58.703236 37.80 1 +2013-03-01 09:11:58.703236 37.80 1 +2013-03-01 09:11:58.703236 37.80 1 2013-03-01 09:11:58.703237 0.24 1 2013-03-01 09:11:58.703237 0.24 1 2013-03-01 09:11:58.703237 0.24 1 @@ -847,17 +847,17 @@ 2013-03-01 09:11:58.703237 0.24 1 2013-03-01 09:11:58.703237 0.24 1 2013-03-01 09:11:58.703237 0.24 1 -2013-03-01 09:11:58.703238 6 1 -2013-03-01 09:11:58.703238 6 1 -2013-03-01 09:11:58.703238 6 1 -2013-03-01 09:11:58.703238 6 1 -2013-03-01 09:11:58.703239 24.8 1 -2013-03-01 09:11:58.703239 24.8 1 -2013-03-01 09:11:58.703239 24.8 1 -2013-03-01 09:11:58.703239 24.8 1 -2013-03-01 09:11:58.703239 24.8 1 -2013-03-01 09:11:58.70324 5.1 1 -2013-03-01 09:11:58.70324 5.1 1 +2013-03-01 09:11:58.703238 6.00 1 +2013-03-01 09:11:58.703238 6.00 1 +2013-03-01 09:11:58.703238 6.00 1 +2013-03-01 09:11:58.703238 6.00 1 +2013-03-01 09:11:58.703239 24.80 1 +2013-03-01 09:11:58.703239 24.80 1 +2013-03-01 09:11:58.703239 24.80 1 +2013-03-01 09:11:58.703239 24.80 1 +2013-03-01 09:11:58.703239 24.80 1 +2013-03-01 09:11:58.70324 5.10 1 +2013-03-01 09:11:58.70324 5.10 1 2013-03-01 09:11:58.703241 19.33 1 2013-03-01 09:11:58.703241 19.33 1 2013-03-01 09:11:58.703241 19.33 1 @@ -973,16 +973,16 @@ 2013-03-01 09:11:58.703262 1.81 1 2013-03-01 09:11:58.703262 1.81 1 2013-03-01 09:11:58.703262 1.81 1 -2013-03-01 09:11:58.703263 14.4 1 -2013-03-01 09:11:58.703263 14.4 1 -2013-03-01 09:11:58.703263 14.4 1 -2013-03-01 09:11:58.703263 14.4 1 -2013-03-01 09:11:58.703263 14.4 1 -2013-03-01 09:11:58.703263 14.4 1 -2013-03-01 09:11:58.703263 14.4 1 -2013-03-01 09:11:58.703263 14.4 1 -2013-03-01 09:11:58.703263 14.4 1 -2013-03-01 09:11:58.703263 14.4 1 +2013-03-01 09:11:58.703263 14.40 1 +2013-03-01 09:11:58.703263 14.40 1 +2013-03-01 09:11:58.703263 14.40 1 +2013-03-01 09:11:58.703263 14.40 1 +2013-03-01 09:11:58.703263 14.40 1 +2013-03-01 09:11:58.703263 14.40 1 +2013-03-01 09:11:58.703263 14.40 1 +2013-03-01 09:11:58.703263 14.40 1 +2013-03-01 09:11:58.703263 14.40 1 +2013-03-01 09:11:58.703263 14.40 1 2013-03-01 09:11:58.703264 52.49 1 2013-03-01 09:11:58.703264 52.49 1 2013-03-01 09:11:58.703264 52.49 1 @@ -1068,12 +1068,12 @@ 2013-03-01 09:11:58.703281 19.95 1 2013-03-01 09:11:58.703281 19.95 1 2013-03-01 09:11:58.703281 19.95 1 -2013-03-01 09:11:58.703282 7.5 1 -2013-03-01 09:11:58.703282 7.5 1 -2013-03-01 09:11:58.703282 7.5 1 -2013-03-01 09:11:58.703282 7.5 1 -2013-03-01 09:11:58.703282 7.5 1 -2013-03-01 09:11:58.703282 7.5 1 +2013-03-01 09:11:58.703282 7.50 1 +2013-03-01 09:11:58.703282 7.50 1 +2013-03-01 09:11:58.703282 7.50 1 +2013-03-01 09:11:58.703282 7.50 1 +2013-03-01 09:11:58.703282 7.50 1 +2013-03-01 09:11:58.703282 7.50 1 2013-03-01 09:11:58.703283 17.62 1 2013-03-01 09:11:58.703283 17.62 1 2013-03-01 09:11:58.703283 17.62 1 @@ -1153,12 +1153,12 @@ 2013-03-01 09:11:58.703297 25.67 1 2013-03-01 09:11:58.703297 25.67 1 2013-03-01 09:11:58.703297 25.67 1 -2013-03-01 09:11:58.703298 8.8 1 -2013-03-01 09:11:58.703298 8.8 1 -2013-03-01 09:11:58.703298 8.8 1 -2013-03-01 09:11:58.703298 8.8 1 -2013-03-01 09:11:58.703299 9 1 -2013-03-01 09:11:58.703299 9 1 +2013-03-01 09:11:58.703298 8.80 1 +2013-03-01 09:11:58.703298 8.80 1 +2013-03-01 09:11:58.703298 8.80 1 +2013-03-01 09:11:58.703298 8.80 1 +2013-03-01 09:11:58.703299 9.00 1 +2013-03-01 09:11:58.703299 9.00 1 2013-03-01 09:11:58.7033 7.51 1 2013-03-01 09:11:58.7033 7.51 1 2013-03-01 09:11:58.7033 7.51 1 @@ -1217,12 +1217,12 @@ 2013-03-01 09:11:58.703311 7.38 1 2013-03-01 09:11:58.703311 7.38 1 2013-03-01 09:11:58.703311 7.38 1 -2013-03-01 09:11:58.703312 18.2 1 -2013-03-01 09:11:58.703312 18.2 1 -2013-03-01 09:11:58.703312 18.2 1 -2013-03-01 09:11:58.703312 18.2 1 -2013-03-01 09:11:58.703312 18.2 1 -2013-03-01 09:11:58.703312 18.2 1 +2013-03-01 09:11:58.703312 18.20 1 +2013-03-01 09:11:58.703312 18.20 1 +2013-03-01 09:11:58.703312 18.20 1 +2013-03-01 09:11:58.703312 18.20 1 +2013-03-01 09:11:58.703312 18.20 1 +2013-03-01 09:11:58.703312 18.20 1 2013-03-01 09:11:58.703313 9.35 1 2013-03-01 09:11:58.703313 9.35 1 2013-03-01 09:11:58.703313 9.35 1 diff --git a/sql/hive/src/test/resources/golden/windowing_rank.q (deterministic) 4-0-12cc78f3953c3e6b5411ddc729541bf0 b/sql/hive/src/test/resources/golden/windowing_rank.q (deterministic) 4-0-12cc78f3953c3e6b5411ddc729541bf0 index d02ca48857b5f..a1628c7e1c0c5 100644 --- a/sql/hive/src/test/resources/golden/windowing_rank.q (deterministic) 4-0-12cc78f3953c3e6b5411ddc729541bf0 +++ b/sql/hive/src/test/resources/golden/windowing_rank.q (deterministic) 4-0-12cc78f3953c3e6b5411ddc729541bf0 @@ -46,9 +46,9 @@ 2013-03-01 09:11:58.703092 54.02 1 2013-03-01 09:11:58.703092 54.02 1 2013-03-01 09:11:58.703096 87.84 1 -2013-03-01 09:11:58.703097 0.9 1 -2013-03-01 09:11:58.703097 0.9 1 -2013-03-01 09:11:58.703097 0.9 1 +2013-03-01 09:11:58.703097 0.90 1 +2013-03-01 09:11:58.703097 0.90 1 +2013-03-01 09:11:58.703097 0.90 1 2013-03-01 09:11:58.703098 21.29 1 2013-03-01 09:11:58.703098 21.29 1 2013-03-01 09:11:58.703098 21.29 1 @@ -88,10 +88,10 @@ 2013-03-01 09:11:58.703113 58.65 1 2013-03-01 09:11:58.703118 8.69 1 2013-03-01 09:11:58.703118 8.69 1 -2013-03-01 09:11:58.70312 52.6 1 -2013-03-01 09:11:58.70312 52.6 1 -2013-03-01 09:11:58.70312 52.6 1 -2013-03-01 09:11:58.70312 52.6 1 +2013-03-01 09:11:58.70312 52.60 1 +2013-03-01 09:11:58.70312 52.60 1 +2013-03-01 09:11:58.70312 52.60 1 +2013-03-01 09:11:58.70312 52.60 1 2013-03-01 09:11:58.703125 78.52 1 2013-03-01 09:11:58.703125 78.52 1 2013-03-01 09:11:58.703125 78.52 1 @@ -119,11 +119,11 @@ 2013-03-01 09:11:58.703136 27.89 1 2013-03-01 09:11:58.703136 27.89 1 2013-03-01 09:11:58.703136 27.89 1 -2013-03-01 09:11:58.703138 86.7 1 -2013-03-01 09:11:58.703138 86.7 1 -2013-03-01 09:11:58.703138 86.7 1 -2013-03-01 09:11:58.703138 86.7 1 -2013-03-01 09:11:58.703138 86.7 1 +2013-03-01 09:11:58.703138 86.70 1 +2013-03-01 09:11:58.703138 86.70 1 +2013-03-01 09:11:58.703138 86.70 1 +2013-03-01 09:11:58.703138 86.70 1 +2013-03-01 09:11:58.703138 86.70 1 2013-03-01 09:11:58.703139 43.53 1 2013-03-01 09:11:58.703139 43.53 1 2013-03-01 09:11:58.703139 43.53 1 @@ -167,13 +167,13 @@ 2013-03-01 09:11:58.703179 60.94 1 2013-03-01 09:11:58.703179 60.94 1 2013-03-01 09:11:58.703179 60.94 1 -2013-03-01 09:11:58.703181 26.6 1 -2013-03-01 09:11:58.703181 26.6 1 -2013-03-01 09:11:58.703181 26.6 1 -2013-03-01 09:11:58.703181 26.6 1 -2013-03-01 09:11:58.703181 26.6 1 -2013-03-01 09:11:58.703181 26.6 1 -2013-03-01 09:11:58.703181 26.6 1 +2013-03-01 09:11:58.703181 26.60 1 +2013-03-01 09:11:58.703181 26.60 1 +2013-03-01 09:11:58.703181 26.60 1 +2013-03-01 09:11:58.703181 26.60 1 +2013-03-01 09:11:58.703181 26.60 1 +2013-03-01 09:11:58.703181 26.60 1 +2013-03-01 09:11:58.703181 26.60 1 2013-03-01 09:11:58.703184 73.93 1 2013-03-01 09:11:58.703184 73.93 1 2013-03-01 09:11:58.703184 73.93 1 @@ -202,12 +202,12 @@ 2013-03-01 09:11:58.703189 37.74 1 2013-03-01 09:11:58.703189 37.74 1 2013-03-01 09:11:58.703189 37.74 1 -2013-03-01 09:11:58.703195 82.5 1 -2013-03-01 09:11:58.703195 82.5 1 -2013-03-01 09:11:58.703195 82.5 1 -2013-03-01 09:11:58.703195 82.5 1 -2013-03-01 09:11:58.703195 82.5 1 -2013-03-01 09:11:58.703195 82.5 1 +2013-03-01 09:11:58.703195 82.50 1 +2013-03-01 09:11:58.703195 82.50 1 +2013-03-01 09:11:58.703195 82.50 1 +2013-03-01 09:11:58.703195 82.50 1 +2013-03-01 09:11:58.703195 82.50 1 +2013-03-01 09:11:58.703195 82.50 1 2013-03-01 09:11:58.703198 97.18 1 2013-03-01 09:11:58.703198 97.18 1 2013-03-01 09:11:58.703198 97.18 1 @@ -233,10 +233,10 @@ 2013-03-01 09:11:58.70321 37.12 1 2013-03-01 09:11:58.70321 37.12 1 2013-03-01 09:11:58.70321 37.12 1 -2013-03-01 09:11:58.703213 48.8 1 -2013-03-01 09:11:58.703213 48.8 1 -2013-03-01 09:11:58.703213 48.8 1 -2013-03-01 09:11:58.703213 48.8 1 +2013-03-01 09:11:58.703213 48.80 1 +2013-03-01 09:11:58.703213 48.80 1 +2013-03-01 09:11:58.703213 48.80 1 +2013-03-01 09:11:58.703213 48.80 1 2013-03-01 09:11:58.703219 32.73 1 2013-03-01 09:11:58.703219 32.73 1 2013-03-01 09:11:58.703219 32.73 1 @@ -253,30 +253,30 @@ 2013-03-01 09:11:58.703221 26.64 1 2013-03-01 09:11:58.703221 26.64 1 2013-03-01 09:11:58.703221 26.64 1 -2013-03-01 09:11:58.703223 57.1 1 -2013-03-01 09:11:58.703223 57.1 1 -2013-03-01 09:11:58.703223 57.1 1 -2013-03-01 09:11:58.703223 57.1 1 -2013-03-01 09:11:58.703223 57.1 1 -2013-03-01 09:11:58.703223 57.1 1 -2013-03-01 09:11:58.703223 57.1 1 +2013-03-01 09:11:58.703223 57.10 1 +2013-03-01 09:11:58.703223 57.10 1 +2013-03-01 09:11:58.703223 57.10 1 +2013-03-01 09:11:58.703223 57.10 1 +2013-03-01 09:11:58.703223 57.10 1 +2013-03-01 09:11:58.703223 57.10 1 +2013-03-01 09:11:58.703223 57.10 1 2013-03-01 09:11:58.703224 42.93 1 2013-03-01 09:11:58.703224 42.93 1 2013-03-01 09:11:58.703224 42.93 1 2013-03-01 09:11:58.703224 42.93 1 -2013-03-01 09:11:58.703226 68.3 1 -2013-03-01 09:11:58.703226 68.3 1 -2013-03-01 09:11:58.703226 68.3 1 -2013-03-01 09:11:58.703226 68.3 1 -2013-03-01 09:11:58.703226 68.3 1 -2013-03-01 09:11:58.703226 68.3 1 -2013-03-01 09:11:58.703231 18.7 1 -2013-03-01 09:11:58.703231 18.7 1 -2013-03-01 09:11:58.703231 18.7 1 -2013-03-01 09:11:58.703231 18.7 1 -2013-03-01 09:11:58.703231 18.7 1 -2013-03-01 09:11:58.703231 18.7 1 -2013-03-01 09:11:58.703231 18.7 1 +2013-03-01 09:11:58.703226 68.30 1 +2013-03-01 09:11:58.703226 68.30 1 +2013-03-01 09:11:58.703226 68.30 1 +2013-03-01 09:11:58.703226 68.30 1 +2013-03-01 09:11:58.703226 68.30 1 +2013-03-01 09:11:58.703226 68.30 1 +2013-03-01 09:11:58.703231 18.70 1 +2013-03-01 09:11:58.703231 18.70 1 +2013-03-01 09:11:58.703231 18.70 1 +2013-03-01 09:11:58.703231 18.70 1 +2013-03-01 09:11:58.703231 18.70 1 +2013-03-01 09:11:58.703231 18.70 1 +2013-03-01 09:11:58.703231 18.70 1 2013-03-01 09:11:58.703233 40.81 1 2013-03-01 09:11:58.703233 40.81 1 2013-03-01 09:11:58.703233 40.81 1 @@ -295,24 +295,24 @@ 2013-03-01 09:11:58.703244 25.67 1 2013-03-01 09:11:58.703244 25.67 1 2013-03-01 09:11:58.703244 25.67 1 -2013-03-01 09:11:58.703245 32.3 1 -2013-03-01 09:11:58.703245 32.3 1 -2013-03-01 09:11:58.703245 32.3 1 -2013-03-01 09:11:58.703245 32.3 1 -2013-03-01 09:11:58.703245 32.3 1 -2013-03-01 09:11:58.703245 32.3 1 -2013-03-01 09:11:58.703245 32.3 1 -2013-03-01 09:11:58.703245 32.3 1 -2013-03-01 09:11:58.703245 32.3 1 +2013-03-01 09:11:58.703245 32.30 1 +2013-03-01 09:11:58.703245 32.30 1 +2013-03-01 09:11:58.703245 32.30 1 +2013-03-01 09:11:58.703245 32.30 1 +2013-03-01 09:11:58.703245 32.30 1 +2013-03-01 09:11:58.703245 32.30 1 +2013-03-01 09:11:58.703245 32.30 1 +2013-03-01 09:11:58.703245 32.30 1 +2013-03-01 09:11:58.703245 32.30 1 2013-03-01 09:11:58.703246 72.87 1 2013-03-01 09:11:58.703246 72.87 1 2013-03-01 09:11:58.703248 81.28 1 2013-03-01 09:11:58.703248 81.28 1 2013-03-01 09:11:58.703248 81.28 1 -2013-03-01 09:11:58.703249 93.3 1 -2013-03-01 09:11:58.703249 93.3 1 -2013-03-01 09:11:58.703249 93.3 1 -2013-03-01 09:11:58.703249 93.3 1 +2013-03-01 09:11:58.703249 93.30 1 +2013-03-01 09:11:58.703249 93.30 1 +2013-03-01 09:11:58.703249 93.30 1 +2013-03-01 09:11:58.703249 93.30 1 2013-03-01 09:11:58.70325 93.79 1 2013-03-01 09:11:58.70325 93.79 1 2013-03-01 09:11:58.70325 93.79 1 @@ -324,12 +324,12 @@ 2013-03-01 09:11:58.703254 0.32 1 2013-03-01 09:11:58.703254 0.32 1 2013-03-01 09:11:58.703254 0.32 1 -2013-03-01 09:11:58.703256 43.8 1 -2013-03-01 09:11:58.703256 43.8 1 -2013-03-01 09:11:58.703256 43.8 1 -2013-03-01 09:11:58.703256 43.8 1 -2013-03-01 09:11:58.703256 43.8 1 -2013-03-01 09:11:58.703256 43.8 1 +2013-03-01 09:11:58.703256 43.80 1 +2013-03-01 09:11:58.703256 43.80 1 +2013-03-01 09:11:58.703256 43.80 1 +2013-03-01 09:11:58.703256 43.80 1 +2013-03-01 09:11:58.703256 43.80 1 +2013-03-01 09:11:58.703256 43.80 1 2013-03-01 09:11:58.703258 21.21 1 2013-03-01 09:11:58.703258 21.21 1 2013-03-01 09:11:58.703258 21.21 1 @@ -342,16 +342,16 @@ 2013-03-01 09:11:58.703262 78.56 1 2013-03-01 09:11:58.703262 78.56 1 2013-03-01 09:11:58.703262 78.56 1 -2013-03-01 09:11:58.703263 14.4 1 -2013-03-01 09:11:58.703263 14.4 1 -2013-03-01 09:11:58.703263 14.4 1 -2013-03-01 09:11:58.703263 14.4 1 -2013-03-01 09:11:58.703263 14.4 1 -2013-03-01 09:11:58.703263 14.4 1 -2013-03-01 09:11:58.703263 14.4 1 -2013-03-01 09:11:58.703263 14.4 1 -2013-03-01 09:11:58.703263 14.4 1 -2013-03-01 09:11:58.703263 14.4 1 +2013-03-01 09:11:58.703263 14.40 1 +2013-03-01 09:11:58.703263 14.40 1 +2013-03-01 09:11:58.703263 14.40 1 +2013-03-01 09:11:58.703263 14.40 1 +2013-03-01 09:11:58.703263 14.40 1 +2013-03-01 09:11:58.703263 14.40 1 +2013-03-01 09:11:58.703263 14.40 1 +2013-03-01 09:11:58.703263 14.40 1 +2013-03-01 09:11:58.703263 14.40 1 +2013-03-01 09:11:58.703263 14.40 1 2013-03-01 09:11:58.703264 52.49 1 2013-03-01 09:11:58.703264 52.49 1 2013-03-01 09:11:58.703264 52.49 1 @@ -438,10 +438,10 @@ 2013-03-01 09:11:58.703299 23.19 1 2013-03-01 09:11:58.703299 23.19 1 2013-03-01 09:11:58.703299 23.19 1 -2013-03-01 09:11:58.703309 89.4 1 -2013-03-01 09:11:58.703309 89.4 1 -2013-03-01 09:11:58.703309 89.4 1 -2013-03-01 09:11:58.703309 89.4 1 +2013-03-01 09:11:58.703309 89.40 1 +2013-03-01 09:11:58.703309 89.40 1 +2013-03-01 09:11:58.703309 89.40 1 +2013-03-01 09:11:58.703309 89.40 1 2013-03-01 09:11:58.70331 69.26 1 2013-03-01 09:11:58.70331 69.26 1 2013-03-01 09:11:58.70331 69.26 1 diff --git a/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/infer_bucket_sort_list_bucket.q b/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/infer_bucket_sort_list_bucket.q index 7d3c0dc7d5859..c0a7a368a0768 100644 --- a/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/infer_bucket_sort_list_bucket.q +++ b/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/infer_bucket_sort_list_bucket.q @@ -20,7 +20,7 @@ SELECT key, count(*) FROM src GROUP BY key; DESC FORMATTED list_bucketing_table PARTITION (part = '1'); --- create a table skewed on a key which doesnt exist in the data +-- create a table skewed on a key which doesn't exist in the data CREATE TABLE list_bucketing_table2 (key STRING, value STRING) PARTITIONED BY (part STRING) SKEWED BY (key) ON ("abc") diff --git a/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/load_fs2.q b/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/load_fs2.q index a75758a0728d5..f92cf24dede8b 100644 --- a/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/load_fs2.q +++ b/sql/hive/src/test/resources/ql/src/test/queries/clientpositive/load_fs2.q @@ -1,4 +1,4 @@ --- HIVE-3300 [jira] LOAD DATA INPATH fails if a hdfs file with same name is added to table +-- HIVE-3300 [jira] LOAD DATA INPATH fails if an hdfs file with same name is added to table -- 'loader' table is used only for uploading kv1.txt to HDFS (!hdfs -put is not working on minMRDriver) create table result (key string, value string); diff --git a/sql/hive/src/test/resources/test_script.py b/sql/hive/src/test/resources/test_script.py new file mode 100644 index 0000000000000..82ef7b38f0c1b --- /dev/null +++ b/sql/hive/src/test/resources/test_script.py @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sys +for line in sys.stdin: + (a, b, c, d, e) = line.split('\t') + sys.stdout.write('\t'.join([a, b, c, d, e])) + sys.stdout.flush() diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/execution/benchmark/ObjectHashAggregateExecBenchmark.scala b/sql/hive/src/test/scala/org/apache/spark/sql/execution/benchmark/ObjectHashAggregateExecBenchmark.scala index 3226e3a5f318a..3f806ad24ca10 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/execution/benchmark/ObjectHashAggregateExecBenchmark.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/execution/benchmark/ObjectHashAggregateExecBenchmark.scala @@ -21,11 +21,10 @@ import scala.concurrent.duration._ import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFPercentileApprox -import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} +import org.apache.spark.benchmark.Benchmark import org.apache.spark.sql.{Column, SparkSession} import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile -import org.apache.spark.sql.catalyst.plans.SQLHelper import org.apache.spark.sql.hive.execution.TestingTypedCount import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.internal.SQLConf @@ -44,9 +43,10 @@ import org.apache.spark.sql.types.LongType * Results will be written to "benchmarks/ObjectHashAggregateExecBenchmark-results.txt". * }}} */ -object ObjectHashAggregateExecBenchmark extends BenchmarkBase with SQLHelper { +object ObjectHashAggregateExecBenchmark extends SqlBasedBenchmark { + + override def getSparkSession: SparkSession = TestHive.sparkSession - private val spark: SparkSession = TestHive.sparkSession private val sql = spark.sql _ import spark.implicits._ @@ -70,13 +70,13 @@ object ObjectHashAggregateExecBenchmark extends BenchmarkBase with SQLHelper { benchmark.addCase("hive udaf w/o group by") { _ => withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "false") { - sql("SELECT hive_percentile_approx(id, 0.5) FROM t").collect() + sql("SELECT hive_percentile_approx(id, 0.5) FROM t").noop() } } benchmark.addCase("spark af w/o group by") { _ => withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "true") { - sql("SELECT percentile_approx(id, 0.5) FROM t").collect() + sql("SELECT percentile_approx(id, 0.5) FROM t").noop() } } @@ -84,14 +84,14 @@ object ObjectHashAggregateExecBenchmark extends BenchmarkBase with SQLHelper { withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "false") { sql( s"SELECT hive_percentile_approx(id, 0.5) FROM t GROUP BY CAST(id / ${N / 4} AS BIGINT)" - ).collect() + ).noop() } } benchmark.addCase("spark af w/ group by w/o fallback") { _ => withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "true") { sql(s"SELECT percentile_approx(id, 0.5) FROM t GROUP BY CAST(id / ${N / 4} AS BIGINT)") - .collect() + .noop() } } @@ -100,7 +100,7 @@ object ObjectHashAggregateExecBenchmark extends BenchmarkBase with SQLHelper { SQLConf.USE_OBJECT_HASH_AGG.key -> "true", SQLConf.OBJECT_AGG_SORT_BASED_FALLBACK_THRESHOLD.key -> "2") { sql(s"SELECT percentile_approx(id, 0.5) FROM t GROUP BY CAST(id / ${N / 4} AS BIGINT)") - .collect() + .noop() } } @@ -125,13 +125,13 @@ object ObjectHashAggregateExecBenchmark extends BenchmarkBase with SQLHelper { benchmark.addCase("sort agg w/ group by") { _ => withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "false") { - df.groupBy($"id" < (N / 2)).agg(typed_count($"id")).collect() + df.groupBy($"id" < (N / 2)).agg(typed_count($"id")).noop() } } benchmark.addCase("object agg w/ group by w/o fallback") { _ => withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "true") { - df.groupBy($"id" < (N / 2)).agg(typed_count($"id")).collect() + df.groupBy($"id" < (N / 2)).agg(typed_count($"id")).noop() } } @@ -139,19 +139,19 @@ object ObjectHashAggregateExecBenchmark extends BenchmarkBase with SQLHelper { withSQLConf( SQLConf.USE_OBJECT_HASH_AGG.key -> "true", SQLConf.OBJECT_AGG_SORT_BASED_FALLBACK_THRESHOLD.key -> "2") { - df.groupBy($"id" < (N / 2)).agg(typed_count($"id")).collect() + df.groupBy($"id" < (N / 2)).agg(typed_count($"id")).noop() } } benchmark.addCase("sort agg w/o group by") { _ => withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "false") { - df.select(typed_count($"id")).collect() + df.select(typed_count($"id")).noop() } } benchmark.addCase("object agg w/o group by w/o fallback") { _ => withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "true") { - df.select(typed_count($"id")).collect() + df.select(typed_count($"id")).noop() } } @@ -173,13 +173,13 @@ object ObjectHashAggregateExecBenchmark extends BenchmarkBase with SQLHelper { benchmark.addCase("sort agg w/ group by") { _ => withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "false") { - df.groupBy($"id" / (N / 4) cast LongType).agg(percentile_approx($"id", 0.5)).collect() + df.groupBy($"id" / (N / 4) cast LongType).agg(percentile_approx($"id", 0.5)).noop() } } benchmark.addCase("object agg w/ group by w/o fallback") { _ => withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "true") { - df.groupBy($"id" / (N / 4) cast LongType).agg(percentile_approx($"id", 0.5)).collect() + df.groupBy($"id" / (N / 4) cast LongType).agg(percentile_approx($"id", 0.5)).noop() } } @@ -187,19 +187,19 @@ object ObjectHashAggregateExecBenchmark extends BenchmarkBase with SQLHelper { withSQLConf( SQLConf.USE_OBJECT_HASH_AGG.key -> "true", SQLConf.OBJECT_AGG_SORT_BASED_FALLBACK_THRESHOLD.key -> "2") { - df.groupBy($"id" / (N / 4) cast LongType).agg(percentile_approx($"id", 0.5)).collect() + df.groupBy($"id" / (N / 4) cast LongType).agg(percentile_approx($"id", 0.5)).noop() } } benchmark.addCase("sort agg w/o group by") { _ => withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "false") { - df.select(percentile_approx($"id", 0.5)).collect() + df.select(percentile_approx($"id", 0.5)).noop() } } benchmark.addCase("object agg w/o group by w/o fallback") { _ => withSQLConf(SQLConf.USE_OBJECT_HASH_AGG.key -> "true") { - df.select(percentile_approx($"id", 0.5)).collect() + df.select(percentile_approx($"id", 0.5)).noop() } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala index fcf0b4591ff84..7b3fb68174234 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/CachedTableSuite.scala @@ -216,7 +216,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto // Drop the table and create it again. sql("DROP TABLE refreshTable") - sparkSession.catalog.createExternalTable("refreshTable", tempPath.toString, "parquet") + sparkSession.catalog.createTable("refreshTable", tempPath.toString, "parquet") // It is not cached. assert(!isCached("refreshTable"), "refreshTable should not be cached.") // Refresh the table. REFRESH TABLE command should not make a uncached @@ -237,7 +237,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto tempPath.delete() table("src").write.mode(SaveMode.Overwrite).parquet(tempPath.toString) sql("DROP TABLE IF EXISTS refreshTable") - sparkSession.catalog.createExternalTable("refreshTable", tempPath.toString, "parquet") + sparkSession.catalog.createTable("refreshTable", tempPath.toString, "parquet") checkAnswer( table("refreshTable"), table("src").collect()) @@ -256,7 +256,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto // Drop the table and create it again. sql("DROP TABLE refreshTable") - sparkSession.catalog.createExternalTable("refreshTable", tempPath.toString, "parquet") + sparkSession.catalog.createTable("refreshTable", tempPath.toString, "parquet") // It is not cached. assert(!isCached("refreshTable"), "refreshTable should not be cached.") // Refresh the table. REFRESH command should not make a uncached @@ -300,7 +300,7 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with TestHiveSingleto val e = intercept[ParseException] { sql(s"CACHE TABLE $db.cachedTable AS SELECT 1") }.getMessage - assert(e.contains("It is not allowed to add database prefix ") && + assert(e.contains("It is not allowed to add catalog/namespace prefix ") && e.contains("to the table name in CACHE TABLE AS SELECT")) } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveContextCompatibilitySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveContextCompatibilitySuite.scala deleted file mode 100644 index a80db765846e9..0000000000000 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveContextCompatibilitySuite.scala +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.sql.hive - -import org.scalatest.BeforeAndAfterEach - -import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} - - -class HiveContextCompatibilitySuite extends SparkFunSuite with BeforeAndAfterEach { - - override protected val enableAutoThreadAudit = false - private var sc: SparkContext = null - private var hc: HiveContext = null - - override def beforeAll(): Unit = { - super.beforeAll() - sc = SparkContext.getOrCreate(new SparkConf().setMaster("local").setAppName("test")) - HiveUtils.newTemporaryConfiguration(useInMemoryDerby = true).foreach { case (k, v) => - sc.hadoopConfiguration.set(k, v) - } - hc = new HiveContext(sc) - } - - override def afterEach(): Unit = { - try { - hc.sharedState.cacheManager.clearCache() - hc.sessionState.catalog.reset() - } finally { - super.afterEach() - } - } - - override def afterAll(): Unit = { - try { - sc = null - hc = null - } finally { - super.afterAll() - } - } - - test("basic operations") { - val _hc = hc - import _hc.implicits._ - val df1 = (1 to 20).map { i => (i, i) }.toDF("a", "x") - val df2 = (1 to 100).map { i => (i, i % 10, i % 2 == 0) }.toDF("a", "b", "c") - .select($"a", $"b") - .filter($"a" > 10 && $"b" > 6 && $"c") - val df3 = df1.join(df2, "a") - val res = df3.collect() - val expected = Seq((18, 18, 8)).toDF("a", "x", "b").collect() - assert(res.toSeq == expected.toSeq) - df3.createOrReplaceTempView("mai_table") - val df4 = hc.table("mai_table") - val res2 = df4.collect() - assert(res2.toSeq == expected.toSeq) - } - - test("basic DDLs") { - val _hc = hc - import _hc.implicits._ - val databases = hc.sql("SHOW DATABASES").collect().map(_.getString(0)) - assert(databases.toSeq == Seq("default")) - hc.sql("CREATE DATABASE mee_db") - hc.sql("USE mee_db") - val databases2 = hc.sql("SHOW DATABASES").collect().map(_.getString(0)) - assert(databases2.toSet == Set("default", "mee_db")) - val df = (1 to 10).map { i => ("bob" + i.toString, i) }.toDF("name", "age") - df.createOrReplaceTempView("mee_table") - hc.sql("CREATE TABLE moo_table (name string, age int)") - hc.sql("INSERT INTO moo_table SELECT * FROM mee_table") - assert( - hc.sql("SELECT * FROM moo_table order by name").collect().toSeq == - df.collect().toSeq.sortBy(_.getString(0))) - val tables = hc.sql("SHOW TABLES IN mee_db").select("tableName").collect().map(_.getString(0)) - assert(tables.toSet == Set("moo_table", "mee_table")) - hc.sql("DROP TABLE moo_table") - hc.sql("DROP TABLE mee_table") - val tables2 = hc.sql("SHOW TABLES IN mee_db").select("tableName").collect().map(_.getString(0)) - assert(tables2.isEmpty) - hc.sql("USE default") - hc.sql("DROP DATABASE mee_db CASCADE") - val databases3 = hc.sql("SHOW DATABASES").collect().map(_.getString(0)) - assert(databases3.toSeq == Seq("default")) - } - -} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala index 0a522b6a11c80..79e569b51ca1d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala @@ -108,9 +108,50 @@ class HiveExternalCatalogSuite extends ExternalCatalogSuite { assert(bucketString.contains("10")) } + test("SPARK-30050: analyze/rename table should not erase the bucketing metadata at hive side") { + val catalog = newBasicCatalog() + externalCatalog.client.runSqlHive( + """ + |CREATE TABLE db1.t(a string, b string) + |CLUSTERED BY (a, b) SORTED BY (a, b) INTO 10 BUCKETS + |STORED AS PARQUET + """.stripMargin) + + val bucketString1 = externalCatalog.client.runSqlHive("DESC FORMATTED db1.t") + .filter(_.contains("Num Buckets")).head + assert(bucketString1.contains("10")) + + catalog.alterTableStats("db1", "t", None) + + val bucketString2 = externalCatalog.client.runSqlHive("DESC FORMATTED db1.t") + .filter(_.contains("Num Buckets")).head + assert(bucketString2.contains("10")) + + catalog.renameTable("db1", "t", "t2") + + val bucketString3 = externalCatalog.client.runSqlHive("DESC FORMATTED db1.t2") + .filter(_.contains("Num Buckets")).head + assert(bucketString3.contains("10")) + } + test("SPARK-23001: NullPointerException when running desc database") { val catalog = newBasicCatalog() catalog.createDatabase(newDb("dbWithNullDesc").copy(description = null), ignoreIfExists = false) assert(catalog.getDatabase("dbWithNullDesc").description == "") } + + test("SPARK-29498 CatalogTable to HiveTable should not change the table's ownership") { + val catalog = newBasicCatalog() + val owner = "SPARK-29498" + val hiveTable = CatalogTable( + identifier = TableIdentifier("spark_29498", Some("db1")), + tableType = CatalogTableType.MANAGED, + storage = storageFormat, + owner = owner, + schema = new StructType().add("i", "int"), + provider = Some("hive")) + + catalog.createTable(hiveTable, ignoreIfExists = false) + assert(catalog.getTable("db1", "spark_29498").owner === owner) + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala index 7d9030b8f87ed..3b5a1247bc09c 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala @@ -237,6 +237,7 @@ object PROCESS_TABLES extends QueryTest with SQLTestUtils { Source.fromURL(s"${releaseMirror}/spark").mkString .split("\n") .filter(_.contains("""
  • """.r.findFirstMatchIn(_).get.group(1)) .filter(_ < org.apache.spark.SPARK_VERSION) } catch { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala index c300660458fdd..5912992694e84 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveInspectorSuite.scala @@ -28,7 +28,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectIn import org.apache.hadoop.io.LongWritable import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.Row +import org.apache.spark.sql.{Row, TestUserClassUDT} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData, MapData} @@ -214,6 +214,12 @@ class HiveInspectorSuite extends SparkFunSuite with HiveInspectors { }) } + test("wrap / unwrap UDT Type") { + val dt = new TestUserClassUDT + checkValue(1, unwrap(wrap(1, toInspector(dt), dt), toInspector(dt))) + checkValue(null, unwrap(wrap(null, toInspector(dt), dt), toInspector(dt))) + } + test("wrap / unwrap Struct Type") { val dt = StructType(dataTypes.zipWithIndex.map { case (t, idx) => StructField(s"c_$idx", t) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala index e71aba72c31fe..94a55b911f092 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetadataCacheSuite.scala @@ -38,7 +38,7 @@ class HiveMetadataCacheSuite extends QueryTest with SQLTestUtils with TestHiveSi checkRefreshView(isTemp = false) } - private def checkRefreshView(isTemp: Boolean) { + private def checkRefreshView(isTemp: Boolean): Unit = { withView("view_refresh") { withTable("view_table") { // Create a Parquet directory diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala index 0e45e18c4b175..b8ef44b096eed 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.hive -import org.apache.spark.sql.{QueryTest, Row, SaveMode} +import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SaveMode} import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.catalog.CatalogTableType import org.apache.spark.sql.catalyst.parser.CatalystSqlParser @@ -46,7 +46,7 @@ class HiveMetastoreCatalogSuite extends TestHiveSingleton with SQLTestUtils { test("duplicated metastore relations") { val df = spark.sql("SELECT * FROM src") logInfo(df.queryExecution.toString) - df.as('a).join(df.as('b), $"a.key" === $"b.key") + df.as("a").join(df.as("b"), $"a.key" === $"b.key") } test("should not truncate struct type catalog string") { @@ -62,7 +62,7 @@ class HiveMetastoreCatalogSuite extends TestHiveSingleton with SQLTestUtils { spark.sql("create view vw1 as select 1 as id") val plan = spark.sql("select id from vw1").queryExecution.analyzed val aliases = plan.collect { - case x @ SubqueryAlias(AliasIdentifier("vw1", Some("default")), _) => x + case x @ SubqueryAlias(AliasIdentifier("vw1", Seq("default")), _) => x } assert(aliases.size == 1) } @@ -142,8 +142,8 @@ class DataSourceWithHiveMetastoreCatalogSuite import testImplicits._ private val testDF = range(1, 3).select( - ('id + 0.1) cast DecimalType(10, 3) as 'd1, - 'id cast StringType as 'd2 + ($"id" + 0.1) cast DecimalType(10, 3) as "d1", + $"id" cast StringType as "d2" ).coalesce(1) override def beforeAll(): Unit = { @@ -358,4 +358,24 @@ class DataSourceWithHiveMetastoreCatalogSuite Seq(table("src").count().toString)) } } + + test("SPARK-29869: Fix convertToLogicalRelation throws unclear AssertionError") { + withTempPath(dir => { + val baseDir = s"${dir.getCanonicalFile.toURI.toString}/non_partition_table" + val partitionLikeDir = s"$baseDir/dt=20191113" + spark.range(3).selectExpr("id").write.parquet(partitionLikeDir) + withTable("non_partition_table") { + withSQLConf(HiveUtils.CONVERT_METASTORE_PARQUET.key -> "true") { + spark.sql( + s""" + |CREATE TABLE non_partition_table (id bigint) + |STORED AS PARQUET LOCATION '$baseDir' + |""".stripMargin) + val e = intercept[AnalysisException]( + spark.table("non_partition_table")).getMessage + assert(e.contains("Converted table has 2 columns, but source Hive table has 1 columns.")) + } + } + }) + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetMetastoreSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetMetastoreSuite.scala index 5f3705d07bcad..0bdaa0c23c537 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetMetastoreSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetMetastoreSuite.scala @@ -440,7 +440,7 @@ class HiveParquetMetastoreSuite extends ParquetPartitioningTest { def checkCached(tableIdentifier: TableIdentifier): Unit = { // Converted test_parquet should be cached. getCachedDataSourceTable(tableIdentifier) match { - case null => fail("Converted test_parquet should be cached in the cache.") + case null => fail(s"Converted ${tableIdentifier.table} should be cached in the cache.") case LogicalRelation(_: HadoopFsRelation, _, _, _) => // OK case other => fail( @@ -480,7 +480,7 @@ class HiveParquetMetastoreSuite extends ParquetPartitioningTest { |INSERT INTO TABLE test_insert_parquet |select a, b from jt """.stripMargin) - checkCached(tableIdentifier) + assert(getCachedDataSourceTable(tableIdentifier) === null) // Make sure we can read the data. checkAnswer( sql("select * from test_insert_parquet"), @@ -512,14 +512,16 @@ class HiveParquetMetastoreSuite extends ParquetPartitioningTest { |PARTITION (`date`='2015-04-01') |select a, b from jt """.stripMargin) - checkCached(tableIdentifier) + // Right now, insert into a partitioned data source Parquet table. We refreshed the table. + // So, we expect it is not cached. + assert(getCachedDataSourceTable(tableIdentifier) === null) sql( """ |INSERT INTO TABLE test_parquet_partitioned_cache_test |PARTITION (`date`='2015-04-02') |select a, b from jt """.stripMargin) - checkCached(tableIdentifier) + assert(getCachedDataSourceTable(tableIdentifier) === null) // Make sure we can cache the partitioned table. table("test_parquet_partitioned_cache_test") diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSourceSuite.scala index de588768cfdee..b557fe73f1154 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSourceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSourceSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.hive import java.io.File +import java.io.IOException import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.sql.catalyst.catalog.HiveTableRelation @@ -212,7 +213,7 @@ class HiveParquetSourceSuite extends ParquetPartitioningTest { val filePath2 = new File(tempDir, "testParquet2").getCanonicalPath val df = Seq(1, 2, 3).map(i => (i, i.toString)).toDF("int", "str") - val df2 = df.as('x).join(df.as('y), $"x.str" === $"y.str").groupBy("y.str").max("y.int") + val df2 = df.as("x").join(df.as("y"), $"x.str" === $"y.str").groupBy("y.str").max("y.int") intercept[Throwable](df2.write.parquet(filePath)) val df3 = df2.toDF("str", "max_int") @@ -222,4 +223,158 @@ class HiveParquetSourceSuite extends ParquetPartitioningTest { assert(df4.columns === Array("str", "max_int")) } } + + test("SPARK-25993 CREATE EXTERNAL TABLE with subdirectories") { + Seq("true", "false").foreach { parquetConversion => + withSQLConf(HiveUtils.CONVERT_METASTORE_PARQUET.key -> parquetConversion) { + withTempPath { path => + withTable("parq_tbl1", "parq_tbl2", "parq_tbl3", + "tbl1", "tbl2", "tbl3", "tbl4", "tbl5", "tbl6") { + val parquetTblStatement1 = + s""" + |CREATE EXTERNAL TABLE parq_tbl1( + | c1 int, + | c2 int, + | c3 string) + |STORED AS parquet + |LOCATION '${s"${path.getCanonicalPath}/l1/"}'""".stripMargin + sql(parquetTblStatement1) + + val parquetTblInsertL1 = + s"INSERT INTO TABLE parq_tbl1 VALUES (1, 1, 'parq1'), (2, 2, 'parq2')".stripMargin + sql(parquetTblInsertL1) + + val parquetTblStatement2 = + s""" + |CREATE EXTERNAL TABLE parq_tbl2( + | c1 int, + | c2 int, + | c3 string) + |STORED AS parquet + |LOCATION '${s"${path.getCanonicalPath}/l1/l2/"}'""".stripMargin + sql(parquetTblStatement2) + + val parquetTblInsertL2 = + s"INSERT INTO TABLE parq_tbl2 VALUES (3, 3, 'parq3'), (4, 4, 'parq4')".stripMargin + sql(parquetTblInsertL2) + + val parquetTblStatement3 = + s""" + |CREATE EXTERNAL TABLE parq_tbl3( + | c1 int, + | c2 int, + | c3 string) + |STORED AS parquet + |LOCATION '${s"${path.getCanonicalPath}/l1/l2/l3/"}'""".stripMargin + sql(parquetTblStatement3) + + val parquetTblInsertL3 = + s"INSERT INTO TABLE parq_tbl3 VALUES (5, 5, 'parq5'), (6, 6, 'parq6')".stripMargin + sql(parquetTblInsertL3) + + val topDirStatement = + s""" + |CREATE EXTERNAL TABLE tbl1( + | c1 int, + | c2 int, + | c3 string) + |STORED AS parquet + |LOCATION '${s"${path.getCanonicalPath}"}'""".stripMargin + sql(topDirStatement) + if (parquetConversion == "true") { + checkAnswer(sql("SELECT * FROM tbl1"), Nil) + } else { + val msg = intercept[IOException] { + sql("SELECT * FROM tbl1").show() + }.getMessage + assert(msg.contains("Not a file:")) + } + + val l1DirStatement = + s""" + |CREATE EXTERNAL TABLE tbl2( + | c1 int, + | c2 int, + | c3 string) + |STORED AS parquet + |LOCATION '${s"${path.getCanonicalPath}/l1/"}'""".stripMargin + sql(l1DirStatement) + if (parquetConversion == "true") { + checkAnswer(sql("SELECT * FROM tbl2"), (1 to 2).map(i => Row(i, i, s"parq$i"))) + } else { + val msg = intercept[IOException] { + sql("SELECT * FROM tbl2").show() + }.getMessage + assert(msg.contains("Not a file:")) + } + + val l2DirStatement = + s""" + |CREATE EXTERNAL TABLE tbl3( + | c1 int, + | c2 int, + | c3 string) + |STORED AS parquet + |LOCATION '${s"${path.getCanonicalPath}/l1/l2/"}'""".stripMargin + sql(l2DirStatement) + if (parquetConversion == "true") { + checkAnswer(sql("SELECT * FROM tbl3"), (3 to 4).map(i => Row(i, i, s"parq$i"))) + } else { + val msg = intercept[IOException] { + sql("SELECT * FROM tbl3").show() + }.getMessage + assert(msg.contains("Not a file:")) + } + + val wildcardTopDirStatement = + s""" + |CREATE EXTERNAL TABLE tbl4( + | c1 int, + | c2 int, + | c3 string) + |STORED AS parquet + |LOCATION '${new File(s"${path}/*").toURI}'""".stripMargin + sql(wildcardTopDirStatement) + if (parquetConversion == "true") { + checkAnswer(sql("SELECT * FROM tbl4"), (1 to 2).map(i => Row(i, i, s"parq$i"))) + } else { + val msg = intercept[IOException] { + sql("SELECT * FROM tbl4").show() + }.getMessage + assert(msg.contains("Not a file:")) + } + + val wildcardL1DirStatement = + s""" + |CREATE EXTERNAL TABLE tbl5( + | c1 int, + | c2 int, + | c3 string) + |STORED AS parquet + |LOCATION '${new File(s"${path}/l1/*").toURI}'""".stripMargin + sql(wildcardL1DirStatement) + if (parquetConversion == "true") { + checkAnswer(sql("SELECT * FROM tbl5"), (1 to 4).map(i => Row(i, i, s"parq$i"))) + } else { + val msg = intercept[IOException] { + sql("SELECT * FROM tbl5").show() + }.getMessage + assert(msg.contains("Not a file:")) + } + + val wildcardL2DirStatement = + s""" + |CREATE EXTERNAL TABLE tbl6( + | c1 int, + | c2 int, + | c3 string) + |STORED AS parquet + |LOCATION '${new File(s"${path}/l1/l2/*").toURI}'""".stripMargin + sql(wildcardL2DirStatement) + checkAnswer(sql("SELECT * FROM tbl6"), (3 to 6).map(i => Row(i, i, s"parq$i"))) + } + } + } + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala index 0386dc79804c6..e5d572c90af38 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveShowCreateTableSuite.scala @@ -18,10 +18,42 @@ package org.apache.spark.sql.hive import org.apache.spark.sql.{AnalysisException, ShowCreateTableSuite} +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable} import org.apache.spark.sql.hive.test.TestHiveSingleton +import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSingleton { + private var origCreateHiveTableConfig = false + + protected override def beforeAll(): Unit = { + super.beforeAll() + origCreateHiveTableConfig = + SQLConf.get.getConf(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED) + SQLConf.get.setConf(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED, true) + } + + protected override def afterAll(): Unit = { + SQLConf.get.setConf(SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED, + origCreateHiveTableConfig) + super.afterAll() + } + + test("view") { + withView("v1") { + sql("CREATE VIEW v1 AS SELECT 1 AS a") + checkCreateHiveTableOrView("v1", "VIEW") + } + } + + test("view with output columns") { + withView("v1") { + sql("CREATE VIEW v1 (b) AS SELECT 1 AS a") + checkCreateHiveTableOrView("v1", "VIEW") + } + } + test("simple hive table") { withTable("t1") { sql( @@ -36,7 +68,7 @@ class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSinglet """.stripMargin ) - checkCreateTable("t1") + checkCreateHiveTableOrView("t1") } } @@ -56,7 +88,7 @@ class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSinglet """.stripMargin ) - checkCreateTable("t1") + checkCreateHiveTableOrView("t1") } } } @@ -76,7 +108,7 @@ class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSinglet """.stripMargin ) - checkCreateTable("t1") + checkCreateHiveTableOrView("t1") } } @@ -94,7 +126,7 @@ class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSinglet """.stripMargin ) - checkCreateTable("t1") + checkCreateHiveTableOrView("t1") } } @@ -109,7 +141,7 @@ class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSinglet """.stripMargin ) - checkCreateTable("t1") + checkCreateHiveTableOrView("t1") } } @@ -131,7 +163,7 @@ class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSinglet """.stripMargin ) - checkCreateTable("t1") + checkCreateHiveTableOrView("t1") } } @@ -144,7 +176,7 @@ class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSinglet |INTO 2 BUCKETS """.stripMargin ) - checkCreateTable("t1") + checkCreateHiveTableOrView("t1") } } @@ -172,22 +204,44 @@ class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSinglet } assert(cause.getMessage.contains(" - partitioned view")) + + val causeForSpark = intercept[AnalysisException] { + sql("SHOW CREATE TABLE v1 AS SERDE") + } + + assert(causeForSpark.getMessage.contains(" - partitioned view")) } } } test("SPARK-24911: keep quotes for nested fields in hive") { withTable("t1") { - val createTable = "CREATE TABLE `t1`(`a` STRUCT<`b`: STRING>)" + val createTable = "CREATE TABLE `t1` (`a` STRUCT<`b`: STRING>) USING hive" sql(createTable) - val shownDDL = sql(s"SHOW CREATE TABLE t1") - .head() - .getString(0) - .split("\n") - .head - assert(shownDDL == createTable) + val shownDDL = getShowDDL("SHOW CREATE TABLE t1") + assert(shownDDL == createTable.dropRight(" USING hive".length)) - checkCreateTable("t1") + checkCreateHiveTableOrView("t1") + } + } + + /** + * This method compares the given table with the table created by the DDL generated by + * `SHOW CREATE TABLE AS SERDE`. + */ + private def checkCreateHiveTableOrView(tableName: String, checkType: String = "TABLE"): Unit = { + val table = TableIdentifier(tableName, Some("default")) + val db = table.database.getOrElse("default") + val expected = spark.sharedState.externalCatalog.getTable(db, table.table) + val shownDDL = sql(s"SHOW CREATE TABLE ${table.quotedString} AS SERDE").head().getString(0) + sql(s"DROP $checkType ${table.quotedString}") + + try { + sql(shownDDL) + val actual = spark.sharedState.externalCatalog.getTable(db, table.table) + checkCatalogTables(expected, actual) + } finally { + sql(s"DROP $checkType IF EXISTS ${table.table}") } } @@ -195,4 +249,269 @@ class HiveShowCreateTableSuite extends ShowCreateTableSuite with TestHiveSinglet hiveContext.sharedState.externalCatalog.unwrapped.asInstanceOf[HiveExternalCatalog] .client.runSqlHive(ddl) } + + private def checkCreateSparkTableAsHive(tableName: String): Unit = { + val table = TableIdentifier(tableName, Some("default")) + val db = table.database.get + val hiveTable = spark.sharedState.externalCatalog.getTable(db, table.table) + val sparkDDL = sql(s"SHOW CREATE TABLE ${table.quotedString}").head().getString(0) + // Drops original Hive table. + sql(s"DROP TABLE ${table.quotedString}") + + try { + // Creates Spark datasource table using generated Spark DDL. + sql(sparkDDL) + val sparkTable = spark.sharedState.externalCatalog.getTable(db, table.table) + checkHiveCatalogTables(hiveTable, sparkTable) + } finally { + sql(s"DROP TABLE IF EXISTS ${table.table}") + } + } + + private def checkHiveCatalogTables(hiveTable: CatalogTable, sparkTable: CatalogTable): Unit = { + def normalize(table: CatalogTable): CatalogTable = { + val nondeterministicProps = Set( + "CreateTime", + "transient_lastDdlTime", + "grantTime", + "lastUpdateTime", + "last_modified_by", + "last_modified_time", + "Owner:", + // The following are hive specific schema parameters which we do not need to match exactly. + "totalNumberFiles", + "maxFileSize", + "minFileSize" + ) + + table.copy( + createTime = 0L, + lastAccessTime = 0L, + properties = table.properties.filterKeys(!nondeterministicProps.contains(_)), + stats = None, + ignoredProperties = Map.empty, + storage = table.storage.copy(properties = Map.empty), + provider = None, + tracksPartitionsInCatalog = false + ) + } + + def fillSerdeFromProvider(table: CatalogTable): CatalogTable = { + table.provider.flatMap(HiveSerDe.sourceToSerDe(_)).map { hiveSerde => + val newStorage = table.storage.copy( + inputFormat = hiveSerde.inputFormat, + outputFormat = hiveSerde.outputFormat, + serde = hiveSerde.serde + ) + table.copy(storage = newStorage) + }.getOrElse(table) + } + + assert(normalize(fillSerdeFromProvider(sparkTable)) == normalize(hiveTable)) + } + + test("simple hive table in Spark DDL") { + withTable("t1") { + sql( + s""" + |CREATE TABLE t1 ( + | c1 STRING COMMENT 'bla', + | c2 STRING + |) + |TBLPROPERTIES ( + | 'prop1' = 'value1', + | 'prop2' = 'value2' + |) + |STORED AS orc + """.stripMargin + ) + + checkCreateSparkTableAsHive("t1") + } + } + + test("show create table as serde can't work on data source table") { + withTable("t1") { + sql( + s""" + |CREATE TABLE t1 ( + | c1 STRING COMMENT 'bla', + | c2 STRING + |) + |USING orc + """.stripMargin + ) + + val cause = intercept[AnalysisException] { + checkCreateHiveTableOrView("t1") + } + + assert(cause.getMessage.contains("Use `SHOW CREATE TABLE` without `AS SERDE` instead")) + } + } + + test("simple external hive table in Spark DDL") { + withTempDir { dir => + withTable("t1") { + sql( + s""" + |CREATE TABLE t1 ( + | c1 STRING COMMENT 'bla', + | c2 STRING + |) + |LOCATION '${dir.toURI}' + |TBLPROPERTIES ( + | 'prop1' = 'value1', + | 'prop2' = 'value2' + |) + |STORED AS orc + """.stripMargin + ) + + checkCreateSparkTableAsHive("t1") + } + } + } + + test("hive table with STORED AS clause in Spark DDL") { + withTable("t1") { + sql( + s""" + |CREATE TABLE t1 ( + | c1 INT COMMENT 'bla', + | c2 STRING + |) + |STORED AS PARQUET + """.stripMargin + ) + + checkCreateSparkTableAsHive("t1") + } + } + + test("hive table with nested fields with STORED AS clause in Spark DDL") { + withTable("t1") { + sql( + s""" + |CREATE TABLE t1 ( + | c1 INT COMMENT 'bla', + | c2 STRING, + | c3 STRUCT + |) + |STORED AS PARQUET + """.stripMargin + ) + + checkCreateSparkTableAsHive("t1") + } + } + + test("hive table with unsupported fileformat in Spark DDL") { + withTable("t1") { + sql( + s""" + |CREATE TABLE t1 ( + | c1 INT COMMENT 'bla', + | c2 STRING + |) + |STORED AS RCFILE + """.stripMargin + ) + + val cause = intercept[AnalysisException] { + checkCreateSparkTableAsHive("t1") + } + + assert(cause.getMessage.contains("unsupported serde configuration")) + } + } + + test("hive table with serde info in Spark DDL") { + withTable("t1") { + sql( + s""" + |CREATE TABLE t1 ( + | c1 INT COMMENT 'bla', + | c2 STRING + |) + |ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' + |STORED AS + | INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' + | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' + """.stripMargin + ) + + checkCreateSparkTableAsHive("t1") + } + } + + test("hive view is not supported by show create table without as serde") { + withTable("t1") { + withView("v1") { + sql("CREATE TABLE t1 (c1 STRING, c2 STRING)") + + createRawHiveTable( + s""" + |CREATE VIEW v1 + |AS SELECT * from t1 + """.stripMargin + ) + + val cause = intercept[AnalysisException] { + sql("SHOW CREATE TABLE v1") + } + + assert(cause.getMessage.contains("view isn't supported")) + } + } + } + + test("partitioned, bucketed hive table in Spark DDL") { + withTable("t1") { + sql( + s""" + |CREATE TABLE t1 ( + | emp_id INT COMMENT 'employee id', emp_name STRING, + | emp_dob STRING COMMENT 'employee date of birth', emp_sex STRING COMMENT 'M/F' + |) + |COMMENT 'employee table' + |PARTITIONED BY ( + | emp_country STRING COMMENT '2-char code', emp_state STRING COMMENT '2-char code' + |) + |CLUSTERED BY (emp_sex) SORTED BY (emp_id ASC) INTO 10 BUCKETS + |STORED AS ORC + """.stripMargin + ) + + checkCreateSparkTableAsHive("t1") + } + } + + test("show create table for transactional hive table") { + withTable("t1") { + sql( + s""" + |CREATE TABLE t1 ( + | c1 STRING COMMENT 'bla', + | c2 STRING + |) + |TBLPROPERTIES ( + | 'transactional' = 'true', + | 'prop1' = 'value1', + | 'prop2' = 'value2' + |) + |CLUSTERED BY (c1) INTO 10 BUCKETS + |STORED AS ORC + """.stripMargin + ) + + + val cause = intercept[AnalysisException] { + sql("SHOW CREATE TABLE t1") + } + + assert(cause.getMessage.contains( + "SHOW CREATE TABLE doesn't support transactional Hive table")) + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala index d23e0f2e0d937..31ff62ed0a530 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala @@ -24,6 +24,7 @@ import scala.util.Properties import org.apache.commons.lang3.{JavaVersion, SystemUtils} import org.apache.hadoop.fs.Path import org.scalatest.{BeforeAndAfterEach, Matchers} +import org.scalatest.Assertions._ import org.apache.spark._ import org.apache.spark.internal.Logging @@ -33,7 +34,7 @@ import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.execution.command.DDLUtils import org.apache.spark.sql.expressions.Window -import org.apache.spark.sql.hive.test.{HiveTestUtils, TestHiveContext} +import org.apache.spark.sql.hive.test.{HiveTestJars, TestHiveContext} import org.apache.spark.sql.internal.SQLConf.SHUFFLE_PARTITIONS import org.apache.spark.sql.internal.StaticSQLConf.WAREHOUSE_PATH import org.apache.spark.sql.types.{DecimalType, StructType} @@ -52,7 +53,7 @@ class HiveSparkSubmitSuite override protected val enableAutoThreadAudit = false - override def beforeEach() { + override def beforeEach(): Unit = { super.beforeEach() } @@ -111,8 +112,8 @@ class HiveSparkSubmitSuite val unusedJar = TestUtils.createJarWithClasses(Seq.empty) val jar1 = TestUtils.createJarWithClasses(Seq("SparkSubmitClassA")) val jar2 = TestUtils.createJarWithClasses(Seq("SparkSubmitClassB")) - val jar3 = HiveTestUtils.getHiveContribJar.getCanonicalPath - val jar4 = HiveTestUtils.getHiveHcatalogCoreJar.getCanonicalPath + val jar3 = HiveTestJars.getHiveContribJar().getCanonicalPath + val jar4 = HiveTestJars.getHiveHcatalogCoreJar().getCanonicalPath val jarsString = Seq(jar1, jar2, jar3, jar4).map(j => j.toString).mkString(",") val args = Seq( "--class", SparkSubmitClassLoaderTest.getClass.getName.stripSuffix("$"), @@ -321,7 +322,7 @@ class HiveSparkSubmitSuite "--master", "local-cluster[2,1,1024]", "--conf", "spark.ui.enabled=false", "--conf", "spark.master.rest.enabled=false", - "--jars", HiveTestUtils.getHiveContribJar.getCanonicalPath, + "--jars", HiveTestJars.getHiveContribJar().getCanonicalPath, unusedJar.toString) runSparkSubmit(argsForCreateTable) @@ -454,7 +455,7 @@ object SetWarehouseLocationTest extends Logging { // and use this UDF. We need to run this test in separate JVM to make sure we // can load the jar defined with the function. object TemporaryHiveUDFTest extends Logging { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { TestUtils.configTestLog4j("INFO") val conf = new SparkConf() conf.set(UI_ENABLED, false) @@ -463,7 +464,7 @@ object TemporaryHiveUDFTest extends Logging { // Load a Hive UDF from the jar. logInfo("Registering a temporary Hive UDF provided in a jar.") - val jar = HiveTestUtils.getHiveContribJar.getCanonicalPath + val jar = HiveTestJars.getHiveContribJar().getCanonicalPath hiveContext.sql( s""" |CREATE TEMPORARY FUNCTION example_max @@ -492,7 +493,7 @@ object TemporaryHiveUDFTest extends Logging { // and use this UDF. We need to run this test in separate JVM to make sure we // can load the jar defined with the function. object PermanentHiveUDFTest1 extends Logging { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { TestUtils.configTestLog4j("INFO") val conf = new SparkConf() conf.set(UI_ENABLED, false) @@ -501,7 +502,7 @@ object PermanentHiveUDFTest1 extends Logging { // Load a Hive UDF from the jar. logInfo("Registering a permanent Hive UDF provided in a jar.") - val jar = HiveTestUtils.getHiveContribJar.getCanonicalPath + val jar = HiveTestJars.getHiveContribJar().getCanonicalPath hiveContext.sql( s""" |CREATE FUNCTION example_max @@ -530,7 +531,7 @@ object PermanentHiveUDFTest1 extends Logging { // resources can be used. We need to run this test in separate JVM to make sure we // can load the jar defined with the function. object PermanentHiveUDFTest2 extends Logging { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { TestUtils.configTestLog4j("INFO") val conf = new SparkConf() conf.set(UI_ENABLED, false) @@ -538,7 +539,7 @@ object PermanentHiveUDFTest2 extends Logging { val hiveContext = new TestHiveContext(sc) // Load a Hive UDF from the jar. logInfo("Write the metadata of a permanent Hive UDF into metastore.") - val jar = HiveTestUtils.getHiveContribJar.getCanonicalPath + val jar = HiveTestJars.getHiveContribJar().getCanonicalPath val function = CatalogFunction( FunctionIdentifier("example_max"), "org.apache.hadoop.hive.contrib.udaf.example.UDAFExampleMax", @@ -565,7 +566,7 @@ object PermanentHiveUDFTest2 extends Logging { // This object is used for testing SPARK-8368: https://issues.apache.org/jira/browse/SPARK-8368. // We test if we can load user jars in both driver and executors when HiveContext is used. object SparkSubmitClassLoaderTest extends Logging { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { TestUtils.configTestLog4j("INFO") val conf = new SparkConf() val hiveWarehouseLocation = Utils.createTempDir() @@ -635,7 +636,7 @@ object SparkSubmitClassLoaderTest extends Logging { // This object is used for testing SPARK-8020: https://issues.apache.org/jira/browse/SPARK-8020. // We test if we can correctly set spark sql configurations when HiveContext is used. object SparkSQLConfTest extends Logging { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { TestUtils.configTestLog4j("INFO") // We override the SparkConf to add spark.sql.hive.metastore.version and // spark.sql.hive.metastore.jars to the beginning of the conf entry array. @@ -699,7 +700,7 @@ object SPARK_9757 extends QueryTest { val df = hiveContext .range(10) - .select(('id + 0.1) cast DecimalType(10, 3) as 'dec) + .select(($"id" + 0.1) cast DecimalType(10, 3) as "dec") df.write.option("path", dir.getCanonicalPath).mode("overwrite").saveAsTable("t") checkAnswer(hiveContext.table("t"), df) } @@ -708,7 +709,7 @@ object SPARK_9757 extends QueryTest { val df = hiveContext .range(10) - .select(callUDF("struct", ('id + 0.2) cast DecimalType(10, 3)) as 'dec_struct) + .select(callUDF("struct", ($"id" + 0.2) cast DecimalType(10, 3)) as "dec_struct") df.write.option("path", dir.getCanonicalPath).mode("overwrite").saveAsTable("t") checkAnswer(hiveContext.table("t"), df) } @@ -770,8 +771,8 @@ object SPARK_14244 extends QueryTest { import hiveContext.implicits._ try { - val window = Window.orderBy('id) - val df = spark.range(2).select(cume_dist().over(window).as('cdist)).orderBy('cdist) + val window = Window.orderBy("id") + val df = spark.range(2).select(cume_dist().over(window).as("cdist")).orderBy("cdist") checkAnswer(df, Seq(Row(0.5D), Row(1.0D))) } finally { sparkContext.stop() @@ -806,14 +807,14 @@ object SPARK_18360 { // Hive will use the value of `hive.metastore.warehouse.dir` to generate default table // location for tables in default database. assert(rawTable.storage.locationUri.map( - CatalogUtils.URIToString(_)).get.contains(newWarehousePath)) + CatalogUtils.URIToString).get.contains(newWarehousePath)) hiveClient.dropTable("default", "test_tbl", ignoreIfNotExists = false, purge = false) spark.sharedState.externalCatalog.createTable(tableMeta, ignoreIfExists = false) val readBack = spark.sharedState.externalCatalog.getTable("default", "test_tbl") // Spark SQL will use the location of default database to generate default table // location for tables in default database. - assert(readBack.storage.locationUri.map(CatalogUtils.URIToString(_)) + assert(readBack.storage.locationUri.map(CatalogUtils.URIToString) .get.contains(defaultDbLocation)) } finally { hiveClient.dropTable("default", "test_tbl", ignoreIfNotExists = true, purge = false) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUserDefinedTypeSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUserDefinedTypeSuite.scala new file mode 100644 index 0000000000000..ca1af73b038a7 --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUserDefinedTypeSuite.scala @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive + +import scala.collection.JavaConverters._ +import scala.util.Random + +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF +import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, StandardListObjectInspector} +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory + +import org.apache.spark.sql.{QueryTest, Row} +import org.apache.spark.sql.catalyst.FunctionIdentifier +import org.apache.spark.sql.hive.test.TestHiveSingleton +import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT} +import org.apache.spark.sql.types.StructType + +class HiveUserDefinedTypeSuite extends QueryTest with TestHiveSingleton { + private val functionClass = classOf[org.apache.spark.sql.hive.TestUDF].getCanonicalName + + test("Support UDT in Hive UDF") { + val functionName = "get_point_x" + try { + val schema = new StructType().add("point", new ExamplePointUDT, nullable = false) + val input = Row.fromSeq(Seq(new ExamplePoint(3.141592d, -3.141592d))) + val df = spark.createDataFrame(Array(input).toList.asJava, schema) + df.createOrReplaceTempView("src") + spark.sql(s"CREATE FUNCTION $functionName AS '$functionClass'") + + checkAnswer( + spark.sql(s"SELECT $functionName(point) FROM src"), + Row(input.getAs[ExamplePoint](0).x)) + } finally { + // If the test failed part way, we don't want to mask the failure by failing to remove + // temp tables that never got created. + spark.sql(s"DROP FUNCTION IF EXISTS $functionName") + assert( + !spark.sessionState.catalog.functionExists(FunctionIdentifier(functionName)), + s"Function $functionName should have been dropped. But, it still exists.") + } + } +} + +class TestUDF extends GenericUDF { + private var data: StandardListObjectInspector = _ + + override def getDisplayString(children: Array[String]): String = "get_point_x" + + override def initialize(arguments: Array[ObjectInspector]): ObjectInspector = { + data = arguments(0).asInstanceOf[StandardListObjectInspector] + PrimitiveObjectInspectorFactory.javaDoubleObjectInspector + } + + override def evaluate(arguments: Array[GenericUDF.DeferredObject]): AnyRef = { + val point = data.getList(arguments(0).get()) + java.lang.Double.valueOf(point.get(0).asInstanceOf[Double]) + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala index daf06645abc24..4ad97eaa2b1c8 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala @@ -54,6 +54,15 @@ class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton } } + test("newTemporaryConfiguration respect spark.hive.foo=bar in SparkConf") { + sys.props.put("spark.hive.foo", "bar") + Seq(true, false) foreach { useInMemoryDerby => + val hiveConf = HiveUtils.newTemporaryConfiguration(useInMemoryDerby) + assert(!hiveConf.contains("spark.hive.foo")) + assert(hiveConf("hive.foo") === "bar") + } + } + test("ChildFirstURLClassLoader's parent is null, get spark classloader instead") { val conf = new SparkConf val contextClassLoader = Thread.currentThread().getContextClassLoader diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala index 73f5bbd88624e..421dcb499bd6a 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala @@ -19,13 +19,13 @@ package org.apache.spark.sql.hive import java.io.File +import com.google.common.io.Files import org.apache.hadoop.fs.Path import org.scalatest.{BeforeAndAfter, PrivateMethodTester} import org.apache.spark.SparkException import org.apache.spark.sql.{QueryTest, _} import org.apache.spark.sql.catalyst.parser.ParseException -import org.apache.spark.sql.catalyst.plans.logical.InsertIntoTable import org.apache.spark.sql.hive.execution.InsertIntoHiveTable import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf @@ -462,7 +462,7 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter // Columns `c + 1` and `d + 1` are resolved by position, and thus mapped to partition // columns `b` and `c` of the target table. val df = Seq((1, 2, 3, 4)).toDF("a", "b", "c", "d") - df.select('a + 1, 'b + 1, 'c + 1, 'd + 1).write.insertInto(tableName) + df.select($"a" + 1, $"b" + 1, $"c" + 1, $"d" + 1).write.insertInto(tableName) checkAnswer( sql(s"SELECT a, b, c, d FROM $tableName"), @@ -556,7 +556,7 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter val inputPath = new Path("/tmp/b/c") var stagingDir = "tmp/b" val saveHiveFile = InsertIntoHiveTable(null, Map.empty, null, false, false, null) - val getStagingDir = PrivateMethod[Path]('getStagingDir) + val getStagingDir = PrivateMethod[Path](Symbol("getStagingDir")) var path = saveHiveFile invokePrivate getStagingDir(inputPath, conf, stagingDir) assert(path.toString.indexOf("/tmp/b_hive_") != -1) @@ -824,4 +824,27 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter } } } + + test("SPARK-30201 HiveOutputWriter standardOI should use ObjectInspectorCopyOption.DEFAULT") { + withTable("t1", "t2") { + withTempDir { dir => + val file = new File(dir, "test.hex") + val hex = "AABBCC" + val bs = org.apache.commons.codec.binary.Hex.decodeHex(hex.toCharArray) + Files.write(bs, file) + val path = file.getParent + sql(s"create table t1 (c string) STORED AS TEXTFILE location '$path'") + checkAnswer( + sql("select hex(c) from t1"), + Row(hex) + ) + + sql("create table t2 as select c from t1") + checkAnswer( + sql("select hex(c) from t2"), + Row(hex) + ) + } + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala index ba807fb58fe40..41a26344f7c21 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala @@ -516,13 +516,13 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv } withSQLConf(SQLConf.DEFAULT_DATA_SOURCE_NAME.key -> "json") { - sparkSession.catalog.createExternalTable("createdJsonTable", tempPath.toString) + sparkSession.catalog.createTable("createdJsonTable", tempPath.toString) assert(table("createdJsonTable").schema === df.schema) checkAnswer(sql("SELECT * FROM createdJsonTable"), df) assert( intercept[AnalysisException] { - sparkSession.catalog.createExternalTable("createdJsonTable", jsonFilePath.toString) + sparkSession.catalog.createTable("createdJsonTable", jsonFilePath.toString) }.getMessage.contains("Table createdJsonTable already exists."), "We should complain that createdJsonTable already exists") } @@ -534,7 +534,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv // Try to specify the schema. withSQLConf(SQLConf.DEFAULT_DATA_SOURCE_NAME.key -> "not a source name") { val schema = StructType(StructField("b", StringType, true) :: Nil) - sparkSession.catalog.createExternalTable( + sparkSession.catalog.createTable( "createdJsonTable", "org.apache.spark.sql.json", schema, @@ -553,7 +553,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv test("path required error") { assert( intercept[AnalysisException] { - sparkSession.catalog.createExternalTable( + sparkSession.catalog.createTable( "createdJsonTable", "org.apache.spark.sql.json", Map.empty[String, String]) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala index 9060ce2e0eb4b..2d3e462531245 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MultiDatabaseSuite.scala @@ -66,14 +66,14 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils with TestHiveSingle } } - test(s"createExternalTable() to non-default database - with USE") { + test(s"createTable() to non-default database - with USE") { withTempDatabase { db => activateDatabase(db) { withTempPath { dir => val path = dir.getCanonicalPath df.write.format("parquet").mode(SaveMode.Overwrite).save(path) - spark.catalog.createExternalTable("t", path, "parquet") + spark.catalog.createTable("t", path, "parquet") assert(getTableNames(Option(db)).contains("t")) checkAnswer(spark.table("t"), df) @@ -92,12 +92,12 @@ class MultiDatabaseSuite extends QueryTest with SQLTestUtils with TestHiveSingle } } - test(s"createExternalTable() to non-default database - without USE") { + test(s"createTable() to non-default database - without USE") { withTempDatabase { db => withTempPath { dir => val path = dir.getCanonicalPath df.write.format("parquet").mode(SaveMode.Overwrite).save(path) - spark.catalog.createExternalTable(s"$db.t", path, "parquet") + spark.catalog.createTable(s"$db.t", path, "parquet") assert(getTableNames(Option(db)).contains("t")) checkAnswer(spark.table(s"$db.t"), df) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala index 40581066c62bb..488175a22bad7 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala @@ -54,7 +54,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto Seq(dsTbl, hiveTbl).foreach { tbl => sql(s"ANALYZE TABLE $tbl COMPUTE STATISTICS") - val catalogStats = getCatalogStatistics(tbl) + val catalogStats = getTableStats(tbl) withSQLConf(SQLConf.CBO_ENABLED.key -> "false") { val relationStats = spark.table(tbl).queryExecution.optimizedPlan.stats assert(relationStats.sizeInBytes == catalogStats.sizeInBytes) @@ -158,7 +158,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto // Non-partitioned table val nonPartTable = "non_part_table" withTable(nonPartTable) { - sql(s"CREATE TABLE $nonPartTable (key STRING, value STRING)") + sql(s"CREATE TABLE $nonPartTable (key STRING, value STRING) USING hive") sql(s"INSERT INTO TABLE $nonPartTable SELECT * FROM src") sql(s"INSERT INTO TABLE $nonPartTable SELECT * FROM src") @@ -312,7 +312,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS noscan") - assert(getCatalogStatistics(tableName).sizeInBytes === BigInt(17436)) + assert(getTableStats(tableName).sizeInBytes === BigInt(17436)) } } } @@ -353,11 +353,11 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto // Analyze original table - expect 3 partitions sql(s"ANALYZE TABLE $sourceTableName COMPUTE STATISTICS noscan") - assert(getCatalogStatistics(sourceTableName).sizeInBytes === BigInt(3 * 5812)) + assert(getTableStats(sourceTableName).sizeInBytes === BigInt(3 * 5812)) // Analyze partial-copy table - expect only 1 partition sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS noscan") - assert(getCatalogStatistics(tableName).sizeInBytes === BigInt(5812)) + assert(getTableStats(tableName).sizeInBytes === BigInt(5812)) } } } @@ -840,7 +840,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto test("alter table should not have the side effect to store statistics in Spark side") { val table = "alter_table_side_effect" withTable(table) { - sql(s"CREATE TABLE $table (i string, j string)") + sql(s"CREATE TABLE $table (i string, j string) USING hive") sql(s"INSERT INTO TABLE $table SELECT 'a', 'b'") val catalogTable1 = getCatalogTable(table) val hiveSize1 = BigInt(catalogTable1.ignoredProperties(StatsSetupConst.TOTAL_SIZE)) @@ -1204,7 +1204,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto assert(tsHistogramProps.size == 1) // Validate histogram after deserialization. - val cs = getCatalogStatistics(tableName).colStats + val cs = getTableStats(tableName).colStats val intHistogram = cs("cint").histogram.get val tsHistogram = cs("ctimestamp").histogram.get assert(intHistogram.bins.length == spark.sessionState.conf.histogramNumBins) @@ -1514,4 +1514,46 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto } } } + + test("SPARK-30269 failed to update partition stats if it's equal to table's old stats") { + val tbl = "SPARK_30269" + val ext_tbl = "SPARK_30269_external" + withTempDir { dir => + withTable(tbl, ext_tbl) { + sql(s"CREATE TABLE $tbl (key INT, value STRING, ds STRING) PARTITIONED BY (ds)") + sql( + s""" + | CREATE TABLE $ext_tbl (key INT, value STRING, ds STRING) + | PARTITIONED BY (ds) + | LOCATION '${dir.toURI}' + """.stripMargin) + + Seq(tbl, ext_tbl).foreach { tblName => + sql(s"INSERT INTO $tblName VALUES (1, 'a', '2019-12-13')") + + // analyze table + sql(s"ANALYZE TABLE $tblName COMPUTE STATISTICS NOSCAN") + var tableStats = getTableStats(tblName) + assert(tableStats.sizeInBytes == 601) + assert(tableStats.rowCount.isEmpty) + + sql(s"ANALYZE TABLE $tblName COMPUTE STATISTICS") + tableStats = getTableStats(tblName) + assert(tableStats.sizeInBytes == 601) + assert(tableStats.rowCount.get == 1) + + // analyze a single partition + sql(s"ANALYZE TABLE $tblName PARTITION (ds='2019-12-13') COMPUTE STATISTICS NOSCAN") + var partStats = getPartitionStats(tblName, Map("ds" -> "2019-12-13")) + assert(partStats.sizeInBytes == 601) + assert(partStats.rowCount.isEmpty) + + sql(s"ANALYZE TABLE $tblName PARTITION (ds='2019-12-13') COMPUTE STATISTICS") + partStats = getPartitionStats(tblName, Map("ds" -> "2019-12-13")) + assert(partStats.sizeInBytes == 601) + assert(partStats.rowCount.get == 1) + } + } + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientUserNameSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientUserNameSuite.scala new file mode 100644 index 0000000000000..77956f4fe69da --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientUserNameSuite.scala @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.client + +import java.security.PrivilegedExceptionAction + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.security.UserGroupInformation +import org.scalatest.{BeforeAndAfterAll, PrivateMethodTester} + +import org.apache.spark.util.Utils + +class HiveClientUserNameSuite(version: String) extends HiveVersionSuite(version) { + + test("username of HiveClient - no UGI") { + // Assuming we're not faking System username + assert(getUserNameFromHiveClient === System.getProperty("user.name")) + } + + test("username of HiveClient - UGI") { + val ugi = UserGroupInformation.createUserForTesting( + "fakeprincipal@EXAMPLE.COM", Array.empty) + ugi.doAs(new PrivilegedExceptionAction[Unit]() { + override def run(): Unit = { + assert(getUserNameFromHiveClient === ugi.getShortUserName) + } + }) + } + + test("username of HiveClient - Proxy user") { + val ugi = UserGroupInformation.createUserForTesting( + "fakeprincipal@EXAMPLE.COM", Array.empty) + val proxyUgi = UserGroupInformation.createProxyUserForTesting( + "proxyprincipal@EXAMPLE.COM", ugi, Array.empty) + proxyUgi.doAs(new PrivilegedExceptionAction[Unit]() { + override def run(): Unit = { + assert(getUserNameFromHiveClient === proxyUgi.getShortUserName) + } + }) + } + + private def getUserNameFromHiveClient: String = { + val hadoopConf = new Configuration() + hadoopConf.set("hive.metastore.warehouse.dir", Utils.createTempDir().toURI().toString()) + val client = buildClient(hadoopConf) + client.userName + } +} diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/sql/DescribeTableStatement.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientUserNameSuites.scala similarity index 73% rename from sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/sql/DescribeTableStatement.scala rename to sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientUserNameSuites.scala index 02604b4ac5ac1..e076c01c08980 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/sql/DescribeTableStatement.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientUserNameSuites.scala @@ -15,11 +15,14 @@ * limitations under the License. */ -package org.apache.spark.sql.catalyst.plans.logical.sql +package org.apache.spark.sql.hive.client -import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec +import scala.collection.immutable.IndexedSeq -case class DescribeTableStatement( - tableName: Seq[String], - partitionSpec: TablePartitionSpec, - isExtended: Boolean) extends ParsedStatement +import org.scalatest.Suite + +class HiveClientUserNameSuites extends Suite with HiveClientVersions { + override def nestedSuites: IndexedSeq[Suite] = { + versions.map(new HiveClientUserNameSuite(_)) + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala similarity index 98% rename from sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala rename to sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala index bda711200acdb..2d615f6fdc261 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuite.scala @@ -31,8 +31,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types.{BooleanType, IntegerType, LongType, StructType} import org.apache.spark.util.Utils -// TODO: Refactor this to `HivePartitionFilteringSuite` -class HiveClientSuite(version: String) +class HivePartitionFilteringSuite(version: String) extends HiveVersionSuite(version) with BeforeAndAfterAll { private val tryDirectSqlKey = HiveConf.ConfVars.METASTORE_TRY_DIRECT_SQL.varname @@ -95,7 +94,7 @@ class HiveClientSuite(version: String) } } - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() client = init(true) } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuites.scala similarity index 87% rename from sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuites.scala rename to sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuites.scala index de1be2115b2d8..a43e778b13b92 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HiveClientSuites.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HivePartitionFilteringSuites.scala @@ -21,9 +21,9 @@ import scala.collection.immutable.IndexedSeq import org.scalatest.Suite -class HiveClientSuites extends Suite with HiveClientVersions { +class HivePartitionFilteringSuites extends Suite with HiveClientVersions { override def nestedSuites: IndexedSeq[Suite] = { // Hive 0.12 does not provide the partition filtering API we call - versions.filterNot(_ == "0.12").map(new HiveClientSuite(_)) + versions.filterNot(_ == "0.12").map(new HivePartitionFilteringSuite(_)) } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala index da2acdc4aa378..4760af7aa46ff 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala @@ -22,10 +22,12 @@ import java.net.URI import org.apache.commons.lang3.{JavaVersion, SystemUtils} import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.common.StatsSetupConst import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.hadoop.mapred.TextInputFormat +import org.apache.hadoop.security.UserGroupInformation import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging @@ -169,6 +171,34 @@ class VersionsSuite extends SparkFunSuite with Logging { client.createDatabase(tempDB, ignoreIfExists = true) } + test(s"$version: create/get/alter database should pick right user name as owner") { + if (version != "0.12") { + val currentUser = UserGroupInformation.getCurrentUser.getUserName + val ownerName = "SPARK_29425" + val db1 = "SPARK_29425_1" + val db2 = "SPARK_29425_2" + val ownerProps = Map("owner" -> ownerName) + + // create database with owner + val dbWithOwner = CatalogDatabase(db1, "desc", Utils.createTempDir().toURI, ownerProps) + client.createDatabase(dbWithOwner, ignoreIfExists = true) + val getDbWithOwner = client.getDatabase(db1) + assert(getDbWithOwner.properties("owner") === ownerName) + // alter database without owner + client.alterDatabase(getDbWithOwner.copy(properties = Map())) + assert(client.getDatabase(db1).properties("owner") === "") + + // create database without owner + val dbWithoutOwner = CatalogDatabase(db2, "desc", Utils.createTempDir().toURI, Map()) + client.createDatabase(dbWithoutOwner, ignoreIfExists = true) + val getDbWithoutOwner = client.getDatabase(db2) + assert(getDbWithoutOwner.properties("owner") === currentUser) + // alter database with owner + client.alterDatabase(getDbWithoutOwner.copy(properties = ownerProps)) + assert(client.getDatabase(db2).properties("owner") === ownerName) + } + } + test(s"$version: createDatabase with null description") { withTempDir { tmpDir => val dbWithNullDesc = @@ -201,6 +231,22 @@ class VersionsSuite extends SparkFunSuite with Logging { val database = client.getDatabase("temporary").copy(properties = Map("flag" -> "true")) client.alterDatabase(database) assert(client.getDatabase("temporary").properties.contains("flag")) + + // test alter database location + val tempDatabasePath2 = Utils.createTempDir().toURI + // Hive support altering database location since HIVE-8472. + if (version == "3.0" || version == "3.1") { + client.alterDatabase(database.copy(locationUri = tempDatabasePath2)) + val uriInCatalog = client.getDatabase("temporary").locationUri + assert("file" === uriInCatalog.getScheme) + assert(new Path(tempDatabasePath2.getPath).toUri.getPath === uriInCatalog.getPath, + "Failed to alter database location") + } else { + val e = intercept[AnalysisException] { + client.alterDatabase(database.copy(locationUri = tempDatabasePath2)) + } + assert(e.getMessage.contains("does not support altering database location")) + } } test(s"$version: dropDatabase") { @@ -274,6 +320,19 @@ class VersionsSuite extends SparkFunSuite with Logging { assert(client.getTable("default", "src").properties.contains("changed")) } + test(s"$version: alterTable - should respect the original catalog table's owner name") { + val ownerName = "SPARK-29405" + val originalTable = client.getTable("default", "src") + // mocking the owner is what we declared + val newTable = originalTable.copy(owner = ownerName) + client.alterTable(newTable) + assert(client.getTable("default", "src").owner === ownerName) + // mocking the owner is empty + val newTable2 = originalTable.copy(owner = "") + client.alterTable(newTable2) + assert(client.getTable("default", "src").owner === client.userName) + } + test(s"$version: alterTable(dbName: String, tableName: String, table: CatalogTable)") { val newTable = client.getTable("default", "src").copy(properties = Map("changedAgain" -> "")) client.alterTable("default", "src", newTable) @@ -891,7 +950,7 @@ class VersionsSuite extends SparkFunSuite with Logging { """.stripMargin ) - val errorMsg = "data type mismatch: cannot cast decimal(2,1) to binary" + val errorMsg = "Cannot safely cast 'f0': DecimalType(2,1) to BinaryType" if (isPartitioned) { val insertStmt = s"INSERT OVERWRITE TABLE $tableName partition (ds='a') SELECT 1.3" diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala index d06cc1c0a88ac..f84b854048e8a 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala @@ -24,7 +24,7 @@ import test.org.apache.spark.sql.MyDoubleAvg import test.org.apache.spark.sql.MyDoubleSum import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.expressions.UnsafeRow +import org.apache.spark.sql.catalyst.expressions.{CodegenObjectFactoryMode, UnsafeRow} import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHiveSingleton @@ -1018,6 +1018,31 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te val agg2 = agg1.groupBy($"text").agg(sum($"avg_res")) checkAnswer(agg2, Row("a", BigDecimal("11.9999999994857142860000"))) } + + test("SPARK-29122: hash-based aggregates for unfixed-length decimals in the interpreter mode") { + withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false", + SQLConf.CODEGEN_FACTORY_MODE.key -> CodegenObjectFactoryMode.NO_CODEGEN.toString) { + withTempView("t") { + spark.range(3).selectExpr("CAST(id AS decimal(38, 0)) a").createOrReplaceTempView("t") + checkAnswer(sql("SELECT SUM(a) FROM t"), Row(java.math.BigDecimal.valueOf(3))) + } + } + } + + test("SPARK-29140: HashAggregateExec aggregating binary type doesn't break codegen compilation") { + val schema = new StructType().add("id", IntegerType, nullable = false) + .add("c1", BinaryType, nullable = true) + + withSQLConf( + SQLConf.CODEGEN_SPLIT_AGGREGATE_FUNC.key -> "true", + SQLConf.CODEGEN_METHOD_SPLIT_THRESHOLD.key -> "1") { + val emptyRows = spark.sparkContext.parallelize(Seq.empty[Row], 1) + val aggDf = spark.createDataFrame(emptyRows, schema) + .groupBy($"id" % 10 as "group") + .agg(countDistinct($"c1")) + checkAnswer(aggDf, Seq.empty[Row]) + } + } } @@ -1038,7 +1063,7 @@ class HashAggregationQueryWithControlledFallbackSuite extends AggregationQuerySu // todo: remove it? val newActual = Dataset.ofRows(spark, actual.logicalPlan) - QueryTest.checkAnswer(newActual, expectedAnswer) match { + QueryTest.getErrorMessageInCheckAnswer(newActual, expectedAnswer) match { case Some(errorMessage) => val newErrorMessage = s""" diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala index 9147a98c94457..dbbf2b29fe8b7 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveCommandSuite.scala @@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.analysis.NoSuchTableException import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType} import org.apache.spark.sql.execution.command.LoadDataCommand import org.apache.spark.sql.hive.test.TestHiveSingleton +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.StructType @@ -57,7 +58,7 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto |STORED AS PARQUET |TBLPROPERTIES('prop1Key'="prop1Val", '`prop2Key`'="prop2Val") """.stripMargin) - sql("CREATE TABLE parquet_tab3(col1 int, `col 2` int)") + sql("CREATE TABLE parquet_tab3(col1 int, `col 2` int) USING hive") sql("CREATE TABLE parquet_tab4 (price int, qty int) partitioned by (year int, month int)") sql("INSERT INTO parquet_tab4 PARTITION(year = 2015, month = 1) SELECT 1, 1") sql("INSERT INTO parquet_tab4 PARTITION(year = 2015, month = 2) SELECT 2, 2") @@ -129,10 +130,10 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } test("show tblproperties for datasource table - errors") { - val message1 = intercept[NoSuchTableException] { + val message = intercept[AnalysisException] { sql("SHOW TBLPROPERTIES badtable") }.getMessage - assert(message1.contains("Table or view 'badtable' not found in database 'default'")) + assert(message.contains("Table not found: badtable")) // When key is not found, a row containing the error is returned. checkAnswer( @@ -146,7 +147,7 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto checkAnswer(sql("SHOW TBLPROPERTIES parquet_tab2('`prop2Key`')"), Row("prop2Val")) } - test("show tblproperties for spark temporary table - empty row") { + test("show tblproperties for spark temporary table - AnalysisException is thrown") { withTempView("parquet_temp") { sql( """ @@ -154,8 +155,10 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto |USING org.apache.spark.sql.parquet.DefaultSource """.stripMargin) - // An empty sequence of row is returned for session temporary table. - checkAnswer(sql("SHOW TBLPROPERTIES parquet_temp"), Nil) + val message = intercept[AnalysisException] { + sql("SHOW TBLPROPERTIES parquet_temp") + }.getMessage + assert(message.contains("parquet_temp is a temp view not table")) } } @@ -289,7 +292,29 @@ class HiveCommandSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } checkAnswer( sql("SELECT employeeID, employeeName FROM part_table WHERE c = '2' AND d = '1'"), - sql("SELECT * FROM non_part_table").collect()) + sql("SELECT * FROM non_part_table")) + } + } + + test("SPARK-28084 case insensitive names of static partitioning in INSERT commands") { + withTable("part_table") { + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { + sql("CREATE TABLE part_table (price int, qty int) partitioned by (year int, month int)") + sql("INSERT INTO part_table PARTITION(YEar = 2015, month = 1) SELECT 1, 1") + checkAnswer(sql("SELECT * FROM part_table"), Row(1, 1, 2015, 1)) + } + } + } + + test("SPARK-28084 case insensitive names of dynamic partitioning in INSERT commands") { + withTable("part_table") { + withSQLConf( + SQLConf.CASE_SENSITIVE.key -> "false", + "hive.exec.dynamic.partition.mode" -> "nonstrict") { + sql("CREATE TABLE part_table (price int) partitioned by (year int)") + sql("INSERT INTO part_table PARTITION(YEar) SELECT 1, 2019") + checkAnswer(sql("SELECT * FROM part_table"), Row(1, 2019)) + } } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala index c0158f1947d99..8b1f4c92755b9 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala @@ -30,7 +30,6 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Dataset import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical._ -import org.apache.spark.sql.catalyst.plans.logical.sql.{DescribeColumnStatement, DescribeTableStatement} import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.execution.HiveResult.hiveResultString import org.apache.spark.sql.execution.SQLExecution @@ -229,7 +228,7 @@ abstract class HiveComparisonTest sql: String, reset: Boolean = true, tryWithoutResettingFirst: Boolean = false, - skip: Boolean = false) { + skip: Boolean = false): Unit = { // testCaseName must not contain ':', which is not allowed to appear in a filename of Windows assert(!testCaseName.contains(":")) @@ -347,8 +346,7 @@ abstract class HiveComparisonTest val catalystResults = queryList.zip(hiveResults).map { case (queryString, hive) => val query = new TestHiveQueryExecution(queryString.replace("../../data", testDataPath)) def getResult(): Seq[String] = { - SQLExecution.withNewExecutionId( - query.sparkSession, query)(hiveResultString(query.executedPlan)) + SQLExecution.withNewExecutionId(query)(hiveResultString(query.executedPlan)) } try { (query, prepareAnswer(query, getResult())) } catch { case e: Throwable => @@ -374,10 +372,10 @@ abstract class HiveComparisonTest // We will ignore the ExplainCommand, ShowFunctions, DescribeFunction if ((!hiveQuery.logical.isInstanceOf[ExplainCommand]) && - (!hiveQuery.logical.isInstanceOf[ShowFunctionsCommand]) && - (!hiveQuery.logical.isInstanceOf[DescribeFunctionCommand]) && + (!hiveQuery.logical.isInstanceOf[ShowFunctionsStatement]) && + (!hiveQuery.logical.isInstanceOf[DescribeFunctionStatement]) && (!hiveQuery.logical.isInstanceOf[DescribeCommandBase]) && - (!hiveQuery.logical.isInstanceOf[DescribeTableStatement]) && + (!hiveQuery.logical.isInstanceOf[DescribeRelation]) && (!hiveQuery.logical.isInstanceOf[DescribeColumnStatement]) && preparedHive != catalyst) { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index cd8e2eaa2b4dc..ba48cfd4142f6 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -31,6 +31,8 @@ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, TableAlreadyExistsException} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.parser.ParseException +import org.apache.spark.sql.connector.catalog.CatalogManager +import org.apache.spark.sql.connector.catalog.SupportsNamespaces.PROP_OWNER import org.apache.spark.sql.execution.command.{DDLSuite, DDLUtils} import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.HiveExternalCatalog @@ -179,8 +181,8 @@ class HiveCatalogedDDLSuite extends DDLSuite with TestHiveSingleton with BeforeA test("SPARK-22431: illegal nested type") { val queries = Seq( - "CREATE TABLE t AS SELECT STRUCT('a' AS `$a`, 1 AS b) q", - "CREATE TABLE t(q STRUCT<`$a`:INT, col2:STRING>, i1 INT)", + "CREATE TABLE t USING hive AS SELECT STRUCT('a' AS `$a`, 1 AS b) q", + "CREATE TABLE t(q STRUCT<`$a`:INT, col2:STRING>, i1 INT) USING hive", "CREATE VIEW t AS SELECT STRUCT('a' AS `$a`, 1 AS b) q") queries.foreach(query => { @@ -251,7 +253,7 @@ class HiveCatalogedDDLSuite extends DDLSuite with TestHiveSingleton with BeforeA test("SPARK-22431: negative alter table tests with nested types") { withTable("t1") { - spark.sql("CREATE TABLE t1 (q STRUCT, i1 INT)") + spark.sql("CREATE TABLE t1 (q STRUCT, i1 INT) USING hive") val err = intercept[SparkException] { spark.sql("ALTER TABLE t1 ADD COLUMNS (newcol1 STRUCT<`$col1`:STRING, col2:Int>)") }.getMessage @@ -360,6 +362,46 @@ class HiveCatalogedDDLSuite extends DDLSuite with TestHiveSingleton with BeforeA Row(2147483648L, "AAA", 3.14, false) :: Row(2147483649L, "BBB", 3.142, true) :: Nil) } } + + test("Create Table LIKE USING Hive built-in ORC in Hive catalog") { + val catalog = spark.sessionState.catalog + withTable("s", "t") { + sql("CREATE TABLE s(a INT, b INT) USING parquet") + val source = catalog.getTableMetadata(TableIdentifier("s")) + assert(source.provider == Some("parquet")) + sql("CREATE TABLE t LIKE s USING org.apache.spark.sql.hive.orc") + val table = catalog.getTableMetadata(TableIdentifier("t")) + assert(table.provider == Some("org.apache.spark.sql.hive.orc")) + } + } + + test("Database Ownership") { + val catalog = spark.sessionState.catalog + try { + val db = "spark_29425_1" + sql(s"CREATE DATABASE $db") + assert(sql(s"DESCRIBE DATABASE EXTENDED $db") + .where("database_description_item='Owner'") + .collect().head.getString(1) === Utils.getCurrentUserName()) + sql(s"ALTER DATABASE $db SET DBPROPERTIES('abc'='xyz')") + assert(sql(s"DESCRIBE DATABASE EXTENDED $db") + .where("database_description_item='Owner'") + .collect().head.getString(1) === Utils.getCurrentUserName()) + } finally { + catalog.reset() + } + } + + test("Table Ownership") { + val catalog = spark.sessionState.catalog + try { + sql(s"CREATE TABLE spark_30019(k int)") + assert(sql(s"DESCRIBE TABLE EXTENDED spark_30019").where("col_name='Owner'") + .collect().head.getString(1) === Utils.getCurrentUserName()) + } finally { + catalog.reset() + } + } } class HiveDDLSuite @@ -417,7 +459,7 @@ class HiveDDLSuite "create the table `default`.`tab1`")) e = intercept[AnalysisException] { - sql(s"CREATE TABLE tab2 location '${tempDir.getCanonicalPath}'") + sql(s"CREATE TABLE tab2 USING hive location '${tempDir.getCanonicalPath}'") }.getMessage assert(e.contains("Unable to infer the schema. The schema specification is required to " + "create the table `default`.`tab2`")) @@ -1100,7 +1142,8 @@ class HiveDDLSuite sql(s"CREATE DATABASE $dbName Location '${tmpDir.toURI.getPath.stripSuffix("/")}'") val db1 = catalog.getDatabaseMetadata(dbName) val dbPath = new URI(tmpDir.toURI.toString.stripSuffix("/")) - assert(db1 == CatalogDatabase(dbName, "", dbPath, Map.empty)) + assert(db1.copy(properties = db1.properties -- Seq(PROP_OWNER)) === + CatalogDatabase(dbName, "", dbPath, Map.empty)) sql("USE db1") sql(s"CREATE TABLE $tabName as SELECT 1") @@ -1138,7 +1181,8 @@ class HiveDDLSuite val expectedDBLocation = s"file:${dbPath.toUri.getPath.stripSuffix("/")}/$dbName.db" val expectedDBUri = CatalogUtils.stringToURI(expectedDBLocation) val db1 = catalog.getDatabaseMetadata(dbName) - assert(db1 == CatalogDatabase( + assert(db1.copy(properties = db1.properties -- Seq(PROP_OWNER)) == + CatalogDatabase( dbName, "", expectedDBUri, @@ -1223,57 +1267,64 @@ class HiveDDLSuite } test("CREATE TABLE LIKE a temporary view") { - // CREATE TABLE LIKE a temporary view. - withCreateTableLikeTempView(location = None) + Seq(None, Some("parquet"), Some("orc"), Some("hive")) foreach { provider => + // CREATE TABLE LIKE a temporary view. + withCreateTableLikeTempView(location = None, provider) - // CREATE TABLE LIKE a temporary view location ... - withTempDir { tmpDir => - withCreateTableLikeTempView(Some(tmpDir.toURI.toString)) + // CREATE TABLE LIKE a temporary view location ... + withTempDir { tmpDir => + withCreateTableLikeTempView(Some(tmpDir.toURI.toString), provider) + } } } - private def withCreateTableLikeTempView(location : Option[String]): Unit = { + private def withCreateTableLikeTempView( + location : Option[String], provider: Option[String]): Unit = { val sourceViewName = "tab1" val targetTabName = "tab2" val tableType = if (location.isDefined) CatalogTableType.EXTERNAL else CatalogTableType.MANAGED withTempView(sourceViewName) { withTable(targetTabName) { - spark.range(10).select('id as 'a, 'id as 'b, 'id as 'c, 'id as 'd) + spark.range(10).select($"id" as "a", $"id" as "b", $"id" as "c", $"id" as "d") .createTempView(sourceViewName) val locationClause = if (location.nonEmpty) s"LOCATION '${location.getOrElse("")}'" else "" - sql(s"CREATE TABLE $targetTabName LIKE $sourceViewName $locationClause") + val providerClause = if (provider.nonEmpty) s"USING ${provider.get}" else "" + sql(s"CREATE TABLE $targetTabName LIKE $sourceViewName $providerClause $locationClause") val sourceTable = spark.sessionState.catalog.getTempViewOrPermanentTableMetadata( TableIdentifier(sourceViewName)) val targetTable = spark.sessionState.catalog.getTableMetadata( TableIdentifier(targetTabName, Some("default"))) - - checkCreateTableLike(sourceTable, targetTable, tableType) + checkCreateTableLike(sourceTable, targetTable, tableType, provider) } } } test("CREATE TABLE LIKE a data source table") { - // CREATE TABLE LIKE a data source table. - withCreateTableLikeDSTable(location = None) + Seq(None, Some("parquet"), Some("orc"), Some("hive")) foreach { provider => + // CREATE TABLE LIKE a data source table. + withCreateTableLikeDSTable(location = None, provider) - // CREATE TABLE LIKE a data source table location ... - withTempDir { tmpDir => - withCreateTableLikeDSTable(Some(tmpDir.toURI.toString)) + // CREATE TABLE LIKE a data source table location ... + withTempDir { tmpDir => + withCreateTableLikeDSTable(Some(tmpDir.toURI.toString), provider) + } } } - private def withCreateTableLikeDSTable(location : Option[String]): Unit = { + private def withCreateTableLikeDSTable( + location : Option[String], provider: Option[String]): Unit = { val sourceTabName = "tab1" val targetTabName = "tab2" val tableType = if (location.isDefined) CatalogTableType.EXTERNAL else CatalogTableType.MANAGED withTable(sourceTabName, targetTabName) { - spark.range(10).select('id as 'a, 'id as 'b, 'id as 'c, 'id as 'd) + spark.range(10).select($"id" as "a", $"id" as "b", $"id" as "c", $"id" as "d") .write.format("json").saveAsTable(sourceTabName) val locationClause = if (location.nonEmpty) s"LOCATION '${location.getOrElse("")}'" else "" - sql(s"CREATE TABLE $targetTabName LIKE $sourceTabName $locationClause") + val providerClause = if (provider.nonEmpty) s"USING ${provider.get}" else "" + sql(s"CREATE TABLE $targetTabName LIKE $sourceTabName $providerClause $locationClause") val sourceTable = spark.sessionState.catalog.getTableMetadata( @@ -1284,34 +1335,37 @@ class HiveDDLSuite // The table type of the source table should be a Hive-managed data source table assert(DDLUtils.isDatasourceTable(sourceTable)) assert(sourceTable.tableType == CatalogTableType.MANAGED) - - checkCreateTableLike(sourceTable, targetTable, tableType) + checkCreateTableLike(sourceTable, targetTable, tableType, provider) } } test("CREATE TABLE LIKE an external data source table") { - // CREATE TABLE LIKE an external data source table. - withCreateTableLikeExtDSTable(location = None) + Seq(None, Some("parquet"), Some("orc"), Some("hive")) foreach { provider => + // CREATE TABLE LIKE an external data source table. + withCreateTableLikeExtDSTable(location = None, provider) - // CREATE TABLE LIKE an external data source table location ... - withTempDir { tmpDir => - withCreateTableLikeExtDSTable(Some(tmpDir.toURI.toString)) + // CREATE TABLE LIKE an external data source table location ... + withTempDir { tmpDir => + withCreateTableLikeExtDSTable(Some(tmpDir.toURI.toString), provider) + } } } - private def withCreateTableLikeExtDSTable(location : Option[String]): Unit = { + private def withCreateTableLikeExtDSTable( + location : Option[String], provider: Option[String]): Unit = { val sourceTabName = "tab1" val targetTabName = "tab2" val tableType = if (location.isDefined) CatalogTableType.EXTERNAL else CatalogTableType.MANAGED withTable(sourceTabName, targetTabName) { withTempPath { dir => val path = dir.getCanonicalPath - spark.range(10).select('id as 'a, 'id as 'b, 'id as 'c, 'id as 'd) + spark.range(10).select($"id" as "a", $"id" as "b", $"id" as "c", $"id" as "d") .write.format("parquet").save(path) sql(s"CREATE TABLE $sourceTabName USING parquet OPTIONS (PATH '${dir.toURI}')") val locationClause = if (location.nonEmpty) s"LOCATION '${location.getOrElse("")}'" else "" - sql(s"CREATE TABLE $targetTabName LIKE $sourceTabName $locationClause") + val providerClause = if (provider.nonEmpty) s"USING ${provider.get}" else "" + sql(s"CREATE TABLE $targetTabName LIKE $sourceTabName $providerClause $locationClause") // The source table should be an external data source table val sourceTable = spark.sessionState.catalog.getTableMetadata( @@ -1321,23 +1375,25 @@ class HiveDDLSuite // The table type of the source table should be an external data source table assert(DDLUtils.isDatasourceTable(sourceTable)) assert(sourceTable.tableType == CatalogTableType.EXTERNAL) - - checkCreateTableLike(sourceTable, targetTable, tableType) + checkCreateTableLike(sourceTable, targetTable, tableType, provider) } } } test("CREATE TABLE LIKE a managed Hive serde table") { - // CREATE TABLE LIKE a managed Hive serde table. - withCreateTableLikeManagedHiveTable(location = None) + Seq(None, Some("parquet"), Some("orc"), Some("hive")) foreach { provider => + // CREATE TABLE LIKE a managed Hive serde table. + withCreateTableLikeManagedHiveTable(location = None, provider) - // CREATE TABLE LIKE a managed Hive serde table location ... - withTempDir { tmpDir => - withCreateTableLikeManagedHiveTable(Some(tmpDir.toURI.toString)) + // CREATE TABLE LIKE a managed Hive serde table location ... + withTempDir { tmpDir => + withCreateTableLikeManagedHiveTable(Some(tmpDir.toURI.toString), provider) + } } } - private def withCreateTableLikeManagedHiveTable(location : Option[String]): Unit = { + private def withCreateTableLikeManagedHiveTable( + location : Option[String], provider: Option[String]): Unit = { val sourceTabName = "tab1" val targetTabName = "tab2" val tableType = if (location.isDefined) CatalogTableType.EXTERNAL else CatalogTableType.MANAGED @@ -1346,7 +1402,8 @@ class HiveDDLSuite sql(s"CREATE TABLE $sourceTabName TBLPROPERTIES('prop1'='value1') AS SELECT 1 key, 'a'") val locationClause = if (location.nonEmpty) s"LOCATION '${location.getOrElse("")}'" else "" - sql(s"CREATE TABLE $targetTabName LIKE $sourceTabName $locationClause") + val providerClause = if (provider.nonEmpty) s"USING ${provider.get}" else "" + sql(s"CREATE TABLE $targetTabName LIKE $sourceTabName $providerClause $locationClause") val sourceTable = catalog.getTableMetadata( TableIdentifier(sourceTabName, Some("default"))) @@ -1354,22 +1411,24 @@ class HiveDDLSuite assert(sourceTable.properties.get("prop1").nonEmpty) val targetTable = catalog.getTableMetadata( TableIdentifier(targetTabName, Some("default"))) - - checkCreateTableLike(sourceTable, targetTable, tableType) + checkCreateTableLike(sourceTable, targetTable, tableType, provider) } } test("CREATE TABLE LIKE an external Hive serde table") { - // CREATE TABLE LIKE an external Hive serde table. - withCreateTableLikeExtHiveTable(location = None) + Seq(None, Some("parquet"), Some("orc"), Some("hive")) foreach { provider => + // CREATE TABLE LIKE an external Hive serde table. + withCreateTableLikeExtHiveTable(location = None, provider) - // CREATE TABLE LIKE an external Hive serde table location ... - withTempDir { tmpDir => - withCreateTableLikeExtHiveTable(Some(tmpDir.toURI.toString)) + // CREATE TABLE LIKE an external Hive serde table location ... + withTempDir { tmpDir => + withCreateTableLikeExtHiveTable(Some(tmpDir.toURI.toString), provider) + } } } - private def withCreateTableLikeExtHiveTable(location : Option[String]): Unit = { + private def withCreateTableLikeExtHiveTable( + location : Option[String], provider: Option[String]): Unit = { val catalog = spark.sessionState.catalog val tableType = if (location.isDefined) CatalogTableType.EXTERNAL else CatalogTableType.MANAGED withTempDir { tmpDir => @@ -1395,7 +1454,8 @@ class HiveDDLSuite } val locationClause = if (location.nonEmpty) s"LOCATION '${location.getOrElse("")}'" else "" - sql(s"CREATE TABLE $targetTabName LIKE $sourceTabName $locationClause") + val providerClause = if (provider.nonEmpty) s"USING ${provider.get}" else "" + sql(s"CREATE TABLE $targetTabName LIKE $sourceTabName $providerClause $locationClause") val sourceTable = catalog.getTableMetadata( TableIdentifier(sourceTabName, Some("default"))) @@ -1403,63 +1463,67 @@ class HiveDDLSuite assert(sourceTable.comment == Option("Apache Spark")) val targetTable = catalog.getTableMetadata( TableIdentifier(targetTabName, Some("default"))) - - checkCreateTableLike(sourceTable, targetTable, tableType) + checkCreateTableLike(sourceTable, targetTable, tableType, provider) } } } test("CREATE TABLE LIKE a view") { - // CREATE TABLE LIKE a view. - withCreateTableLikeView(location = None) + Seq(None, Some("parquet"), Some("orc"), Some("hive")) foreach { provider => + // CREATE TABLE LIKE a view. + withCreateTableLikeView(location = None, provider) - // CREATE TABLE LIKE a view location ... - withTempDir { tmpDir => - withCreateTableLikeView(Some(tmpDir.toURI.toString)) + // CREATE TABLE LIKE a view location ... + withTempDir { tmpDir => + withCreateTableLikeView(Some(tmpDir.toURI.toString), provider) + } } } - private def withCreateTableLikeView(location : Option[String]): Unit = { + private def withCreateTableLikeView( + location : Option[String], provider: Option[String]): Unit = { val sourceTabName = "tab1" val sourceViewName = "view" val targetTabName = "tab2" val tableType = if (location.isDefined) CatalogTableType.EXTERNAL else CatalogTableType.MANAGED withTable(sourceTabName, targetTabName) { withView(sourceViewName) { - spark.range(10).select('id as 'a, 'id as 'b, 'id as 'c, 'id as 'd) + spark.range(10).select($"id" as "a", $"id" as "b", $"id" as "c", $"id" as "d") .write.format("json").saveAsTable(sourceTabName) sql(s"CREATE VIEW $sourceViewName AS SELECT * FROM $sourceTabName") val locationClause = if (location.nonEmpty) s"LOCATION '${location.getOrElse("")}'" else "" - sql(s"CREATE TABLE $targetTabName LIKE $sourceViewName $locationClause") + val providerClause = if (provider.nonEmpty) s"USING ${provider.get}" else "" + sql(s"CREATE TABLE $targetTabName LIKE $sourceViewName $providerClause $locationClause") val sourceView = spark.sessionState.catalog.getTableMetadata( TableIdentifier(sourceViewName, Some("default"))) // The original source should be a VIEW with an empty path assert(sourceView.tableType == CatalogTableType.VIEW) assert(sourceView.viewText.nonEmpty) - assert(sourceView.viewDefaultDatabase == Some("default")) + assert(sourceView.viewCatalogAndNamespace == + Seq(CatalogManager.SESSION_CATALOG_NAME, "default")) assert(sourceView.viewQueryColumnNames == Seq("a", "b", "c", "d")) val targetTable = spark.sessionState.catalog.getTableMetadata( TableIdentifier(targetTabName, Some("default"))) - - checkCreateTableLike(sourceView, targetTable, tableType) + checkCreateTableLike(sourceView, targetTable, tableType, provider) } } } private def checkCreateTableLike( - sourceTable: CatalogTable, - targetTable: CatalogTable, - tableType: CatalogTableType): Unit = { + sourceTable: CatalogTable, + targetTable: CatalogTable, + tableType: CatalogTableType, + provider: Option[String]): Unit = { // The created table should be a MANAGED table or EXTERNAL table with empty view text // and original text. assert(targetTable.tableType == tableType, s"the created table must be a/an ${tableType.name} table") assert(targetTable.viewText.isEmpty, "the view text in the created table must be empty") - assert(targetTable.viewDefaultDatabase.isEmpty, - "the view default database in the created table must be empty") + assert(targetTable.viewCatalogAndNamespace.isEmpty, + "the view catalog and namespace in the created table must be empty") assert(targetTable.viewQueryColumnNames.isEmpty, "the view query output columns in the created table must be empty") assert(targetTable.comment.isEmpty, @@ -1482,21 +1546,29 @@ class HiveDDLSuite assert(targetTable.properties.filterKeys(!metastoreGeneratedProperties.contains(_)).isEmpty, "the table properties of source tables should not be copied in the created table") - if (DDLUtils.isDatasourceTable(sourceTable) || - sourceTable.tableType == CatalogTableType.VIEW) { - assert(DDLUtils.isDatasourceTable(targetTable), - "the target table should be a data source table") - } else { - assert(!DDLUtils.isDatasourceTable(targetTable), - "the target table should be a Hive serde table") - } - - if (sourceTable.tableType == CatalogTableType.VIEW) { - // Source table is a temporary/permanent view, which does not have a provider. The created - // target table uses the default data source format - assert(targetTable.provider == Option(spark.sessionState.conf.defaultDataSourceName)) - } else { - assert(targetTable.provider == sourceTable.provider) + provider match { + case Some(_) => + assert(targetTable.provider == provider) + if (DDLUtils.isHiveTable(provider)) { + assert(DDLUtils.isHiveTable(targetTable), + "the target table should be a hive table if provider is hive") + } + case None => + if (sourceTable.tableType == CatalogTableType.VIEW) { + // Source table is a temporary/permanent view, which does not have a provider. + // The created target table uses the default data source format + assert(targetTable.provider == Option(spark.sessionState.conf.defaultDataSourceName)) + } else { + assert(targetTable.provider == sourceTable.provider) + } + if (DDLUtils.isDatasourceTable(sourceTable) || + sourceTable.tableType == CatalogTableType.VIEW) { + assert(DDLUtils.isDatasourceTable(targetTable), + "the target table should be a data source table") + } else { + assert(!DDLUtils.isDatasourceTable(targetTable), + "the target table should be a Hive serde table") + } } assert(targetTable.storage.locationUri.nonEmpty, "target table path should not be empty") @@ -1551,7 +1623,7 @@ class HiveDDLSuite assert(spark.catalog.getTable("default", indexTabName).name === indexTabName) intercept[TableAlreadyExistsException] { - sql(s"CREATE TABLE $indexTabName(b int)") + sql(s"CREATE TABLE $indexTabName(b int) USING hive") } intercept[TableAlreadyExistsException] { sql(s"ALTER TABLE $tabName RENAME TO $indexTabName") @@ -1749,7 +1821,7 @@ class HiveDDLSuite test("create hive serde table with Catalog") { withTable("t") { withTempDir { dir => - val df = spark.catalog.createExternalTable( + val df = spark.catalog.createTable( "t", "hive", new StructType().add("i", "int"), @@ -1828,10 +1900,10 @@ class HiveDDLSuite .write.format("hive").mode("append").saveAsTable("t") checkAnswer(spark.table("t"), Row(1, "a") :: Row(2, "b") :: Row(3, "c") :: Nil) - Seq("c" -> 3).toDF("i", "j") + Seq(3.5 -> 3).toDF("i", "j") .write.format("hive").mode("append").saveAsTable("t") checkAnswer(spark.table("t"), Row(1, "a") :: Row(2, "b") :: Row(3, "c") - :: Row(null, "3") :: Nil) + :: Row(3, "3") :: Nil) Seq(4 -> "d").toDF("i", "j").write.saveAsTable("t1") @@ -2366,10 +2438,11 @@ class HiveDDLSuite checkAnswer(spark.table("t"), Row(1)) val maybeFile = path.listFiles().find(_.getName.startsWith("part")) - val reader = getReader(maybeFile.head.getCanonicalPath) - assert(reader.getCompressionKind.name === "ZLIB") - assert(reader.getCompressionSize == 1001) - assert(reader.getRowIndexStride == 2002) + Utils.tryWithResource(getReader(maybeFile.head.getCanonicalPath)) { reader => + assert(reader.getCompressionKind.name === "ZLIB") + assert(reader.getCompressionSize == 1001) + assert(reader.getRowIndexStride == 2002) + } } } } @@ -2407,7 +2480,7 @@ class HiveDDLSuite test("load command for non local invalid path validation") { withTable("tbl") { - sql("CREATE TABLE tbl(i INT, j STRING)") + sql("CREATE TABLE tbl(i INT, j STRING) USING hive") val e = intercept[AnalysisException]( sql("load data inpath '/doesnotexist.csv' into table tbl")) assert(e.message.contains("LOAD DATA input path does not exist")) @@ -2416,12 +2489,12 @@ class HiveDDLSuite test("SPARK-22252: FileFormatWriter should respect the input query schema in HIVE") { withTable("t1", "t2", "t3", "t4") { - spark.range(1).select('id as 'col1, 'id as 'col2).write.saveAsTable("t1") + spark.range(1).select($"id" as "col1", $"id" as "col2").write.saveAsTable("t1") spark.sql("select COL1, COL2 from t1").write.format("hive").saveAsTable("t2") checkAnswer(spark.table("t2"), Row(0, 0)) // Test picking part of the columns when writing. - spark.range(1).select('id, 'id as 'col1, 'id as 'col2).write.saveAsTable("t3") + spark.range(1).select($"id", $"id" as "col1", $"id" as "col2").write.saveAsTable("t3") spark.sql("select COL1, COL2 from t3").write.format("hive").saveAsTable("t4") checkAnswer(spark.table("t4"), Row(0, 0)) } @@ -2433,9 +2506,9 @@ class HiveDDLSuite "CREATE TABLE IF NOT EXISTS t1 (c1_int INT, c2_string STRING, c3_float FLOAT)") val desc = sql("DESC FORMATTED t1").filter($"col_name".startsWith("Last Access")) .select("data_type") - // check if the last access time doesnt have the default date of year + // check if the last access time doesn't have the default date of year // 1970 as its a wrong access time - assert(!(desc.first.toString.contains("1970"))) + assert((desc.first.toString.contains("UNKNOWN"))) } } @@ -2524,4 +2597,131 @@ class HiveDDLSuite } } } + + test("Create Table LIKE STORED AS Hive Format") { + val catalog = spark.sessionState.catalog + withTable("s") { + sql("CREATE TABLE s(a INT, b INT) STORED AS ORC") + hiveFormats.foreach { tableType => + val expectedSerde = HiveSerDe.sourceToSerDe(tableType) + withTable("t") { + sql(s"CREATE TABLE t LIKE s STORED AS $tableType") + val table = catalog.getTableMetadata(TableIdentifier("t")) + assert(table.provider == Some("hive")) + assert(table.storage.serde == expectedSerde.get.serde) + assert(table.storage.inputFormat == expectedSerde.get.inputFormat) + assert(table.storage.outputFormat == expectedSerde.get.outputFormat) + } + } + } + } + + test("Create Table LIKE with specified TBLPROPERTIES") { + val catalog = spark.sessionState.catalog + withTable("s", "t") { + sql("CREATE TABLE s(a INT, b INT) USING hive TBLPROPERTIES('a'='apple')") + val source = catalog.getTableMetadata(TableIdentifier("s")) + assert(source.properties("a") == "apple") + sql("CREATE TABLE t LIKE s STORED AS parquet TBLPROPERTIES('f'='foo', 'b'='bar')") + val table = catalog.getTableMetadata(TableIdentifier("t")) + assert(table.properties.get("a") === None) + assert(table.properties("f") == "foo") + assert(table.properties("b") == "bar") + } + } + + test("Create Table LIKE with row format") { + val catalog = spark.sessionState.catalog + withTable("sourceHiveTable", "sourceDsTable", "targetHiveTable1", "targetHiveTable2") { + sql("CREATE TABLE sourceHiveTable(a INT, b INT) STORED AS PARQUET") + sql("CREATE TABLE sourceDsTable(a INT, b INT) USING PARQUET") + + // row format doesn't work in create targetDsTable + var e = intercept[AnalysisException] { + spark.sql( + """ + |CREATE TABLE targetDsTable LIKE sourceHiveTable USING PARQUET + |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + """.stripMargin) + }.getMessage + assert(e.contains("'ROW FORMAT' must be used with 'STORED AS'")) + + // row format doesn't work with provider hive + e = intercept[AnalysisException] { + spark.sql( + """ + |CREATE TABLE targetHiveTable LIKE sourceHiveTable USING hive + |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + |WITH SERDEPROPERTIES ('test' = 'test') + """.stripMargin) + }.getMessage + assert(e.contains("'ROW FORMAT' must be used with 'STORED AS'")) + + // row format doesn't work without 'STORED AS' + e = intercept[AnalysisException] { + spark.sql( + """ + |CREATE TABLE targetDsTable LIKE sourceDsTable + |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + |WITH SERDEPROPERTIES ('test' = 'test') + """.stripMargin) + }.getMessage + assert(e.contains("'ROW FORMAT' must be used with 'STORED AS'")) + + // row format works with STORED AS hive format (from hive table) + spark.sql( + """ + |CREATE TABLE targetHiveTable1 LIKE sourceHiveTable STORED AS PARQUET + |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + |WITH SERDEPROPERTIES ('test' = 'test') + """.stripMargin) + var table = catalog.getTableMetadata(TableIdentifier("targetHiveTable1")) + assert(table.provider === Some("hive")) + assert(table.storage.inputFormat === + Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat")) + assert(table.storage.serde === Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) + assert(table.storage.properties("test") == "test") + + // row format works with STORED AS hive format (from datasource table) + spark.sql( + """ + |CREATE TABLE targetHiveTable2 LIKE sourceDsTable STORED AS PARQUET + |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + |WITH SERDEPROPERTIES ('test' = 'test') + """.stripMargin) + table = catalog.getTableMetadata(TableIdentifier("targetHiveTable2")) + assert(table.provider === Some("hive")) + assert(table.storage.inputFormat === + Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat")) + assert(table.storage.serde === Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) + assert(table.storage.properties("test") == "test") + } + } + + test("SPARK-30098: create table without provider should " + + "use default data source under non-legacy mode") { + val catalog = spark.sessionState.catalog + withSQLConf( + SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED.key -> "false") { + withTable("s") { + val defaultProvider = conf.defaultDataSourceName + sql("CREATE TABLE s(a INT, b INT)") + val table = catalog.getTableMetadata(TableIdentifier("s")) + assert(table.provider === Some(defaultProvider)) + } + } + } + + test("SPARK-30098: create table without provider should " + + "use hive under legacy mode") { + val catalog = spark.sessionState.catalog + withSQLConf( + SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED.key -> "true") { + withTable("s") { + sql("CREATE TABLE s(a INT, b INT)") + val table = catalog.getTableMetadata(TableIdentifier("s")) + assert(table.provider === Some("hive")) + } + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala index 68ccee5e6623a..f9a4e2cd210e3 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala @@ -97,13 +97,14 @@ class HiveExplainSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } test("explain create table command") { - checkKeywordsExist(sql("explain create table temp__b as select * from src limit 2"), + checkKeywordsExist(sql("explain create table temp__b using hive as select * from src limit 2"), "== Physical Plan ==", "InsertIntoHiveTable", "Limit", "src") - checkKeywordsExist(sql("explain extended create table temp__b as select * from src limit 2"), + checkKeywordsExist( + sql("explain extended create table temp__b using hive as select * from src limit 2"), "== Parsed Logical Plan ==", "== Analyzed Logical Plan ==", "== Optimized Logical Plan ==", @@ -133,19 +134,21 @@ class HiveExplainSuite extends QueryTest with SQLTestUtils with TestHiveSingleto } test("explain output of physical plan should contain proper codegen stage ID") { - checkKeywordsExist(sql( - """ - |EXPLAIN SELECT t1.id AS a, t2.id AS b FROM - |(SELECT * FROM range(3)) t1 JOIN - |(SELECT * FROM range(10)) t2 ON t1.id == t2.id % 3 - """.stripMargin), - "== Physical Plan ==", - "*(2) Project ", - "+- *(2) BroadcastHashJoin ", - " :- BroadcastExchange ", - " : +- *(1) Range ", - " +- *(2) Range " - ) + withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { + checkKeywordsExist(sql( + """ + |EXPLAIN SELECT t1.id AS a, t2.id AS b FROM + |(SELECT * FROM range(3)) t1 JOIN + |(SELECT * FROM range(10)) t2 ON t1.id == t2.id % 3 + """.stripMargin), + "== Physical Plan ==", + "*(2) Project ", + "+- *(2) BroadcastHashJoin ", + " :- BroadcastExchange ", + " : +- *(1) Range ", + " +- *(2) Range " + ) + } } test("EXPLAIN CODEGEN command") { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala index 53798e0ac2727..5a8365017a5ba 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala @@ -35,7 +35,7 @@ import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.catalyst.plans.logical.Project import org.apache.spark.sql.execution.joins.BroadcastNestedLoopJoinExec import org.apache.spark.sql.hive._ -import org.apache.spark.sql.hive.test.{HiveTestUtils, TestHive} +import org.apache.spark.sql.hive.test.{HiveTestJars, TestHive} import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils @@ -56,7 +56,7 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd def spark: SparkSession = sparkSession - override def beforeAll() { + override def beforeAll(): Unit = { super.beforeAll() TestHive.setCacheTables(true) // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*) @@ -67,7 +67,7 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, true) } - override def afterAll() { + override def afterAll(): Unit = { try { TestHive.setCacheTables(false) TimeZone.setDefault(originalTimeZone) @@ -711,7 +711,7 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd } def isExplanation(result: DataFrame): Boolean = { - val explanation = result.select('plan).collect().map { case Row(plan: String) => plan } + val explanation = result.select("plan").collect().map { case Row(plan: String) => plan } explanation.head.startsWith("== Physical Plan ==") } @@ -817,7 +817,7 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd test("ADD JAR command 2") { // this is a test case from mapjoin_addjar.q - val testJar = HiveTestUtils.getHiveHcatalogCoreJar.toURI + val testJar = HiveTestJars.getHiveHcatalogCoreJar().toURI val testData = TestHive.getHiveFile("data/files/sample.json").toURI sql(s"ADD JAR $testJar") sql( @@ -827,9 +827,9 @@ class HiveQuerySuite extends HiveComparisonTest with SQLTestUtils with BeforeAnd sql("select * from src join t1 on src.key = t1.a") sql("DROP TABLE t1") assert(sql("list jars"). - filter(_.getString(0).contains(HiveTestUtils.getHiveHcatalogCoreJar.getName)).count() > 0) + filter(_.getString(0).contains(HiveTestJars.getHiveHcatalogCoreJar().getName)).count() > 0) assert(sql("list jar"). - filter(_.getString(0).contains(HiveTestUtils.getHiveHcatalogCoreJar.getName)).count() > 0) + filter(_.getString(0).contains(HiveTestJars.getHiveHcatalogCoreJar().getName)).count() > 0) val testJar2 = TestHive.getHiveFile("TestUDTF.jar").getCanonicalPath sql(s"ADD JAR $testJar2") assert(sql(s"list jar $testJar").count() == 1) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeReadWriteSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeReadWriteSuite.scala index 25ff3544185af..f8ba7bf2c1a62 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeReadWriteSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeReadWriteSuite.scala @@ -65,7 +65,7 @@ class HiveSerDeReadWriteSuite extends QueryTest with SQLTestUtils with TestHiveS hiveClient.runSqlHive(s"CREATE TABLE hive_serde (c1 TIMESTAMP) STORED AS $fileFormat") hiveClient.runSqlHive("INSERT INTO TABLE hive_serde values('2019-04-11 15:50:00')") checkAnswer(spark.table("hive_serde"), Row(Timestamp.valueOf("2019-04-11 15:50:00"))) - spark.sql("INSERT INTO TABLE hive_serde values('2019-04-12 15:50:00')") + spark.sql("INSERT INTO TABLE hive_serde values(TIMESTAMP('2019-04-12 15:50:00'))") checkAnswer( spark.table("hive_serde"), Seq(Row(Timestamp.valueOf("2019-04-11 15:50:00")), @@ -77,7 +77,7 @@ class HiveSerDeReadWriteSuite extends QueryTest with SQLTestUtils with TestHiveS hiveClient.runSqlHive(s"CREATE TABLE hive_serde (c1 DATE) STORED AS $fileFormat") hiveClient.runSqlHive("INSERT INTO TABLE hive_serde values('2019-04-11')") checkAnswer(spark.table("hive_serde"), Row(Date.valueOf("2019-04-11"))) - spark.sql("INSERT INTO TABLE hive_serde values('2019-04-12')") + spark.sql("INSERT INTO TABLE hive_serde values(TIMESTAMP('2019-04-12'))") checkAnswer( spark.table("hive_serde"), Seq(Row(Date.valueOf("2019-04-11")), Row(Date.valueOf("2019-04-12")))) @@ -119,7 +119,7 @@ class HiveSerDeReadWriteSuite extends QueryTest with SQLTestUtils with TestHiveS hiveClient.runSqlHive(s"CREATE TABLE hive_serde (c1 BINARY) STORED AS $fileFormat") hiveClient.runSqlHive("INSERT INTO TABLE hive_serde values('1')") checkAnswer(spark.table("hive_serde"), Row("1".getBytes)) - spark.sql("INSERT INTO TABLE hive_serde values('2')") + spark.sql("INSERT INTO TABLE hive_serde values(BINARY('2'))") checkAnswer(spark.table("hive_serde"), Seq(Row("1".getBytes), Row("2".getBytes))) } } @@ -168,6 +168,8 @@ class HiveSerDeReadWriteSuite extends QueryTest with SQLTestUtils with TestHiveS checkNumericTypes(fileFormat, "DECIMAL(38, 2)", 2.1D) // Date/Time Types + // SPARK-28885 String value is not allowed to be stored as date/timestamp type with + // ANSI store assignment policy. checkDateTimeTypes(fileFormat) // String Types diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala index ed4304b9aa57b..9a1190af02fac 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSerDeSuite.scala @@ -83,15 +83,18 @@ class HiveSerDeSuite extends HiveComparisonTest with PlanTest with BeforeAndAfte } test("Test the default fileformat for Hive-serde tables") { - withSQLConf("hive.default.fileformat" -> "orc") { - val (desc, exists) = extractTableDesc("CREATE TABLE IF NOT EXISTS fileformat_test (id int)") + withSQLConf("hive.default.fileformat" -> "orc", + SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED.key -> "true") { + val (desc, exists) = extractTableDesc( + "CREATE TABLE IF NOT EXISTS fileformat_test (id int)") assert(exists) assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat")) assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat")) assert(desc.storage.serde == Some("org.apache.hadoop.hive.ql.io.orc.OrcSerde")) } - withSQLConf("hive.default.fileformat" -> "parquet") { + withSQLConf("hive.default.fileformat" -> "parquet", + SQLConf.LEGACY_CREATE_HIVE_TABLE_BY_DEFAULT_ENABLED.key -> "true") { val (desc, exists) = extractTableDesc("CREATE TABLE IF NOT EXISTS fileformat_test (id int)") assert(exists) val input = desc.storage.inputFormat diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala index 3f9bb8de42e09..67d7ed0841abb 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveTableScanSuite.scala @@ -85,8 +85,8 @@ class HiveTableScanSuite extends HiveComparisonTest with SQLTestUtils with TestH sql("create table spark_4959 (col1 string)") sql("""insert into table spark_4959 select "hi" from src limit 1""") table("spark_4959").select( - 'col1.as("CaseSensitiveColName"), - 'col1.as("CaseSensitiveColName2")).createOrReplaceTempView("spark_4959_2") + $"col1".as("CaseSensitiveColName"), + $"col1".as("CaseSensitiveColName2")).createOrReplaceTempView("spark_4959_2") assert(sql("select CaseSensitiveColName from spark_4959_2").head() === Row("hi")) assert(sql("select casesensitivecolname from spark_4959_2").head() === Row("hi")) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDAFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDAFSuite.scala index b0d615c1acee9..9e33a8ee4cc5c 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDAFSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDAFSuite.scala @@ -29,12 +29,14 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo import test.org.apache.spark.sql.MyDoubleAvg import org.apache.spark.sql.{AnalysisException, QueryTest, Row} +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils -class HiveUDAFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils { +class HiveUDAFSuite extends QueryTest + with TestHiveSingleton with SQLTestUtils with AdaptiveSparkPlanHelper { import testImplicits._ protected override def beforeAll(): Unit = { @@ -63,7 +65,7 @@ class HiveUDAFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils { test("built-in Hive UDAF") { val df = sql("SELECT key % 2, hive_max(key) FROM t GROUP BY key % 2") - val aggs = df.queryExecution.executedPlan.collect { + val aggs = collect(df.queryExecution.executedPlan) { case agg: ObjectHashAggregateExec => agg } @@ -80,7 +82,7 @@ class HiveUDAFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils { test("customized Hive UDAF") { val df = sql("SELECT key % 2, mock(value) FROM t GROUP BY key % 2") - val aggs = df.queryExecution.executedPlan.collect { + val aggs = collect(df.queryExecution.executedPlan) { case agg: ObjectHashAggregateExec => agg } @@ -99,7 +101,7 @@ class HiveUDAFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils { spark.range(100).createTempView("v") val df = sql("SELECT id % 2, mock2(id) FROM v GROUP BY id % 2") - val aggs = df.queryExecution.executedPlan.collect { + val aggs = collect(df.queryExecution.executedPlan) { case agg: ObjectHashAggregateExec => agg } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala index 587eab4a24810..7bca2af379934 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala @@ -32,6 +32,7 @@ import org.apache.hadoop.io.{LongWritable, Writable} import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.catalyst.plans.logical.Project +import org.apache.spark.sql.execution.command.FunctionsCommand import org.apache.spark.sql.functions.max import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf @@ -148,13 +149,6 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils { sql("SELECT array(max(key), max(key)) FROM src").collect().toSeq) } - test("SPARK-16228 Percentile needs explicit cast to double") { - sql("select percentile(value, cast(0.5 as double)) from values 1,2,3 T(value)") - sql("select percentile_approx(value, cast(0.5 as double)) from values 1.0,2.0,3.0 T(value)") - sql("select percentile(value, 0.5) from values 1,2,3 T(value)") - sql("select percentile_approx(value, 0.5) from values 1.0,2.0,3.0 T(value)") - } - test("Generic UDAF aggregates") { checkAnswer(sql( @@ -563,7 +557,8 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils { checkAnswer( sql("SELECT testUDFToListInt(s) FROM inputTable"), Seq(Row(Seq(1, 2, 3)))) - assert(sql("show functions").count() == numFunc + 1) + assert(sql("show functions").count() == + numFunc + FunctionsCommand.virtualOperators.size + 1) assert(spark.catalog.listFunctions().count() == numFunc + 1) } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala index 2391106cfb253..327e4104d59a8 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ObjectHashAggregateSuite.scala @@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.analysis.UnresolvedFunction import org.apache.spark.sql.catalyst.expressions.{ExpressionEvalHelper, Literal} import org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec} import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHiveSingleton @@ -38,7 +39,8 @@ class ObjectHashAggregateSuite extends QueryTest with SQLTestUtils with TestHiveSingleton - with ExpressionEvalHelper { + with ExpressionEvalHelper + with AdaptiveSparkPlanHelper { import testImplicits._ @@ -156,7 +158,7 @@ class ObjectHashAggregateSuite ) checkAnswer( - df.groupBy($"id" % 4 as 'mod).agg(aggFunctions.head, aggFunctions.tail: _*), + df.groupBy($"id" % 4 as "mod").agg(aggFunctions.head, aggFunctions.tail: _*), data.groupBy(_.getInt(0) % 4).map { case (key, value) => key -> Row.fromSeq(value.map(_.toSeq).transpose.map(_.count(_ != null): Long)) }.toSeq.map { @@ -394,19 +396,19 @@ class ObjectHashAggregateSuite } private def containsSortAggregateExec(df: DataFrame): Boolean = { - df.queryExecution.executedPlan.collectFirst { + collectFirst(df.queryExecution.executedPlan) { case _: SortAggregateExec => () }.nonEmpty } private def containsObjectHashAggregateExec(df: DataFrame): Boolean = { - df.queryExecution.executedPlan.collectFirst { + collectFirst(df.queryExecution.executedPlan) { case _: ObjectHashAggregateExec => () }.nonEmpty } private def containsHashAggregateExec(df: DataFrame): Boolean = { - df.queryExecution.executedPlan.collectFirst { + collectFirst(df.queryExecution.executedPlan) { case _: HashAggregateExec => () }.nonEmpty } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala index 6b2d0c656b371..c9c36992906a8 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala @@ -65,7 +65,8 @@ class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with Te options = Map.empty)(sparkSession = spark) val logicalRelation = LogicalRelation(relation, tableMeta) - val query = Project(Seq('i, 'p), Filter('p === 1, logicalRelation)).analyze + val query = Project(Seq(Symbol("i"), Symbol("p")), + Filter(Symbol("p") === 1, logicalRelation)).analyze val optimized = Optimize.execute(query) assert(optimized.missingInput.isEmpty) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitionsSuite.scala new file mode 100644 index 0000000000000..e41709841a736 --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneHiveTablePartitionsSuite.scala @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.execution + +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.rules.RuleExecutor +import org.apache.spark.sql.hive.test.TestHiveSingleton +import org.apache.spark.sql.test.SQLTestUtils + +class PruneHiveTablePartitionsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { + + object Optimize extends RuleExecutor[LogicalPlan] { + val batches = + Batch("PruneHiveTablePartitions", Once, + EliminateSubqueryAliases, new PruneHiveTablePartitions(spark)) :: Nil + } + + test("SPARK-15616 statistics pruned after going throuhg PruneHiveTablePartitions") { + withTable("test", "temp") { + sql( + s""" + |CREATE TABLE test(i int) + |PARTITIONED BY (p int) + |STORED AS textfile""".stripMargin) + spark.range(0, 1000, 1).selectExpr("id as col") + .createOrReplaceTempView("temp") + + for (part <- Seq(1, 2, 3, 4)) { + sql( + s""" + |INSERT OVERWRITE TABLE test PARTITION (p='$part') + |select col from temp""".stripMargin) + } + val analyzed1 = sql("select i from test where p > 0").queryExecution.analyzed + val analyzed2 = sql("select i from test where p = 1").queryExecution.analyzed + assert(Optimize.execute(analyzed1).stats.sizeInBytes / 4 === + Optimize.execute(analyzed2).stats.sizeInBytes) + } + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala index cc592cf6ca629..985281bce3036 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruningSuite.scala @@ -141,6 +141,13 @@ class PruningSuite extends HiveComparisonTest with BeforeAndAfter { Seq("2008-04-08", "11"), Seq("2008-04-09", "11"))) + createPruningTest("Partition pruning - with filter containing non-deterministic condition", + "SELECT value, hr FROM srcpart1 WHERE ds = '2008-04-08' AND hr < 12 AND rand() < 1", + Seq("value", "hr"), + Seq("value", "hr"), + Seq( + Seq("2008-04-08", "11"))) + def createPruningTest( testCaseName: String, sql: String, @@ -154,7 +161,7 @@ class PruningSuite extends HiveComparisonTest with BeforeAndAfter { case p @ HiveTableScanExec(columns, relation, _) => val columnNames = columns.map(_.name) val partValues = if (relation.isPartitioned) { - p.prunePartitions(p.rawPartitions).map(_.getValues) + p.prunedPartitions.map(_.getValues) } else { Seq.empty } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala index 022cb7177339d..16668f93bd4e7 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala @@ -19,9 +19,23 @@ package org.apache.spark.sql.hive.execution import org.apache.spark.sql.execution.metric.SQLMetricsTestUtils import org.apache.spark.sql.hive.test.TestHiveSingleton +import org.apache.spark.sql.internal.SQLConf class SQLMetricsSuite extends SQLMetricsTestUtils with TestHiveSingleton { + var originalValue: String = _ + // With AQE on/off, the metric info is different. + override def beforeAll(): Unit = { + super.beforeAll() + originalValue = spark.conf.get(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key) + spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false") + } + + override def afterAll(): Unit = { + spark.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, originalValue) + super.afterAll() + } + test("writing data out metrics: hive") { testMetricsNonDynamicPartition("hive", "t1") } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 1638f6cd91808..539b464743461 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -33,16 +33,16 @@ import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, Functio import org.apache.spark.sql.catalyst.catalog.{CatalogTableType, CatalogUtils, HiveTableRelation} import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} -import org.apache.spark.sql.execution.command.LoadDataCommand +import org.apache.spark.sql.execution.command.{FunctionsCommand, LoadDataCommand} import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils} -import org.apache.spark.sql.hive.test.{HiveTestUtils, TestHiveSingleton} +import org.apache.spark.sql.hive.test.{HiveTestJars, TestHiveSingleton} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.StaticSQLConf.GLOBAL_TEMP_DATABASE import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ -import org.apache.spark.unsafe.types.CalendarInterval +import org.apache.spark.util.Utils case class Nested1(f1: Nested2) case class Nested2(f2: Nested3) @@ -192,6 +192,11 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { allBuiltinFunctions.foreach { f => assert(allFunctions.contains(f)) } + + FunctionsCommand.virtualOperators.foreach { f => + assert(allFunctions.contains(f)) + } + withTempDatabase { db => def createFunction(names: Seq[String]): Unit = { names.foreach { name => @@ -771,7 +776,8 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { sql("CREATE TABLE test2 (key INT, value STRING)") testData.write.mode(SaveMode.Append).insertInto("test2") testData.write.mode(SaveMode.Append).insertInto("test2") - sql("CREATE TABLE test AS SELECT COUNT(a.value) FROM test1 a JOIN test2 b ON a.key = b.key") + sql("CREATE TABLE test USING hive AS " + + "SELECT COUNT(a.value) FROM test1 a JOIN test2 b ON a.key = b.key") checkAnswer( table("test"), sql("SELECT COUNT(a.value) FROM test1 a JOIN test2 b ON a.key = b.key").collect().toSeq) @@ -932,7 +938,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { read.json(ds).createOrReplaceTempView("data") withSQLConf(SQLConf.CONVERT_CTAS.key -> "false") { - sql("CREATE TABLE explodeTest (key bigInt)") + sql("CREATE TABLE explodeTest (key bigInt) USING hive") table("explodeTest").queryExecution.analyzed match { case SubqueryAlias(_, r: HiveTableRelation) => // OK case _ => @@ -1103,10 +1109,10 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("Call add jar in a different thread (SPARK-8306)") { @volatile var error: Option[Throwable] = None val thread = new Thread { - override def run() { + override def run(): Unit = { // To make sure this test works, this jar should not be loaded in another place. sql( - s"ADD JAR ${HiveTestUtils.getHiveContribJar.getCanonicalPath}") + s"ADD JAR ${HiveTestJars.getHiveContribJar().getCanonicalPath}") try { sql( """ @@ -1178,51 +1184,6 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { checkAnswer(sql("SELECT a.`c.b`, `b.$q`[0].`a@!.q`, `q.w`.`w.i&`[0] FROM t"), Row(1, 1, 1)) } - test("Convert hive interval term into Literal of CalendarIntervalType") { - checkAnswer(sql("select interval '0 0:0:0.1' day to second"), - Row(CalendarInterval.fromString("interval 100 milliseconds"))) - checkAnswer(sql("select interval '10-9' year to month"), - Row(CalendarInterval.fromString("interval 10 years 9 months"))) - checkAnswer(sql("select interval '20 15:40:32.99899999' day to hour"), - Row(CalendarInterval.fromString("interval 2 weeks 6 days 15 hours"))) - checkAnswer(sql("select interval '20 15:40:32.99899999' day to minute"), - Row(CalendarInterval.fromString("interval 2 weeks 6 days 15 hours 40 minutes"))) - checkAnswer(sql("select interval '20 15:40:32.99899999' day to second"), - Row(CalendarInterval.fromString("interval 2 weeks 6 days 15 hours 40 minutes " + - "32 seconds 998 milliseconds 999 microseconds"))) - checkAnswer(sql("select interval '15:40:32.99899999' hour to minute"), - Row(CalendarInterval.fromString("interval 15 hours 40 minutes"))) - checkAnswer(sql("select interval '15:40.99899999' hour to second"), - Row(CalendarInterval.fromString("interval 15 minutes 40 seconds 998 milliseconds " + - "999 microseconds"))) - checkAnswer(sql("select interval '15:40' hour to second"), - Row(CalendarInterval.fromString("interval 15 hours 40 minutes"))) - checkAnswer(sql("select interval '15:40:32.99899999' hour to second"), - Row(CalendarInterval.fromString("interval 15 hours 40 minutes 32 seconds 998 milliseconds " + - "999 microseconds"))) - checkAnswer(sql("select interval '20 40:32.99899999' minute to second"), - Row(CalendarInterval.fromString("interval 2 weeks 6 days 40 minutes 32 seconds " + - "998 milliseconds 999 microseconds"))) - checkAnswer(sql("select interval '40:32.99899999' minute to second"), - Row(CalendarInterval.fromString("interval 40 minutes 32 seconds 998 milliseconds " + - "999 microseconds"))) - checkAnswer(sql("select interval '40:32' minute to second"), - Row(CalendarInterval.fromString("interval 40 minutes 32 seconds"))) - checkAnswer(sql("select interval '30' year"), - Row(CalendarInterval.fromString("interval 30 years"))) - checkAnswer(sql("select interval '25' month"), - Row(CalendarInterval.fromString("interval 25 months"))) - checkAnswer(sql("select interval '-100' day"), - Row(CalendarInterval.fromString("interval -14 weeks -2 days"))) - checkAnswer(sql("select interval '40' hour"), - Row(CalendarInterval.fromString("interval 1 days 16 hours"))) - checkAnswer(sql("select interval '80' minute"), - Row(CalendarInterval.fromString("interval 1 hour 20 minutes"))) - checkAnswer(sql("select interval '299.889987299' second"), - Row(CalendarInterval.fromString( - "interval 4 minutes 59 seconds 889 milliseconds 987 microseconds"))) - } - test("specifying database name for a temporary view is not allowed") { withTempPath { dir => withTempView("db.t") { @@ -1931,7 +1892,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } withTable("load_t") { - sql("CREATE TABLE load_t (a STRING)") + sql("CREATE TABLE load_t (a STRING) USING hive") sql(s"LOAD DATA LOCAL INPATH '$path/*part-r*' INTO TABLE load_t") checkAnswer(sql("SELECT * FROM load_t"), Seq(Row("1"), Row("2"), Row("3"))) @@ -1951,7 +1912,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { Files.write(s"$i", new File(dirPath, s"part-r-0000 $i"), StandardCharsets.UTF_8) } withTable("load_t") { - sql("CREATE TABLE load_t (a STRING)") + sql("CREATE TABLE load_t (a STRING) USING hive") sql(s"LOAD DATA LOCAL INPATH '$path/part-r-0000 1' INTO TABLE load_t") checkAnswer(sql("SELECT * FROM load_t"), Seq(Row("1"))) } @@ -1966,7 +1927,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { Files.write(s"$i", new File(dirPath, s"part-r-0000$i"), StandardCharsets.UTF_8) } withTable("load_t_folder_wildcard") { - sql("CREATE TABLE load_t (a STRING)") + sql("CREATE TABLE load_t (a STRING) USING hive") sql(s"LOAD DATA LOCAL INPATH '${ path.substring(0, path.length - 1) .concat("*") @@ -1990,7 +1951,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { Files.write(s"$i", new File(dirPath, s"part-r-0000$i"), StandardCharsets.UTF_8) } withTable("load_t1") { - sql("CREATE TABLE load_t1 (a STRING)") + sql("CREATE TABLE load_t1 (a STRING) USING hive") sql(s"LOAD DATA LOCAL INPATH '$path/part-r-0000?' INTO TABLE load_t1") checkAnswer(sql("SELECT * FROM load_t1"), Seq(Row("1"), Row("2"), Row("3"))) } @@ -2005,13 +1966,33 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { Files.write(s"$i", new File(dirPath, s"part-r-0000$i"), StandardCharsets.UTF_8) } withTable("load_t2") { - sql("CREATE TABLE load_t2 (a STRING)") + sql("CREATE TABLE load_t2 (a STRING) USING hive") sql(s"LOAD DATA LOCAL INPATH '$path/?art-r-00001' INTO TABLE load_t2") checkAnswer(sql("SELECT * FROM load_t2"), Seq(Row("1"))) } } } + test("SPARK-28084 check for case insensitive property of partition column name in load command") { + withTempDir { dir => + val path = dir.toURI.toString.stripSuffix("/") + val dirPath = dir.getAbsoluteFile + Files.append("1", new File(dirPath, "part-r-000011"), StandardCharsets.UTF_8) + withTable("part_table") { + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { + sql( + """ + |CREATE TABLE part_table (c STRING) + |PARTITIONED BY (d STRING) + """.stripMargin) + sql(s"LOAD DATA LOCAL INPATH '$path/part-r-000011' " + + "INTO TABLE part_table PARTITION(D ='1')") + checkAnswer(sql("SELECT * FROM part_table"), Seq(Row("1", "1"))) + } + } + } + } + test("SPARK-25738: defaultFs can have a port") { val defaultURI = new URI("hdfs://fizz.buzz.com:8020") val r = LoadDataCommand.makeQualified(defaultURI, new Path("/foo/bar"), new Path("/flim/flam")) @@ -2111,7 +2092,8 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { withTable("t") { df.createTempView("tempView") val e = intercept[AnalysisException] { - sql("CREATE TABLE t AS SELECT key, get_json_object(jstring, '$.f1') FROM tempView") + sql("CREATE TABLE t USING hive AS " + + "SELECT key, get_json_object(jstring, '$.f1') FROM tempView") }.getMessage assert(e.contains(expectedMsg)) } @@ -2396,7 +2378,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { }) spark .range(5) - .select(badUDF('id).as("a")) + .select(badUDF($"id").as("a")) .createOrReplaceTempView("test") val scriptFilePath = getTestResourcePath("data") val e = intercept[SparkException] { @@ -2412,4 +2394,149 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } } } + + test("SPARK-29295: insert overwrite external partition should not have old data") { + Seq("true", "false").foreach { convertParquet => + withTable("test") { + withTempDir { f => + sql("CREATE EXTERNAL TABLE test(id int) PARTITIONED BY (name string) STORED AS " + + s"PARQUET LOCATION '${f.getAbsolutePath}'") + + withSQLConf(HiveUtils.CONVERT_METASTORE_PARQUET.key -> convertParquet) { + sql("INSERT OVERWRITE TABLE test PARTITION(name='n1') SELECT 1") + sql("ALTER TABLE test DROP PARTITION(name='n1')") + sql("INSERT OVERWRITE TABLE test PARTITION(name='n1') SELECT 2") + checkAnswer(sql("SELECT id FROM test WHERE name = 'n1' ORDER BY id"), + Array(Row(2))) + } + } + } + } + } + + test("SPARK-29295: dynamic insert overwrite external partition should not have old data") { + Seq("true", "false").foreach { convertParquet => + withTable("test") { + withTempDir { f => + sql("CREATE EXTERNAL TABLE test(id int) PARTITIONED BY (p1 string, p2 string) " + + s"STORED AS PARQUET LOCATION '${f.getAbsolutePath}'") + + withSQLConf(HiveUtils.CONVERT_METASTORE_PARQUET.key -> convertParquet, + "hive.exec.dynamic.partition.mode" -> "nonstrict") { + sql( + """ + |INSERT OVERWRITE TABLE test PARTITION(p1='n1', p2) + |SELECT * FROM VALUES (1, 'n2'), (2, 'n3') AS t(id, p2) + """.stripMargin) + checkAnswer(sql("SELECT id FROM test WHERE p1 = 'n1' and p2 = 'n2' ORDER BY id"), + Array(Row(1))) + checkAnswer(sql("SELECT id FROM test WHERE p1 = 'n1' and p2 = 'n3' ORDER BY id"), + Array(Row(2))) + + sql("INSERT OVERWRITE TABLE test PARTITION(p1='n1', p2) SELECT 4, 'n4'") + checkAnswer(sql("SELECT id FROM test WHERE p1 = 'n1' and p2 = 'n4' ORDER BY id"), + Array(Row(4))) + + sql("ALTER TABLE test DROP PARTITION(p1='n1',p2='n2')") + sql("ALTER TABLE test DROP PARTITION(p1='n1',p2='n3')") + + sql( + """ + |INSERT OVERWRITE TABLE test PARTITION(p1='n1', p2) + |SELECT * FROM VALUES (5, 'n2'), (6, 'n3') AS t(id, p2) + """.stripMargin) + checkAnswer(sql("SELECT id FROM test WHERE p1 = 'n1' and p2 = 'n2' ORDER BY id"), + Array(Row(5))) + checkAnswer(sql("SELECT id FROM test WHERE p1 = 'n1' and p2 = 'n3' ORDER BY id"), + Array(Row(6))) + // Partition not overwritten should not be deleted. + checkAnswer(sql("SELECT id FROM test WHERE p1 = 'n1' and p2 = 'n4' ORDER BY id"), + Array(Row(4))) + } + } + } + + withTable("test") { + withTempDir { f => + sql("CREATE EXTERNAL TABLE test(id int) PARTITIONED BY (p1 string, p2 string) " + + s"STORED AS PARQUET LOCATION '${f.getAbsolutePath}'") + + withSQLConf(HiveUtils.CONVERT_METASTORE_PARQUET.key -> convertParquet, + "hive.exec.dynamic.partition.mode" -> "nonstrict") { + // We should unescape partition value. + sql("INSERT OVERWRITE TABLE test PARTITION(p1='n1', p2) SELECT 1, '/'") + sql("ALTER TABLE test DROP PARTITION(p1='n1',p2='/')") + sql("INSERT OVERWRITE TABLE test PARTITION(p1='n1', p2) SELECT 2, '/'") + checkAnswer(sql("SELECT id FROM test WHERE p1 = 'n1' and p2 = '/' ORDER BY id"), + Array(Row(2))) + } + } + } + } + } + + test("partition pruning should handle date correctly") { + withSQLConf(SQLConf.OPTIMIZER_INSET_CONVERSION_THRESHOLD.key -> "2") { + withTable("t") { + sql("CREATE TABLE t (i INT) PARTITIONED BY (j DATE)") + sql("INSERT INTO t PARTITION(j='1990-11-11') SELECT 1") + checkAnswer(sql("SELECT i, CAST(j AS STRING) FROM t"), Row(1, "1990-11-11")) + checkAnswer( + sql( + """ + |SELECT i, CAST(j AS STRING) + |FROM t + |WHERE j IN (DATE'1990-11-10', DATE'1990-11-11', DATE'1990-11-12') + |""".stripMargin), + Row(1, "1990-11-11")) + } + } + } + + test("SPARK-26560 Spark should be able to run Hive UDF using jar regardless of " + + "current thread context classloader") { + // force to use Spark classloader as other test (even in other test suites) may change the + // current thread's context classloader to jar classloader + Utils.withContextClassLoader(Utils.getSparkClassLoader) { + withUserDefinedFunction("udtf_count3" -> false) { + val sparkClassLoader = Thread.currentThread().getContextClassLoader + + // This jar file should not be placed to the classpath; GenericUDTFCount3 is slightly + // modified version of GenericUDTFCount2 in hive/contrib, which emits the count for + // three times. + val jarPath = "src/test/noclasspath/TestUDTF-spark-26560.jar" + val jarURL = s"file://${System.getProperty("user.dir")}/$jarPath" + + sql( + s""" + |CREATE FUNCTION udtf_count3 + |AS 'org.apache.hadoop.hive.contrib.udtf.example.GenericUDTFCount3' + |USING JAR '$jarURL' + """.stripMargin) + + assert(Thread.currentThread().getContextClassLoader eq sparkClassLoader) + + // JAR will be loaded at first usage, and it will change the current thread's + // context classloader to jar classloader in sharedState. + // See SessionState.addJar for details. + checkAnswer( + sql("SELECT udtf_count3(a) FROM (SELECT 1 AS a FROM src LIMIT 3) t"), + Row(3) :: Row(3) :: Row(3) :: Nil) + + assert(Thread.currentThread().getContextClassLoader ne sparkClassLoader) + assert(Thread.currentThread().getContextClassLoader eq + spark.sqlContext.sharedState.jarClassLoader) + + // Roll back to the original classloader and run query again. Without this line, the test + // would pass, as thread's context classloader is changed to jar classloader. But thread + // context classloader can be changed from others as well which would fail the query; one + // example is spark-shell, which thread context classloader rolls back automatically. This + // mimics the behavior of spark-shell. + Thread.currentThread().setContextClassLoader(sparkClassLoader) + checkAnswer( + sql("SELECT udtf_count3(a) FROM (SELECT 1 AS a FROM src LIMIT 3) t"), + Row(3) :: Row(3) :: Row(3) :: Nil) + } + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala index ed3b376f6eda1..7d01fc53a4099 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/ScriptTransformationSuite.scala @@ -17,21 +17,27 @@ package org.apache.spark.sql.hive.execution +import java.sql.Timestamp + import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe +import org.scalatest.Assertions._ import org.scalatest.BeforeAndAfterEach import org.scalatest.exceptions.TestFailedException import org.apache.spark.{SparkException, TaskContext, TestUtils} import org.apache.spark.rdd.RDD +import org.apache.spark.sql.Column import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.{SparkPlan, SparkPlanTest, UnaryExecNode} +import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.test.TestHiveSingleton +import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.StringType -class ScriptTransformationSuite extends SparkPlanTest with TestHiveSingleton with - BeforeAndAfterEach { +class ScriptTransformationSuite extends SparkPlanTest with SQLTestUtils with TestHiveSingleton + with BeforeAndAfterEach { import spark.implicits._ private val noSerdeIOSchema = HiveScriptIOSchema( @@ -185,6 +191,43 @@ class ScriptTransformationSuite extends SparkPlanTest with TestHiveSingleton wit rowsDf.select("name").collect()) assert(uncaughtExceptionHandler.exception.isEmpty) } + + test("SPARK-25990: TRANSFORM should handle different data types correctly") { + assume(TestUtils.testCommandAvailable("python")) + val scriptFilePath = getTestResourcePath("test_script.py") + + withTempView("v") { + val df = Seq( + (1, "1", 1.0, BigDecimal(1.0), new Timestamp(1)), + (2, "2", 2.0, BigDecimal(2.0), new Timestamp(2)), + (3, "3", 3.0, BigDecimal(3.0), new Timestamp(3)) + ).toDF("a", "b", "c", "d", "e") // Note column d's data type is Decimal(38, 18) + df.createTempView("v") + + val query = sql( + s""" + |SELECT + |TRANSFORM(a, b, c, d, e) + |USING 'python $scriptFilePath' AS (a, b, c, d, e) + |FROM v + """.stripMargin) + + // In Hive1.2, it does not do well on Decimal conversion. For example, in this case, + // it converts a decimal value's type from Decimal(38, 18) to Decimal(1, 0). So we need + // do extra cast here for Hive1.2. But in Hive2.3, it still keeps the original Decimal type. + val decimalToString: Column => Column = if (HiveUtils.isHive23) { + c => c.cast("string") + } else { + c => c.cast("decimal(1, 0)").cast("string") + } + checkAnswer(query, identity, df.select( + 'a.cast("string"), + 'b.cast("string"), + 'c.cast("string"), + decimalToString('d), + 'e.cast("string")).collect()) + } + } } private case class ExceptionInjectingOperator(child: SparkPlan) extends UnaryExecNode { diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala new file mode 100644 index 0000000000000..e6856a58b0ea9 --- /dev/null +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/UDAQuerySuite.scala @@ -0,0 +1,417 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.execution + +import java.lang.{Double => jlDouble, Integer => jlInt, Long => jlLong} + +import scala.collection.JavaConverters._ +import scala.util.Random + +import test.org.apache.spark.sql.MyDoubleAvg +import test.org.apache.spark.sql.MyDoubleSum + +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.catalyst.expressions.GenericInternalRow +import org.apache.spark.sql.catalyst.expressions.UnsafeRow +import org.apache.spark.sql.expressions.{Aggregator} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.hive.test.TestHiveSingleton +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SQLTestUtils +import org.apache.spark.sql.types._ + +class MyDoubleAvgAggBase extends Aggregator[jlDouble, (Double, Long), jlDouble] { + def zero: (Double, Long) = (0.0, 0L) + def reduce(b: (Double, Long), a: jlDouble): (Double, Long) = { + if (a != null) (b._1 + a, b._2 + 1L) else b + } + def merge(b1: (Double, Long), b2: (Double, Long)): (Double, Long) = + (b1._1 + b2._1, b1._2 + b2._2) + def finish(r: (Double, Long)): jlDouble = + if (r._2 > 0L) 100.0 + (r._1 / r._2.toDouble) else null + def bufferEncoder: Encoder[(Double, Long)] = + Encoders.tuple(Encoders.scalaDouble, Encoders.scalaLong) + def outputEncoder: Encoder[jlDouble] = Encoders.DOUBLE +} + +object MyDoubleAvgAgg extends MyDoubleAvgAggBase +object MyDoubleSumAgg extends MyDoubleAvgAggBase { + override def finish(r: (Double, Long)): jlDouble = if (r._2 > 0L) r._1 else null +} + +object LongProductSumAgg extends Aggregator[(jlLong, jlLong), Long, jlLong] { + def zero: Long = 0L + def reduce(b: Long, a: (jlLong, jlLong)): Long = { + if ((a._1 != null) && (a._2 != null)) b + (a._1 * a._2) else b + } + def merge(b1: Long, b2: Long): Long = b1 + b2 + def finish(r: Long): jlLong = r + def bufferEncoder: Encoder[Long] = Encoders.scalaLong + def outputEncoder: Encoder[jlLong] = Encoders.LONG +} + +@SQLUserDefinedType(udt = classOf[CountSerDeUDT]) +case class CountSerDeSQL(nSer: Int, nDeSer: Int, sum: Int) + +class CountSerDeUDT extends UserDefinedType[CountSerDeSQL] { + def userClass: Class[CountSerDeSQL] = classOf[CountSerDeSQL] + + override def typeName: String = "count-ser-de" + + private[spark] override def asNullable: CountSerDeUDT = this + + def sqlType: DataType = StructType( + StructField("nSer", IntegerType, false) :: + StructField("nDeSer", IntegerType, false) :: + StructField("sum", IntegerType, false) :: + Nil) + + def serialize(sql: CountSerDeSQL): Any = { + val row = new GenericInternalRow(3) + row.setInt(0, 1 + sql.nSer) + row.setInt(1, sql.nDeSer) + row.setInt(2, sql.sum) + row + } + + def deserialize(any: Any): CountSerDeSQL = any match { + case row: InternalRow if (row.numFields == 3) => + CountSerDeSQL(row.getInt(0), 1 + row.getInt(1), row.getInt(2)) + case u => throw new Exception(s"failed to deserialize: $u") + } + + override def equals(obj: Any): Boolean = { + obj match { + case _: CountSerDeUDT => true + case _ => false + } + } + + override def hashCode(): Int = classOf[CountSerDeUDT].getName.hashCode() +} + +case object CountSerDeUDT extends CountSerDeUDT + +object CountSerDeAgg extends Aggregator[Int, CountSerDeSQL, CountSerDeSQL] { + def zero: CountSerDeSQL = CountSerDeSQL(0, 0, 0) + def reduce(b: CountSerDeSQL, a: Int): CountSerDeSQL = b.copy(sum = b.sum + a) + def merge(b1: CountSerDeSQL, b2: CountSerDeSQL): CountSerDeSQL = + CountSerDeSQL(b1.nSer + b2.nSer, b1.nDeSer + b2.nDeSer, b1.sum + b2.sum) + def finish(r: CountSerDeSQL): CountSerDeSQL = r + def bufferEncoder: Encoder[CountSerDeSQL] = ExpressionEncoder[CountSerDeSQL]() + def outputEncoder: Encoder[CountSerDeSQL] = ExpressionEncoder[CountSerDeSQL]() +} + +abstract class UDAQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { + import testImplicits._ + + override def beforeAll(): Unit = { + super.beforeAll() + val data1 = Seq[(Integer, Integer)]( + (1, 10), + (null, -60), + (1, 20), + (1, 30), + (2, 0), + (null, -10), + (2, -1), + (2, null), + (2, null), + (null, 100), + (3, null), + (null, null), + (3, null)).toDF("key", "value") + data1.write.saveAsTable("agg1") + + val data2 = Seq[(Integer, Integer, Integer)]( + (1, 10, -10), + (null, -60, 60), + (1, 30, -30), + (1, 30, 30), + (2, 1, 1), + (null, -10, 10), + (2, -1, null), + (2, 1, 1), + (2, null, 1), + (null, 100, -10), + (3, null, 3), + (null, null, null), + (3, null, null)).toDF("key", "value1", "value2") + data2.write.saveAsTable("agg2") + + val data3 = Seq[(Seq[Integer], Integer, Integer)]( + (Seq[Integer](1, 1), 10, -10), + (Seq[Integer](null), -60, 60), + (Seq[Integer](1, 1), 30, -30), + (Seq[Integer](1), 30, 30), + (Seq[Integer](2), 1, 1), + (null, -10, 10), + (Seq[Integer](2, 3), -1, null), + (Seq[Integer](2, 3), 1, 1), + (Seq[Integer](2, 3, 4), null, 1), + (Seq[Integer](null), 100, -10), + (Seq[Integer](3), null, 3), + (null, null, null), + (Seq[Integer](3), null, null)).toDF("key", "value1", "value2") + data3.write.saveAsTable("agg3") + + val data4 = Seq[Boolean](true, false, true).toDF("boolvalues") + data4.write.saveAsTable("agg4") + + val emptyDF = spark.createDataFrame( + sparkContext.emptyRDD[Row], + StructType(StructField("key", StringType) :: StructField("value", IntegerType) :: Nil)) + emptyDF.createOrReplaceTempView("emptyTable") + + // Register UDAs + spark.udf.register("mydoublesum", udaf(MyDoubleSumAgg)) + spark.udf.register("mydoubleavg", udaf(MyDoubleAvgAgg)) + spark.udf.register("longProductSum", udaf(LongProductSumAgg)) + } + + override def afterAll(): Unit = { + try { + spark.sql("DROP TABLE IF EXISTS agg1") + spark.sql("DROP TABLE IF EXISTS agg2") + spark.sql("DROP TABLE IF EXISTS agg3") + spark.sql("DROP TABLE IF EXISTS agg4") + spark.catalog.dropTempView("emptyTable") + } finally { + super.afterAll() + } + } + + test("aggregators") { + checkAnswer( + spark.sql( + """ + |SELECT + | key, + | mydoublesum(value + 1.5 * key), + | mydoubleavg(value), + | avg(value - key), + | mydoublesum(value - 1.5 * key), + | avg(value) + |FROM agg1 + |GROUP BY key + """.stripMargin), + Row(1, 64.5, 120.0, 19.0, 55.5, 20.0) :: + Row(2, 5.0, 99.5, -2.5, -7.0, -0.5) :: + Row(3, null, null, null, null, null) :: + Row(null, null, 110.0, null, null, 10.0) :: Nil) + } + + test("non-deterministic children expressions of aggregator") { + val e = intercept[AnalysisException] { + spark.sql( + """ + |SELECT mydoublesum(value + 1.5 * key + rand()) + |FROM agg1 + |GROUP BY key + """.stripMargin) + }.getMessage + assert(Seq("nondeterministic expression", + "should not appear in the arguments of an aggregate function").forall(e.contains)) + } + + test("interpreted aggregate function") { + checkAnswer( + spark.sql( + """ + |SELECT mydoublesum(value), key + |FROM agg1 + |GROUP BY key + """.stripMargin), + Row(60.0, 1) :: Row(-1.0, 2) :: Row(null, 3) :: Row(30.0, null) :: Nil) + + checkAnswer( + spark.sql( + """ + |SELECT mydoublesum(value) FROM agg1 + """.stripMargin), + Row(89.0) :: Nil) + + checkAnswer( + spark.sql( + """ + |SELECT mydoublesum(null) + """.stripMargin), + Row(null) :: Nil) + } + + test("interpreted and expression-based aggregation functions") { + checkAnswer( + spark.sql( + """ + |SELECT mydoublesum(value), key, avg(value) + |FROM agg1 + |GROUP BY key + """.stripMargin), + Row(60.0, 1, 20.0) :: + Row(-1.0, 2, -0.5) :: + Row(null, 3, null) :: + Row(30.0, null, 10.0) :: Nil) + + checkAnswer( + spark.sql( + """ + |SELECT + | mydoublesum(value + 1.5 * key), + | avg(value - key), + | key, + | mydoublesum(value - 1.5 * key), + | avg(value) + |FROM agg1 + |GROUP BY key + """.stripMargin), + Row(64.5, 19.0, 1, 55.5, 20.0) :: + Row(5.0, -2.5, 2, -7.0, -0.5) :: + Row(null, null, 3, null, null) :: + Row(null, null, null, null, 10.0) :: Nil) + } + + test("single distinct column set") { + checkAnswer( + spark.sql( + """ + |SELECT + | mydoubleavg(distinct value1), + | avg(value1), + | avg(value2), + | key, + | mydoubleavg(value1 - 1), + | mydoubleavg(distinct value1) * 0.1, + | avg(value1 + value2) + |FROM agg2 + |GROUP BY key + """.stripMargin), + Row(120.0, 70.0/3.0, -10.0/3.0, 1, 67.0/3.0 + 100.0, 12.0, 20.0) :: + Row(100.0, 1.0/3.0, 1.0, 2, -2.0/3.0 + 100.0, 10.0, 2.0) :: + Row(null, null, 3.0, 3, null, null, null) :: + Row(110.0, 10.0, 20.0, null, 109.0, 11.0, 30.0) :: Nil) + + checkAnswer( + spark.sql( + """ + |SELECT + | key, + | mydoubleavg(distinct value1), + | mydoublesum(value2), + | mydoublesum(distinct value1), + | mydoubleavg(distinct value1), + | mydoubleavg(value1) + |FROM agg2 + |GROUP BY key + """.stripMargin), + Row(1, 120.0, -10.0, 40.0, 120.0, 70.0/3.0 + 100.0) :: + Row(2, 100.0, 3.0, 0.0, 100.0, 1.0/3.0 + 100.0) :: + Row(3, null, 3.0, null, null, null) :: + Row(null, 110.0, 60.0, 30.0, 110.0, 110.0) :: Nil) + } + + test("multiple distinct multiple columns sets") { + checkAnswer( + spark.sql( + """ + |SELECT + | key, + | count(distinct value1), + | sum(distinct value1), + | count(distinct value2), + | sum(distinct value2), + | count(distinct value1, value2), + | longProductSum(distinct value1, value2), + | count(value1), + | sum(value1), + | count(value2), + | sum(value2), + | longProductSum(value1, value2), + | count(*), + | count(1) + |FROM agg2 + |GROUP BY key + """.stripMargin), + Row(null, 3, 30, 3, 60, 3, -4700, 3, 30, 3, 60, -4700, 4, 4) :: + Row(1, 2, 40, 3, -10, 3, -100, 3, 70, 3, -10, -100, 3, 3) :: + Row(2, 2, 0, 1, 1, 1, 1, 3, 1, 3, 3, 2, 4, 4) :: + Row(3, 0, null, 1, 3, 0, 0, 0, null, 1, 3, 0, 2, 2) :: Nil) + } + + test("verify aggregator ser/de behavior") { + val data = sparkContext.parallelize((1 to 100).toSeq, 3).toDF("value1") + val agg = udaf(CountSerDeAgg) + checkAnswer( + data.agg(agg($"value1")), + Row(CountSerDeSQL(4, 4, 5050)) :: Nil) + } + + test("verify type casting failure") { + assertThrows[org.apache.spark.sql.AnalysisException] { + spark.sql( + """ + |SELECT mydoublesum(boolvalues) FROM agg4 + """.stripMargin) + } + } +} + +class HashUDAQuerySuite extends UDAQuerySuite + +class HashUDAQueryWithControlledFallbackSuite extends UDAQuerySuite { + + override protected def checkAnswer(actual: => DataFrame, expectedAnswer: Seq[Row]): Unit = { + super.checkAnswer(actual, expectedAnswer) + Seq("true", "false").foreach { enableTwoLevelMaps => + withSQLConf("spark.sql.codegen.aggregate.map.twolevel.enabled" -> + enableTwoLevelMaps) { + (1 to 3).foreach { fallbackStartsAt => + withSQLConf("spark.sql.TungstenAggregate.testFallbackStartsAt" -> + s"${(fallbackStartsAt - 1).toString}, ${fallbackStartsAt.toString}") { + QueryTest.getErrorMessageInCheckAnswer(actual, expectedAnswer) match { + case Some(errorMessage) => + val newErrorMessage = + s""" + |The following aggregation query failed when using HashAggregate with + |controlled fallback (it falls back to bytes to bytes map once it has processed + |${fallbackStartsAt - 1} input rows and to sort-based aggregation once it has + |processed $fallbackStartsAt input rows). The query is ${actual.queryExecution} + | + |$errorMessage + """.stripMargin + + fail(newErrorMessage) + case None => // Success + } + } + } + } + } + } + + // Override it to make sure we call the actually overridden checkAnswer. + override protected def checkAnswer(df: => DataFrame, expectedAnswer: Row): Unit = { + checkAnswer(df, Seq(expectedAnswer)) + } + + // Override it to make sure we call the actually overridden checkAnswer. + override protected def checkAnswer(df: => DataFrame, expectedAnswer: DataFrame): Unit = { + checkAnswer(df, expectedAnswer.collect()) + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowQuerySuite.scala index 3f9485dd018b1..15712a18ce751 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowQuerySuite.scala @@ -41,7 +41,7 @@ class WindowQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleto | p_size INT, | p_container STRING, | p_retailprice DOUBLE, - | p_comment STRING) + | p_comment STRING) USING hive """.stripMargin) val testData1 = TestHive.getHiveFile("data/files/part_tiny.txt").toURI sql( diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcFilterSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcFilterSuite.scala index b5e50915c7c89..5fc41067f661d 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcFilterSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcFilterSuite.scala @@ -124,154 +124,154 @@ class HiveOrcFilterSuite extends OrcTest with TestHiveSingleton { test("filter pushdown - integer") { withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i)))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === 1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < 2, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= 4, PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(1) === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(1) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(2) > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(3) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(1) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(4) <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === 1, PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < 2, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= 4, PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(1) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal(1) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(2) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal(3) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(1) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(4) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } test("filter pushdown - long") { withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i.toLong)))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === 1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < 2, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= 4, PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(1) === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(1) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(2) > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(3) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(1) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(4) <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === 1, PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < 2, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= 4, PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(1) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal(1) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(2) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal(3) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(1) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(4) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } test("filter pushdown - float") { withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i.toFloat)))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === 1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < 2, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= 4, PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(1) === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(1) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(2) > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(3) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(1) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(4) <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === 1, PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < 2, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= 4, PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(1) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal(1) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(2) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal(3) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(1) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(4) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } test("filter pushdown - double") { withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i.toDouble)))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === 1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < 2, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= 4, PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(1) === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(1) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(2) > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(3) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(1) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(4) <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === 1, PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < 2, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= 4, PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(1) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal(1) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(2) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal(3) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(1) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(4) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } test("filter pushdown - string") { withOrcDataFrame((1 to 4).map(i => Tuple1(i.toString))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === "1", PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> "1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < "2", PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > "3", PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= "1", PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= "4", PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal("1") === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal("1") <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal("2") > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal("3") < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal("1") >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal("4") <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === "1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> "1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < "2", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > "3", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= "1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= "4", PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal("1") === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal("1") <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal("2") > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal("3") < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal("1") >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal("4") <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } test("filter pushdown - boolean") { withOrcDataFrame((true :: false :: Nil).map(b => Tuple1.apply(Option(b)))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === true, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> true, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < true, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > false, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= false, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= false, PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(false) === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(false) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(false) > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(true) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(true) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(true) <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === true, PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> true, PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < true, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > false, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= false, PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= false, PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(false) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal(false) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(false) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal(true) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(true) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(true) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } test("filter pushdown - decimal") { withOrcDataFrame((1 to 4).map(i => Tuple1.apply(BigDecimal.valueOf(i)))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) - checkFilterPredicate('_1 === BigDecimal.valueOf(1), PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> BigDecimal.valueOf(1), PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate($"_1" === BigDecimal.valueOf(1), PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> BigDecimal.valueOf(1), PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate('_1 < BigDecimal.valueOf(2), PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > BigDecimal.valueOf(3), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= BigDecimal.valueOf(1), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= BigDecimal.valueOf(4), PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" < BigDecimal.valueOf(2), PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > BigDecimal.valueOf(3), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= BigDecimal.valueOf(1), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= BigDecimal.valueOf(4), PredicateLeaf.Operator.LESS_THAN) checkFilterPredicate( - Literal(BigDecimal.valueOf(1)) === '_1, PredicateLeaf.Operator.EQUALS) + Literal(BigDecimal.valueOf(1)) === $"_1", PredicateLeaf.Operator.EQUALS) checkFilterPredicate( - Literal(BigDecimal.valueOf(1)) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) + Literal(BigDecimal.valueOf(1)) <=> $"_1", PredicateLeaf.Operator.NULL_SAFE_EQUALS) checkFilterPredicate( - Literal(BigDecimal.valueOf(2)) > '_1, PredicateLeaf.Operator.LESS_THAN) + Literal(BigDecimal.valueOf(2)) > $"_1", PredicateLeaf.Operator.LESS_THAN) checkFilterPredicate( - Literal(BigDecimal.valueOf(3)) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) + Literal(BigDecimal.valueOf(3)) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) checkFilterPredicate( - Literal(BigDecimal.valueOf(1)) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) + Literal(BigDecimal.valueOf(1)) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) checkFilterPredicate( - Literal(BigDecimal.valueOf(4)) <= '_1, PredicateLeaf.Operator.LESS_THAN) + Literal(BigDecimal.valueOf(4)) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } @@ -282,22 +282,23 @@ class HiveOrcFilterSuite extends OrcTest with TestHiveSingleton { new Timestamp(milliseconds) } withOrcDataFrame(timestamps.map(Tuple1(_))) { implicit df => - checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL) - - checkFilterPredicate('_1 === timestamps(0), PredicateLeaf.Operator.EQUALS) - checkFilterPredicate('_1 <=> timestamps(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS) - - checkFilterPredicate('_1 < timestamps(1), PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate('_1 > timestamps(2), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 <= timestamps(0), PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate('_1 >= timestamps(3), PredicateLeaf.Operator.LESS_THAN) - - checkFilterPredicate(Literal(timestamps(0)) === '_1, PredicateLeaf.Operator.EQUALS) - checkFilterPredicate(Literal(timestamps(0)) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS) - checkFilterPredicate(Literal(timestamps(1)) > '_1, PredicateLeaf.Operator.LESS_THAN) - checkFilterPredicate(Literal(timestamps(2)) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(timestamps(0)) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS) - checkFilterPredicate(Literal(timestamps(3)) <= '_1, PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1".isNull, PredicateLeaf.Operator.IS_NULL) + + checkFilterPredicate($"_1" === timestamps(0), PredicateLeaf.Operator.EQUALS) + checkFilterPredicate($"_1" <=> timestamps(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS) + + checkFilterPredicate($"_1" < timestamps(1), PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate($"_1" > timestamps(2), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" <= timestamps(0), PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate($"_1" >= timestamps(3), PredicateLeaf.Operator.LESS_THAN) + + checkFilterPredicate(Literal(timestamps(0)) === $"_1", PredicateLeaf.Operator.EQUALS) + checkFilterPredicate(Literal(timestamps(0)) <=> $"_1", + PredicateLeaf.Operator.NULL_SAFE_EQUALS) + checkFilterPredicate(Literal(timestamps(1)) > $"_1", PredicateLeaf.Operator.LESS_THAN) + checkFilterPredicate(Literal(timestamps(2)) < $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(timestamps(0)) >= $"_1", PredicateLeaf.Operator.LESS_THAN_EQUALS) + checkFilterPredicate(Literal(timestamps(3)) <= $"_1", PredicateLeaf.Operator.LESS_THAN) } } @@ -309,30 +310,30 @@ class HiveOrcFilterSuite extends OrcTest with TestHiveSingleton { // to produce string expression and then compare it to given string expression below. // This might have to be changed after Hive version is upgraded. checkFilterPredicateWithDiffHiveVersion( - '_1.isNotNull, + $"_1".isNotNull, """leaf-0 = (IS_NULL _1) |expr = (not leaf-0)""".stripMargin.trim ) checkFilterPredicateWithDiffHiveVersion( - '_1 =!= 1, + $"_1" =!= 1, """leaf-0 = (IS_NULL _1) |leaf-1 = (EQUALS _1 1) |expr = (and (not leaf-0) (not leaf-1))""".stripMargin.trim ) checkFilterPredicateWithDiffHiveVersion( - !('_1 < 4), + !($"_1" < 4), """leaf-0 = (IS_NULL _1) |leaf-1 = (LESS_THAN _1 4) |expr = (and (not leaf-0) (not leaf-1))""".stripMargin.trim ) checkFilterPredicateWithDiffHiveVersion( - '_1 < 2 || '_1 > 3, + $"_1" < 2 || $"_1" > 3, """leaf-0 = (LESS_THAN _1 2) |leaf-1 = (LESS_THAN_EQUALS _1 3) |expr = (or leaf-0 (not leaf-1))""".stripMargin.trim ) checkFilterPredicateWithDiffHiveVersion( - '_1 < 2 && '_1 > 3, + $"_1" < 2 && $"_1" > 3, """leaf-0 = (IS_NULL _1) |leaf-1 = (LESS_THAN _1 2) |leaf-2 = (LESS_THAN_EQUALS _1 3) @@ -347,22 +348,22 @@ class HiveOrcFilterSuite extends OrcTest with TestHiveSingleton { } // ArrayType withOrcDataFrame((1 to 4).map(i => Tuple1(Array(i)))) { implicit df => - checkNoFilterPredicate('_1.isNull) + checkNoFilterPredicate($"_1".isNull) } // BinaryType withOrcDataFrame((1 to 4).map(i => Tuple1(i.b))) { implicit df => - checkNoFilterPredicate('_1 <=> 1.b) + checkNoFilterPredicate($"_1" <=> 1.b) } // DateType if (!HiveUtils.isHive23) { val stringDate = "2015-01-01" withOrcDataFrame(Seq(Tuple1(Date.valueOf(stringDate)))) { implicit df => - checkNoFilterPredicate('_1 === Date.valueOf(stringDate)) + checkNoFilterPredicate($"_1" === Date.valueOf(stringDate)) } } // MapType withOrcDataFrame((1 to 4).map(i => Tuple1(Map(i -> i)))) { implicit df => - checkNoFilterPredicate('_1.isNotNull) + checkNoFilterPredicate($"_1".isNotNull) } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala index 00333397e1fbb..990d9425fb7fc 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala @@ -210,7 +210,10 @@ class HiveOrcQuerySuite extends OrcQueryTest with TestHiveSingleton { } } - test("SPARK-23340 Empty float/double array columns raise EOFException") { + // SPARK-28885 String value is not allowed to be stored as numeric type with + // ANSI store assignment policy. + // TODO: re-enable the test case when SPARK-29462 is fixed. + ignore("SPARK-23340 Empty float/double array columns raise EOFException") { withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "false") { withTable("spark_23340") { sql("CREATE TABLE spark_23340(a array, b array) STORED AS ORC") @@ -271,8 +274,8 @@ class HiveOrcQuerySuite extends OrcQueryTest with TestHiveSingleton { val orcPartitionedTable = TableIdentifier("dummy_orc_partitioned", Some("default")) if (conversion == "true") { - // if converted, it's cached as a datasource table. - checkCached(orcPartitionedTable) + // if converted, we refresh the cached relation. + assert(getCachedDataSourceTable(orcPartitionedTable) === null) } else { // otherwise, not cached. assert(getCachedDataSourceTable(orcPartitionedTable) === null) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala index 0ea941c8e0d8e..f3e712d6c0a4a 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala @@ -170,4 +170,154 @@ class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton { test("SPARK-11412 read and merge orc schemas in parallel") { testMergeSchemasInParallel(OrcFileOperator.readOrcSchemasInParallel) } + + test("SPARK-25993 CREATE EXTERNAL TABLE with subdirectories") { + Seq(true, false).foreach { convertMetastore => + withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> s"$convertMetastore") { + withTempDir { dir => + withTable("orc_tbl1", "orc_tbl2", "orc_tbl3") { + val orcTblStatement1 = + s""" + |CREATE EXTERNAL TABLE orc_tbl1( + | c1 int, + | c2 int, + | c3 string) + |STORED AS orc + |LOCATION '${s"${dir.getCanonicalPath}/l1/"}'""".stripMargin + sql(orcTblStatement1) + + val orcTblInsertL1 = + s"INSERT INTO TABLE orc_tbl1 VALUES (1, 1, 'orc1'), (2, 2, 'orc2')".stripMargin + sql(orcTblInsertL1) + + val orcTblStatement2 = + s""" + |CREATE EXTERNAL TABLE orc_tbl2( + | c1 int, + | c2 int, + | c3 string) + |STORED AS orc + |LOCATION '${s"${dir.getCanonicalPath}/l1/l2/"}'""".stripMargin + sql(orcTblStatement2) + + val orcTblInsertL2 = + s"INSERT INTO TABLE orc_tbl2 VALUES (3, 3, 'orc3'), (4, 4, 'orc4')".stripMargin + sql(orcTblInsertL2) + + val orcTblStatement3 = + s""" + |CREATE EXTERNAL TABLE orc_tbl3( + | c1 int, + | c2 int, + | c3 string) + |STORED AS orc + |LOCATION '${s"${dir.getCanonicalPath}/l1/l2/l3/"}'""".stripMargin + sql(orcTblStatement3) + + val orcTblInsertL3 = + s"INSERT INTO TABLE orc_tbl3 VALUES (5, 5, 'orc5'), (6, 6, 'orc6')".stripMargin + sql(orcTblInsertL3) + + withTable("tbl1", "tbl2", "tbl3", "tbl4", "tbl5", "tbl6") { + val topDirStatement = + s""" + |CREATE EXTERNAL TABLE tbl1( + | c1 int, + | c2 int, + | c3 string) + |STORED AS orc + |LOCATION '${s"${dir.getCanonicalPath}"}'""".stripMargin + sql(topDirStatement) + val topDirSqlStatement = s"SELECT * FROM tbl1" + if (convertMetastore) { + checkAnswer(sql(topDirSqlStatement), Nil) + } else { + checkAnswer(sql(topDirSqlStatement), (1 to 6).map(i => Row(i, i, s"orc$i"))) + } + + val l1DirStatement = + s""" + |CREATE EXTERNAL TABLE tbl2( + | c1 int, + | c2 int, + | c3 string) + |STORED AS orc + |LOCATION '${s"${dir.getCanonicalPath}/l1/"}'""".stripMargin + sql(l1DirStatement) + val l1DirSqlStatement = s"SELECT * FROM tbl2" + if (convertMetastore) { + checkAnswer(sql(l1DirSqlStatement), (1 to 2).map(i => Row(i, i, s"orc$i"))) + } else { + checkAnswer(sql(l1DirSqlStatement), (1 to 6).map(i => Row(i, i, s"orc$i"))) + } + + val l2DirStatement = + s""" + |CREATE EXTERNAL TABLE tbl3( + | c1 int, + | c2 int, + | c3 string) + |STORED AS orc + |LOCATION '${s"${dir.getCanonicalPath}/l1/l2/"}'""".stripMargin + sql(l2DirStatement) + val l2DirSqlStatement = s"SELECT * FROM tbl3" + if (convertMetastore) { + checkAnswer(sql(l2DirSqlStatement), (3 to 4).map(i => Row(i, i, s"orc$i"))) + } else { + checkAnswer(sql(l2DirSqlStatement), (3 to 6).map(i => Row(i, i, s"orc$i"))) + } + + val wildcardTopDirStatement = + s""" + |CREATE EXTERNAL TABLE tbl4( + | c1 int, + | c2 int, + | c3 string) + |STORED AS orc + |LOCATION '${new File(s"${dir}/*").toURI}'""".stripMargin + sql(wildcardTopDirStatement) + val wildcardTopDirSqlStatement = s"SELECT * FROM tbl4" + if (convertMetastore) { + checkAnswer(sql(wildcardTopDirSqlStatement), (1 to 2).map(i => Row(i, i, s"orc$i"))) + } else { + checkAnswer(sql(wildcardTopDirSqlStatement), Nil) + } + + val wildcardL1DirStatement = + s""" + |CREATE EXTERNAL TABLE tbl5( + | c1 int, + | c2 int, + | c3 string) + |STORED AS orc + |LOCATION '${new File(s"${dir}/l1/*").toURI}'""".stripMargin + sql(wildcardL1DirStatement) + val wildcardL1DirSqlStatement = s"SELECT * FROM tbl5" + if (convertMetastore) { + checkAnswer(sql(wildcardL1DirSqlStatement), (1 to 4).map(i => Row(i, i, s"orc$i"))) + } else { + checkAnswer(sql(wildcardL1DirSqlStatement), Nil) + } + + val wildcardL2Statement = + s""" + |CREATE EXTERNAL TABLE tbl6( + | c1 int, + | c2 int, + | c3 string) + |STORED AS orc + |LOCATION '${new File(s"${dir}/l1/l2/*").toURI}'""".stripMargin + sql(wildcardL2Statement) + val wildcardL2SqlStatement = s"SELECT * FROM tbl6" + if (convertMetastore) { + checkAnswer(sql(wildcardL2SqlStatement), (3 to 6).map(i => Row(i, i, s"orc$i"))) + } else { + checkAnswer(sql(wildcardL2SqlStatement), Nil) + } + } + } + } + } + } + } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala index c03ae144a1595..a26412c5163ec 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala @@ -22,9 +22,9 @@ import java.io.File import scala.util.Random import org.apache.spark.SparkConf -import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} +import org.apache.spark.benchmark.Benchmark import org.apache.spark.sql.{DataFrame, SparkSession} -import org.apache.spark.sql.catalyst.plans.SQLHelper +import org.apache.spark.sql.execution.benchmark.SqlBasedBenchmark import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -43,18 +43,23 @@ import org.apache.spark.sql.types._ * This is in `sql/hive` module in order to compare `sql/core` and `sql/hive` ORC data sources. */ // scalastyle:off line.size.limit -object OrcReadBenchmark extends BenchmarkBase with SQLHelper { - val conf = new SparkConf() - conf.set("orc.compression", "snappy") +object OrcReadBenchmark extends SqlBasedBenchmark { - private val spark = SparkSession.builder() - .master("local[1]") - .appName("OrcReadBenchmark") - .config(conf) - .getOrCreate() + override def getSparkSession: SparkSession = { + val conf = new SparkConf() + conf.set("orc.compression", "snappy") - // Set default configs. Individual cases will change them if necessary. - spark.conf.set(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key, "true") + val sparkSession = SparkSession.builder() + .master("local[1]") + .appName("OrcReadBenchmark") + .config(conf) + .getOrCreate() + + // Set default configs. Individual cases will change them if necessary. + sparkSession.conf.set(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key, "true") + + sparkSession + } def withTempTable(tableNames: String*)(f: => Unit): Unit = { try f finally tableNames.foreach(spark.catalog.dropTempView) @@ -88,16 +93,16 @@ object OrcReadBenchmark extends BenchmarkBase with SQLHelper { benchmark.addCase("Native ORC MR") { _ => withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("SELECT sum(id) FROM nativeOrcTable").collect() + spark.sql("SELECT sum(id) FROM nativeOrcTable").noop() } } benchmark.addCase("Native ORC Vectorized") { _ => - spark.sql("SELECT sum(id) FROM nativeOrcTable").collect() + spark.sql("SELECT sum(id) FROM nativeOrcTable").noop() } benchmark.addCase("Hive built-in ORC") { _ => - spark.sql("SELECT sum(id) FROM hiveOrcTable").collect() + spark.sql("SELECT sum(id) FROM hiveOrcTable").noop() } benchmark.run() @@ -119,16 +124,16 @@ object OrcReadBenchmark extends BenchmarkBase with SQLHelper { benchmark.addCase("Native ORC MR") { _ => withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("SELECT sum(c1), sum(length(c2)) FROM nativeOrcTable").collect() + spark.sql("SELECT sum(c1), sum(length(c2)) FROM nativeOrcTable").noop() } } benchmark.addCase("Native ORC Vectorized") { _ => - spark.sql("SELECT sum(c1), sum(length(c2)) FROM nativeOrcTable").collect() + spark.sql("SELECT sum(c1), sum(length(c2)) FROM nativeOrcTable").noop() } benchmark.addCase("Hive built-in ORC") { _ => - spark.sql("SELECT sum(c1), sum(length(c2)) FROM hiveOrcTable").collect() + spark.sql("SELECT sum(c1), sum(length(c2)) FROM hiveOrcTable").noop() } benchmark.run() @@ -148,44 +153,44 @@ object OrcReadBenchmark extends BenchmarkBase with SQLHelper { benchmark.addCase("Data column - Native ORC MR") { _ => withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("SELECT sum(id) FROM nativeOrcTable").collect() + spark.sql("SELECT sum(id) FROM nativeOrcTable").noop() } } benchmark.addCase("Data column - Native ORC Vectorized") { _ => - spark.sql("SELECT sum(id) FROM nativeOrcTable").collect() + spark.sql("SELECT sum(id) FROM nativeOrcTable").noop() } benchmark.addCase("Data column - Hive built-in ORC") { _ => - spark.sql("SELECT sum(id) FROM hiveOrcTable").collect() + spark.sql("SELECT sum(id) FROM hiveOrcTable").noop() } benchmark.addCase("Partition column - Native ORC MR") { _ => withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("SELECT sum(p) FROM nativeOrcTable").collect() + spark.sql("SELECT sum(p) FROM nativeOrcTable").noop() } } benchmark.addCase("Partition column - Native ORC Vectorized") { _ => - spark.sql("SELECT sum(p) FROM nativeOrcTable").collect() + spark.sql("SELECT sum(p) FROM nativeOrcTable").noop() } benchmark.addCase("Partition column - Hive built-in ORC") { _ => - spark.sql("SELECT sum(p) FROM hiveOrcTable").collect() + spark.sql("SELECT sum(p) FROM hiveOrcTable").noop() } benchmark.addCase("Both columns - Native ORC MR") { _ => withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("SELECT sum(p), sum(id) FROM nativeOrcTable").collect() + spark.sql("SELECT sum(p), sum(id) FROM nativeOrcTable").noop() } } benchmark.addCase("Both columns - Native ORC Vectorized") { _ => - spark.sql("SELECT sum(p), sum(id) FROM nativeOrcTable").collect() + spark.sql("SELECT sum(p), sum(id) FROM nativeOrcTable").noop() } benchmark.addCase("Both columns - Hive built-in ORC") { _ => - spark.sql("SELECT sum(p), sum(id) FROM hiveOrcTable").collect() + spark.sql("SELECT sum(p), sum(id) FROM hiveOrcTable").noop() } benchmark.run() @@ -204,16 +209,16 @@ object OrcReadBenchmark extends BenchmarkBase with SQLHelper { benchmark.addCase("Native ORC MR") { _ => withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql("SELECT sum(length(c1)) FROM nativeOrcTable").collect() + spark.sql("SELECT sum(length(c1)) FROM nativeOrcTable").noop() } } benchmark.addCase("Native ORC Vectorized") { _ => - spark.sql("SELECT sum(length(c1)) FROM nativeOrcTable").collect() + spark.sql("SELECT sum(length(c1)) FROM nativeOrcTable").noop() } benchmark.addCase("Hive built-in ORC") { _ => - spark.sql("SELECT sum(length(c1)) FROM hiveOrcTable").collect() + spark.sql("SELECT sum(length(c1)) FROM hiveOrcTable").noop() } benchmark.run() @@ -239,18 +244,18 @@ object OrcReadBenchmark extends BenchmarkBase with SQLHelper { benchmark.addCase("Native ORC MR") { _ => withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") { spark.sql("SELECT SUM(LENGTH(c2)) FROM nativeOrcTable " + - "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect() + "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").noop() } } benchmark.addCase("Native ORC Vectorized") { _ => spark.sql("SELECT SUM(LENGTH(c2)) FROM nativeOrcTable " + - "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect() + "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").noop() } benchmark.addCase("Hive built-in ORC") { _ => spark.sql("SELECT SUM(LENGTH(c2)) FROM hiveOrcTable " + - "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect() + "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").noop() } benchmark.run() @@ -273,16 +278,16 @@ object OrcReadBenchmark extends BenchmarkBase with SQLHelper { benchmark.addCase("Native ORC MR") { _ => withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") { - spark.sql(s"SELECT sum(c$middle) FROM nativeOrcTable").collect() + spark.sql(s"SELECT sum(c$middle) FROM nativeOrcTable").noop() } } benchmark.addCase("Native ORC Vectorized") { _ => - spark.sql(s"SELECT sum(c$middle) FROM nativeOrcTable").collect() + spark.sql(s"SELECT sum(c$middle) FROM nativeOrcTable").noop() } benchmark.addCase("Hive built-in ORC") { _ => - spark.sql(s"SELECT sum(c$middle) FROM hiveOrcTable").collect() + spark.sql(s"SELECT sum(c$middle) FROM hiveOrcTable").noop() } benchmark.run() diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/security/HiveHadoopDelegationTokenManagerSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/security/HiveHadoopDelegationTokenManagerSuite.scala index ce40cf51746b2..97eab4f3f4f77 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/security/HiveHadoopDelegationTokenManagerSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/security/HiveHadoopDelegationTokenManagerSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.hive.security import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration +import org.scalatest.Assertions._ import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.deploy.security.HadoopDelegationTokenManager diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala index d68a47053f18c..222244a04f5f5 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala @@ -328,10 +328,10 @@ private[hive] class TestHiveSparkSession( @transient val hiveQTestUtilTables: Seq[TestTable] = Seq( TestTable("src", - "CREATE TABLE src (key INT, value STRING)".cmd, + "CREATE TABLE src (key INT, value STRING) STORED AS TEXTFILE".cmd, s"LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/kv1.txt")}' INTO TABLE src".cmd), TestTable("src1", - "CREATE TABLE src1 (key INT, value STRING)".cmd, + "CREATE TABLE src1 (key INT, value STRING) STORED AS TEXTFILE".cmd, s"LOAD DATA LOCAL INPATH '${quoteHiveFile("data/files/kv3.txt")}' INTO TABLE src1".cmd), TestTable("srcpart", () => { "CREATE TABLE srcpart (key INT, value STRING) PARTITIONED BY (ds STRING, hr STRING)" @@ -489,7 +489,7 @@ private[hive] class TestHiveSparkSession( def getLoadedTables: collection.mutable.HashSet[String] = sharedState.loadedTables - def loadTestTable(name: String) { + def loadTestTable(name: String): Unit = { if (!sharedState.loadedTables.contains(name)) { // Marks the table as loaded first to prevent infinite mutually recursive table loading. sharedState.loadedTables += name @@ -501,7 +501,7 @@ private[hive] class TestHiveSparkSession( // has already set the execution id. if (sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) == null) { // We don't actually have a `QueryExecution` here, use a fake one instead. - SQLExecution.withNewExecutionId(this, new QueryExecution(this, OneRowRelation())) { + SQLExecution.withNewExecutionId(new QueryExecution(this, OneRowRelation())) { createCmds.foreach(_()) } } else { @@ -523,7 +523,7 @@ private[hive] class TestHiveSparkSession( /** * Resets the test instance by deleting any table, view, temp view, and UDF that have been created */ - def reset() { + def reset(): Unit = { try { // HACK: Hive is too noisy by default. org.apache.log4j.LogManager.getCurrentLoggers.asScala.foreach { log => @@ -647,3 +647,25 @@ private[sql] class TestHiveSessionStateBuilder( override protected def newBuilder: NewBuilder = new TestHiveSessionStateBuilder(_, _) } + +private[hive] object HiveTestJars { + private val repository = SQLConf.ADDITIONAL_REMOTE_REPOSITORIES.defaultValueString.split(",")(0) + private val hiveTestJarsDir = Utils.createTempDir() + + def getHiveContribJar(version: String = HiveUtils.builtinHiveVersion): File = + getJarFromUrl(s"${repository}org/apache/hive/hive-contrib/" + + s"$version/hive-contrib-$version.jar") + + def getHiveHcatalogCoreJar(version: String = HiveUtils.builtinHiveVersion): File = + getJarFromUrl(s"${repository}org/apache/hive/hcatalog/hive-hcatalog-core/" + + s"$version/hive-hcatalog-core-$version.jar") + + private def getJarFromUrl(urlString: String): File = { + val fileName = urlString.split("/").last + val targetFile = new File(hiveTestJarsDir, fileName) + if (!targetFile.exists()) { + Utils.doFetchFile(urlString, hiveTestJarsDir, fileName, new SparkConf, null, null) + } + targetFile + } +} diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala index 5db83c698ff15..4ada5077aec7f 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/HadoopFsRelationTest.scala @@ -73,22 +73,22 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes // Simple filtering and partition pruning checkAnswer( - df.filter('a > 1 && 'p1 === 2), + df.filter($"a" > 1 && $"p1" === 2), for (i <- 2 to 3; p2 <- Seq("foo", "bar")) yield Row(i, s"val_$i", 2, p2)) // Simple projection and filtering checkAnswer( - df.filter('a > 1).select('b, 'a + 1), + df.filter($"a" > 1).select($"b", $"a" + 1), for (i <- 2 to 3; _ <- 1 to 2; _ <- Seq("foo", "bar")) yield Row(s"val_$i", i + 1)) // Simple projection and partition pruning checkAnswer( - df.filter('a > 1 && 'p1 < 2).select('b, 'p1), + df.filter($"a" > 1 && $"p1" < 2).select($"b", $"p1"), for (i <- 2 to 3; _ <- Seq("foo", "bar")) yield Row(s"val_$i", 1)) // Project many copies of columns with different types (reproduction for SPARK-7858) checkAnswer( - df.filter('a > 1 && 'p1 < 2).select('b, 'b, 'b, 'b, 'p1, 'p1, 'p1, 'p1), + df.filter($"a" > 1 && $"p1" < 2).select($"b", $"b", $"b", $"b", $"p1", $"p1", $"p1", $"p1"), for (i <- 2 to 3; _ <- Seq("foo", "bar")) yield Row(s"val_$i", s"val_$i", s"val_$i", s"val_$i", 1, 1, 1, 1)) @@ -384,12 +384,12 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes test("saveAsTable()/load() - partitioned table - boolean type") { spark.range(2) - .select('id, ('id % 2 === 0).as("b")) + .select($"id", ($"id" % 2 === 0).as("b")) .write.partitionBy("b").saveAsTable("t") withTable("t") { checkAnswer( - spark.table("t").sort('id), + spark.table("t").sort($"id"), Row(0, true) :: Row(1, false) :: Nil ) } @@ -731,12 +731,12 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes } yield (i, s"val_$i", 1.0d, p2, 123, 123.123f)).toDF("a", "b", "p1", "p2", "p3", "f") val input = df.select( - 'a, - 'b, - 'p1.cast(StringType).as('ps1), - 'p2, - 'p3.cast(FloatType).as('pf1), - 'f) + $"a", + $"b", + $"p1".cast(StringType).as("ps1"), + $"p2", + $"p3".cast(FloatType).as("pf1"), + $"f") withTempView("t") { input @@ -770,7 +770,7 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes .saveAsTable("t") withTable("t") { - checkAnswer(spark.table("t").select('b, 'c, 'a), df.select('b, 'c, 'a).collect()) + checkAnswer(spark.table("t").select("b", "c", "a"), df.select("b", "c", "a").collect()) } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala index 6ebc1d145848c..2e6b86206a631 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/ParquetHadoopFsRelationSuite.scala @@ -152,8 +152,8 @@ class ParquetHadoopFsRelationSuite extends HadoopFsRelationTest { withTempPath { dir => val path = dir.getCanonicalPath - spark.range(2).select('id as 'a, 'id as 'b).write.partitionBy("b").parquet(path) - val df = spark.read.parquet(path).filter('a === 0).select('b) + spark.range(2).select($"id" as "a", $"id" as "b").write.partitionBy("b").parquet(path) + val df = spark.read.parquet(path).filter($"a" === 0).select("b") val physicalPlan = df.queryExecution.sparkPlan assert(physicalPlan.collect { case p: execution.ProjectExec => p }.length === 1) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala index 60a4638f610b3..d1b97b2852fbc 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/SimpleTextRelation.scala @@ -23,7 +23,7 @@ import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.spark.sql.{sources, SparkSession} import org.apache.spark.sql.catalyst.{expressions, InternalRow} -import org.apache.spark.sql.catalyst.expressions.{Cast, Expression, GenericInternalRow, InterpretedPredicate, InterpretedProjection, JoinedRow, Literal} +import org.apache.spark.sql.catalyst.expressions.{Cast, Expression, GenericInternalRow, InterpretedProjection, JoinedRow, Literal, Predicate} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.types.{DataType, StructType} @@ -88,7 +88,7 @@ class SimpleTextSource extends TextBasedFileFormat with DataSourceRegister { val attribute = inputAttributes.find(_.name == column).get expressions.GreaterThan(attribute, literal) }.reduceOption(expressions.And).getOrElse(Literal(true)) - InterpretedPredicate.create(filterCondition, inputAttributes) + Predicate.create(filterCondition, inputAttributes) } // Uses a simple projection to simulate column pruning diff --git a/streaming/pom.xml b/streaming/pom.xml index 1d1ea469f7d18..87af6388e1118 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -131,4 +131,16 @@ + + + + scala-2.13 + + + org.scala-lang.modules + scala-parallel-collections_${scala.binary.version} + + + + diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala index 54f91ff1c69d5..5d81d36dfe357 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala @@ -55,6 +55,8 @@ class Checkpoint(ssc: StreamingContext, val checkpointTime: Time) "spark.driver.bindAddress", "spark.driver.port", "spark.master", + "spark.ui.port", + "spark.blockManager.port", "spark.kubernetes.driver.pod.name", "spark.kubernetes.executor.podNamePrefix", "spark.yarn.jars", @@ -69,6 +71,8 @@ class Checkpoint(ssc: StreamingContext, val checkpointTime: Time) .remove("spark.driver.host") .remove("spark.driver.bindAddress") .remove("spark.driver.port") + .remove("spark.ui.port") + .remove("spark.blockManager.port") .remove("spark.kubernetes.driver.pod.name") .remove("spark.kubernetes.executor.podNamePrefix") val newReloadConf = new SparkConf(loadDefaults = true) @@ -90,7 +94,7 @@ class Checkpoint(ssc: StreamingContext, val checkpointTime: Time) newSparkConf } - def validate() { + def validate(): Unit = { assert(master != null, "Checkpoint.master is null") assert(framework != null, "Checkpoint.framework is null") assert(graph != null, "Checkpoint.graph is null") @@ -131,8 +135,8 @@ object Checkpoint extends Logging { try { val statuses = fs.listStatus(path) if (statuses != null) { - val paths = statuses.map(_.getPath) - val filtered = paths.filter(p => REGEX.findFirstIn(p.toString).nonEmpty) + val paths = statuses.filterNot(_.isDirectory).map(_.getPath) + val filtered = paths.filter(p => REGEX.findFirstIn(p.getName).nonEmpty) filtered.sortWith(sortFunc) } else { logWarning(s"Listing $path returned null") @@ -213,7 +217,7 @@ class CheckpointWriter( checkpointTime: Time, bytes: Array[Byte], clearCheckpointDataLater: Boolean) extends Runnable { - def run() { + def run(): Unit = { if (latestCheckpointTime == null || latestCheckpointTime < checkpointTime) { latestCheckpointTime = checkpointTime } @@ -288,7 +292,7 @@ class CheckpointWriter( } } - def write(checkpoint: Checkpoint, clearCheckpointDataLater: Boolean) { + def write(checkpoint: Checkpoint, clearCheckpointDataLater: Boolean): Unit = { try { val bytes = Checkpoint.serialize(checkpoint, conf) executor.execute(new CheckpointWriteHandler( diff --git a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala index dce2028b48878..683db21d3f0e1 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala @@ -20,6 +20,7 @@ package org.apache.spark.streaming import java.io.{IOException, ObjectInputStream, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer +import scala.collection.parallel.immutable.ParVector import org.apache.spark.internal.Logging import org.apache.spark.streaming.dstream.{DStream, InputDStream, ReceiverInputDStream} @@ -41,7 +42,7 @@ final private[streaming] class DStreamGraph extends Serializable with Logging { var batchDuration: Duration = null @volatile private var numReceivers: Int = 0 - def start(time: Time) { + def start(time: Time): Unit = { this.synchronized { require(zeroTime == null, "DStream graph computation already started") zeroTime = time @@ -50,28 +51,28 @@ final private[streaming] class DStreamGraph extends Serializable with Logging { outputStreams.foreach(_.remember(rememberDuration)) outputStreams.foreach(_.validateAtStart()) numReceivers = inputStreams.count(_.isInstanceOf[ReceiverInputDStream[_]]) - inputStreamNameAndID = inputStreams.map(is => (is.name, is.id)) - inputStreams.par.foreach(_.start()) + inputStreamNameAndID = inputStreams.map(is => (is.name, is.id)).toSeq + new ParVector(inputStreams.toVector).foreach(_.start()) } } - def restart(time: Time) { + def restart(time: Time): Unit = { this.synchronized { startTime = time } } - def stop() { + def stop(): Unit = { this.synchronized { - inputStreams.par.foreach(_.stop()) + new ParVector(inputStreams.toVector).foreach(_.stop()) } } - def setContext(ssc: StreamingContext) { + def setContext(ssc: StreamingContext): Unit = { this.synchronized { outputStreams.foreach(_.setContext(ssc)) } } - def setBatchDuration(duration: Duration) { + def setBatchDuration(duration: Duration): Unit = { this.synchronized { require(batchDuration == null, s"Batch duration already set as $batchDuration. Cannot set it again.") @@ -79,7 +80,7 @@ final private[streaming] class DStreamGraph extends Serializable with Logging { } } - def remember(duration: Duration) { + def remember(duration: Duration): Unit = { this.synchronized { require(rememberDuration == null, s"Remember duration already set as $rememberDuration. Cannot set it again.") @@ -87,14 +88,14 @@ final private[streaming] class DStreamGraph extends Serializable with Logging { } } - def addInputStream(inputStream: InputDStream[_]) { + def addInputStream(inputStream: InputDStream[_]): Unit = { this.synchronized { inputStream.setGraph(this) inputStreams += inputStream } } - def addOutputStream(outputStream: DStream[_]) { + def addOutputStream(outputStream: DStream[_]): Unit = { this.synchronized { outputStream.setGraph(this) outputStreams += outputStream @@ -128,7 +129,7 @@ final private[streaming] class DStreamGraph extends Serializable with Logging { jobs } - def clearMetadata(time: Time) { + def clearMetadata(time: Time): Unit = { logDebug("Clearing metadata for time " + time) this.synchronized { outputStreams.foreach(_.clearMetadata(time)) @@ -136,7 +137,7 @@ final private[streaming] class DStreamGraph extends Serializable with Logging { logDebug("Cleared old metadata for time " + time) } - def updateCheckpointData(time: Time) { + def updateCheckpointData(time: Time): Unit = { logInfo("Updating checkpoint data for time " + time) this.synchronized { outputStreams.foreach(_.updateCheckpointData(time)) @@ -144,7 +145,7 @@ final private[streaming] class DStreamGraph extends Serializable with Logging { logInfo("Updated checkpoint data for time " + time) } - def clearCheckpointData(time: Time) { + def clearCheckpointData(time: Time): Unit = { logInfo("Clearing checkpoint data for time " + time) this.synchronized { outputStreams.foreach(_.clearCheckpointData(time)) @@ -152,7 +153,7 @@ final private[streaming] class DStreamGraph extends Serializable with Logging { logInfo("Cleared checkpoint data for time " + time) } - def restoreCheckpointData() { + def restoreCheckpointData(): Unit = { logInfo("Restoring checkpoint data") this.synchronized { outputStreams.foreach(_.restoreCheckpointData()) @@ -160,7 +161,7 @@ final private[streaming] class DStreamGraph extends Serializable with Logging { logInfo("Restored checkpoint data") } - def validate() { + def validate(): Unit = { this.synchronized { require(batchDuration != null, "Batch duration has not been set") // assert(batchDuration >= Milliseconds(100), "Batch duration of " + batchDuration + diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala index 589dd877c8c97..440b653e45de1 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala @@ -26,7 +26,6 @@ import scala.collection.mutable.Queue import scala.reflect.ClassTag import scala.util.control.NonFatal -import org.apache.commons.lang3.SerializationUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, LongWritable, Text} @@ -222,7 +221,7 @@ class StreamingContext private[streaming] ( * if the developer wishes to query old data outside the DStream computation). * @param duration Minimum duration that each DStream should remember its RDDs */ - def remember(duration: Duration) { + def remember(duration: Duration): Unit = { graph.remember(duration) } @@ -232,7 +231,7 @@ class StreamingContext private[streaming] ( * @param directory HDFS-compatible directory where the checkpoint data will be reliably stored. * Note that this must be a fault-tolerant file system like HDFS. */ - def checkpoint(directory: String) { + def checkpoint(directory: String): Unit = { if (directory != null) { val path = new Path(directory) val fs = path.getFileSystem(sparkContext.hadoopConfiguration) @@ -505,7 +504,7 @@ class StreamingContext private[streaming] ( * Add a [[org.apache.spark.streaming.scheduler.StreamingListener]] object for * receiving system events related to streaming. */ - def addStreamingListener(streamingListener: StreamingListener) { + def addStreamingListener(streamingListener: StreamingListener): Unit = { scheduler.listenerBus.addListener(streamingListener) } @@ -513,7 +512,7 @@ class StreamingContext private[streaming] ( scheduler.listenerBus.removeListener(streamingListener) } - private def validate() { + private def validate(): Unit = { assert(graph != null, "Graph is null") graph.validate() @@ -586,7 +585,7 @@ class StreamingContext private[streaming] ( sparkContext.setCallSite(startSite.get) sparkContext.clearJobGroup() sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "false") - savedProperties.set(SerializationUtils.clone(sparkContext.localProperties.get())) + savedProperties.set(Utils.cloneProperties(sparkContext.localProperties.get())) scheduler.start() } state = StreamingContextState.ACTIVE @@ -621,7 +620,7 @@ class StreamingContext private[streaming] ( * Wait for the execution to stop. Any exceptions that occurs during the execution * will be thrown in this thread. */ - def awaitTermination() { + def awaitTermination(): Unit = { waiter.waitForStopOrError() } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala index 4a0ec31b5f3c8..51141212f9ecb 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala @@ -268,7 +268,7 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T * Apply a function to each RDD in this DStream. This is an output operator, so * 'this' DStream will be registered as an output stream and therefore materialized. */ - def foreachRDD(foreachFunc: JVoidFunction[R]) { + def foreachRDD(foreachFunc: JVoidFunction[R]): Unit = { dstream.foreachRDD(rdd => foreachFunc.call(wrapRDD(rdd))) } @@ -276,7 +276,7 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T * Apply a function to each RDD in this DStream. This is an output operator, so * 'this' DStream will be registered as an output stream and therefore materialized. */ - def foreachRDD(foreachFunc: JVoidFunction2[R, Time]) { + def foreachRDD(foreachFunc: JVoidFunction2[R, Time]): Unit = { dstream.foreachRDD((rdd, time) => foreachFunc.call(wrapRDD(rdd), time)) } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala index 3f88fe0817c57..650d8c7f4d1a7 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala @@ -759,7 +759,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])( * Save each RDD in `this` DStream as a Hadoop file. The file name at each batch interval is * generated based on `prefix` and `suffix`: "prefix-TIME_IN_MS.suffix". */ - def saveAsHadoopFiles(prefix: String, suffix: String) { + def saveAsHadoopFiles(prefix: String, suffix: String): Unit = { dstream.saveAsHadoopFiles(prefix, suffix) } @@ -772,7 +772,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])( suffix: String, keyClass: Class[_], valueClass: Class[_], - outputFormatClass: Class[F]) { + outputFormatClass: Class[F]): Unit = { dstream.saveAsHadoopFiles(prefix, suffix, keyClass, valueClass, outputFormatClass) } @@ -786,7 +786,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])( keyClass: Class[_], valueClass: Class[_], outputFormatClass: Class[F], - conf: JobConf) { + conf: JobConf): Unit = { dstream.saveAsHadoopFiles(prefix, suffix, keyClass, valueClass, outputFormatClass, conf) } @@ -794,7 +794,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])( * Save each RDD in `this` DStream as a Hadoop file. The file name at each batch interval is * generated based on `prefix` and `suffix`: "prefix-TIME_IN_MS.suffix". */ - def saveAsNewAPIHadoopFiles(prefix: String, suffix: String) { + def saveAsNewAPIHadoopFiles(prefix: String, suffix: String): Unit = { dstream.saveAsNewAPIHadoopFiles(prefix, suffix) } @@ -807,7 +807,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])( suffix: String, keyClass: Class[_], valueClass: Class[_], - outputFormatClass: Class[F]) { + outputFormatClass: Class[F]): Unit = { dstream.saveAsNewAPIHadoopFiles(prefix, suffix, keyClass, valueClass, outputFormatClass) } @@ -821,7 +821,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])( keyClass: Class[_], valueClass: Class[_], outputFormatClass: Class[F], - conf: Configuration = dstream.context.sparkContext.hadoopConfiguration) { + conf: Configuration = dstream.context.sparkContext.hadoopConfiguration): Unit = { dstream.saveAsNewAPIHadoopFiles(prefix, suffix, keyClass, valueClass, outputFormatClass, conf) } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala index d4f03bedc7ed6..2d53a1b4c78b6 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala @@ -505,7 +505,7 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable { * fault-tolerance. The graph will be checkpointed every batch interval. * @param directory HDFS-compatible directory where the checkpoint data will be reliably stored */ - def checkpoint(directory: String) { + def checkpoint(directory: String): Unit = { ssc.checkpoint(directory) } @@ -516,7 +516,7 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable { * if the developer wishes to query old data outside the DStream computation). * @param duration Minimum duration that each DStream should remember its RDDs */ - def remember(duration: Duration) { + def remember(duration: Duration): Unit = { ssc.remember(duration) } @@ -524,7 +524,7 @@ class JavaStreamingContext(val ssc: StreamingContext) extends Closeable { * Add a [[org.apache.spark.streaming.scheduler.StreamingListener]] object for * receiving system events related to streaming. */ - def addStreamingListener(streamingListener: StreamingListener) { + def addStreamingListener(streamingListener: StreamingListener): Unit = { ssc.addStreamingListener(streamingListener) } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingListener.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingListener.scala index 28cb86c9f31fd..ce1afad7a91d8 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingListener.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingListener.scala @@ -22,33 +22,33 @@ import org.apache.spark.streaming.Time private[streaming] trait PythonStreamingListener{ /** Called when the streaming has been started */ - def onStreamingStarted(streamingStarted: JavaStreamingListenerStreamingStarted) { } + def onStreamingStarted(streamingStarted: JavaStreamingListenerStreamingStarted): Unit = { } /** Called when a receiver has been started */ - def onReceiverStarted(receiverStarted: JavaStreamingListenerReceiverStarted) { } + def onReceiverStarted(receiverStarted: JavaStreamingListenerReceiverStarted): Unit = { } /** Called when a receiver has reported an error */ - def onReceiverError(receiverError: JavaStreamingListenerReceiverError) { } + def onReceiverError(receiverError: JavaStreamingListenerReceiverError): Unit = { } /** Called when a receiver has been stopped */ - def onReceiverStopped(receiverStopped: JavaStreamingListenerReceiverStopped) { } + def onReceiverStopped(receiverStopped: JavaStreamingListenerReceiverStopped): Unit = { } /** Called when a batch of jobs has been submitted for processing. */ - def onBatchSubmitted(batchSubmitted: JavaStreamingListenerBatchSubmitted) { } + def onBatchSubmitted(batchSubmitted: JavaStreamingListenerBatchSubmitted): Unit = { } /** Called when processing of a batch of jobs has started. */ - def onBatchStarted(batchStarted: JavaStreamingListenerBatchStarted) { } + def onBatchStarted(batchStarted: JavaStreamingListenerBatchStarted): Unit = { } /** Called when processing of a batch of jobs has completed. */ - def onBatchCompleted(batchCompleted: JavaStreamingListenerBatchCompleted) { } + def onBatchCompleted(batchCompleted: JavaStreamingListenerBatchCompleted): Unit = { } /** Called when processing of a job of a batch has started. */ def onOutputOperationStarted( - outputOperationStarted: JavaStreamingListenerOutputOperationStarted) { } + outputOperationStarted: JavaStreamingListenerOutputOperationStarted): Unit = { } /** Called when processing of a job of a batch has completed. */ def onOutputOperationCompleted( - outputOperationCompleted: JavaStreamingListenerOutputOperationCompleted) { } + outputOperationCompleted: JavaStreamingListenerOutputOperationCompleted): Unit = { } } private[streaming] class PythonStreamingListenerWrapper(listener: PythonStreamingListener) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala index 46bfc60856453..570663c6f6ad3 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/api/python/PythonDStream.scala @@ -174,7 +174,7 @@ private[streaming] object PythonDStream { * helper function for DStream.foreachRDD(), * cannot be `foreachRDD`, it will confusing py4j */ - def callForeachRDD(jdstream: JavaDStream[Array[Byte]], pfunc: PythonTransformFunction) { + def callForeachRDD(jdstream: JavaDStream[Array[Byte]], pfunc: PythonTransformFunction): Unit = { val func = new TransformFunction((pfunc)) jdstream.dstream.foreachRDD((rdd, time) => func(Some(rdd), time)) } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ConstantInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ConstantInputDStream.scala index 995470ec8deae..ed2ddf9e25572 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ConstantInputDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ConstantInputDStream.scala @@ -31,9 +31,9 @@ class ConstantInputDStream[T: ClassTag](_ssc: StreamingContext, rdd: RDD[T]) require(rdd != null, "parameter rdd null is illegal, which will lead to NPE in the following transformation") - override def start() {} + override def start(): Unit = {} - override def stop() {} + override def stop(): Unit = {} override def compute(validTime: Time): Option[RDD[T]] = { Some(rdd) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala index 41374b5e370f8..6c981b293ac76 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala @@ -33,7 +33,7 @@ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ import org.apache.spark.streaming.StreamingContext.rddToFileName import org.apache.spark.streaming.scheduler.Job -import org.apache.spark.streaming.ui.UIUtils +import org.apache.spark.ui.{UIUtils => SparkUIUtils} import org.apache.spark.util.{CallSite, Utils} /** @@ -138,7 +138,7 @@ abstract class DStream[T: ClassTag] ( */ private def makeScope(time: Time): Option[RDDOperationScope] = { baseScope.map { bsJson => - val formattedBatchTime = UIUtils.formatBatchTime( + val formattedBatchTime = SparkUIUtils.formatBatchTime( time.milliseconds, ssc.graph.batchDuration.milliseconds, showYYYYMMSS = false) val bs = RDDOperationScope.fromJson(bsJson) val baseName = bs.name // e.g. countByWindow, "kafka stream [0]" @@ -189,7 +189,7 @@ abstract class DStream[T: ClassTag] ( * the validity of future times is calculated. This method also recursively initializes * its parent DStreams. */ - private[streaming] def initialize(time: Time) { + private[streaming] def initialize(time: Time): Unit = { if (zeroTime != null && zeroTime != time) { throw new SparkException(s"ZeroTime is already initialized to $zeroTime" + s", cannot initialize it again to $time") @@ -231,7 +231,7 @@ abstract class DStream[T: ClassTag] ( } } - private[streaming] def validateAtStart() { + private[streaming] def validateAtStart(): Unit = { require(rememberDuration != null, "Remember duration is set to null") require( @@ -282,7 +282,7 @@ abstract class DStream[T: ClassTag] ( logInfo(s"Initialized and validated $this") } - private[streaming] def setContext(s: StreamingContext) { + private[streaming] def setContext(s: StreamingContext): Unit = { if (ssc != null && ssc != s) { throw new SparkException(s"Context must not be set again for $this") } @@ -291,7 +291,7 @@ abstract class DStream[T: ClassTag] ( dependencies.foreach(_.setContext(ssc)) } - private[streaming] def setGraph(g: DStreamGraph) { + private[streaming] def setGraph(g: DStreamGraph): Unit = { if (graph != null && graph != g) { throw new SparkException(s"Graph must not be set again for $this") } @@ -299,7 +299,7 @@ abstract class DStream[T: ClassTag] ( dependencies.foreach(_.setGraph(graph)) } - private[streaming] def remember(duration: Duration) { + private[streaming] def remember(duration: Duration): Unit = { if (duration != null && (rememberDuration == null || duration > rememberDuration)) { rememberDuration = duration logInfo(s"Duration for remembering RDDs set to $rememberDuration for $this") @@ -446,7 +446,7 @@ abstract class DStream[T: ClassTag] ( * implementation clears the old generated RDDs. Subclasses of DStream may override * this to clear their own metadata along with the generated RDDs. */ - private[streaming] def clearMetadata(time: Time) { + private[streaming] def clearMetadata(time: Time): Unit = { val unpersistData = ssc.conf.getBoolean("spark.streaming.unpersist", true) val oldRDDs = generatedRDDs.filter(_._1 <= (time - rememberDuration)) logDebug("Clearing references to old RDDs: [" + @@ -477,14 +477,14 @@ abstract class DStream[T: ClassTag] ( * checkpointData. Subclasses of DStream (especially those of InputDStream) may override * this method to save custom checkpoint data. */ - private[streaming] def updateCheckpointData(currentTime: Time) { + private[streaming] def updateCheckpointData(currentTime: Time): Unit = { logDebug(s"Updating checkpoint data for time $currentTime") checkpointData.update(currentTime) dependencies.foreach(_.updateCheckpointData(currentTime)) logDebug(s"Updated checkpoint data for time $currentTime: $checkpointData") } - private[streaming] def clearCheckpointData(time: Time) { + private[streaming] def clearCheckpointData(time: Time): Unit = { logDebug("Clearing checkpoint data") checkpointData.cleanup(time) dependencies.foreach(_.clearCheckpointData(time)) @@ -497,7 +497,7 @@ abstract class DStream[T: ClassTag] ( * from the checkpoint file names stored in checkpointData. Subclasses of DStream that * override the updateCheckpointData() method would also need to override this method. */ - private[streaming] def restoreCheckpointData() { + private[streaming] def restoreCheckpointData(): Unit = { if (!restoredFromCheckpointData) { // Create RDDs from the checkpoint data logInfo("Restoring checkpoint data") diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStreamCheckpointData.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStreamCheckpointData.scala index b35f7d97233e2..667edf3713d43 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStreamCheckpointData.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStreamCheckpointData.scala @@ -46,7 +46,7 @@ class DStreamCheckpointData[T: ClassTag](dstream: DStream[T]) * the graph checkpoint is initiated. Default implementation records the * checkpoint files at which the generated RDDs of the DStream have been saved. */ - def update(time: Time) { + def update(time: Time): Unit = { // Get the checkpointed RDDs from the generated RDDs val checkpointFiles = dstream.generatedRDDs.filter(_._2.getCheckpointFile.isDefined) @@ -69,7 +69,7 @@ class DStreamCheckpointData[T: ClassTag](dstream: DStream[T]) * Cleanup old checkpoint data. This gets called after a checkpoint of `time` has been * written to the checkpoint directory. */ - def cleanup(time: Time) { + def cleanup(time: Time): Unit = { // Get the time of the oldest checkpointed RDD that was written as part of the // checkpoint of `time` timeToOldestCheckpointFileTime.remove(time) match { @@ -109,7 +109,7 @@ class DStreamCheckpointData[T: ClassTag](dstream: DStream[T]) * (along with its output DStreams) is being restored from a graph checkpoint file. * Default implementation restores the RDDs from their checkpoint files. */ - def restore() { + def restore(): Unit = { // Create RDDs from the checkpoint data currentCheckpointFiles.foreach { case(time, file) => diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala index 438847caf0c3a..d46c9a22379d3 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala @@ -128,9 +128,9 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]]( @transient private var _path: Path = null @transient private var _fs: FileSystem = null - override def start() { } + override def start(): Unit = { } - override def stop() { } + override def stop(): Unit = { } /** * Finds the files that were modified since the last time this method was called and makes @@ -160,7 +160,7 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]]( } /** Clear the old time-to-files mappings along with old RDDs */ - protected[streaming] override def clearMetadata(time: Time) { + protected[streaming] override def clearMetadata(time: Time): Unit = { super.clearMetadata(time) batchTimeToSelectedFiles.synchronized { val oldFiles = batchTimeToSelectedFiles.filter(_._1 < (time - rememberDuration)) @@ -306,7 +306,7 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]]( _fs } - private def reset() { + private def reset(): Unit = { _fs = null } @@ -328,14 +328,14 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]]( private def hadoopFiles = data.asInstanceOf[mutable.HashMap[Time, Array[String]]] - override def update(time: Time) { + override def update(time: Time): Unit = { hadoopFiles.clear() batchTimeToSelectedFiles.synchronized { hadoopFiles ++= batchTimeToSelectedFiles } } - override def cleanup(time: Time) { } + override def cleanup(time: Time): Unit = { } - override def restore() { + override def restore(): Unit = { hadoopFiles.toSeq.sortBy(_._1)(Time.ordering).foreach { case (t, f) => // Restore the metadata in both files and generatedRDDs diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala index 6495c91247047..5a75b77659960 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala @@ -34,7 +34,7 @@ import org.apache.spark.util.Utils * Input streams that can generate RDDs from new data by running a service/thread only on * the driver node (that is, without running a receiver on worker nodes), can be * implemented by directly inheriting this InputDStream. For example, - * FileInputDStream, a subclass of InputDStream, monitors a HDFS directory from the driver for + * FileInputDStream, a subclass of InputDStream, monitors an HDFS directory from the driver for * new files and generates RDDs with the new files. For implementing input streams * that requires running a receiver on the worker nodes, use * [[org.apache.spark.streaming.dstream.ReceiverInputDStream]] as the parent class. @@ -48,7 +48,7 @@ abstract class InputDStream[T: ClassTag](_ssc: StreamingContext) ssc.graph.addInputStream(this) - /** This is an unique identifier for the input stream. */ + /** This is a unique identifier for the input stream. */ val id = ssc.getNewInputStreamId() // Keep track of the freshest rate for this stream using the rateEstimator diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala index f9c78699164ab..d3e6e766bea4a 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala @@ -33,9 +33,9 @@ class QueueInputDStream[T: ClassTag]( defaultRDD: RDD[T] ) extends InputDStream[T](ssc) { - override def start() { } + override def start(): Unit = { } - override def stop() { } + override def stop(): Unit = { } private def readObject(in: ObjectInputStream): Unit = { throw new NotSerializableException("queueStream doesn't support checkpointing. " + diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/RawInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/RawInputDStream.scala index b22bbb79a5cc9..671ac7b97f9d2 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/RawInputDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/RawInputDStream.scala @@ -55,7 +55,7 @@ class RawNetworkReceiver(host: String, port: Int, storageLevel: StorageLevel) var blockPushingThread: Thread = null - def onStart() { + def onStart(): Unit = { // Open a socket to the target address and keep reading from it logInfo("Connecting to " + host + ":" + port) val channel = SocketChannel.open() @@ -67,7 +67,7 @@ class RawNetworkReceiver(host: String, port: Int, storageLevel: StorageLevel) blockPushingThread = new Thread { setDaemon(true) - override def run() { + override def run(): Unit = { var nextBlockNumber = 0 while (true) { val buffer = queue.take() @@ -92,12 +92,12 @@ class RawNetworkReceiver(host: String, port: Int, storageLevel: StorageLevel) } } - def onStop() { + def onStop(): Unit = { if (blockPushingThread != null) blockPushingThread.interrupt() } /** Read a buffer fully from a given Channel */ - private def readFully(channel: ReadableByteChannel, dest: ByteBuffer) { + private def readFully(channel: ReadableByteChannel, dest: ByteBuffer): Unit = { while (dest.position() < dest.limit()) { if (channel.read(dest) == -1) { throw new EOFException("End of channel") diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala index fd3e72e41be26..983ac09cd435e 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala @@ -60,9 +60,9 @@ abstract class ReceiverInputDStream[T: ClassTag](_ssc: StreamingContext) def getReceiver(): Receiver[T] // Nothing to start or stop as both taken care of by the ReceiverTracker. - def start() {} + def start(): Unit = {} - def stop() {} + def stop(): Unit = {} /** * Generates RDDs with blocks received by the receiver of this stream. */ diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala index 7853af562368e..9d3facc68e0c6 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/SocketInputDStream.scala @@ -54,7 +54,7 @@ class SocketReceiver[T: ClassTag]( private var socket: Socket = _ - def onStart() { + def onStart(): Unit = { logInfo(s"Connecting to $host:$port") try { @@ -69,11 +69,11 @@ class SocketReceiver[T: ClassTag]( // Start the thread that receives data over a connection new Thread("Socket Receiver") { setDaemon(true) - override def run() { receive() } + override def run(): Unit = { receive() } }.start() } - def onStop() { + def onStop(): Unit = { // in case restart thread close it twice synchronized { if (socket != null) { @@ -85,7 +85,7 @@ class SocketReceiver[T: ClassTag]( } /** Create a socket connection and receive data until receiver is stopped */ - def receive() { + def receive(): Unit = { try { val iterator = bytesToObjects(socket.getInputStream()) while(!isStopped && iterator.hasNext) { @@ -125,7 +125,7 @@ object SocketReceiver { nextValue } - protected override def close() { + protected override def close(): Unit = { dataInputStream.close() } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala index 90309c0145ae1..2533c53883cac 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/BlockGenerator.scala @@ -107,7 +107,8 @@ private[streaming] class BlockGenerator( new RecurringTimer(clock, blockIntervalMs, updateCurrentBuffer, "BlockGenerator") private val blockQueueSize = conf.getInt("spark.streaming.blockQueueSize", 10) private val blocksForPushing = new ArrayBlockingQueue[Block](blockQueueSize) - private val blockPushingThread = new Thread() { override def run() { keepPushingBlocks() } } + private val blockPushingThread = + new Thread() { override def run(): Unit = keepPushingBlocks() } @volatile private var currentBuffer = new ArrayBuffer[Any] @volatile private var state = Initialized @@ -255,7 +256,7 @@ private[streaming] class BlockGenerator( } /** Keep pushing blocks to the BlockManager. */ - private def keepPushingBlocks() { + private def keepPushingBlocks(): Unit = { logInfo("Started block pushing thread") def areBlocksBeingGenerated: Boolean = synchronized { @@ -288,12 +289,12 @@ private[streaming] class BlockGenerator( } } - private def reportError(message: String, t: Throwable) { + private def reportError(message: String, t: Throwable): Unit = { logError(message, t) listener.onError(message, t) } - private def pushBlock(block: Block) { + private def pushBlock(block: Block): Unit = { listener.onPushBlock(block.id, block.buffer) logInfo("Pushed block " + block.id) } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala index fbac4880bdf65..c620074b4e44d 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala @@ -40,7 +40,7 @@ private[receiver] abstract class RateLimiter(conf: SparkConf) extends Logging { private val maxRateLimit = conf.getLong("spark.streaming.receiver.maxRate", Long.MaxValue) private lazy val rateLimiter = GuavaRateLimiter.create(getInitialRateLimit().toDouble) - def waitToPush() { + def waitToPush(): Unit = { rateLimiter.acquire() } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala index eb70232a7452e..12ed8015117e5 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceivedBlockHandler.scala @@ -98,7 +98,7 @@ private[streaming] class BlockManagerBasedBlockHandler( BlockManagerBasedStoreResult(blockId, numRecords) } - def cleanupOldBlocks(threshTime: Long) { + def cleanupOldBlocks(threshTime: Long): Unit = { // this is not used as blocks inserted into the BlockManager are cleared by DStream's clearing // of BlockRDDs. } @@ -210,11 +210,11 @@ private[streaming] class WriteAheadLogBasedBlockHandler( WriteAheadLogBasedStoreResult(blockId, numRecords, walRecordHandle) } - def cleanupOldBlocks(threshTime: Long) { + def cleanupOldBlocks(threshTime: Long): Unit = { writeAheadLog.clean(threshTime, false) } - def stop() { + def stop(): Unit = { writeAheadLog.close() executionContext.shutdown() } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala index 31a88730d163e..dde074c7e324b 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/Receiver.scala @@ -115,12 +115,12 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable * These single items will be aggregated together into data blocks before * being pushed into Spark's memory. */ - def store(dataItem: T) { + def store(dataItem: T): Unit = { supervisor.pushSingle(dataItem) } /** Store an ArrayBuffer of received data as a data block into Spark's memory. */ - def store(dataBuffer: ArrayBuffer[T]) { + def store(dataBuffer: ArrayBuffer[T]): Unit = { supervisor.pushArrayBuffer(dataBuffer, None, None) } @@ -129,12 +129,12 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable * The metadata will be associated with this block of data * for being used in the corresponding InputDStream. */ - def store(dataBuffer: ArrayBuffer[T], metadata: Any) { + def store(dataBuffer: ArrayBuffer[T], metadata: Any): Unit = { supervisor.pushArrayBuffer(dataBuffer, Some(metadata), None) } /** Store an iterator of received data as a data block into Spark's memory. */ - def store(dataIterator: Iterator[T]) { + def store(dataIterator: Iterator[T]): Unit = { supervisor.pushIterator(dataIterator, None, None) } @@ -143,12 +143,12 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable * The metadata will be associated with this block of data * for being used in the corresponding InputDStream. */ - def store(dataIterator: java.util.Iterator[T], metadata: Any) { + def store(dataIterator: java.util.Iterator[T], metadata: Any): Unit = { supervisor.pushIterator(dataIterator.asScala, Some(metadata), None) } /** Store an iterator of received data as a data block into Spark's memory. */ - def store(dataIterator: java.util.Iterator[T]) { + def store(dataIterator: java.util.Iterator[T]): Unit = { supervisor.pushIterator(dataIterator.asScala, None, None) } @@ -157,7 +157,7 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable * The metadata will be associated with this block of data * for being used in the corresponding InputDStream. */ - def store(dataIterator: Iterator[T], metadata: Any) { + def store(dataIterator: Iterator[T], metadata: Any): Unit = { supervisor.pushIterator(dataIterator, Some(metadata), None) } @@ -166,7 +166,7 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable * that the data in the ByteBuffer must be serialized using the same serializer * that Spark is configured to use. */ - def store(bytes: ByteBuffer) { + def store(bytes: ByteBuffer): Unit = { supervisor.pushBytes(bytes, None, None) } @@ -175,12 +175,12 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable * The metadata will be associated with this block of data * for being used in the corresponding InputDStream. */ - def store(bytes: ByteBuffer, metadata: Any) { + def store(bytes: ByteBuffer, metadata: Any): Unit = { supervisor.pushBytes(bytes, Some(metadata), None) } /** Report exceptions in receiving data. */ - def reportError(message: String, throwable: Throwable) { + def reportError(message: String, throwable: Throwable): Unit = { supervisor.reportError(message, throwable) } @@ -192,7 +192,7 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable * is defined by the Spark configuration `spark.streaming.receiverRestartDelay`. * The `message` will be reported to the driver. */ - def restart(message: String) { + def restart(message: String): Unit = { supervisor.restartReceiver(message) } @@ -204,7 +204,7 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable * is defined by the Spark configuration `spark.streaming.receiverRestartDelay`. * The `message` and `exception` will be reported to the driver. */ - def restart(message: String, error: Throwable) { + def restart(message: String, error: Throwable): Unit = { supervisor.restartReceiver(message, Some(error)) } @@ -214,17 +214,17 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable * (by calling `onStop()` and `onStart()`) is performed asynchronously * in a background thread. */ - def restart(message: String, error: Throwable, millisecond: Int) { + def restart(message: String, error: Throwable, millisecond: Int): Unit = { supervisor.restartReceiver(message, Some(error), millisecond) } /** Stop the receiver completely. */ - def stop(message: String) { + def stop(message: String): Unit = { supervisor.stop(message, None) } /** Stop the receiver completely due to an exception */ - def stop(message: String, error: Throwable) { + def stop(message: String, error: Throwable): Unit = { supervisor.stop(message, Some(error)) } @@ -260,12 +260,12 @@ abstract class Receiver[T](val storageLevel: StorageLevel) extends Serializable @transient private var _supervisor: ReceiverSupervisor = null /** Set the ID of the DStream that this receiver is associated with. */ - private[streaming] def setReceiverId(_id: Int) { + private[streaming] def setReceiverId(_id: Int): Unit = { id = _id } /** Attach Network Receiver executor to this receiver. */ - private[streaming] def attachSupervisor(exec: ReceiverSupervisor) { + private[streaming] def attachSupervisor(exec: ReceiverSupervisor): Unit = { assert(_supervisor == null) _supervisor = exec } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala index faf6db82d5b18..b464dccb760f6 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisor.scala @@ -110,29 +110,29 @@ private[streaming] abstract class ReceiverSupervisor( * Note that this must be called before the receiver.onStart() is called to ensure * things like [[BlockGenerator]]s are started before the receiver starts sending data. */ - protected def onStart() { } + protected def onStart(): Unit = { } /** * Called when supervisor is stopped. * Note that this must be called after the receiver.onStop() is called to ensure * things like [[BlockGenerator]]s are cleaned up after the receiver stops sending data. */ - protected def onStop(message: String, error: Option[Throwable]) { } + protected def onStop(message: String, error: Option[Throwable]): Unit = { } /** Called when receiver is started. Return true if the driver accepts us */ protected def onReceiverStart(): Boolean /** Called when receiver is stopped */ - protected def onReceiverStop(message: String, error: Option[Throwable]) { } + protected def onReceiverStop(message: String, error: Option[Throwable]): Unit = { } /** Start the supervisor */ - def start() { + def start(): Unit = { onStart() startReceiver() } /** Mark the supervisor and the receiver for stopping */ - def stop(message: String, error: Option[Throwable]) { + def stop(message: String, error: Option[Throwable]): Unit = { stoppingError = error.orNull stopReceiver(message, error) onStop(message, error) @@ -180,12 +180,12 @@ private[streaming] abstract class ReceiverSupervisor( } /** Restart receiver with delay */ - def restartReceiver(message: String, error: Option[Throwable] = None) { + def restartReceiver(message: String, error: Option[Throwable] = None): Unit = { restartReceiver(message, error, defaultRestartDelay) } /** Restart receiver with delay */ - def restartReceiver(message: String, error: Option[Throwable], delay: Int) { + def restartReceiver(message: String, error: Option[Throwable], delay: Int): Unit = { Future { // This is a blocking action so we should use "futureExecutionContext" which is a cached // thread pool. @@ -214,7 +214,7 @@ private[streaming] abstract class ReceiverSupervisor( /** Wait the thread until the supervisor is stopped */ - def awaitTermination() { + def awaitTermination(): Unit = { logInfo("Waiting for receiver to be stopped") stopLatch.await() if (stoppingError != null) { diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala index 5d38c56aa5873..13c80841d4d14 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/ReceiverSupervisorImpl.scala @@ -102,11 +102,11 @@ private[streaming] class ReceiverSupervisorImpl( def onGenerateBlock(blockId: StreamBlockId): Unit = { } - def onError(message: String, throwable: Throwable) { + def onError(message: String, throwable: Throwable): Unit = { reportError(message, throwable) } - def onPushBlock(blockId: StreamBlockId, arrayBuffer: ArrayBuffer[_]) { + def onPushBlock(blockId: StreamBlockId, arrayBuffer: ArrayBuffer[_]): Unit = { pushArrayBuffer(arrayBuffer, None, Some(blockId)) } } @@ -116,7 +116,7 @@ private[streaming] class ReceiverSupervisorImpl( override private[streaming] def getCurrentRateLimit: Long = defaultBlockGenerator.getCurrentLimit /** Push a single record of received data into block generator. */ - def pushSingle(data: Any) { + def pushSingle(data: Any): Unit = { defaultBlockGenerator.addData(data) } @@ -125,7 +125,7 @@ private[streaming] class ReceiverSupervisorImpl( arrayBuffer: ArrayBuffer[_], metadataOption: Option[Any], blockIdOption: Option[StreamBlockId] - ) { + ): Unit = { pushAndReportBlock(ArrayBufferBlock(arrayBuffer), metadataOption, blockIdOption) } @@ -134,7 +134,7 @@ private[streaming] class ReceiverSupervisorImpl( iterator: Iterator[_], metadataOption: Option[Any], blockIdOption: Option[StreamBlockId] - ) { + ): Unit = { pushAndReportBlock(IteratorBlock(iterator), metadataOption, blockIdOption) } @@ -143,7 +143,7 @@ private[streaming] class ReceiverSupervisorImpl( bytes: ByteBuffer, metadataOption: Option[Any], blockIdOption: Option[StreamBlockId] - ) { + ): Unit = { pushAndReportBlock(ByteBufferBlock(bytes), metadataOption, blockIdOption) } @@ -152,7 +152,7 @@ private[streaming] class ReceiverSupervisorImpl( receivedBlock: ReceivedBlock, metadataOption: Option[Any], blockIdOption: Option[StreamBlockId] - ) { + ): Unit = { val blockId = blockIdOption.getOrElse(nextBlockId) val time = System.currentTimeMillis val blockStoreResult = receivedBlockHandler.storeBlock(blockId, receivedBlock) @@ -166,17 +166,17 @@ private[streaming] class ReceiverSupervisorImpl( } /** Report error to the receiver tracker */ - def reportError(message: String, error: Throwable) { + def reportError(message: String, error: Throwable): Unit = { val errorString = Option(error).map(Throwables.getStackTraceAsString).getOrElse("") trackerEndpoint.send(ReportError(streamId, message, errorString)) logWarning("Reported error " + message + " - " + error) } - override protected def onStart() { + override protected def onStart(): Unit = { registeredBlockGenerators.asScala.foreach { _.start() } } - override protected def onStop(message: String, error: Option[Throwable]) { + override protected def onStop(message: String, error: Option[Throwable]): Unit = { receivedBlockHandler match { case handler: WriteAheadLogBasedBlockHandler => // Write ahead log should be closed. @@ -193,7 +193,7 @@ private[streaming] class ReceiverSupervisorImpl( trackerEndpoint.askSync[Boolean](msg) } - override protected def onReceiverStop(message: String, error: Option[Throwable]) { + override protected def onReceiverStop(message: String, error: Option[Throwable]): Unit = { logInfo("Deregistering receiver " + streamId) val errorString = error.map(Throwables.getStackTraceAsString).getOrElse("") trackerEndpoint.askSync[Boolean](DeregisterReceiver(streamId, message, errorString)) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManager.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManager.scala index e85a3b9009c32..58bd56c591d04 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManager.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManager.scala @@ -23,6 +23,7 @@ import scala.util.Random import org.apache.spark.{ExecutorAllocationClient, SparkConf} import org.apache.spark.internal.Logging import org.apache.spark.internal.config.Streaming._ +import org.apache.spark.resource.ResourceProfile import org.apache.spark.streaming.util.RecurringTimer import org.apache.spark.util.{Clock, Utils} @@ -111,7 +112,11 @@ private[streaming] class ExecutorAllocationManager( logDebug(s"Executors (${allExecIds.size}) = ${allExecIds}") val targetTotalExecutors = math.max(math.min(maxNumExecutors, allExecIds.size + numNewExecutors), minNumExecutors) - client.requestTotalExecutors(targetTotalExecutors, 0, Map.empty) + // Just map the targetTotalExecutors to the default ResourceProfile + client.requestTotalExecutors( + Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID -> targetTotalExecutors), + Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID -> 0), + Map.empty) logInfo(s"Requested total $targetTotalExecutors executors") } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala index 7050d7ef45240..88e7b56895993 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala @@ -35,7 +35,7 @@ class Job(val time: Time, func: () => _) { private var _startTime: Option[Long] = None private var _endTime: Option[Long] = None - def run() { + def run(): Unit = { _result = Try(func()) } @@ -66,7 +66,7 @@ class Job(val time: Time, func: () => _) { _outputOpId } - def setOutputOpId(outputOpId: Int) { + def setOutputOpId(outputOpId: Int): Unit = { if (isSet) { throw new IllegalStateException("Cannot call setOutputOpId more than once") } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala index ddeb3d4547c55..7e8449ee5aa7e 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobGenerator.scala @@ -77,7 +77,7 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging { private var eventLoop: EventLoop[JobGeneratorEvent] = null // last batch whose completion,checkpointing and metadata cleanup has been completed - private var lastProcessedBatch: Time = null + @volatile private[streaming] var lastProcessedBatch: Time = null /** Start generation of jobs */ def start(): Unit = synchronized { @@ -166,21 +166,21 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging { /** * Callback called when a batch has been completely processed. */ - def onBatchCompletion(time: Time) { + def onBatchCompletion(time: Time): Unit = { eventLoop.post(ClearMetadata(time)) } /** * Callback called when the checkpoint of a batch has been written. */ - def onCheckpointCompletion(time: Time, clearCheckpointDataLater: Boolean) { + def onCheckpointCompletion(time: Time, clearCheckpointDataLater: Boolean): Unit = { if (clearCheckpointDataLater) { eventLoop.post(ClearCheckpointData(time)) } } /** Processes all events */ - private def processEvent(event: JobGeneratorEvent) { + private def processEvent(event: JobGeneratorEvent): Unit = { logDebug("Got event " + event) event match { case GenerateJobs(time) => generateJobs(time) @@ -192,7 +192,7 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging { } /** Starts the generator for the first time */ - private def startFirstTime() { + private def startFirstTime(): Unit = { val startTime = new Time(timer.getStartTime()) graph.start(startTime - graph.batchDuration) timer.start(startTime.milliseconds) @@ -200,7 +200,7 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging { } /** Restarts the generator based on the information in checkpoint */ - private def restart() { + private def restart(): Unit = { // If manual clock is being used for testing, then // either set the manual clock to the last checkpointed time, // or if the property is defined set it to that time @@ -243,7 +243,7 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging { } /** Generate jobs and perform checkpointing for the given `time`. */ - private def generateJobs(time: Time) { + private def generateJobs(time: Time): Unit = { // Checkpoint all RDDs marked for checkpointing to ensure their lineages are // truncated periodically. Otherwise, we may run into stack overflows (SPARK-6847). ssc.sparkContext.setLocalProperty(RDD.CHECKPOINT_ALL_MARKED_ANCESTORS, "true") @@ -262,7 +262,7 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging { } /** Clear DStream metadata for the given `time`. */ - private def clearMetadata(time: Time) { + private def clearMetadata(time: Time): Unit = { ssc.graph.clearMetadata(time) // If checkpointing is enabled, then checkpoint, @@ -281,7 +281,7 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging { } /** Clear DStream checkpoint data for the given `time`. */ - private def clearCheckpointData(time: Time) { + private def clearCheckpointData(time: Time): Unit = { ssc.graph.clearCheckpointData(time) // All the checkpoint information about which batches have been processed, etc have @@ -293,7 +293,7 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging { } /** Perform checkpoint for the given `time`. */ - private def doCheckpoint(time: Time, clearCheckpointDataLater: Boolean) { + private def doCheckpoint(time: Time, clearCheckpointDataLater: Boolean): Unit = { if (shouldCheckpoint && (time - graph.zeroTime).isMultipleOf(ssc.checkpointDuration)) { logInfo("Checkpointing graph for time " + time) ssc.graph.updateCheckpointData(time) @@ -303,7 +303,7 @@ class JobGenerator(jobScheduler: JobScheduler) extends Logging { } } - private def markBatchFullyProcessed(time: Time) { + private def markBatchFullyProcessed(time: Time): Unit = { lastProcessedBatch = time } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala index 2fa3bf7d5230b..7eea57cc083ed 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala @@ -22,16 +22,14 @@ import java.util.concurrent.{ConcurrentHashMap, TimeUnit} import scala.collection.JavaConverters._ import scala.util.Failure -import org.apache.commons.lang3.SerializationUtils - import org.apache.spark.ExecutorAllocationClient import org.apache.spark.internal.Logging import org.apache.spark.internal.io.SparkHadoopWriterUtils import org.apache.spark.rdd.RDD import org.apache.spark.streaming._ import org.apache.spark.streaming.api.python.PythonDStream -import org.apache.spark.streaming.ui.UIUtils -import org.apache.spark.util.{EventLoop, ThreadUtils} +import org.apache.spark.ui.{UIUtils => SparkUIUtils} +import org.apache.spark.util.{EventLoop, ThreadUtils, Utils} private[scheduler] sealed trait JobSchedulerEvent @@ -52,8 +50,9 @@ class JobScheduler(val ssc: StreamingContext) extends Logging { private val numConcurrentJobs = ssc.conf.getInt("spark.streaming.concurrentJobs", 1) private val jobExecutor = ThreadUtils.newDaemonFixedThreadPool(numConcurrentJobs, "streaming-job-executor") - private val jobGenerator = new JobGenerator(this) + private[streaming] val jobGenerator = new JobGenerator(this) val clock = jobGenerator.clock + val listenerBus = new StreamingListenerBus(ssc.sparkContext.listenerBus) // These two are created only when scheduler starts. @@ -144,7 +143,7 @@ class JobScheduler(val ssc: StreamingContext) extends Logging { logInfo("Stopped JobScheduler") } - def submitJobSet(jobSet: JobSet) { + def submitJobSet(jobSet: JobSet): Unit = { if (jobSet.jobs.isEmpty) { logInfo("No jobs added for time " + jobSet.time) } else { @@ -159,7 +158,7 @@ class JobScheduler(val ssc: StreamingContext) extends Logging { jobSets.asScala.keys.toSeq } - def reportError(msg: String, e: Throwable) { + def reportError(msg: String, e: Throwable): Unit = { eventLoop.post(ErrorReported(msg, e)) } @@ -167,7 +166,7 @@ class JobScheduler(val ssc: StreamingContext) extends Logging { eventLoop != null } - private def processEvent(event: JobSchedulerEvent) { + private def processEvent(event: JobSchedulerEvent): Unit = { try { event match { case JobStarted(job, startTime) => handleJobStart(job, startTime) @@ -180,7 +179,7 @@ class JobScheduler(val ssc: StreamingContext) extends Logging { } } - private def handleJobStart(job: Job, startTime: Long) { + private def handleJobStart(job: Job, startTime: Long): Unit = { val jobSet = jobSets.get(job.time) val isFirstJobOfJobSet = !jobSet.hasStarted jobSet.handleJobStart(job) @@ -194,7 +193,7 @@ class JobScheduler(val ssc: StreamingContext) extends Logging { logInfo("Starting job " + job.id + " from job set of time " + jobSet.time) } - private def handleJobCompletion(job: Job, completedTime: Long) { + private def handleJobCompletion(job: Job, completedTime: Long): Unit = { val jobSet = jobSets.get(job.time) jobSet.handleJobCompletion(job) job.setEndTime(completedTime) @@ -218,7 +217,7 @@ class JobScheduler(val ssc: StreamingContext) extends Logging { } } - private def handleError(msg: String, e: Throwable) { + private def handleError(msg: String, e: Throwable): Unit = { logError(msg, e) ssc.waiter.notifyError(e) PythonDStream.stopStreamingContextIfPythonProcessIsDead(e) @@ -227,11 +226,11 @@ class JobScheduler(val ssc: StreamingContext) extends Logging { private class JobHandler(job: Job) extends Runnable with Logging { import JobScheduler._ - def run() { + def run(): Unit = { val oldProps = ssc.sparkContext.getLocalProperties try { - ssc.sparkContext.setLocalProperties(SerializationUtils.clone(ssc.savedProperties.get())) - val formattedTime = UIUtils.formatBatchTime( + ssc.sparkContext.setLocalProperties(Utils.cloneProperties(ssc.savedProperties.get())) + val formattedTime = SparkUIUtils.formatBatchTime( job.time.milliseconds, ssc.graph.batchDuration.milliseconds, showYYYYMMSS = false) val batchUrl = s"/streaming/batch/?id=${job.time.milliseconds}" val batchLinkText = s"[output operation ${job.outputOpId}, batch time ${formattedTime}]" diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala index 0baedaf275d67..5a5469ac6543a 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala @@ -39,11 +39,11 @@ case class JobSet( jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) } incompleteJobs ++= jobs - def handleJobStart(job: Job) { + def handleJobStart(job: Job): Unit = { if (processingStartTime < 0) processingStartTime = System.currentTimeMillis() } - def handleJobCompletion(job: Job) { + def handleJobCompletion(job: Job): Unit = { incompleteJobs -= job if (hasCompleted) processingEndTime = System.currentTimeMillis() } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala index a46c0c1b25e74..7774e85f778a6 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/RateController.scala @@ -47,7 +47,7 @@ private[streaming] abstract class RateController(val streamUID: Int, rateEstimat /** * An initialization method called both from the constructor and Serialization code. */ - private def init() { + private def init(): Unit = { executionContext = ExecutionContext.fromExecutorService( ThreadUtils.newDaemonSingleThreadExecutor("stream-rate-update")) rateLimit = new AtomicLong(-1L) @@ -72,7 +72,7 @@ private[streaming] abstract class RateController(val streamUID: Int, rateEstimat def getLatestRate(): Long = rateLimit.get() - override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted) { + override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = { val elements = batchCompleted.batchInfo.streamIdToInputInfo for { diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala index a9763cfe04539..6c71b18b46213 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceivedBlockTracker.scala @@ -182,7 +182,7 @@ private[streaming] class ReceivedBlockTracker( } /** Stop the block tracker. */ - def stop() { + def stop(): Unit = { writeAheadLogOption.foreach { _.close() } } @@ -192,7 +192,7 @@ private[streaming] class ReceivedBlockTracker( */ private def recoverPastEvents(): Unit = synchronized { // Insert the recovered block information - def insertAddedBlock(receivedBlockInfo: ReceivedBlockInfo) { + def insertAddedBlock(receivedBlockInfo: ReceivedBlockInfo): Unit = { logTrace(s"Recovery: Inserting added block $receivedBlockInfo") receivedBlockInfo.setBlockIdInvalid() getReceivedBlockQueue(receivedBlockInfo.streamId) += receivedBlockInfo @@ -200,7 +200,7 @@ private[streaming] class ReceivedBlockTracker( // Insert the recovered block-to-batch allocations and removes them from queue of // received blocks. - def insertAllocatedBatch(batchTime: Time, allocatedBlocks: AllocatedBlocks) { + def insertAllocatedBatch(batchTime: Time, allocatedBlocks: AllocatedBlocks): Unit = { logTrace(s"Recovery: Inserting allocated batch for time $batchTime to " + s"${allocatedBlocks.streamIdToAllocatedBlocks}") allocatedBlocks.streamIdToAllocatedBlocks.foreach { @@ -212,7 +212,7 @@ private[streaming] class ReceivedBlockTracker( } // Cleanup the batch allocations - def cleanupBatches(batchTimes: Seq[Time]) { + def cleanupBatches(batchTimes: Seq[Time]): Unit = { logTrace(s"Recovery: Cleaning up batches $batchTimes") timeToAllocatedBlocks --= batchTimes } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala index 551d376fbc1e7..13cf5cc0e71ea 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala @@ -223,7 +223,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false * Clean up the data and metadata of blocks and batches that are strictly * older than the threshold time. Note that this does not */ - def cleanupOldBlocksAndBatches(cleanupThreshTime: Time) { + def cleanupOldBlocksAndBatches(cleanupThreshTime: Time): Unit = { // Clean up old block and batch metadata receivedBlockTracker.cleanupOldBatches(cleanupThreshTime, waitForCompletion = false) @@ -309,7 +309,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false } /** Deregister a receiver */ - private def deregisterReceiver(streamId: Int, message: String, error: String) { + private def deregisterReceiver(streamId: Int, message: String, error: String): Unit = { val lastErrorTime = if (error == null || error == "") -1 else ssc.scheduler.clock.getTimeMillis() val errorInfo = ReceiverErrorInfo( @@ -345,7 +345,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false } /** Report error sent by a receiver */ - private def reportError(streamId: Int, message: String, error: String) { + private def reportError(streamId: Int, message: String, error: String): Unit = { val newReceiverTrackingInfo = receiverTrackingInfos.get(streamId) match { case Some(oldInfo) => val errorInfo = ReceiverErrorInfo(lastErrorMessage = message, lastError = error, @@ -613,7 +613,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false ssc.sparkContext.setCallSite(Option(ssc.getStartSite()).getOrElse(Utils.getCallSite())) val future = ssc.sparkContext.submitJob[Receiver[_], Unit, Unit]( - receiverRDD, startReceiverFunc, Seq(0), (_, _) => Unit, ()) + receiverRDD, startReceiverFunc, Seq(0), (_, _) => (), ()) // We will keep restarting the receiver job until ReceiverTracker is stopped future.onComplete { case Success(_) => @@ -653,7 +653,7 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false } /** Send stop signal to the receivers. */ - private def stopReceivers() { + private def stopReceivers(): Unit = { receiverTrackingInfos.values.flatMap(_.endpoint).foreach { _.send(StopReceiver) } logInfo("Sent stop signal to all " + receiverTrackingInfos.size + " receivers") } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListener.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListener.scala index b57f9b772f8c6..cc961bb268c9d 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListener.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListener.scala @@ -70,33 +70,33 @@ case class StreamingListenerReceiverStopped(receiverInfo: ReceiverInfo) trait StreamingListener { /** Called when the streaming has been started */ - def onStreamingStarted(streamingStarted: StreamingListenerStreamingStarted) { } + def onStreamingStarted(streamingStarted: StreamingListenerStreamingStarted): Unit = { } /** Called when a receiver has been started */ - def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted) { } + def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted): Unit = { } /** Called when a receiver has reported an error */ - def onReceiverError(receiverError: StreamingListenerReceiverError) { } + def onReceiverError(receiverError: StreamingListenerReceiverError): Unit = { } /** Called when a receiver has been stopped */ - def onReceiverStopped(receiverStopped: StreamingListenerReceiverStopped) { } + def onReceiverStopped(receiverStopped: StreamingListenerReceiverStopped): Unit = { } /** Called when a batch of jobs has been submitted for processing. */ - def onBatchSubmitted(batchSubmitted: StreamingListenerBatchSubmitted) { } + def onBatchSubmitted(batchSubmitted: StreamingListenerBatchSubmitted): Unit = { } /** Called when processing of a batch of jobs has started. */ - def onBatchStarted(batchStarted: StreamingListenerBatchStarted) { } + def onBatchStarted(batchStarted: StreamingListenerBatchStarted): Unit = { } /** Called when processing of a batch of jobs has completed. */ - def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted) { } + def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = { } /** Called when processing of a job of a batch has started. */ def onOutputOperationStarted( - outputOperationStarted: StreamingListenerOutputOperationStarted) { } + outputOperationStarted: StreamingListenerOutputOperationStarted): Unit = { } /** Called when processing of a job of a batch has completed. */ def onOutputOperationCompleted( - outputOperationCompleted: StreamingListenerOutputOperationCompleted) { } + outputOperationCompleted: StreamingListenerOutputOperationCompleted): Unit = { } } @@ -110,18 +110,18 @@ class StatsReportListener(numBatchInfos: Int = 10) extends StreamingListener { // Queue containing latest completed batches val batchInfos = new Queue[BatchInfo]() - override def onBatchCompleted(batchStarted: StreamingListenerBatchCompleted) { + override def onBatchCompleted(batchStarted: StreamingListenerBatchCompleted): Unit = { batchInfos.enqueue(batchStarted.batchInfo) if (batchInfos.size > numBatchInfos) batchInfos.dequeue() printStats() } - def printStats() { + def printStats(): Unit = { showMillisDistribution("Total delay: ", _.totalDelay) showMillisDistribution("Processing time: ", _.processingDelay) } - def showMillisDistribution(heading: String, getMetric: BatchInfo => Option[Long]) { + def showMillisDistribution(heading: String, getMetric: BatchInfo => Option[Long]): Unit = { org.apache.spark.scheduler.StatsReportListener.showMillisDistribution( heading, extractDistribution(getMetric)) } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListenerBus.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListenerBus.scala index 6a70bf7406b3c..8a10a62f0180b 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListenerBus.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/StreamingListenerBus.scala @@ -33,7 +33,7 @@ private[streaming] class StreamingListenerBus(sparkListenerBus: LiveListenerBus) * Post a StreamingListenerEvent to the Spark listener bus asynchronously. This event will be * dispatched to all StreamingListeners in the thread of the Spark listener bus. */ - def post(event: StreamingListenerEvent) { + def post(event: StreamingListenerEvent): Unit = { sparkListenerBus.post(new WrappedStreamingListenerEvent(event)) } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala index f1070e9029cb5..b5a0e92e69c04 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala @@ -51,7 +51,7 @@ private[ui] abstract class BatchTableBase(tableId: String, batchInterval: Long) protected def baseRow(batch: BatchUIData): Seq[Node] = { val batchTime = batch.batchTime.milliseconds - val formattedBatchTime = UIUtils.formatBatchTime(batchTime, batchInterval) + val formattedBatchTime = SparkUIUtils.formatBatchTime(batchTime, batchInterval) val numRecords = batch.numRecords val schedulingDelay = batch.schedulingDelay val formattedSchedulingDelay = schedulingDelay.map(SparkUIUtils.formatDuration).getOrElse("-") diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala index f3d2e478e9b2d..04cd063a28713 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala @@ -37,10 +37,13 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") { private def columns: Seq[Node] = { Output Op Id Description - Output Op Duration + Output Op Duration {SparkUIUtils.tooltip("Time taken for all the jobs of this batch to" + + " finish processing from the time they were submitted.", + "top")} Status Job Id - Job Duration + Job Duration {SparkUIUtils.tooltip("Time taken from submission time to completion " + + "time of the job", "top")} Stages: Succeeded/Total Tasks (for all stages): Succeeded/Total Error @@ -322,7 +325,7 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") { throw new IllegalArgumentException(s"Missing id parameter") } val formattedBatchTime = - UIUtils.formatBatchTime(batchTime.milliseconds, streamingListener.batchDuration) + SparkUIUtils.formatBatchTime(batchTime.milliseconds, streamingListener.batchDuration) val batchUIData = streamingListener.getBatchUIData(batchTime).getOrElse { throw new IllegalArgumentException(s"Batch $formattedBatchTime does not exist") @@ -381,7 +384,7 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") { Input - Metadata + Metadata {SparkUIUtils.tooltip("Batch Input Details", "right")} diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala index ed4c1e484efd2..de73762beb860 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala @@ -68,23 +68,23 @@ private[spark] class StreamingJobProgressListener(ssc: StreamingContext) val batchDuration = ssc.graph.batchDuration.milliseconds - override def onStreamingStarted(streamingStarted: StreamingListenerStreamingStarted) { + override def onStreamingStarted(streamingStarted: StreamingListenerStreamingStarted): Unit = { _startTime = streamingStarted.time } - override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted) { + override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted): Unit = { synchronized { receiverInfos(receiverStarted.receiverInfo.streamId) = receiverStarted.receiverInfo } } - override def onReceiverError(receiverError: StreamingListenerReceiverError) { + override def onReceiverError(receiverError: StreamingListenerReceiverError): Unit = { synchronized { receiverInfos(receiverError.receiverInfo.streamId) = receiverError.receiverInfo } } - override def onReceiverStopped(receiverStopped: StreamingListenerReceiverStopped) { + override def onReceiverStopped(receiverStopped: StreamingListenerReceiverStopped): Unit = { synchronized { receiverInfos(receiverStopped.receiverInfo.streamId) = receiverStopped.receiverInfo } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala index d16611f412034..d47287b6077f8 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala @@ -20,79 +20,10 @@ package org.apache.spark.streaming.ui import java.util.concurrent.TimeUnit import javax.servlet.http.HttpServletRequest -import scala.collection.mutable.ArrayBuffer import scala.xml.{Node, Unparsed} import org.apache.spark.internal.Logging -import org.apache.spark.ui._ -import org.apache.spark.ui.{UIUtils => SparkUIUtils} - -/** - * A helper class to generate JavaScript and HTML for both timeline and histogram graphs. - * - * @param timelineDivId the timeline `id` used in the html `div` tag - * @param histogramDivId the timeline `id` used in the html `div` tag - * @param data the data for the graph - * @param minX the min value of X axis - * @param maxX the max value of X axis - * @param minY the min value of Y axis - * @param maxY the max value of Y axis - * @param unitY the unit of Y axis - * @param batchInterval if `batchInterval` is not None, we will draw a line for `batchInterval` in - * the graph - */ -private[ui] class GraphUIData( - timelineDivId: String, - histogramDivId: String, - data: Seq[(Long, Double)], - minX: Long, - maxX: Long, - minY: Double, - maxY: Double, - unitY: String, - batchInterval: Option[Double] = None) { - - private var dataJavaScriptName: String = _ - - def generateDataJs(jsCollector: JsCollector): Unit = { - val jsForData = data.map { case (x, y) => - s"""{"x": $x, "y": $y}""" - }.mkString("[", ",", "]") - dataJavaScriptName = jsCollector.nextVariableName - jsCollector.addPreparedStatement(s"var $dataJavaScriptName = $jsForData;") - } - - def generateTimelineHtml(jsCollector: JsCollector): Seq[Node] = { - jsCollector.addPreparedStatement(s"registerTimeline($minY, $maxY);") - if (batchInterval.isDefined) { - jsCollector.addStatement( - "drawTimeline(" + - s"'#$timelineDivId', $dataJavaScriptName, $minX, $maxX, $minY, $maxY, '$unitY'," + - s" ${batchInterval.get}" + - ");") - } else { - jsCollector.addStatement( - s"drawTimeline('#$timelineDivId', $dataJavaScriptName, $minX, $maxX, $minY, $maxY," + - s" '$unitY');") - } -
    - } - - def generateHistogramHtml(jsCollector: JsCollector): Seq[Node] = { - val histogramData = s"$dataJavaScriptName.map(function(d) { return d.y; })" - jsCollector.addPreparedStatement(s"registerHistogram($histogramData, $minY, $maxY);") - if (batchInterval.isDefined) { - jsCollector.addStatement( - "drawHistogram(" + - s"'#$histogramDivId', $histogramData, $minY, $maxY, '$unitY', ${batchInterval.get}" + - ");") - } else { - jsCollector.addStatement( - s"drawHistogram('#$histogramDivId', $histogramData, $minY, $maxY, '$unitY');") - } -
    - } -} +import org.apache.spark.ui.{GraphUIData, JsCollector, UIUtils => SparkUIUtils, WebUIPage} /** * A helper class for "scheduling delay", "processing time" and "total delay" to generate data that @@ -165,8 +96,8 @@ private[ui] class StreamingPage(parent: StreamingTab) private def generateLoadResources(request: HttpServletRequest): Seq[Node] = { // scalastyle:off - - + + // scalastyle:on } @@ -202,7 +133,7 @@ private[ui] class StreamingPage(parent: StreamingTab) private def generateTimeMap(times: Seq[Long]): Seq[Node] = { val js = "var timeFormat = {};\n" + times.map { time => val formattedTime = - UIUtils.formatBatchTime(time, listener.batchDuration, showYYYYMMSS = false) + SparkUIUtils.formatBatchTime(time, listener.batchDuration, showYYYYMMSS = false) s"timeFormat[$time] = '$formattedTime';" }.mkString("\n") @@ -321,7 +252,7 @@ private[ui] class StreamingPage(parent: StreamingTab) if (hasStream) { -
    + Input Rate @@ -351,7 +282,7 @@ private[ui] class StreamingPage(parent: StreamingTab)
    -
    Scheduling Delay {SparkUIUtils.tooltip("Time taken by Streaming scheduler to submit jobs of a batch", "right")}
    +
    Scheduling Delay {SparkUIUtils.tooltip("Time taken by Streaming scheduler to submit jobs of a batch", "top")}
    Avg: {schedulingDelay.formattedAvg}
    @@ -361,7 +292,7 @@ private[ui] class StreamingPage(parent: StreamingTab)
    -
    Processing Time {SparkUIUtils.tooltip("Time taken to process all jobs of a batch", "right")}
    +
    Processing Time {SparkUIUtils.tooltip("Time taken to process all jobs of a batch", "top")}
    Avg: {processingTime.formattedAvg}
    @@ -371,7 +302,7 @@ private[ui] class StreamingPage(parent: StreamingTab)
    -
    Total Delay {SparkUIUtils.tooltip("Total time taken to handle a batch", "right")}
    +
    Total Delay {SparkUIUtils.tooltip("Total time taken to handle a batch", "top")}
    Avg: {totalDelay.formattedAvg}
    @@ -545,52 +476,3 @@ private[ui] object StreamingPage { } -/** - * A helper class that allows the user to add JavaScript statements which will be executed when the - * DOM has finished loading. - */ -private[ui] class JsCollector { - - private var variableId = 0 - - /** - * Return the next unused JavaScript variable name - */ - def nextVariableName: String = { - variableId += 1 - "v" + variableId - } - - /** - * JavaScript statements that will execute before `statements` - */ - private val preparedStatements = ArrayBuffer[String]() - - /** - * JavaScript statements that will execute after `preparedStatements` - */ - private val statements = ArrayBuffer[String]() - - def addPreparedStatement(js: String): Unit = { - preparedStatements += js - } - - def addStatement(js: String): Unit = { - statements += js - } - - /** - * Generate a html snippet that will execute all scripts when the DOM has finished loading. - */ - def toHtml: Seq[Node] = { - val js = - s""" - |$$(document).ready(function() { - | ${preparedStatements.mkString("\n")} - | ${statements.mkString("\n")} - |});""".stripMargin - - - } -} - diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingTab.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingTab.scala index 13357db728701..d616b47117f1c 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingTab.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingTab.scala @@ -28,7 +28,7 @@ import org.apache.spark.ui.{SparkUI, SparkUITab} private[spark] class StreamingTab(val ssc: StreamingContext, sparkUI: SparkUI) extends SparkUITab(sparkUI, "streaming") with Logging { - private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static" + private val STATIC_RESOURCE_DIR = "org/apache/spark/ui/static" val parent = sparkUI val listener = ssc.progressListener @@ -36,12 +36,12 @@ private[spark] class StreamingTab(val ssc: StreamingContext, sparkUI: SparkUI) attachPage(new StreamingPage(this)) attachPage(new BatchPage(this)) - def attach() { + def attach(): Unit = { parent.attachTab(this) parent.addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming") } - def detach() { + def detach(): Unit = { parent.detachTab(this) parent.detachHandler("/static/streaming") } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/UIUtils.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/UIUtils.scala index c21912ab2816c..dc1af0a940ec7 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/ui/UIUtils.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/UIUtils.scala @@ -17,14 +17,14 @@ package org.apache.spark.streaming.ui -import java.text.SimpleDateFormat -import java.util.{Locale, TimeZone} import java.util.concurrent.TimeUnit import scala.xml.Node import org.apache.commons.text.StringEscapeUtils +import org.apache.spark.ui.{ UIUtils => SparkUIUtils } + private[streaming] object UIUtils { /** @@ -78,59 +78,6 @@ private[streaming] object UIUtils { case TimeUnit.DAYS => milliseconds / 1000.0 / 60.0 / 60.0 / 24.0 } - // SimpleDateFormat is not thread-safe. Don't expose it to avoid improper use. - private val batchTimeFormat = new ThreadLocal[SimpleDateFormat]() { - override def initialValue(): SimpleDateFormat = - new SimpleDateFormat("yyyy/MM/dd HH:mm:ss", Locale.US) - } - - private val batchTimeFormatWithMilliseconds = new ThreadLocal[SimpleDateFormat]() { - override def initialValue(): SimpleDateFormat = - new SimpleDateFormat("yyyy/MM/dd HH:mm:ss.SSS", Locale.US) - } - - /** - * If `batchInterval` is less than 1 second, format `batchTime` with milliseconds. Otherwise, - * format `batchTime` without milliseconds. - * - * @param batchTime the batch time to be formatted - * @param batchInterval the batch interval - * @param showYYYYMMSS if showing the `yyyy/MM/dd` part. If it's false, the return value wll be - * only `HH:mm:ss` or `HH:mm:ss.SSS` depending on `batchInterval` - * @param timezone only for test - */ - def formatBatchTime( - batchTime: Long, - batchInterval: Long, - showYYYYMMSS: Boolean = true, - timezone: TimeZone = null): String = { - val oldTimezones = - (batchTimeFormat.get.getTimeZone, batchTimeFormatWithMilliseconds.get.getTimeZone) - if (timezone != null) { - batchTimeFormat.get.setTimeZone(timezone) - batchTimeFormatWithMilliseconds.get.setTimeZone(timezone) - } - try { - val formattedBatchTime = - if (batchInterval < 1000) { - batchTimeFormatWithMilliseconds.get.format(batchTime) - } else { - // If batchInterval >= 1 second, don't show milliseconds - batchTimeFormat.get.format(batchTime) - } - if (showYYYYMMSS) { - formattedBatchTime - } else { - formattedBatchTime.substring(formattedBatchTime.indexOf(' ') + 1) - } - } finally { - if (timezone != null) { - batchTimeFormat.get.setTimeZone(oldTimezones._1) - batchTimeFormatWithMilliseconds.get.setTimeZone(oldTimezones._2) - } - } - } - def createOutputOperationFailureForUI(failure: String): String = { if (failure.startsWith("org.apache.spark.Spark")) { // SparkException or SparkDriverExecutionException @@ -164,19 +111,7 @@ private[streaming] object UIUtils { } else { failureReason } - val details = if (isMultiline) { - // scalastyle:off - - +details - ++ - - // scalastyle:on - } else { - "" - } + val details = SparkUIUtils.detailsUINode(isMultiline, failureDetails) if (rowspan == 1) { {failureReasonSummary}{details} diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala index 21f3bbe40bfab..d33f83c819086 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala @@ -24,6 +24,7 @@ import java.util.concurrent.RejectedExecutionException import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import scala.collection.parallel.ExecutionContextTaskSupport +import scala.collection.parallel.immutable.ParVector import scala.concurrent.{Await, ExecutionContext, Future} import org.apache.hadoop.conf.Configuration @@ -313,7 +314,7 @@ private[streaming] object FileBasedWriteAheadLog { val groupSize = taskSupport.parallelismLevel.max(8) source.grouped(groupSize).flatMap { group => - val parallelCollection = group.par + val parallelCollection = new ParVector(group.toVector) parallelCollection.tasksupport = taskSupport parallelCollection.map(handler) }.flatten diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLogRandomReader.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLogRandomReader.scala index 56d4977da0b51..7af018f6d7561 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLogRandomReader.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLogRandomReader.scala @@ -48,7 +48,7 @@ private[streaming] class FileBasedWriteAheadLogRandomReader(path: String, conf: instream.close() } - private def assertOpen() { + private def assertOpen(): Unit = { HdfsUtils.checkState(!closed, "Stream is closed. Create a new Reader to read from the file.") } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLogWriter.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLogWriter.scala index 1f5c1d4369b53..40d8865b146db 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLogWriter.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLogWriter.scala @@ -53,13 +53,13 @@ private[streaming] class FileBasedWriteAheadLogWriter(path: String, hadoopConf: stream.close() } - private def flush() { + private def flush(): Unit = { stream.hflush() // Useful for local file system where hflush/sync does not work (HADOOP-7844) stream.getWrappedStream.flush() } - private def assertOpen() { + private def assertOpen(): Unit = { HdfsUtils.checkState(!closed, "Stream is closed. Create a new Writer to write to file.") } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala index 8cb68b2be4ecf..146577214de17 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala @@ -40,7 +40,7 @@ private[streaming] object HdfsUtils { } } else { // we dont' want to use hdfs erasure coding, as that lacks support for append and hflush - SparkHadoopUtil.createNonECFile(dfs, dfsPath) + SparkHadoopUtil.createFile(dfs, dfsPath, false) } } stream @@ -62,7 +62,7 @@ private[streaming] object HdfsUtils { } } - def checkState(state: Boolean, errorMsg: => String) { + def checkState(state: Boolean, errorMsg: => String): Unit = { if (!state) { throw new IllegalStateException(errorMsg) } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/RateLimitedOutputStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/RateLimitedOutputStream.scala index 342f20f47a39e..af1f19e9cd98b 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/RateLimitedOutputStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/RateLimitedOutputStream.scala @@ -36,17 +36,17 @@ class RateLimitedOutputStream(out: OutputStream, desiredBytesPerSec: Int) private var lastSyncTime = System.nanoTime private var bytesWrittenSinceSync = 0L - override def write(b: Int) { + override def write(b: Int): Unit = { waitToWrite(1) out.write(b) } - override def write(bytes: Array[Byte]) { + override def write(bytes: Array[Byte]): Unit = { write(bytes, 0, bytes.length) } @tailrec - override final def write(bytes: Array[Byte], offset: Int, length: Int) { + override final def write(bytes: Array[Byte], offset: Int, length: Int): Unit = { val writeSize = math.min(length - offset, CHUNK_SIZE) if (writeSize > 0) { waitToWrite(writeSize) @@ -55,16 +55,16 @@ class RateLimitedOutputStream(out: OutputStream, desiredBytesPerSec: Int) } } - override def flush() { + override def flush(): Unit = { out.flush() } - override def close() { + override def close(): Unit = { out.close() } @tailrec - private def waitToWrite(numBytes: Int) { + private def waitToWrite(numBytes: Int): Unit = { val now = System.nanoTime val elapsedNanosecs = math.max(now - lastSyncTime, 1) val rate = bytesWrittenSinceSync.toDouble * 1000000000 / elapsedNanosecs diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala index eb9996ece3779..9cdfdb8374322 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala @@ -96,7 +96,7 @@ object RawTextHelper { * Warms up the SparkContext in master and slave by running tasks to force JIT kick in * before real workload starts. */ - def warmUp(sc: SparkContext) { + def warmUp(sc: SparkContext): Unit = { for (i <- 0 to 1) { sc.parallelize(1 to 200000, 1000) .map(_ % 1331).map(_.toString) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextSender.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextSender.scala index 9667af97f03bc..5d4fcf8bd1596 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextSender.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextSender.scala @@ -34,7 +34,7 @@ import org.apache.spark.util.IntParam */ private[streaming] object RawTextSender extends Logging { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { if (args.length != 4) { // scalastyle:off println System.err.println("Usage: RawTextSender ") diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala index 62e681e3e9646..3ffb2c12fb2dc 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/RecurringTimer.scala @@ -26,7 +26,7 @@ class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name: private val thread = new Thread("RecurringTimer - " + name) { setDaemon(true) - override def run() { loop } + override def run(): Unit = { loop } } @volatile private var prevTime = -1L @@ -100,7 +100,7 @@ class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name: /** * Repeatedly call the callback every interval. */ - private def loop() { + private def loop(): Unit = { try { while (!stopped) { triggerActionForNextInterval() @@ -115,11 +115,11 @@ class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name: private[streaming] object RecurringTimer extends Logging { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { var lastRecurTime = 0L val period = 1000 - def onRecur(time: Long) { + def onRecur(time: Long): Unit = { val currentTime = System.currentTimeMillis() logInfo("" + currentTime + ": " + (currentTime - lastRecurTime)) lastRecurTime = currentTime diff --git a/streaming/src/test/java/org/apache/spark/streaming/JavaWriteAheadLogSuite.java b/streaming/src/test/java/org/apache/spark/streaming/JavaWriteAheadLogSuite.java index 3f4e6ddb216ec..7037de1526c9c 100644 --- a/streaming/src/test/java/org/apache/spark/streaming/JavaWriteAheadLogSuite.java +++ b/streaming/src/test/java/org/apache/spark/streaming/JavaWriteAheadLogSuite.java @@ -120,6 +120,6 @@ public void testCustomWAL() { while (dataIterator.hasNext()) { readData.add(JavaUtils.bytesToString(dataIterator.next())); } - Assert.assertEquals(readData, Arrays.asList("data3", "data4")); + Assert.assertEquals(Arrays.asList("data3", "data4"), readData); } } diff --git a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala index 287a43ac689ed..742eae50e159b 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala @@ -732,7 +732,7 @@ class BasicOperationsSuite extends TestSuiteBase { val blockRdds = new mutable.HashMap[Time, BlockRDD[_]] val persistentRddIds = new mutable.HashMap[Time, Int] - def collectRddInfo() { // get all RDD info required for verification + def collectRddInfo(): Unit = { // get all RDD info required for verification networkStream.generatedRDDs.foreach { case (time, rdd) => blockRdds(time) = rdd.asInstanceOf[BlockRDD[_]] } diff --git a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala index 55fdd4c82ac75..238ef1e2367a0 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala @@ -39,8 +39,7 @@ import org.apache.spark.internal.config._ import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream._ import org.apache.spark.streaming.scheduler._ -import org.apache.spark.util.{Clock, ManualClock, MutableURLClassLoader, ResetSystemProperties, - Utils} +import org.apache.spark.util.{Clock, ManualClock, MutableURLClassLoader, ResetSystemProperties, Utils} /** * A input stream that records the times of restore() invoked @@ -55,7 +54,7 @@ class CheckpointInputDStream(_ssc: StreamingContext) extends InputDStream[Int](_ class FileInputDStreamCheckpointData extends DStreamCheckpointData(this) { @transient var restoredTimes = 0 - override def restore() { + override def restore(): Unit = { restoredTimes += 1 super.restore() } @@ -85,7 +84,7 @@ trait DStreamCheckpointTester { self: SparkFunSuite => numBatchesBeforeRestart: Int, batchDuration: Duration = Milliseconds(500), stopSparkContextAfterTest: Boolean = true - ) { + ): Unit = { require(numBatchesBeforeRestart < expectedOutput.size, "Number of batches before context restart less than number of expected output " + "(i.e. number of total batches to run)") @@ -206,24 +205,21 @@ trait DStreamCheckpointTester { self: SparkFunSuite => * the checkpointing of a DStream's RDDs as well as the checkpointing of * the whole DStream graph. */ -class CheckpointSuite extends TestSuiteBase with DStreamCheckpointTester +class CheckpointSuite extends TestSuiteBase with LocalStreamingContext with DStreamCheckpointTester with ResetSystemProperties { - var ssc: StreamingContext = null - override def batchDuration: Duration = Milliseconds(500) - override def beforeFunction() { - super.beforeFunction() + override def beforeEach(): Unit = { + super.beforeEach() Utils.deleteRecursively(new File(checkpointDir)) } - override def afterFunction() { + override def afterEach(): Unit = { try { - if (ssc != null) { ssc.stop() } Utils.deleteRecursively(new File(checkpointDir)) } finally { - super.afterFunction() + super.afterEach() } } @@ -241,8 +237,8 @@ class CheckpointSuite extends TestSuiteBase with DStreamCheckpointTester val stateStreamCheckpointInterval = Seconds(1) val fs = FileSystem.getLocal(new Configuration()) // this ensure checkpointing occurs at least once - val firstNumBatches = (stateStreamCheckpointInterval / batchDuration).toLong * 2 - val secondNumBatches = firstNumBatches + val firstNumBatches = (stateStreamCheckpointInterval / batchDuration).toLong + val secondNumBatches = firstNumBatches * 2 // Setup the streams val input = (1 to 10).map(_ => Seq("a")).toSeq @@ -255,17 +251,28 @@ class CheckpointSuite extends TestSuiteBase with DStreamCheckpointTester .checkpoint(stateStreamCheckpointInterval) .map(t => (t._1, t._2)) } - var ssc = setupStreams(input, operation) + ssc = setupStreams(input, operation) var stateStream = ssc.graph.getOutputStreams().head.dependencies.head.dependencies.head + def waitForCompletionOfBatch(numBatches: Long): Unit = { + eventually(timeout(10.seconds), interval(50.millis)) { + val lastProcessed = ssc.scheduler.jobGenerator.lastProcessedBatch + assert(lastProcessed != null && + lastProcessed >= Time(batchDuration.milliseconds * numBatches)) + } + } + // Run till a time such that at least one RDD in the stream should have been checkpointed, // then check whether some RDD has been checkpointed or not ssc.start() advanceTimeWithRealDelay(ssc, firstNumBatches) + waitForCompletionOfBatch(firstNumBatches) + logInfo("Checkpoint data of state stream = \n" + stateStream.checkpointData) - assert(!stateStream.checkpointData.currentCheckpointFiles.isEmpty, + var currCheckpointFiles = stateStream.checkpointData.currentCheckpointFiles + assert(!currCheckpointFiles.isEmpty, "No checkpointed RDDs in state stream before first failure") - stateStream.checkpointData.currentCheckpointFiles.foreach { + currCheckpointFiles.foreach { case (time, file) => assert(fs.exists(new Path(file)), "Checkpoint file '" + file +"' for time " + time + " for state stream before first failure does not exist") @@ -273,8 +280,10 @@ class CheckpointSuite extends TestSuiteBase with DStreamCheckpointTester // Run till a further time such that previous checkpoint files in the stream would be deleted // and check whether the earlier checkpoint files are deleted - val checkpointFiles = stateStream.checkpointData.currentCheckpointFiles.map(x => new File(x._2)) - advanceTimeWithRealDelay(ssc, secondNumBatches) + currCheckpointFiles = stateStream.checkpointData.currentCheckpointFiles + val checkpointFiles = currCheckpointFiles.map(x => new File(x._2)) + advanceTimeWithRealDelay(ssc, secondNumBatches - firstNumBatches) + waitForCompletionOfBatch(secondNumBatches) checkpointFiles.foreach(file => assert(!file.exists, "Checkpoint file '" + file + "' was not deleted")) ssc.stop() @@ -287,14 +296,15 @@ class CheckpointSuite extends TestSuiteBase with DStreamCheckpointTester assert(!stateStream.generatedRDDs.isEmpty, "No restored RDDs in state stream after recovery from first failure") - // Run one batch to generate a new checkpoint file and check whether some RDD // is present in the checkpoint data or not ssc.start() advanceTimeWithRealDelay(ssc, 1) - assert(!stateStream.checkpointData.currentCheckpointFiles.isEmpty, + waitForCompletionOfBatch(secondNumBatches + 1) + currCheckpointFiles = stateStream.checkpointData.currentCheckpointFiles + assert(!currCheckpointFiles.isEmpty, "No checkpointed RDDs in state stream before second failure") - stateStream.checkpointData.currentCheckpointFiles.foreach { + currCheckpointFiles.foreach { case (time, file) => assert(fs.exists(new Path(file)), "Checkpoint file '" + file +"' for time " + time + " for state stream before seconds failure does not exist") @@ -410,6 +420,33 @@ class CheckpointSuite extends TestSuiteBase with DStreamCheckpointTester assert(restoredConf1.get("spark.driver.port") !== "9999") } + test("SPARK-30199 get ui port and blockmanager port") { + val conf = Map("spark.ui.port" -> "30001", "spark.blockManager.port" -> "30002") + conf.foreach { case (k, v) => System.setProperty(k, v) } + ssc = new StreamingContext(master, framework, batchDuration) + conf.foreach { case (k, v) => assert(ssc.conf.get(k) === v) } + + val cp = new Checkpoint(ssc, Time(1000)) + ssc.stop() + + // Serialize/deserialize to simulate write to storage and reading it back + val newCp = Utils.deserialize[Checkpoint](Utils.serialize(cp)) + + val newCpConf = newCp.createSparkConf() + conf.foreach { case (k, v) => assert(newCpConf.contains(k) && newCpConf.get(k) === v) } + + // Check if all the parameters have been restored + ssc = new StreamingContext(null, newCp, null) + conf.foreach { case (k, v) => assert(ssc.conf.get(k) === v) } + ssc.stop() + + // If port numbers are not set in system property, these parameters should not be presented + // in the newly recovered conf. + conf.foreach(kv => System.clearProperty(kv._1)) + val newCpConf1 = newCp.createSparkConf() + conf.foreach { case (k, _) => assert(!newCpConf1.contains(k)) } + } + // This tests whether the system can recover from a master failure with simple // non-stateful operations. This assumes as reliable, replayable input // source - TestInputDStream. @@ -847,6 +884,23 @@ class CheckpointSuite extends TestSuiteBase with DStreamCheckpointTester checkpointWriter.stop() } + test("SPARK-28912: Fix MatchError in getCheckpointFiles") { + withTempDir { tempDir => + val fs = FileSystem.get(tempDir.toURI, new Configuration) + val checkpointDir = tempDir.getAbsolutePath + "/checkpoint-01" + + assert(Checkpoint.getCheckpointFiles(checkpointDir, Some(fs)).length === 0) + + // Ignore files whose parent path match. + fs.create(new Path(checkpointDir, "this-is-matched-before-due-to-parent-path")).close() + assert(Checkpoint.getCheckpointFiles(checkpointDir, Some(fs)).length === 0) + + // Ignore directories whose names match. + fs.mkdirs(new Path(checkpointDir, "checkpoint-1000000000")) + assert(Checkpoint.getCheckpointFiles(checkpointDir, Some(fs)).length === 0) + } + } + test("SPARK-6847: stack overflow when updateStateByKey is followed by a checkpointed dstream") { // In this test, there are two updateStateByKey operators. The RDD DAG is as follows: // diff --git a/streaming/src/test/scala/org/apache/spark/streaming/DStreamClosureSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/DStreamClosureSuite.scala index 2ab600ab817e0..0576bf560f30e 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/DStreamClosureSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/DStreamClosureSuite.scala @@ -29,24 +29,14 @@ import org.apache.spark.util.ReturnStatementInClosureException /** * Test that closures passed to DStream operations are actually cleaned. */ -class DStreamClosureSuite extends SparkFunSuite with BeforeAndAfterAll { - private var ssc: StreamingContext = null +class DStreamClosureSuite extends SparkFunSuite with LocalStreamingContext with BeforeAndAfterAll { + override protected def beforeEach(): Unit = { + super.beforeEach() - override def beforeAll(): Unit = { - super.beforeAll() val sc = new SparkContext("local", "test") ssc = new StreamingContext(sc, Seconds(1)) } - override def afterAll(): Unit = { - try { - ssc.stop(stopSparkContext = true) - ssc = null - } finally { - super.afterAll() - } - } - test("user provided closures are actually cleaned") { val dstream = new DummyInputDStream(ssc) val pairDstream = dstream.map { i => (i, i) } diff --git a/streaming/src/test/scala/org/apache/spark/streaming/DStreamScopeSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/DStreamScopeSuite.scala index 94f1bcebc3a39..36036fcd44b04 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/DStreamScopeSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/DStreamScopeSuite.scala @@ -19,39 +19,38 @@ package org.apache.spark.streaming import scala.collection.mutable.ArrayBuffer -import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll} - import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.rdd.{RDD, RDDOperationScope} import org.apache.spark.streaming.dstream.DStream -import org.apache.spark.streaming.ui.UIUtils +import org.apache.spark.ui.{UIUtils => SparkUIUtils} import org.apache.spark.util.ManualClock /** * Tests whether scope information is passed from DStream operations to RDDs correctly. */ -class DStreamScopeSuite extends SparkFunSuite with BeforeAndAfter with BeforeAndAfterAll { - private var ssc: StreamingContext = null - private val batchDuration: Duration = Seconds(1) +class DStreamScopeSuite + extends SparkFunSuite + with LocalStreamingContext { + + override def beforeEach(): Unit = { + super.beforeEach() - override def beforeAll(): Unit = { - super.beforeAll() val conf = new SparkConf().setMaster("local").setAppName("test") conf.set("spark.streaming.clock", classOf[ManualClock].getName()) + val batchDuration: Duration = Seconds(1) ssc = new StreamingContext(new SparkContext(conf), batchDuration) + + assertPropertiesNotSet() } - override def afterAll(): Unit = { + override def afterEach(): Unit = { try { - ssc.stop(stopSparkContext = true) + assertPropertiesNotSet() } finally { - super.afterAll() + super.afterEach() } } - before { assertPropertiesNotSet() } - after { assertPropertiesNotSet() } - test("dstream without scope") { val dummyStream = new DummyDStream(ssc) dummyStream.initialize(Time(0)) @@ -213,7 +212,7 @@ class DStreamScopeSuite extends SparkFunSuite with BeforeAndAfter with BeforeAnd rddScope: RDDOperationScope, batchTime: Long): Unit = { val (baseScopeId, baseScopeName) = (baseScope.id, baseScope.name) - val formattedBatchTime = UIUtils.formatBatchTime( + val formattedBatchTime = SparkUIUtils.formatBatchTime( batchTime, ssc.graph.batchDuration.milliseconds, showYYYYMMSS = false) assert(rddScope.id === s"${baseScopeId}_$batchTime") assert(rddScope.name.replaceAll("\\n", " ") === s"$baseScopeName @ $formattedBatchTime") diff --git a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala index 0792770442055..53ef840864bce 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/InputStreamsSuite.scala @@ -449,9 +449,9 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter { withStreamingContext(new StreamingContext(conf, batchDuration)) { ssc => class TestInputDStream extends InputDStream[String](ssc) { - def start() {} + def start(): Unit = {} - def stop() {} + def stop(): Unit = {} def compute(validTime: Time): Option[RDD[String]] = None } @@ -473,7 +473,7 @@ class InputStreamsSuite extends TestSuiteBase with BeforeAndAfter { } } - def testFileStream(newFilesOnly: Boolean) { + def testFileStream(newFilesOnly: Boolean): Unit = { withTempDir { testDir => val batchDuration = Seconds(2) // Create a file that exists before the StreamingContext is created: @@ -537,7 +537,7 @@ class TestServer(portToBind: Int = 0) extends Logging with Assertions { private val startLatch = new CountDownLatch(1) val servingThread = new Thread() { - override def run() { + override def run(): Unit = { try { while (true) { logInfo("Accepting connections on port " + port) @@ -608,9 +608,9 @@ class TestServer(portToBind: Int = 0) extends Logging with Assertions { } } - def send(msg: String) { queue.put(msg) } + def send(msg: String): Unit = { queue.put(msg) } - def stop() { servingThread.interrupt() } + def stop(): Unit = { servingThread.interrupt() } def port: Int = serverSocket.getLocalPort } @@ -621,10 +621,10 @@ class MultiThreadTestReceiver(numThreads: Int, numRecordsPerThread: Int) lazy val executorPool = Executors.newFixedThreadPool(numThreads) lazy val finishCount = new AtomicInteger(0) - def onStart() { + def onStart(): Unit = { (1 to numThreads).map(threadId => { val runnable = new Runnable { - def run() { + def run(): Unit = { (1 to numRecordsPerThread).foreach(i => store(threadId * numRecordsPerThread + i) ) if (finishCount.incrementAndGet == numThreads) { @@ -637,7 +637,7 @@ class MultiThreadTestReceiver(numThreads: Int, numRecordsPerThread: Int) }) } - def onStop() { + def onStop(): Unit = { executorPool.shutdown() } } diff --git a/streaming/src/test/scala/org/apache/spark/streaming/LocalStreamingContext.scala b/streaming/src/test/scala/org/apache/spark/streaming/LocalStreamingContext.scala new file mode 100644 index 0000000000000..5bf24a9705dc9 --- /dev/null +++ b/streaming/src/test/scala/org/apache/spark/streaming/LocalStreamingContext.scala @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming + +import org.scalatest.{BeforeAndAfterEach, Suite} + +import org.apache.spark.SparkContext +import org.apache.spark.internal.Logging + +/** + * Manages a local `ssc` `StreamingContext` variable, correctly stopping it after each test. + * Note that it also stops active SparkContext if `stopSparkContext` is set to true (default). + * In most cases you may want to leave it, to isolate environment for SparkContext in each test. + */ +trait LocalStreamingContext extends BeforeAndAfterEach { self: Suite => + + @transient var ssc: StreamingContext = _ + @transient var stopSparkContext: Boolean = true + + override def afterEach(): Unit = { + try { + resetStreamingContext() + } finally { + super.afterEach() + } + } + + def resetStreamingContext(): Unit = { + LocalStreamingContext.stop(ssc, stopSparkContext) + ssc = null + } +} + +object LocalStreamingContext extends Logging { + def stop(ssc: StreamingContext, stopSparkContext: Boolean): Unit = { + try { + if (ssc != null) { + ssc.stop(stopSparkContext = stopSparkContext) + } + } finally { + if (stopSparkContext) { + ensureNoActiveSparkContext() + } + } + } + + /** + * Clean up active SparkContext: try to stop first if there's an active SparkContext. + * If it fails to stop, log warning message and clear active SparkContext to avoid + * interfere between tests. + */ + def ensureNoActiveSparkContext(): Unit = { + // if SparkContext is still active, try to clean up + SparkContext.getActive match { + case Some(sc) => + try { + sc.stop() + } catch { + case e: Throwable => + logError("Exception trying to stop SparkContext, clear active SparkContext...", e) + SparkContext.clearActiveContext() + throw e + } + case _ => + } + } + +} diff --git a/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala index 06c0c2aa97ee1..b2b8d2f41fc80 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/MapWithStateSuite.scala @@ -23,46 +23,36 @@ import java.util.concurrent.ConcurrentLinkedQueue import scala.collection.JavaConverters._ import scala.reflect.ClassTag -import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll} import org.scalatest.PrivateMethodTester._ import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.streaming.dstream.{DStream, InternalMapWithStateDStream, MapWithStateDStream, MapWithStateDStreamImpl} import org.apache.spark.util.{ManualClock, Utils} -class MapWithStateSuite extends SparkFunSuite - with DStreamCheckpointTester with BeforeAndAfterAll with BeforeAndAfter { +class MapWithStateSuite extends SparkFunSuite with LocalStreamingContext + with DStreamCheckpointTester { private var sc: SparkContext = null protected var checkpointDir: File = null protected val batchDuration = Seconds(1) - before { - StreamingContext.getActive().foreach { _.stop(stopSparkContext = false) } - checkpointDir = Utils.createTempDir(namePrefix = "checkpoint") - } + override def beforeEach(): Unit = { + super.beforeEach() - after { - StreamingContext.getActive().foreach { _.stop(stopSparkContext = false) } - if (checkpointDir != null) { - Utils.deleteRecursively(checkpointDir) - } - } - - override def beforeAll(): Unit = { - super.beforeAll() val conf = new SparkConf().setMaster("local").setAppName("MapWithStateSuite") conf.set("spark.streaming.clock", classOf[ManualClock].getName()) sc = new SparkContext(conf) + + checkpointDir = Utils.createTempDir(namePrefix = "checkpoint") } - override def afterAll(): Unit = { + override def afterEach(): Unit = { try { - if (sc != null) { - sc.stop() + if (checkpointDir != null) { + Utils.deleteRecursively(checkpointDir) } } finally { - super.afterAll() + super.afterEach() } } @@ -446,7 +436,8 @@ class MapWithStateSuite extends SparkFunSuite } test("mapWithState - checkpoint durations") { - val privateMethod = PrivateMethod[InternalMapWithStateDStream[_, _, _, _]]('internalStream) + val privateMethod = + PrivateMethod[InternalMapWithStateDStream[_, _, _, _]](Symbol("internalStream")) def testCheckpointDuration( batchDuration: Duration, @@ -571,7 +562,7 @@ class MapWithStateSuite extends SparkFunSuite (collectedOutputs.asScala.toSeq, collectedStateSnapshots.asScala.toSeq) } - private def assert[U](expected: Seq[Seq[U]], collected: Seq[Seq[U]], typ: String) { + private def assert[U](expected: Seq[Seq[U]], collected: Seq[Seq[U]], typ: String): Unit = { val debugString = "\nExpected:\n" + expected.mkString("\n") + "\nCollected:\n" + collected.mkString("\n") assert(expected.size === collected.size, diff --git a/streaming/src/test/scala/org/apache/spark/streaming/MasterFailureTest.scala b/streaming/src/test/scala/org/apache/spark/streaming/MasterFailureTest.scala index cf8dd10571f47..d0a5ababc7cac 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/MasterFailureTest.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/MasterFailureTest.scala @@ -30,6 +30,7 @@ import scala.util.Random import com.google.common.io.Files import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path +import org.scalatest.Assertions._ import org.apache.spark.internal.Logging import org.apache.spark.streaming.dstream.DStream @@ -42,7 +43,7 @@ object MasterFailureTest extends Logging { @volatile var killCount = 0 @volatile var setupCalled = false - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { // scalastyle:off println if (args.size < 2) { println( @@ -64,7 +65,7 @@ object MasterFailureTest extends Logging { // scalastyle:on println } - def testMap(directory: String, numBatches: Int, batchDuration: Duration) { + def testMap(directory: String, numBatches: Int, batchDuration: Duration): Unit = { // Input: time=1 ==> [ 1 ] , time=2 ==> [ 2 ] , time=3 ==> [ 3 ] , ... val input = (1 to numBatches).map(_.toString).toSeq // Expected output: time=1 ==> [ 1 ] , time=2 ==> [ 2 ] , time=3 ==> [ 3 ] , ... @@ -86,7 +87,7 @@ object MasterFailureTest extends Logging { } - def testUpdateStateByKey(directory: String, numBatches: Int, batchDuration: Duration) { + def testUpdateStateByKey(directory: String, numBatches: Int, batchDuration: Duration): Unit = { // Input: time=1 ==> [ a ] , time=2 ==> [ a, a ] , time=3 ==> [ a, a, a ] , ... val input = (1 to numBatches).map(i => (1 to i).map(_ => "a").mkString(" ")).toSeq // Expected output: time=1 ==> [ (a, 1) ] , time=2 ==> [ (a, 3) ] , time=3 ==> [ (a,6) ] , ... @@ -293,7 +294,7 @@ object MasterFailureTest extends Logging { * duplicate batch outputs of values from the `output`. As a result, the * expected output should not have consecutive batches with the same values as output. */ - private def verifyOutput[T: ClassTag](output: Seq[T], expectedOutput: Seq[T]) { + private def verifyOutput[T: ClassTag](output: Seq[T], expectedOutput: Seq[T]): Unit = { // Verify whether expected outputs do not consecutive batches with same output for (i <- 0 until expectedOutput.size - 1) { assert(expectedOutput(i) != expectedOutput(i + 1), @@ -315,7 +316,7 @@ object MasterFailureTest extends Logging { } /** Resets counter to prepare for the test */ - private def reset() { + private def reset(): Unit = { killed = false killCount = 0 setupCalled = false @@ -328,7 +329,7 @@ object MasterFailureTest extends Logging { private[streaming] class KillingThread(ssc: StreamingContext, maxKillWaitTime: Long) extends Thread with Logging { - override def run() { + override def run(): Unit = { try { // If it is the first killing, then allow the first checkpoint to be created var minKillWaitTime = if (MasterFailureTest.killCount == 0) 5000 else 2000 @@ -362,7 +363,7 @@ private[streaming] class FileGeneratingThread(input: Seq[String], testDir: Path, interval: Long) extends Thread with Logging { - override def run() { + override def run(): Unit = { val localTestDir = Utils.createTempDir() var fs = testDir.getFileSystem(new Configuration()) val maxTries = 3 diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala index c8f424af9af01..0976494b6d094 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.streaming import java.io.File import java.nio.ByteBuffer +import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import scala.concurrent.duration._ import scala.reflect.ClassTag @@ -87,9 +88,12 @@ abstract class BaseReceivedBlockHandlerSuite(enableEncryption: Boolean) rpcEnv = RpcEnv.create("test", "localhost", 0, conf, securityMgr) conf.set("spark.driver.port", rpcEnv.address.port.toString) + val blockManagerInfo = new mutable.HashMap[BlockManagerId, BlockManagerInfo]() blockManagerMaster = new BlockManagerMaster(rpcEnv.setupEndpoint("blockmanager", new BlockManagerMasterEndpoint(rpcEnv, true, conf, - new LiveListenerBus(conf), None)), conf, true) + new LiveListenerBus(conf), None, blockManagerInfo)), + rpcEnv.setupEndpoint("blockmanagerHeartbeat", + new BlockManagerMasterHeartbeatEndpoint(rpcEnv, true, blockManagerInfo)), conf, true) storageLevel = StorageLevel.MEMORY_ONLY_SER blockManager = createBlockManager(blockManagerSize, conf) @@ -242,7 +246,8 @@ abstract class BaseReceivedBlockHandlerSuite(enableEncryption: Boolean) } } - private def testCountWithBlockManagerBasedBlockHandler(isBlockManagerBasedBlockHandler: Boolean) { + private def testCountWithBlockManagerBasedBlockHandler( + isBlockManagerBasedBlockHandler: Boolean): Unit = { // ByteBufferBlock-MEMORY_ONLY testRecordcount(isBlockManagerBasedBlockHandler, StorageLevel.MEMORY_ONLY, ByteBufferBlock(ByteBuffer.wrap(Array.tabulate(100)(i => i.toByte))), blockManager, None) @@ -298,7 +303,7 @@ abstract class BaseReceivedBlockHandlerSuite(enableEncryption: Boolean) receivedBlock: ReceivedBlock, bManager: BlockManager, expectedNumRecords: Option[Long] - ) { + ): Unit = { blockManager = bManager storageLevel = sLevel var bId: StreamBlockId = null @@ -335,10 +340,11 @@ abstract class BaseReceivedBlockHandlerSuite(enableEncryption: Boolean) * using the given verification function */ private def testBlockStoring(receivedBlockHandler: ReceivedBlockHandler) - (verifyFunc: (Seq[String], Seq[StreamBlockId], Seq[ReceivedBlockStoreResult]) => Unit) { + (verifyFunc: (Seq[String], Seq[StreamBlockId], Seq[ReceivedBlockStoreResult]) => Unit) + : Unit = { val data = Seq.tabulate(100) { _.toString } - def storeAndVerify(blocks: Seq[ReceivedBlock]) { + def storeAndVerify(blocks: Seq[ReceivedBlock]): Unit = { blocks should not be empty val (blockIds, storeResults) = storeBlocks(receivedBlockHandler, blocks) withClue(s"Testing with ${blocks.head.getClass.getSimpleName}s:") { @@ -361,7 +367,7 @@ abstract class BaseReceivedBlockHandlerSuite(enableEncryption: Boolean) } /** Test error handling when blocks that cannot be stored */ - private def testErrorHandling(receivedBlockHandler: ReceivedBlockHandler) { + private def testErrorHandling(receivedBlockHandler: ReceivedBlockHandler): Unit = { // Handle error in iterator (e.g. divide-by-zero error) intercept[Exception] { val iterator = (10 to (-10, -1)).toIterator.map { _ / 0 } @@ -376,12 +382,14 @@ abstract class BaseReceivedBlockHandlerSuite(enableEncryption: Boolean) } /** Instantiate a BlockManagerBasedBlockHandler and run a code with it */ - private def withBlockManagerBasedBlockHandler(body: BlockManagerBasedBlockHandler => Unit) { + private def withBlockManagerBasedBlockHandler( + body: BlockManagerBasedBlockHandler => Unit): Unit = { body(new BlockManagerBasedBlockHandler(blockManager, storageLevel)) } /** Instantiate a WriteAheadLogBasedBlockHandler and run a code with it */ - private def withWriteAheadLogBasedBlockHandler(body: WriteAheadLogBasedBlockHandler => Unit) { + private def withWriteAheadLogBasedBlockHandler( + body: WriteAheadLogBasedBlockHandler => Unit): Unit = { require(WriteAheadLogUtils.getRollingIntervalSecs(conf, isDriver = false) === 1) val receivedBlockHandler = new WriteAheadLogBasedBlockHandler(blockManager, serializerManager, 1, storageLevel, conf, hadoopConf, tempDirectory.toString, manualClock) diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala index 0b15f00eba499..368411cc2214b 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockTrackerSuite.scala @@ -184,7 +184,7 @@ class ReceivedBlockTrackerSuite // Set the time increment level to twice the rotation interval so that every increment creates // a new log file - def incrementTime() { + def incrementTime(): Unit = { val timeIncrementMillis = 2000L manualClock.advance(timeIncrementMillis) } @@ -197,7 +197,7 @@ class ReceivedBlockTrackerSuite } // Print the data present in the log ahead files in the log directory - def printLogFiles(message: String) { + def printLogFiles(message: String): Unit = { val fileContents = getWriteAheadLogFiles().map { file => (s"\n>>>>> $file: <<<<<\n${getWrittenLogData(file).mkString("\n")}") }.mkString("\n") diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala index 0349e11224cfc..5e2ce25c7c441 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala @@ -19,8 +19,6 @@ package org.apache.spark.streaming import scala.util.Random -import org.scalatest.BeforeAndAfterAll - import org.apache.spark.{SparkConf, SparkEnv} import org.apache.spark.rdd.BlockRDD import org.apache.spark.storage.{StorageLevel, StreamBlockId} @@ -30,15 +28,9 @@ import org.apache.spark.streaming.receiver.{BlockManagerBasedStoreResult, Receiv import org.apache.spark.streaming.scheduler.ReceivedBlockInfo import org.apache.spark.streaming.util.{WriteAheadLogRecordHandle, WriteAheadLogUtils} -class ReceiverInputDStreamSuite extends TestSuiteBase with BeforeAndAfterAll { - - override def afterAll(): Unit = { - try { - StreamingContext.getActive().foreach(_.stop()) - } finally { - super.afterAll() - } - } +class ReceiverInputDStreamSuite + extends TestSuiteBase + with LocalStreamingContext { testWithoutWAL("createBlockRDD creates empty BlockRDD when no block info") { receiverStream => val rdd = receiverStream.createBlockRDD(Time(0), Seq.empty) @@ -127,7 +119,7 @@ class ReceiverInputDStreamSuite extends TestSuiteBase with BeforeAndAfterAll { conf.setMaster("local[4]").setAppName("ReceiverInputDStreamSuite") conf.set(WriteAheadLogUtils.RECEIVER_WAL_ENABLE_CONF_KEY, enableWAL.toString) require(WriteAheadLogUtils.enableReceiverLog(conf) === enableWAL) - val ssc = new StreamingContext(conf, Seconds(1)) + ssc = new StreamingContext(conf, Seconds(1)) val receiverStream = new ReceiverInputDStream[Int](ssc) { override def getReceiver(): Receiver[Int] = null } diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala index 6b664b7a7dfd4..b07fd733953db 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala @@ -52,7 +52,7 @@ class ReceiverSuite extends TestSuiteBase with TimeLimits with Serializable { // Thread that runs the executor val executingThread = new Thread() { - override def run() { + override def run(): Unit = { executor.start() executorStarted.release(1) executor.awaitTermination() @@ -73,7 +73,7 @@ class ReceiverSuite extends TestSuiteBase with TimeLimits with Serializable { executorStarted.acquire() // Verify that receiver was started - assert(receiver.onStartCalled) + assert(receiver.callsRecorder.calls === Seq("onStart")) assert(executor.isReceiverStarted) assert(receiver.isStarted) assert(!receiver.isStopped()) @@ -106,19 +106,22 @@ class ReceiverSuite extends TestSuiteBase with TimeLimits with Serializable { assert(executor.errors.head.eq(exception)) // Verify restarting actually stops and starts the receiver - receiver.restart("restarting", null, 600) - eventually(timeout(300.milliseconds), interval(10.milliseconds)) { - // receiver will be stopped async - assert(receiver.isStopped) - assert(receiver.onStopCalled) - } - eventually(timeout(1.second), interval(10.milliseconds)) { - // receiver will be started async - assert(receiver.onStartCalled) - assert(executor.isReceiverStarted) + executor.callsRecorder.reset() + receiver.callsRecorder.reset() + receiver.restart("restarting", null, 100) + eventually(timeout(10.seconds), interval(10.milliseconds)) { + // below verification ensures for now receiver is already restarted assert(receiver.isStarted) assert(!receiver.isStopped) assert(receiver.receiving) + + // both receiver supervisor and receiver should be stopped first, and started + assert(executor.callsRecorder.calls === Seq("onReceiverStop", "onReceiverStart")) + assert(receiver.callsRecorder.calls === Seq("onStop", "onStart")) + + // check whether the delay between stop and start is respected + assert(executor.callsRecorder.timestamps.reverse.reduceLeft { _ - _ } >= 100) + assert(receiver.callsRecorder.timestamps.reverse.reduceLeft { _ - _ } >= 100) } // Verify that stopping actually stops the thread @@ -229,7 +232,7 @@ class ReceiverSuite extends TestSuiteBase with TimeLimits with Serializable { } } - def printLogFiles(message: String, files: Seq[String]) { + def printLogFiles(message: String, files: Seq[String]): Unit = { logInfo(s"$message (${files.size} files):\n" + files.mkString("\n")) } @@ -290,42 +293,53 @@ class ReceiverSuite extends TestSuiteBase with TimeLimits with Serializable { val arrayBuffers = new ArrayBuffer[ArrayBuffer[_]] val errors = new ArrayBuffer[Throwable] + // tracks calls of "onReceiverStart", "onReceiverStop" + val callsRecorder = new MethodsCallRecorder() + /** Check if all data structures are clean */ def isAllEmpty: Boolean = { singles.isEmpty && byteBuffers.isEmpty && iterators.isEmpty && arrayBuffers.isEmpty && errors.isEmpty } - def pushSingle(data: Any) { + def pushSingle(data: Any): Unit = { singles += data } def pushBytes( bytes: ByteBuffer, optionalMetadata: Option[Any], - optionalBlockId: Option[StreamBlockId]) { + optionalBlockId: Option[StreamBlockId]): Unit = { byteBuffers += bytes } def pushIterator( iterator: Iterator[_], optionalMetadata: Option[Any], - optionalBlockId: Option[StreamBlockId]) { + optionalBlockId: Option[StreamBlockId]): Unit = { iterators += iterator } def pushArrayBuffer( arrayBuffer: ArrayBuffer[_], optionalMetadata: Option[Any], - optionalBlockId: Option[StreamBlockId]) { + optionalBlockId: Option[StreamBlockId]): Unit = { arrayBuffers += arrayBuffer } - def reportError(message: String, throwable: Throwable) { + def reportError(message: String, throwable: Throwable): Unit = { errors += throwable } - override protected def onReceiverStart(): Boolean = true + override protected def onReceiverStart(): Boolean = { + callsRecorder.record() + true + } + + override protected def onReceiverStop(message: String, error: Option[Throwable]): Unit = { + callsRecorder.record() + super.onReceiverStop(message, error) + } override def createBlockGenerator( blockGeneratorListener: BlockGeneratorListener): BlockGenerator = { @@ -341,17 +355,17 @@ class ReceiverSuite extends TestSuiteBase with TimeLimits with Serializable { val arrayBuffers = new ArrayBuffer[ArrayBuffer[Int]] val errors = new ArrayBuffer[Throwable] - def onAddData(data: Any, metadata: Any) { } + def onAddData(data: Any, metadata: Any): Unit = { } - def onGenerateBlock(blockId: StreamBlockId) { } + def onGenerateBlock(blockId: StreamBlockId): Unit = { } - def onPushBlock(blockId: StreamBlockId, arrayBuffer: ArrayBuffer[_]) { + def onPushBlock(blockId: StreamBlockId, arrayBuffer: ArrayBuffer[_]): Unit = { val bufferOfInts = arrayBuffer.map(_.asInstanceOf[Int]) arrayBuffers += bufferOfInts Thread.sleep(0) } - def onError(message: String, throwable: Throwable) { + def onError(message: String, throwable: Throwable): Unit = { errors += throwable } } @@ -363,36 +377,55 @@ class ReceiverSuite extends TestSuiteBase with TimeLimits with Serializable { class FakeReceiver(sendData: Boolean = false) extends Receiver[Int](StorageLevel.MEMORY_ONLY) { @volatile var otherThread: Thread = null @volatile var receiving = false - @volatile var onStartCalled = false - @volatile var onStopCalled = false - def onStart() { + // tracks calls of "onStart", "onStop" + @transient lazy val callsRecorder = new MethodsCallRecorder() + + def onStart(): Unit = { otherThread = new Thread() { - override def run() { + override def run(): Unit = { receiving = true - var count = 0 - while(!isStopped()) { - if (sendData) { - store(count) - count += 1 + try { + var count = 0 + while(!isStopped()) { + if (sendData) { + store(count) + count += 1 + } + Thread.sleep(10) } - Thread.sleep(10) + } finally { + receiving = false } } } - onStartCalled = true + callsRecorder.record() otherThread.start() } - def onStop() { - onStopCalled = true + def onStop(): Unit = { + callsRecorder.record() otherThread.join() } +} + +class MethodsCallRecorder { + // tracks calling methods as (timestamp, methodName) + private val records = new ArrayBuffer[(Long, String)] + + def record(): Unit = records.append((System.currentTimeMillis(), callerMethodName)) + + def reset(): Unit = records.clear() - def reset() { - receiving = false - onStartCalled = false - onStopCalled = false + def callsWithTimestamp: scala.collection.immutable.Seq[(Long, String)] = records.toList + + def calls: scala.collection.immutable.Seq[String] = records.map(_._2).toList + + def timestamps: scala.collection.immutable.Seq[Long] = records.map(_._1).toList + + private def callerMethodName: String = { + val stackTrace = new Throwable().getStackTrace + // it should return method name of two levels deeper + stackTrace(2).getMethodName } } - diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala index c4424b3cff877..1d6637861511f 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingContextSuite.scala @@ -26,7 +26,7 @@ import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.Queue import org.apache.commons.io.FileUtils -import org.scalatest.{Assertions, BeforeAndAfter, PrivateMethodTester} +import org.scalatest.{Assertions, PrivateMethodTester} import org.scalatest.concurrent.{Signaler, ThreadSignaler, TimeLimits} import org.scalatest.concurrent.Eventually._ import org.scalatest.exceptions.TestFailedDueToTimeoutException @@ -44,7 +44,11 @@ import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.util.{ManualClock, Utils} -class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with TimeLimits with Logging { +class StreamingContextSuite + extends SparkFunSuite + with LocalStreamingContext + with TimeLimits + with Logging { // Necessary to make ScalaTest 3.x interrupt a thread on the JVM like ScalaTest 2.2.x implicit val signaler: Signaler = ThreadSignaler @@ -56,20 +60,6 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with TimeL val envPair = "key" -> "value" val conf = new SparkConf().setMaster(master).setAppName(appName) - var sc: SparkContext = null - var ssc: StreamingContext = null - - after { - if (ssc != null) { - ssc.stop() - ssc = null - } - if (sc != null) { - sc.stop() - sc = null - } - } - test("from no conf constructor") { ssc = new StreamingContext(master, appName, batchDuration) assert(ssc.sparkContext.conf.get("spark.master") === master) @@ -95,7 +85,7 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with TimeL } test("from existing SparkContext") { - sc = new SparkContext(master, appName) + val sc = new SparkContext(master, appName) ssc = new StreamingContext(sc, batchDuration) } @@ -272,7 +262,7 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with TimeL // Explicitly do not stop SparkContext ssc = new StreamingContext(conf, batchDuration) - sc = ssc.sparkContext + var sc = ssc.sparkContext addInputStream(ssc).register() ssc.start() ssc.stop(stopSparkContext = false) @@ -306,7 +296,7 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with TimeL test("stop gracefully") { val conf = new SparkConf().setMaster(master).setAppName(appName) conf.set("spark.dummyTimeConfig", "3600s") - sc = new SparkContext(conf) + val sc = new SparkContext(conf) for (i <- 1 to 4) { logInfo("==================================\n\n\n") ssc = new StreamingContext(sc, Milliseconds(100)) @@ -338,7 +328,7 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with TimeL // This is not a deterministic unit. But if this unit test is flaky, then there is definitely // something wrong. See SPARK-5681 val conf = new SparkConf().setMaster(master).setAppName(appName) - sc = new SparkContext(conf) + val sc = new SparkContext(conf) ssc = new StreamingContext(sc, Milliseconds(100)) val input = ssc.receiverStream(new TestReceiver) input.foreachRDD(_ => {}) @@ -352,11 +342,10 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with TimeL test("stop slow receiver gracefully") { val conf = new SparkConf().setMaster(master).setAppName(appName) conf.set("spark.streaming.gracefulStopTimeout", "20000s") - sc = new SparkContext(conf) + val sc = new SparkContext(conf) logInfo("==================================\n\n\n") ssc = new StreamingContext(sc, Milliseconds(100)) var runningCount = 0 - SlowTestReceiver.receivedAllRecords = false // Create test receiver that sleeps in onStop() val totalNumRecords = 15 val recordsPerSecond = 1 @@ -368,6 +357,9 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with TimeL } ssc.start() ssc.awaitTerminationOrTimeout(500) + eventually(timeout(10.seconds), interval(10.millis)) { + assert(SlowTestReceiver.initialized) + } ssc.stop(stopSparkContext = false, stopGracefully = true) logInfo("Running count = " + runningCount) assert(runningCount > 0) @@ -445,7 +437,7 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with TimeL // test whether wait exits if context is stopped failAfter(10.seconds) { // 10 seconds because spark takes a long time to shutdown t = new Thread() { - override def run() { + override def run(): Unit = { Thread.sleep(500) ssc.stop() } @@ -512,7 +504,7 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with TimeL // test whether awaitTerminationOrTimeout() return true if context is stopped failAfter(10.seconds) { // 10 seconds because spark takes a long time to shutdown t = new Thread() { - override def run() { + override def run(): Unit = { Thread.sleep(500) ssc.stop() } @@ -591,7 +583,7 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with TimeL // getOrCreate should recover StreamingContext with existing SparkContext testGetOrCreate { - sc = new SparkContext(conf) + val sc = new SparkContext(conf) ssc = StreamingContext.getOrCreate(checkpointPath, () => creatingFunction()) assert(ssc != null, "no context created") assert(!newContextCreated, "old context not recovered") @@ -603,7 +595,7 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with TimeL require(StreamingContext.getActive().isEmpty, "context exists from before") var newContextCreated = false - def creatingFunc(): StreamingContext = { + def creatingFunc(sc: SparkContext)(): StreamingContext = { newContextCreated = true val newSsc = new StreamingContext(sc, batchDuration) val input = addInputStream(newSsc) @@ -627,8 +619,8 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with TimeL // getActiveOrCreate should create new context and getActive should return it only // after starting the context testGetActiveOrCreate { - sc = new SparkContext(conf) - ssc = StreamingContext.getActiveOrCreate(creatingFunc _) + val sc = new SparkContext(conf) + ssc = StreamingContext.getActiveOrCreate(creatingFunc(sc)) assert(ssc != null, "no context created") assert(newContextCreated, "new context not created") assert(StreamingContext.getActive().isEmpty, @@ -636,25 +628,25 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with TimeL ssc.start() assert(StreamingContext.getActive() === Some(ssc), "active context not returned") - assert(StreamingContext.getActiveOrCreate(creatingFunc _) === ssc, + assert(StreamingContext.getActiveOrCreate(creatingFunc(sc)) === ssc, "active context not returned") ssc.stop() assert(StreamingContext.getActive().isEmpty, "inactive context returned") - assert(StreamingContext.getActiveOrCreate(creatingFunc _) !== ssc, + assert(StreamingContext.getActiveOrCreate(creatingFunc(sc)) !== ssc, "inactive context returned") } // getActiveOrCreate and getActive should return independently created context after activating testGetActiveOrCreate { - sc = new SparkContext(conf) - ssc = creatingFunc() // Create + val sc = new SparkContext(conf) + ssc = creatingFunc(sc) // Create assert(StreamingContext.getActive().isEmpty, "new initialized context returned before starting") ssc.start() assert(StreamingContext.getActive() === Some(ssc), "active context not returned") - assert(StreamingContext.getActiveOrCreate(creatingFunc _) === ssc, + assert(StreamingContext.getActiveOrCreate(creatingFunc(sc)) === ssc, "active context not returned") ssc.stop() assert(StreamingContext.getActive().isEmpty, @@ -736,7 +728,7 @@ class StreamingContextSuite extends SparkFunSuite with BeforeAndAfter with TimeL } test("multiple streaming contexts") { - sc = new SparkContext( + val sc = new SparkContext( conf.clone.set("spark.streaming.clock", "org.apache.spark.util.ManualClock")) ssc = new StreamingContext(sc, Seconds(1)) val input = addInputStream(ssc) @@ -930,9 +922,9 @@ class TestReceiver extends Receiver[Int](StorageLevel.MEMORY_ONLY) with Logging var receivingThreadOption: Option[Thread] = None - def onStart() { + def onStart(): Unit = { val thread = new Thread() { - override def run() { + override def run(): Unit = { logInfo("Receiving started") while (!isStopped) { store(TestReceiver.counter.getAndIncrement) @@ -944,7 +936,7 @@ class TestReceiver extends Receiver[Int](StorageLevel.MEMORY_ONLY) with Logging thread.start() } - def onStop() { + def onStop(): Unit = { // no clean to be done, the receiving thread should stop on it own, so just wait for it. receivingThreadOption.foreach(_.join()) } @@ -959,26 +951,28 @@ class SlowTestReceiver(totalRecords: Int, recordsPerSecond: Int) extends Receiver[Int](StorageLevel.MEMORY_ONLY) with Logging { var receivingThreadOption: Option[Thread] = None + @volatile var receivedAllRecords = false - def onStart() { + def onStart(): Unit = { val thread = new Thread() { - override def run() { + override def run(): Unit = { logInfo("Receiving started") for(i <- 1 to totalRecords) { Thread.sleep(1000 / recordsPerSecond) store(i) } - SlowTestReceiver.receivedAllRecords = true + receivedAllRecords = true logInfo(s"Received all $totalRecords records") } } receivingThreadOption = Some(thread) thread.start() + SlowTestReceiver.initialized = true } - def onStop() { + def onStop(): Unit = { // Simulate slow receiver by waiting for all records to be produced - while (!SlowTestReceiver.receivedAllRecords) { + while (!receivedAllRecords) { Thread.sleep(100) } // no clean to be done, the receiving thread should stop on it own @@ -986,12 +980,12 @@ class SlowTestReceiver(totalRecords: Int, recordsPerSecond: Int) } object SlowTestReceiver { - var receivedAllRecords = false + var initialized = false } /** Streaming application for testing DStream and RDD creation sites */ -package object testPackage extends Assertions { - def test() { +object testPackage extends Assertions { + def test(): Unit = { val conf = new SparkConf().setMaster("local").setAppName("CreationSite test") val ssc = new StreamingContext(conf, Milliseconds(100)) try { @@ -1032,11 +1026,11 @@ package object testPackage extends Assertions { * This includes methods to access private methods and fields in StreamingContext and MetricsSystem */ private object StreamingContextSuite extends PrivateMethodTester { - private val _sources = PrivateMethod[ArrayBuffer[Source]]('sources) + private val _sources = PrivateMethod[ArrayBuffer[Source]](Symbol("sources")) private def getSources(metricsSystem: MetricsSystem): ArrayBuffer[Source] = { metricsSystem.invokePrivate(_sources()) } - private val _streamingSource = PrivateMethod[StreamingSource]('streamingSource) + private val _streamingSource = PrivateMethod[StreamingSource](Symbol("streamingSource")) private def getStreamingSource(streamingContext: StreamingContext): StreamingSource = { streamingContext.invokePrivate(_streamingSource()) } diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala index 62fd43302b9d7..679c58dbae92b 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala @@ -36,20 +36,11 @@ import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler._ -class StreamingListenerSuite extends TestSuiteBase with Matchers { +class StreamingListenerSuite extends TestSuiteBase with LocalStreamingContext with Matchers { val input = (1 to 4).map(Seq(_)).toSeq val operation = (d: DStream[Int]) => d.map(x => x) - var ssc: StreamingContext = _ - - override def afterFunction() { - super.afterFunction() - if (ssc != null) { - ssc.stop() - } - } - // To make sure that the processing start and end times in collected // information are different for successive batches override def batchDuration: Duration = Milliseconds(100) @@ -236,7 +227,7 @@ class StreamingListenerSuite extends TestSuiteBase with Matchers { // Post a Streaming event after stopping StreamingContext val receiverInfoStopped = ReceiverInfo(0, "test", false, "localhost", "0") ssc.scheduler.listenerBus.post(StreamingListenerReceiverStopped(receiverInfoStopped)) - ssc.sparkContext.listenerBus.waitUntilEmpty(1000) + ssc.sparkContext.listenerBus.waitUntilEmpty() // The StreamingListener should not receive any event verifyNoMoreInteractions(streamingListener) } @@ -288,15 +279,15 @@ class BatchInfoCollector extends StreamingListener { val batchInfosStarted = new ConcurrentLinkedQueue[BatchInfo] val batchInfosSubmitted = new ConcurrentLinkedQueue[BatchInfo] - override def onBatchSubmitted(batchSubmitted: StreamingListenerBatchSubmitted) { + override def onBatchSubmitted(batchSubmitted: StreamingListenerBatchSubmitted): Unit = { batchInfosSubmitted.add(batchSubmitted.batchInfo) } - override def onBatchStarted(batchStarted: StreamingListenerBatchStarted) { + override def onBatchStarted(batchStarted: StreamingListenerBatchStarted): Unit = { batchInfosStarted.add(batchStarted.batchInfo) } - override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted) { + override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = { batchInfosCompleted.add(batchCompleted.batchInfo) } } @@ -307,15 +298,15 @@ class ReceiverInfoCollector extends StreamingListener { val stoppedReceiverStreamIds = new ConcurrentLinkedQueue[Int] val receiverErrors = new ConcurrentLinkedQueue[(Int, String, String)] - override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted) { + override def onReceiverStarted(receiverStarted: StreamingListenerReceiverStarted): Unit = { startedReceiverStreamIds.add(receiverStarted.receiverInfo.streamId) } - override def onReceiverStopped(receiverStopped: StreamingListenerReceiverStopped) { + override def onReceiverStopped(receiverStopped: StreamingListenerReceiverStopped): Unit = { stoppedReceiverStreamIds.add(receiverStopped.receiverInfo.streamId) } - override def onReceiverError(receiverError: StreamingListenerReceiverError) { + override def onReceiverError(receiverError: StreamingListenerReceiverError): Unit = { receiverErrors.add(((receiverError.receiverInfo.streamId, receiverError.receiverInfo.lastErrorMessage, receiverError.receiverInfo.lastError))) } @@ -338,7 +329,7 @@ class OutputOperationInfoCollector extends StreamingListener { } class StreamingListenerSuiteReceiver extends Receiver[Any](StorageLevel.MEMORY_ONLY) with Logging { - def onStart() { + def onStart(): Unit = { Future { logInfo("Started receiver and sleeping") Thread.sleep(10) @@ -349,7 +340,7 @@ class StreamingListenerSuiteReceiver extends Receiver[Any](StorageLevel.MEMORY_O stop("test stop error") } } - def onStop() { } + def onStop(): Unit = { } } /** @@ -377,7 +368,7 @@ class StreamingContextStoppingCollector(val ssc: StreamingContext) extends Strea private var isFirstBatch = true - override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted) { + override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = { if (isFirstBatch) { // We should only call `ssc.stop()` in the first batch. Otherwise, it's possible that the main // thread is calling `ssc.stop()`, while StreamingContextStoppingCollector is also calling diff --git a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala index f2ae77896a5d3..55c2950261a07 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/TestSuiteBase.scala @@ -24,7 +24,7 @@ import scala.collection.JavaConverters._ import scala.language.implicitConversions import scala.reflect.ClassTag -import org.scalatest.BeforeAndAfter +import org.scalatest.BeforeAndAfterEach import org.scalatest.concurrent.Eventually.timeout import org.scalatest.concurrent.PatienceConfiguration import org.scalatest.time.{Seconds => ScalaTestSeconds, Span} @@ -62,9 +62,9 @@ private[streaming] class DummyInputDStream(ssc: StreamingContext) extends InputD class TestInputStream[T: ClassTag](_ssc: StreamingContext, input: Seq[Seq[T]], numPartitions: Int) extends InputDStream[T](_ssc) { - def start() {} + def start(): Unit = {} - def stop() {} + def stop(): Unit = {} def compute(validTime: Time): Option[RDD[T]] = { logInfo("Computing RDD for time " + validTime) @@ -211,7 +211,7 @@ class BatchCounter(ssc: StreamingContext) { * This is the base trait for Spark Streaming testsuites. This provides basic functionality * to run user-defined set of input on user-defined stream operations, and verify the output. */ -trait TestSuiteBase extends SparkFunSuite with BeforeAndAfter with Logging { +trait TestSuiteBase extends SparkFunSuite with BeforeAndAfterEach with Logging { // Name of the framework for Spark context def framework: String = this.getClass.getSimpleName @@ -250,8 +250,8 @@ trait TestSuiteBase extends SparkFunSuite with BeforeAndAfter with Logging { val eventuallyTimeout: PatienceConfiguration.Timeout = timeout(Span(10, ScalaTestSeconds)) // Default before function for any streaming test suite. Override this - // if you want to add your stuff to "before" (i.e., don't call before { } ) - def beforeFunction() { + // if you want to add your stuff to "beforeEach" + def beforeFunction(): Unit = { if (useManualClock) { logInfo("Using manual clock") conf.set("spark.streaming.clock", "org.apache.spark.util.ManualClock") @@ -262,13 +262,24 @@ trait TestSuiteBase extends SparkFunSuite with BeforeAndAfter with Logging { } // Default after function for any streaming test suite. Override this - // if you want to add your stuff to "after" (i.e., don't call after { } ) - def afterFunction() { + // if you want to add your stuff to "afterEach" + def afterFunction(): Unit = { System.clearProperty("spark.streaming.clock") } - before(beforeFunction) - after(afterFunction) + override def beforeEach(): Unit = { + super.beforeEach() + beforeFunction() + } + + override def afterEach(): Unit = { + try { + afterFunction() + } finally { + super.afterEach() + } + + } /** * Run a block of code with the given StreamingContext and automatically @@ -278,12 +289,7 @@ trait TestSuiteBase extends SparkFunSuite with BeforeAndAfter with Logging { try { block(ssc) } finally { - try { - ssc.stop(stopSparkContext = true) - } catch { - case e: Exception => - logError("Error stopping StreamingContext", e) - } + LocalStreamingContext.stop(ssc, stopSparkContext = true) } } @@ -452,7 +458,7 @@ trait TestSuiteBase extends SparkFunSuite with BeforeAndAfter with Logging { output: Seq[Seq[V]], expectedOutput: Seq[Seq[V]], useSet: Boolean - ) { + ): Unit = { logInfo("--------------------------------") logInfo("output.size = " + output.size) logInfo("output") @@ -492,7 +498,7 @@ trait TestSuiteBase extends SparkFunSuite with BeforeAndAfter with Logging { operation: DStream[U] => DStream[V], expectedOutput: Seq[Seq[V]], useSet: Boolean = false - ) { + ): Unit = { testOperation[U, V](input, operation, expectedOutput, -1, useSet) } @@ -511,7 +517,7 @@ trait TestSuiteBase extends SparkFunSuite with BeforeAndAfter with Logging { expectedOutput: Seq[Seq[V]], numBatches: Int, useSet: Boolean - ) { + ): Unit = { val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size withStreamingContext(setupStreams[U, V](input, operation)) { ssc => val output = runStreams[V](ssc, numBatches_, expectedOutput.size) @@ -529,7 +535,7 @@ trait TestSuiteBase extends SparkFunSuite with BeforeAndAfter with Logging { operation: (DStream[U], DStream[V]) => DStream[W], expectedOutput: Seq[Seq[W]], useSet: Boolean - ) { + ): Unit = { testOperation[U, V, W](input1, input2, operation, expectedOutput, -1, useSet) } @@ -550,7 +556,7 @@ trait TestSuiteBase extends SparkFunSuite with BeforeAndAfter with Logging { expectedOutput: Seq[Seq[W]], numBatches: Int, useSet: Boolean - ) { + ): Unit = { val numBatches_ = if (numBatches > 0) numBatches else expectedOutput.size withStreamingContext(setupStreams[U, V, W](input1, input2, operation)) { ssc => val output = runStreams[W](ssc, numBatches_, expectedOutput.size) diff --git a/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala index 1d34221fde4f4..bdc9e9ee2aed1 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala @@ -23,8 +23,8 @@ import org.openqa.selenium.WebDriver import org.openqa.selenium.htmlunit.HtmlUnitDriver import org.scalatest._ import org.scalatest.concurrent.Eventually._ -import org.scalatest.selenium.WebBrowser import org.scalatest.time.SpanSugar._ +import org.scalatestplus.selenium.WebBrowser import org.apache.spark._ import org.apache.spark.internal.config.UI.UI_ENABLED @@ -97,7 +97,7 @@ class UISeleniumSuite val sparkUI = ssc.sparkContext.ui.get - sparkUI.getHandlers.count(_.getContextPath.contains("/streaming")) should be (5) + sparkUI.getDelegatingHandlers.count(_.getContextPath.contains("/streaming")) should be (5) eventually(timeout(10.seconds), interval(50.milliseconds)) { go to (sparkUI.webUrl.stripSuffix("/")) @@ -151,8 +151,9 @@ class UISeleniumSuite summaryText should contain ("Total delay:") findAll(cssSelector("""#batch-job-table th""")).map(_.text).toSeq should be { - List("Output Op Id", "Description", "Output Op Duration", "Status", "Job Id", - "Job Duration", "Stages: Succeeded/Total", "Tasks (for all stages): Succeeded/Total", + List("Output Op Id", "Description", "Output Op Duration (?)", "Status", "Job Id", + "Job Duration (?)", "Stages: Succeeded/Total", + "Tasks (for all stages): Succeeded/Total", "Error") } @@ -163,7 +164,7 @@ class UISeleniumSuite // Check job ids val jobIdCells = findAll(cssSelector( """#batch-job-table a""")).toSeq - jobIdCells.map(_.text) should be (List("0", "1", "2", "3")) + jobIdCells.map(_.text).filter(_.forall(_.isDigit)) should be (List("0", "1", "2", "3")) val jobLinks = jobIdCells.flatMap(_.attribute("href")) jobLinks.size should be (4) @@ -198,7 +199,7 @@ class UISeleniumSuite ssc.stop(false) - sparkUI.getHandlers.count(_.getContextPath.contains("/streaming")) should be (0) + sparkUI.getDelegatingHandlers.count(_.getContextPath.contains("/streaming")) should be (0) eventually(timeout(10.seconds), interval(50.milliseconds)) { go to (sparkUI.webUrl.stripSuffix("/")) diff --git a/streaming/src/test/scala/org/apache/spark/streaming/WindowOperationsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/WindowOperationsSuite.scala index c7d085ec0799b..468a52226682e 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/WindowOperationsSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/WindowOperationsSuite.scala @@ -146,15 +146,16 @@ class WindowOperationsSuite extends TestSuiteBase { test("window - persistence level") { val input = Seq( Seq(0), Seq(1), Seq(2), Seq(3), Seq(4), Seq(5)) - val ssc = new StreamingContext(conf, batchDuration) - val inputStream = new TestInputStream[Int](ssc, input, 1) - val windowStream1 = inputStream.window(batchDuration * 2) - assert(windowStream1.storageLevel === StorageLevel.NONE) - assert(inputStream.storageLevel === StorageLevel.MEMORY_ONLY_SER) - windowStream1.persist(StorageLevel.MEMORY_ONLY) - assert(windowStream1.storageLevel === StorageLevel.NONE) - assert(inputStream.storageLevel === StorageLevel.MEMORY_ONLY) - ssc.stop() + + withStreamingContext(new StreamingContext(conf, batchDuration)) { ssc => + val inputStream = new TestInputStream[Int](ssc, input, 1) + val windowStream1 = inputStream.window(batchDuration * 2) + assert(windowStream1.storageLevel === StorageLevel.NONE) + assert(inputStream.storageLevel === StorageLevel.MEMORY_ONLY_SER) + windowStream1.persist(StorageLevel.MEMORY_ONLY) + assert(windowStream1.storageLevel === StorageLevel.NONE) + assert(inputStream.storageLevel === StorageLevel.MEMORY_ONLY) + } } // Testing naive reduceByKeyAndWindow (without invertible function) @@ -276,7 +277,7 @@ class WindowOperationsSuite extends TestSuiteBase { expectedOutput: Seq[Seq[Int]], windowDuration: Duration = Seconds(2), slideDuration: Duration = Seconds(1) - ) { + ): Unit = { test("window - " + name) { val numBatches = expectedOutput.size * (slideDuration / batchDuration).toInt val operation = (s: DStream[Int]) => s.window(windowDuration, slideDuration) @@ -290,7 +291,7 @@ class WindowOperationsSuite extends TestSuiteBase { expectedOutput: Seq[Seq[(String, Int)]], windowDuration: Duration = Seconds(2), slideDuration: Duration = Seconds(1) - ) { + ): Unit = { test("reduceByKeyAndWindow - " + name) { logInfo("reduceByKeyAndWindow - " + name) val numBatches = expectedOutput.size * (slideDuration / batchDuration).toInt @@ -307,7 +308,7 @@ class WindowOperationsSuite extends TestSuiteBase { expectedOutput: Seq[Seq[(String, Int)]], windowDuration: Duration = Seconds(2), slideDuration: Duration = Seconds(1) - ) { + ): Unit = { test("reduceByKeyAndWindow with inverse function - " + name) { logInfo("reduceByKeyAndWindow with inverse function - " + name) val numBatches = expectedOutput.size * (slideDuration / batchDuration).toInt @@ -325,7 +326,7 @@ class WindowOperationsSuite extends TestSuiteBase { expectedOutput: Seq[Seq[(String, Int)]], windowDuration: Duration = Seconds(2), slideDuration: Duration = Seconds(1) - ) { + ): Unit = { test("reduceByKeyAndWindow with inverse and filter functions - " + name) { logInfo("reduceByKeyAndWindow with inverse and filter functions - " + name) val numBatches = expectedOutput.size * (slideDuration / batchDuration).toInt diff --git a/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala index aa69be7ca9939..86a8dc47098af 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDDSuite.scala @@ -164,7 +164,7 @@ class WriteAheadLogBackedBlockRDDSuite testIsBlockValid: Boolean = false, testBlockRemove: Boolean = false, testStoreInBM: Boolean = false - ) { + ): Unit = { require(numPartitionsInBM <= numPartitions, "Can't put more partitions in BlockManager than that in RDD") require(numPartitionsInWAL <= numPartitions, diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala index a8b00558b40a7..65efa10bfcf92 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala @@ -19,26 +19,26 @@ package org.apache.spark.streaming.scheduler import org.mockito.ArgumentMatchers.{eq => meq} import org.mockito.Mockito.{never, reset, times, verify, when} -import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, PrivateMethodTester} +import org.scalatest.{BeforeAndAfterEach, PrivateMethodTester} import org.scalatest.concurrent.Eventually.{eventually, timeout} -import org.scalatest.mockito.MockitoSugar import org.scalatest.time.SpanSugar._ +import org.scalatestplus.mockito.MockitoSugar -import org.apache.spark.{ExecutorAllocationClient, SparkConf, SparkFunSuite} +import org.apache.spark.{ExecutorAllocationClient, SparkConf} import org.apache.spark.internal.config.{DYN_ALLOCATION_ENABLED, DYN_ALLOCATION_TESTING} import org.apache.spark.internal.config.Streaming._ -import org.apache.spark.streaming.{DummyInputDStream, Seconds, StreamingContext} +import org.apache.spark.resource.ResourceProfile +import org.apache.spark.streaming.{DummyInputDStream, Seconds, StreamingContext, TestSuiteBase} import org.apache.spark.util.{ManualClock, Utils} - -class ExecutorAllocationManagerSuite extends SparkFunSuite - with BeforeAndAfter with BeforeAndAfterAll with MockitoSugar with PrivateMethodTester { +class ExecutorAllocationManagerSuite extends TestSuiteBase + with MockitoSugar with PrivateMethodTester { private val batchDurationMillis = 1000L private var allocationClient: ExecutorAllocationClient = null private var clock: StreamManualClock = null - before { + override def beforeEach(): Unit = { allocationClient = mock[ExecutorAllocationClient] clock = new StreamManualClock() } @@ -72,10 +72,15 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite if (expectedRequestedTotalExecs.nonEmpty) { require(expectedRequestedTotalExecs.get > 0) verify(allocationClient, times(1)).requestTotalExecutors( - meq(expectedRequestedTotalExecs.get), meq(0), meq(Map.empty)) + meq(Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID -> + expectedRequestedTotalExecs.get)), + meq(Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID -> 0)), + meq(Map.empty)) } else { - verify(allocationClient, never).requestTotalExecutors(0, 0, Map.empty) - } + verify(allocationClient, never).requestTotalExecutors( + Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID -> 0), + Map(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID -> 0), + Map.empty)} } /** Verify that a particular executor was killed */ @@ -140,8 +145,11 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite reset(allocationClient) when(allocationClient.getExecutorIds()).thenReturn((1 to numExecs).map(_.toString)) requestExecutors(allocationManager, numNewExecs) - verify(allocationClient, times(1)).requestTotalExecutors( - meq(expectedRequestedTotalExecs), meq(0), meq(Map.empty)) + val defaultProfId = ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID + verify(allocationClient, times(1)). + requestTotalExecutors( + meq(Map(defaultProfId -> expectedRequestedTotalExecs)), + meq(Map(defaultProfId -> 0)), meq(Map.empty)) } withAllocationManager(numReceivers = 1) { case (_, allocationManager) => @@ -364,11 +372,11 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite } } - private val _addBatchProcTime = PrivateMethod[Unit]('addBatchProcTime) - private val _requestExecutors = PrivateMethod[Unit]('requestExecutors) - private val _killExecutor = PrivateMethod[Unit]('killExecutor) + private val _addBatchProcTime = PrivateMethod[Unit](Symbol("addBatchProcTime")) + private val _requestExecutors = PrivateMethod[Unit](Symbol("requestExecutors")) + private val _killExecutor = PrivateMethod[Unit](Symbol("killExecutor")) private val _executorAllocationManager = - PrivateMethod[Option[ExecutorAllocationManager]]('executorAllocationManager) + PrivateMethod[Option[ExecutorAllocationManager]](Symbol("executorAllocationManager")) private def addBatchProcTime(manager: ExecutorAllocationManager, timeMs: Long): Unit = { manager invokePrivate _addBatchProcTime(timeMs) @@ -392,13 +400,9 @@ class ExecutorAllocationManagerSuite extends SparkFunSuite .setAppName(this.getClass.getSimpleName) .set("spark.streaming.dynamicAllocation.testing", "true") // to test dynamic allocation - var ssc: StreamingContext = null - try { - ssc = new StreamingContext(conf, Seconds(1)) + withStreamingContext(new StreamingContext(conf, Seconds(1))) { ssc => new DummyInputDStream(ssc).foreachRDD(_ => { }) body(ssc) - } finally { - if (ssc != null) ssc.stop() } } } diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/InputInfoTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/InputInfoTrackerSuite.scala index a7e365649d3e8..cc393425ca6f0 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/InputInfoTrackerSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/InputInfoTrackerSuite.scala @@ -17,27 +17,15 @@ package org.apache.spark.streaming.scheduler -import org.scalatest.BeforeAndAfter - import org.apache.spark.{SparkConf, SparkFunSuite} -import org.apache.spark.streaming.{Duration, StreamingContext, Time} - -class InputInfoTrackerSuite extends SparkFunSuite with BeforeAndAfter { +import org.apache.spark.streaming.{Duration, LocalStreamingContext, StreamingContext, Time} - private var ssc: StreamingContext = _ +class InputInfoTrackerSuite extends SparkFunSuite with LocalStreamingContext { - before { + override def beforeEach(): Unit = { + super.beforeEach() val conf = new SparkConf().setMaster("local[2]").setAppName("DirectStreamTacker") - if (ssc == null) { - ssc = new StreamingContext(conf, Duration(1000)) - } - } - - after { - if (ssc != null) { - ssc.stop() - ssc = null - } + ssc = new StreamingContext(conf, Duration(1000)) } test("test report and get InputInfo from InputInfoTracker") { diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/JobGeneratorSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/JobGeneratorSuite.scala index f0e502727402e..227a02eece65b 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/JobGeneratorSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/JobGeneratorSuite.scala @@ -93,14 +93,14 @@ class JobGeneratorSuite extends TestSuiteBase { } // Wait for new blocks to be received - def waitForNewReceivedBlocks() { + def waitForNewReceivedBlocks(): Unit = { eventually(testTimeout) { assert(receiverTracker.hasUnallocatedBlocks) } } // Wait for received blocks to be allocated to a batch - def waitForBlocksToBeAllocatedToBatch(batchTime: Long) { + def waitForBlocksToBeAllocatedToBatch(batchTime: Long): Unit = { eventually(testTimeout) { assert(getBlocksOfBatch(batchTime).nonEmpty) } diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/RateControllerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/RateControllerSuite.scala index 37ca0ce2f6a30..b5a45fc317d0e 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/RateControllerSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/RateControllerSuite.scala @@ -30,8 +30,7 @@ class RateControllerSuite extends TestSuiteBase { override def batchDuration: Duration = Milliseconds(50) test("RateController - rate controller publishes updates after batches complete") { - val ssc = new StreamingContext(conf, batchDuration) - withStreamingContext(ssc) { ssc => + withStreamingContext(new StreamingContext(conf, batchDuration)) { ssc => val dstream = new RateTestInputDStream(ssc) dstream.register() ssc.start() @@ -43,8 +42,7 @@ class RateControllerSuite extends TestSuiteBase { } test("ReceiverRateController - published rates reach receivers") { - val ssc = new StreamingContext(conf, batchDuration) - withStreamingContext(ssc) { ssc => + withStreamingContext(new StreamingContext(conf, batchDuration)) { ssc => val estimator = new ConstantEstimator(100) val dstream = new RateTestInputDStream(ssc) { override val rateController = diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala index fec20f0429ff0..1a0154600bf3c 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala @@ -205,9 +205,9 @@ class StoppableReceiver extends Receiver[Int](StorageLevel.MEMORY_ONLY) { var receivingThreadOption: Option[Thread] = None - def onStart() { + def onStart(): Unit = { val thread = new Thread() { - override def run() { + override def run(): Unit = { while (!StoppableReceiver.shouldStop) { Thread.sleep(10) } @@ -217,7 +217,7 @@ class StoppableReceiver extends Receiver[Int](StorageLevel.MEMORY_ONLY) { thread.start() } - def onStop() { + def onStop(): Unit = { StoppableReceiver.shouldStop = true receivingThreadOption.foreach(_.join()) // Reset it so as to restart it diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala index 56b400850fdd4..10f92f9386173 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala @@ -22,24 +22,18 @@ import java.util.Properties import org.scalatest.Matchers import org.apache.spark.scheduler.SparkListenerJobStart -import org.apache.spark.streaming._ +import org.apache.spark.streaming.{LocalStreamingContext, _} import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.scheduler._ -class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers { +class StreamingJobProgressListenerSuite + extends TestSuiteBase + with LocalStreamingContext + with Matchers { val input = (1 to 4).map(Seq(_)).toSeq val operation = (d: DStream[Int]) => d.map(x => x) - var ssc: StreamingContext = _ - - override def afterFunction() { - super.afterFunction() - if (ssc != null) { - ssc.stop() - } - } - private def createJobStart( batchTime: Time, outputOpId: Int, jobId: Int): SparkListenerJobStart = { val properties = new Properties() diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ui/UIUtilsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ui/UIUtilsSuite.scala index d3ca2b58f36c2..576083723f8bd 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/ui/UIUtilsSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/ui/UIUtilsSuite.scala @@ -23,6 +23,7 @@ import java.util.concurrent.TimeUnit import org.scalatest.Matchers import org.apache.spark.SparkFunSuite +import org.apache.spark.ui.{UIUtils => SparkUIUtils} class UIUtilsSuite extends SparkFunSuite with Matchers{ @@ -70,10 +71,13 @@ class UIUtilsSuite extends SparkFunSuite with Matchers{ test("formatBatchTime") { val tzForTest = TimeZone.getTimeZone("America/Los_Angeles") val batchTime = 1431637480452L // Thu May 14 14:04:40 PDT 2015 - assert("2015/05/14 14:04:40" === UIUtils.formatBatchTime(batchTime, 1000, timezone = tzForTest)) + assert("2015/05/14 14:04:40" === + SparkUIUtils.formatBatchTime(batchTime, 1000, timezone = tzForTest)) assert("2015/05/14 14:04:40.452" === - UIUtils.formatBatchTime(batchTime, 999, timezone = tzForTest)) - assert("14:04:40" === UIUtils.formatBatchTime(batchTime, 1000, false, timezone = tzForTest)) - assert("14:04:40.452" === UIUtils.formatBatchTime(batchTime, 999, false, timezone = tzForTest)) + SparkUIUtils.formatBatchTime(batchTime, 999, timezone = tzForTest)) + assert("14:04:40" === + SparkUIUtils.formatBatchTime(batchTime, 1000, false, timezone = tzForTest)) + assert("14:04:40.452" === + SparkUIUtils.formatBatchTime(batchTime, 999, false, timezone = tzForTest)) } } diff --git a/streaming/src/test/scala/org/apache/spark/streaming/util/RecurringTimerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/util/RecurringTimerSuite.scala index 25b70a3d089ee..a11dac4d41caa 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/util/RecurringTimerSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/util/RecurringTimerSuite.scala @@ -69,7 +69,7 @@ class RecurringTimerSuite extends SparkFunSuite with PrivateMethodTester { } } thread.start() - val stopped = PrivateMethod[RecurringTimer]('stopped) + val stopped = PrivateMethod[RecurringTimer](Symbol("stopped")) // Make sure the `stopped` field has been changed eventually(timeout(10.seconds), interval(10.millis)) { assert(timer.invokePrivate(stopped()) === true) diff --git a/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala index 8d2fa7d515e2f..bb60d6fa7bf78 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/util/WriteAheadLogSuite.scala @@ -34,9 +34,10 @@ import org.mockito.ArgumentCaptor import org.mockito.ArgumentMatchers.{any, anyLong, eq => meq} import org.mockito.Mockito.{times, verify, when} import org.scalatest.{BeforeAndAfter, BeforeAndAfterEach, PrivateMethodTester} +import org.scalatest.Assertions._ import org.scalatest.concurrent.Eventually import org.scalatest.concurrent.Eventually._ -import org.scalatest.mockito.MockitoSugar +import org.scalatestplus.mockito.MockitoSugar import org.apache.spark.{SparkConf, SparkException, SparkFunSuite} import org.apache.spark.streaming.scheduler._ @@ -256,12 +257,12 @@ class FileBasedWriteAheadLogSuite counter.increment() // block so that other threads also launch latch.await(10, TimeUnit.SECONDS) - override def completion() { counter.decrement() } + override def completion(): Unit = { counter.decrement() } } } @volatile var collected: Seq[Int] = Nil val t = new Thread() { - override def run() { + override def run(): Unit = { // run the calculation on a separate thread so that we can release the latch val iterator = FileBasedWriteAheadLog.seqToParIterator[Int, Int](executionContext, testSeq, handle) @@ -434,7 +435,7 @@ class BatchedWriteAheadLogSuite extends CommonWriteAheadLogTests( private var walBatchingExecutionContext: ExecutionContextExecutorService = _ private val sparkConf = new SparkConf() - private val queueLength = PrivateMethod[Int]('getQueueLength) + private val queueLength = PrivateMethod[Int](Symbol("getQueueLength")) override def beforeEach(): Unit = { super.beforeEach() diff --git a/tools/pom.xml b/tools/pom.xml index 6286fad403c83..e380e869f55c7 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -44,7 +44,7 @@ org.clapper classutil_${scala.binary.version} - 1.1.2 + 1.5.1 diff --git a/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala b/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala index c9058ff409893..f9bc499961ad7 100644 --- a/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala +++ b/tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala @@ -115,7 +115,7 @@ object GenerateMIMAIgnore { ).filter(x => isPackagePrivate(x)).map(_.fullName) ++ getInnerFunctions(classSymbol) } - def main(args: Array[String]) { + def main(args: Array[String]): Unit = { import scala.tools.nsc.io.File val (privateClasses, privateMembers) = privateWithin("org.apache.spark") val previousContents = Try(File(".generated-mima-class-excludes").lines()).